Skip to content
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,20 @@ Settings are initialized by `openkb init`, and stored in `.openkb/config.yaml`:
model: gpt-5.4 # LLM model (any LiteLLM-supported provider)
language: en # Wiki output language
pageindex_threshold: 20 # PDF pages threshold for PageIndex
storage_backend: sqlite # Storage backend: sqlite (default) or json
```

### Storage Backend

OpenKB supports two storage backends for the file hash registry:

| Backend | Description | Use Case |
|---------|-------------|----------|
| `sqlite` | SQLite database (default) | Better concurrency, scalability, recommended for production |
| `json` | JSON file | Simple, human-readable, for small installations |

Migration from JSON to SQLite happens automatically when you switch to `sqlite` backend and a `hashes.json` file exists. The JSON file is preserved but no longer used.

Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/providers) (OpenAI models can omit the prefix):

| Provider | Model example |
Expand Down
51 changes: 27 additions & 24 deletions openkb/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,14 +277,15 @@ def add_single_file(file_path: Path, kb_dir: Path) -> Literal["added", "skipped"
retry without re-downloading.
"""
from openkb.agent.compiler import compile_long_doc, compile_short_doc
from openkb.state import HashRegistry
from openkb.state import get_registry

logger = logging.getLogger(__name__)
openkb_dir = kb_dir / ".openkb"
config = load_config(openkb_dir / "config.yaml")
_setup_llm_key(kb_dir)
model: str = config.get("model", DEFAULT_CONFIG["model"])
registry = HashRegistry(openkb_dir / "hashes.json")
backend = config.get("storage_backend", "sqlite")
registry = get_registry(openkb_dir, backend=backend)

# 2. Convert document
click.echo(f"Adding: {file_path.name}")
Expand Down Expand Up @@ -553,9 +554,10 @@ def init(model, language):
"model": model,
"language": language,
"pageindex_threshold": DEFAULT_CONFIG["pageindex_threshold"],
"storage_backend": DEFAULT_CONFIG["storage_backend"],
}
save_config(openkb_dir / "config.yaml", config)
(openkb_dir / "hashes.json").write_text(json.dumps({}), encoding="utf-8")
# SQLite DB 会在首次访问时由 get_registry() 自动创建,无需预创建

# Write API key to KB-local .env (0600) if the user provided one
if api_key:
Expand Down Expand Up @@ -803,15 +805,17 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes):
remove_doc_from_index,
)
from openkb.lint import fix_broken_links
from openkb.state import HashRegistry
from openkb.state import get_registry

kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override"))
if kb_dir is None:
click.echo("No knowledge base found. Run `openkb init` first.")
return

openkb_dir = kb_dir / ".openkb"
registry = HashRegistry(openkb_dir / "hashes.json")
config = load_config(openkb_dir / "config.yaml")
backend = config.get("storage_backend", "sqlite")
registry = get_registry(openkb_dir, backend=backend)

matches = _resolve_doc_identifier(registry, identifier)
if not matches:
Expand Down Expand Up @@ -1169,20 +1173,16 @@ async def run_lint(kb_dir: Path) -> Path | None:
"""
from openkb.lint import run_structural_lint
from openkb.agent.linter import run_knowledge_lint
from openkb.state import get_registry

openkb_dir = kb_dir / ".openkb"

# Skip lint entirely when the KB has no indexed documents
hashes_file = openkb_dir / "hashes.json"
if hashes_file.exists():
hashes = json.loads(hashes_file.read_text(encoding="utf-8"))
else:
hashes = {}
config = load_config(openkb_dir / "config.yaml")
backend: str = config.get("storage_backend", "sqlite")
registry = get_registry(openkb_dir, backend=backend)
hashes = registry.all_entries()
if not hashes:
click.echo("Nothing to lint — no documents indexed yet. Run `openkb add` first.")
return

config = load_config(openkb_dir / "config.yaml")
_setup_llm_key(kb_dir)
model: str = config.get("model", DEFAULT_CONFIG["model"])

Expand Down Expand Up @@ -1235,13 +1235,13 @@ def lint(ctx, fix):

def print_list(kb_dir: Path) -> None:
"""Print all documents in the knowledge base. Usable from CLI and chat REPL."""
openkb_dir = kb_dir / ".openkb"
hashes_file = openkb_dir / "hashes.json"
if not hashes_file.exists():
click.echo("No documents indexed yet.")
return
from openkb.state import get_registry

hashes = json.loads(hashes_file.read_text(encoding="utf-8"))
openkb_dir = kb_dir / ".openkb"
config = load_config(openkb_dir / "config.yaml")
backend = config.get("storage_backend", "sqlite")
registry = get_registry(openkb_dir, backend=backend)
hashes = registry.all_entries()
if not hashes:
click.echo("No documents indexed yet.")
return
Expand Down Expand Up @@ -1326,11 +1326,14 @@ def print_status(kb_dir: Path) -> None:
click.echo(f" {'raw':<20} {raw_count:<10}")

# Hash registry summary
from openkb.state import get_registry

openkb_dir = kb_dir / ".openkb"
hashes_file = openkb_dir / "hashes.json"
if hashes_file.exists():
hashes = json.loads(hashes_file.read_text(encoding="utf-8"))
click.echo(f"\n Total indexed: {len(hashes)} document(s)")
config = load_config(openkb_dir / "config.yaml")
backend = config.get("storage_backend", "sqlite")
registry = get_registry(openkb_dir, backend=backend)
hashes = registry.all_entries()
click.echo(f"\n Total indexed: {len(hashes)} document(s)")

# Last compile time: newest file in wiki/summaries/
summaries_dir = wiki_dir / "summaries"
Expand Down
1 change: 1 addition & 0 deletions openkb/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"model": "gpt-5.4-mini",
"language": "en",
"pageindex_threshold": 20,
"storage_backend": "sqlite",
}

GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb"
Expand Down
7 changes: 4 additions & 3 deletions openkb/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from openkb.config import load_config
from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images
from openkb.state import HashRegistry
from openkb.state import get_registry

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -50,12 +50,13 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
openkb_dir = kb_dir / ".openkb"
config = load_config(openkb_dir / "config.yaml")
threshold: int = config.get("pageindex_threshold", 20)
registry = HashRegistry(openkb_dir / "hashes.json")
backend = config.get("storage_backend", "sqlite")
registry = get_registry(openkb_dir, backend=backend)

# ------------------------------------------------------------------
# 1. Hash check
# ------------------------------------------------------------------
file_hash = HashRegistry.hash_file(src)
file_hash = registry.hash_file(src)
if registry.is_known(file_hash):
logger.info("Skipping already-known file: %s", src.name)
return ConvertResult(skipped=True)
Expand Down
Loading