Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ uv add semble # Or install with uv

To update Semble, see [Updating](#updating).

Curious how many tokens Semble has saved you? Run `semble savings` to see. See [Savings](#savings) for details.

## Main Features

- **Fast**: indexes an average repo in ~250 ms and answers queries in ~1.5 ms, all on CPU.
Expand Down Expand Up @@ -185,6 +187,29 @@ semble find-related src/auth.py 42 ./my-project

If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.

### Savings

`semble savings` shows how many tokens semble has saved across all your searches:

```bash
semble savings # summary by period
semble savings --verbose # also show breakdown by call type
```

```
Semble Token Savings
════════════════════════════════════════════════════════════════
Period Calls Savings
────────────────────────────────────────────────────────────────
Today 42 [β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‘] ~58.4k tokens (95%)
Last 7 days 287 [β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‘β–‘] ~312.4k tokens (90%)
All time 1.4k [β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‘β–‘] ~1.2M tokens (89%)
```

**How savings are calculated:** for each call, semble records the total character count of the unique files containing returned chunks and the character count of the snippets returned. Estimated tokens saved is `(file chars βˆ’ snippet chars) / 4` (4 chars per token). This is a conservative estimate: the baseline is reading matched files in full, which is how coding agents often explore unfamiliar code.

Stats are stored in `~/.semble/savings.jsonl`.

### Updating

To update/upgrade Semble to the latest version:
Expand Down
10 changes: 9 additions & 1 deletion src/semble/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
from model2vec.utils import get_package_extras

from semble.index import SembleIndex
from semble.stats import format_savings_report
from semble.utils import _format_results, _is_git_url, _resolve_chunk

_CLAUDE_FILE_PATH = Path(".claude") / "agents" / "semble-search.md"
_CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "-h", "--help"})
_CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "savings", "-h", "--help"})


def main() -> None:
Expand Down Expand Up @@ -91,12 +92,19 @@ def _cli_main() -> None:
init_p = sub.add_parser("init", help="Write .claude/agents/semble-search.md for Claude Code sub-agent support.")
init_p.add_argument("--force", action="store_true", help="Overwrite if the file already exists.")

savings_p = sub.add_parser("savings", help="Show token savings and usage stats.")
savings_p.add_argument("--verbose", action="store_true", help="Also show usage breakdown by call type.")

args = parser.parse_args()

if args.command == "init":
_run_init(force=args.force)
return

if args.command == "savings":
print(format_savings_report(verbose=args.verbose), end="")
return

include_text = args.include_text_files
index = (
SembleIndex.from_git(args.path, include_text_files=include_text)
Expand Down
43 changes: 35 additions & 8 deletions src/semble/index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from semble.index.create import create_index_from_path
from semble.index.dense import SelectableBasicBackend, load_model
from semble.search import search_bm25, search_hybrid, search_semantic
from semble.types import Chunk, Encoder, IndexStats, SearchMode, SearchResult
from semble.stats import save_search_stats
from semble.types import CallType, Chunk, Encoder, IndexStats, SearchMode, SearchResult

_GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60))

Expand All @@ -39,6 +40,7 @@ def __init__(
self.chunks: list[Chunk] = chunks
self._bm25_index: BM25 = bm25_index
self._semantic_index: SelectableBasicBackend = semantic_index
self._file_sizes: dict[str, int] = {}
self._file_mapping, self._language_mapping = self._populate_mapping()

def _populate_mapping(self) -> tuple[dict[str, list[int]], dict[str, list[int]]]:
Expand All @@ -53,6 +55,20 @@ def _populate_mapping(self) -> tuple[dict[str, list[int]], dict[str, list[int]]]

return dict(file_to_id), dict(language_to_id)

@staticmethod
def _compute_file_sizes(root: Path, chunks: list[Chunk]) -> dict[str, int]:
"""Return a mapping of repo-relative file path to total character count."""
sizes: dict[str, int] = {}
for chunk in chunks:
file_path = chunk.file_path
if file_path in sizes:
continue
try:
sizes[file_path] = len((root / file_path).read_text(encoding="utf-8", errors="replace"))
except OSError:
pass
return sizes

@property
def stats(self) -> IndexStats:
"""Stats of an index."""
Expand Down Expand Up @@ -104,6 +120,7 @@ def from_path(
)

index = SembleIndex(model, bm25, vicinity, chunks)
index._file_sizes = SembleIndex._compute_file_sizes(path, chunks)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can be self._compute_file_sizes. I would not call this here and just make it part of the __init__, or, if it is fast enough, just make it a property. There's also afaik no need to make this a staticmethod, doing

index.recompute_file_sizes()

is completely fine. I realize you do need the root, but the root could be part of the SembleIndex.


return index

Expand Down Expand Up @@ -158,6 +175,7 @@ def from_git(
)

index = SembleIndex(model, bm25, vicinity, chunks)
index._file_sizes = SembleIndex._compute_file_sizes(resolved_path, chunks)

return index

Expand All @@ -171,7 +189,11 @@ def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[
target = source.chunk if isinstance(source, SearchResult) else source
selector = self._get_selector_vector(filter_languages=[target.language]) if target.language else None
results = search_semantic(target.content, self.model, self._semantic_index, self.chunks, top_k + 1, selector)
return [r for r in results if r.chunk != target][:top_k]
results = [r for r in results if r.chunk != target][:top_k]
if self._file_sizes:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this ever needs to be checked. It's probably just better to populate file_sizes here if it doesn't exist already.

# Save stats if file sizes are available
save_search_stats(results, CallType.FIND_RELATED, self._file_sizes)
return results

def _get_selector_vector(
self, filter_languages: list[str] | None = None, filter_paths: list[str] | None = None
Expand Down Expand Up @@ -216,11 +238,16 @@ def search(
selector = self._get_selector_vector(filter_languages, filter_paths)

if mode == SearchMode.BM25:
return search_bm25(query, bm25_index, self.chunks, top_k, selector=selector)
if mode == SearchMode.SEMANTIC:
return search_semantic(query, self.model, semantic_index, self.chunks, top_k, selector=selector)
if mode == SearchMode.HYBRID:
return search_hybrid(
results = search_bm25(query, bm25_index, self.chunks, top_k, selector=selector)
elif mode == SearchMode.SEMANTIC:
results = search_semantic(query, self.model, semantic_index, self.chunks, top_k, selector=selector)
elif mode == SearchMode.HYBRID:
results = search_hybrid(
query, self.model, semantic_index, bm25_index, self.chunks, top_k, alpha=alpha, selector=selector
)
raise ValueError(f"Unknown search mode: {mode!r}")
else:
raise ValueError(f"Unknown search mode: {mode!r}")
if self._file_sizes:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

idem

# Save stats if file sizes are available
save_search_stats(results, CallType.SEARCH, self._file_sizes)
return results
137 changes: 137 additions & 0 deletions src/semble/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import json
from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from pathlib import Path

from semble.types import CallType, SearchResult

_STATS_FILE = Path.home() / ".semble" / "savings.jsonl"


@dataclass
class BucketStats:
calls: int = 0
snippet_chars: int = 0
file_chars: int = 0

def add(self, snippet_chars: int, file_chars: int) -> None:
"""Update stats with a call and its character counts."""
self.calls += 1
self.snippet_chars += snippet_chars
self.file_chars += file_chars


@dataclass
class SavingsSummary:
buckets: dict[str, BucketStats]
call_type_counts: dict[str, int]


def save_search_stats(
results: list[SearchResult],
call_type: CallType,
file_sizes: dict[str, int],
) -> None:
"""Save stats about a search or find_related call to the stats file."""
try:
snippet_chars = sum(len(result.chunk.content) for result in results)
file_chars = sum(file_sizes.get(path, 0) for path in {result.chunk.file_path for result in results})
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is probably better to disregard chunks for files for which we don't have size information. In fact, I think not having a specific file here points to some other failure.


record = {
"ts": datetime.now(timezone.utc).isoformat(),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't write iso format, just dump the timestamp.

"call": call_type,
"results": len(results),
"snippet_chars": snippet_chars,
"file_chars": file_chars,
}
_STATS_FILE.parent.mkdir(parents=True, exist_ok=True)
with _STATS_FILE.open("a") as f:
f.write(json.dumps(record) + "\n")
except OSError:
pass


def build_savings_summary(path: Path = _STATS_FILE) -> SavingsSummary:
"""Read savings.jsonl and return a SavingsSummary."""
now = datetime.now(timezone.utc)
today = now.date()
seven_days_ago = (now - timedelta(days=7)).date()

buckets = {
"Today": BucketStats(),
"Last 7 days": BucketStats(),
"All time": BucketStats(),
}
call_type_counts: defaultdict[str, int] = defaultdict(int)

with path.open() as f:
for line in f:
try:
record = json.loads(line)
except json.JSONDecodeError:
continue
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe warn here, we don't expect this to happen.

snippet_chars = record.get("snippet_chars", 0)
file_chars = record.get("file_chars", 0)
call_type = record.get("call", "search")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when would we expect any of these to be missing? We write these ourselves.

call_type_counts[call_type] += 1
try:
record_date = datetime.fromisoformat(record.get("ts", "")).date()
in_today = record_date == today
in_last_7 = record_date > seven_days_ago
except ValueError:
in_today = in_last_7 = False # unparseable timestamp: count in All time only
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how could this happen? AFAIK we write this ourselves. In any case, you should not parse a timestamp to a date and then reparse it again. Just use a timestamp

buckets["All time"].add(snippet_chars, file_chars)
if in_last_7:
buckets["Last 7 days"].add(snippet_chars, file_chars)
if in_today:
buckets["Today"].add(snippet_chars, file_chars)

return SavingsSummary(buckets=buckets, call_type_counts=dict(call_type_counts))


def format_savings_report(path: Path | None = None, *, verbose: bool = False) -> str:
"""Return a formatted token-savings report."""
if path is None:
path = _STATS_FILE
if not path.exists():
return "No stats yet. Run a search first."

summary = build_savings_summary(path)
bar_width = 16
heavy_line = " " + "═" * 64
light_line = " " + "─" * 64

lines = [
"",
" Semble Token Savings",
heavy_line,
f" {'Period':<12} {'Calls':<6} Savings",
light_line,
]
for label, bucket in summary.buckets.items():
saved_chars = max(0, bucket.file_chars - bucket.snippet_chars)
saved_tokens = saved_chars // 4 # standard ~4 chars/token approximation
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmmmm...

if saved_tokens >= 1_000_000:
saved_str = f"~{saved_tokens / 1_000_000:.1f}M"
elif saved_tokens >= 1000:
saved_str = f"~{saved_tokens / 1000:.1f}k"
else:
saved_str = f"~{saved_tokens}"
calls_str = f"{bucket.calls / 1000:.1f}k" if bucket.calls >= 1000 else str(bucket.calls)
if bucket.file_chars > 0:
ratio = saved_chars / bucket.file_chars
filled = round(ratio * bar_width)
bar = "β–ˆ" * filled + "β–‘" * (bar_width - filled)
pct = round(ratio * 100)
lines.append(f" {label:<12} {calls_str:<6} [{bar}] {saved_str} tokens ({pct}%)")
else:
lines.append(f" {label:<12} {calls_str:<6} [{'β–‘' * bar_width}] {saved_str} tokens")
if verbose and summary.call_type_counts:
lines += ["", " Usage Breakdown", light_line, f" {'Call type':<16} Calls"]
for call_type, count in sorted(summary.call_type_counts.items()):
count_str = f"{count / 1000:.1f}k" if count >= 1000 else str(count)
lines.append(f" {call_type:<16} {count_str}")
lines.append(heavy_line)
lines.append("")
return "\n".join(lines)
7 changes: 7 additions & 0 deletions src/semble/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@ class SearchMode(str, Enum):
BM25 = "bm25"


class CallType(str, Enum):
"""Call type for token-savings tracking."""

SEARCH = "search"
FIND_RELATED = "find_related"


class Encoder(Protocol):
"""Protocol for embedding models."""

Expand Down
19 changes: 19 additions & 0 deletions tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from semble import SembleIndex
from semble.index.create import _MAX_FILE_BYTES, create_index_from_path
from semble.types import Encoder
from tests.conftest import make_chunk


@pytest.fixture
Expand Down Expand Up @@ -87,6 +88,24 @@ def test_search_empty_query_returns_empty(indexed_index: SembleIndex, mode: str,
assert indexed_index.search(query, mode=mode) == []


@pytest.mark.parametrize(
("disk_files", "chunk_paths", "expected"),
[
({"foo.py": "hello world"}, ["foo.py", "foo.py"], {"foo.py": 11}),
({}, ["nonexistent.py"], {}),
],
ids=["dedup-same-file", "missing-file-skipped"],
)
def test_compute_file_sizes(
tmp_path: Path, disk_files: dict[str, str], chunk_paths: list[str], expected: dict[str, int]
) -> None:
"""_compute_file_sizes deduplicates paths and silently skips missing files."""
for name, content in disk_files.items():
(tmp_path / name).write_text(content)
chunks = [make_chunk("c", p) for p in chunk_paths]
assert SembleIndex._compute_file_sizes(tmp_path, chunks) == expected


def test_find_related(indexed_index: SembleIndex) -> None:
"""find_related returns related chunks for a Chunk or SearchResult seed."""
chunk = indexed_index.chunks[0]
Expand Down
Loading
Loading