-
Notifications
You must be signed in to change notification settings - Fork 60
feat: Add "semble savings" command #76
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
1431444
e529ae9
ac7c3b7
7b18560
93ff6b6
688533f
12963ab
cdee40b
9e4d447
1e3ea12
eec1374
0b4034a
1d30641
fdbcd0a
7a2b438
661dbbc
5b52215
f3d935f
1737c4c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,7 +13,8 @@ | |
| from semble.index.create import create_index_from_path | ||
| from semble.index.dense import SelectableBasicBackend, load_model | ||
| from semble.search import search_bm25, search_hybrid, search_semantic | ||
| from semble.types import Chunk, Encoder, IndexStats, SearchMode, SearchResult | ||
| from semble.stats import save_search_stats | ||
| from semble.types import CallType, Chunk, Encoder, IndexStats, SearchMode, SearchResult | ||
|
|
||
| _GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60)) | ||
|
|
||
|
|
@@ -39,6 +40,7 @@ def __init__( | |
| self.chunks: list[Chunk] = chunks | ||
| self._bm25_index: BM25 = bm25_index | ||
| self._semantic_index: SelectableBasicBackend = semantic_index | ||
| self._file_sizes: dict[str, int] = {} | ||
| self._file_mapping, self._language_mapping = self._populate_mapping() | ||
|
|
||
| def _populate_mapping(self) -> tuple[dict[str, list[int]], dict[str, list[int]]]: | ||
|
|
@@ -53,6 +55,20 @@ def _populate_mapping(self) -> tuple[dict[str, list[int]], dict[str, list[int]]] | |
|
|
||
| return dict(file_to_id), dict(language_to_id) | ||
|
|
||
| @staticmethod | ||
| def _compute_file_sizes(root: Path, chunks: list[Chunk]) -> dict[str, int]: | ||
| """Return a mapping of repo-relative file path to total character count.""" | ||
| sizes: dict[str, int] = {} | ||
| for chunk in chunks: | ||
| file_path = chunk.file_path | ||
| if file_path in sizes: | ||
| continue | ||
| try: | ||
| sizes[file_path] = len((root / file_path).read_text(encoding="utf-8", errors="replace")) | ||
| except OSError: | ||
| pass | ||
| return sizes | ||
|
|
||
| @property | ||
| def stats(self) -> IndexStats: | ||
| """Stats of an index.""" | ||
|
|
@@ -104,6 +120,7 @@ def from_path( | |
| ) | ||
|
|
||
| index = SembleIndex(model, bm25, vicinity, chunks) | ||
| index._file_sizes = SembleIndex._compute_file_sizes(path, chunks) | ||
|
|
||
| return index | ||
|
|
||
|
|
@@ -158,6 +175,7 @@ def from_git( | |
| ) | ||
|
|
||
| index = SembleIndex(model, bm25, vicinity, chunks) | ||
| index._file_sizes = SembleIndex._compute_file_sizes(resolved_path, chunks) | ||
|
|
||
| return index | ||
|
|
||
|
|
@@ -171,7 +189,11 @@ def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[ | |
| target = source.chunk if isinstance(source, SearchResult) else source | ||
| selector = self._get_selector_vector(filter_languages=[target.language]) if target.language else None | ||
| results = search_semantic(target.content, self.model, self._semantic_index, self.chunks, top_k + 1, selector) | ||
| return [r for r in results if r.chunk != target][:top_k] | ||
| results = [r for r in results if r.chunk != target][:top_k] | ||
| if self._file_sizes: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this ever needs to be checked. It's probably just better to populate file_sizes here if it doesn't exist already. |
||
| # Save stats if file sizes are available | ||
| save_search_stats(results, CallType.FIND_RELATED, self._file_sizes) | ||
| return results | ||
|
|
||
| def _get_selector_vector( | ||
| self, filter_languages: list[str] | None = None, filter_paths: list[str] | None = None | ||
|
|
@@ -216,11 +238,16 @@ def search( | |
| selector = self._get_selector_vector(filter_languages, filter_paths) | ||
|
|
||
| if mode == SearchMode.BM25: | ||
| return search_bm25(query, bm25_index, self.chunks, top_k, selector=selector) | ||
| if mode == SearchMode.SEMANTIC: | ||
| return search_semantic(query, self.model, semantic_index, self.chunks, top_k, selector=selector) | ||
| if mode == SearchMode.HYBRID: | ||
| return search_hybrid( | ||
| results = search_bm25(query, bm25_index, self.chunks, top_k, selector=selector) | ||
| elif mode == SearchMode.SEMANTIC: | ||
| results = search_semantic(query, self.model, semantic_index, self.chunks, top_k, selector=selector) | ||
| elif mode == SearchMode.HYBRID: | ||
| results = search_hybrid( | ||
| query, self.model, semantic_index, bm25_index, self.chunks, top_k, alpha=alpha, selector=selector | ||
| ) | ||
| raise ValueError(f"Unknown search mode: {mode!r}") | ||
| else: | ||
| raise ValueError(f"Unknown search mode: {mode!r}") | ||
| if self._file_sizes: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. idem |
||
| # Save stats if file sizes are available | ||
| save_search_stats(results, CallType.SEARCH, self._file_sizes) | ||
| return results | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,137 @@ | ||
| import json | ||
| from collections import defaultdict | ||
| from dataclasses import dataclass | ||
| from datetime import datetime, timedelta, timezone | ||
| from pathlib import Path | ||
|
|
||
| from semble.types import CallType, SearchResult | ||
|
|
||
| _STATS_FILE = Path.home() / ".semble" / "savings.jsonl" | ||
|
|
||
|
|
||
| @dataclass | ||
| class BucketStats: | ||
| calls: int = 0 | ||
| snippet_chars: int = 0 | ||
| file_chars: int = 0 | ||
|
|
||
| def add(self, snippet_chars: int, file_chars: int) -> None: | ||
| """Update stats with a call and its character counts.""" | ||
| self.calls += 1 | ||
| self.snippet_chars += snippet_chars | ||
| self.file_chars += file_chars | ||
|
|
||
|
|
||
| @dataclass | ||
| class SavingsSummary: | ||
| buckets: dict[str, BucketStats] | ||
| call_type_counts: dict[str, int] | ||
|
|
||
|
|
||
| def save_search_stats( | ||
| results: list[SearchResult], | ||
| call_type: CallType, | ||
| file_sizes: dict[str, int], | ||
| ) -> None: | ||
| """Save stats about a search or find_related call to the stats file.""" | ||
| try: | ||
| snippet_chars = sum(len(result.chunk.content) for result in results) | ||
| file_chars = sum(file_sizes.get(path, 0) for path in {result.chunk.file_path for result in results}) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is probably better to disregard chunks for files for which we don't have size information. In fact, I think not having a specific file here points to some other failure. |
||
|
|
||
| record = { | ||
| "ts": datetime.now(timezone.utc).isoformat(), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't write iso format, just dump the timestamp. |
||
| "call": call_type, | ||
| "results": len(results), | ||
| "snippet_chars": snippet_chars, | ||
| "file_chars": file_chars, | ||
| } | ||
| _STATS_FILE.parent.mkdir(parents=True, exist_ok=True) | ||
| with _STATS_FILE.open("a") as f: | ||
| f.write(json.dumps(record) + "\n") | ||
| except OSError: | ||
| pass | ||
|
|
||
|
|
||
| def build_savings_summary(path: Path = _STATS_FILE) -> SavingsSummary: | ||
| """Read savings.jsonl and return a SavingsSummary.""" | ||
| now = datetime.now(timezone.utc) | ||
| today = now.date() | ||
| seven_days_ago = (now - timedelta(days=7)).date() | ||
|
|
||
| buckets = { | ||
| "Today": BucketStats(), | ||
| "Last 7 days": BucketStats(), | ||
| "All time": BucketStats(), | ||
| } | ||
| call_type_counts: defaultdict[str, int] = defaultdict(int) | ||
|
|
||
| with path.open() as f: | ||
| for line in f: | ||
| try: | ||
| record = json.loads(line) | ||
| except json.JSONDecodeError: | ||
| continue | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe warn here, we don't expect this to happen. |
||
| snippet_chars = record.get("snippet_chars", 0) | ||
| file_chars = record.get("file_chars", 0) | ||
| call_type = record.get("call", "search") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when would we expect any of these to be missing? We write these ourselves. |
||
| call_type_counts[call_type] += 1 | ||
| try: | ||
| record_date = datetime.fromisoformat(record.get("ts", "")).date() | ||
| in_today = record_date == today | ||
| in_last_7 = record_date > seven_days_ago | ||
| except ValueError: | ||
| in_today = in_last_7 = False # unparseable timestamp: count in All time only | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how could this happen? AFAIK we write this ourselves. In any case, you should not parse a timestamp to a date and then reparse it again. Just use a timestamp |
||
| buckets["All time"].add(snippet_chars, file_chars) | ||
| if in_last_7: | ||
| buckets["Last 7 days"].add(snippet_chars, file_chars) | ||
| if in_today: | ||
| buckets["Today"].add(snippet_chars, file_chars) | ||
|
|
||
| return SavingsSummary(buckets=buckets, call_type_counts=dict(call_type_counts)) | ||
|
|
||
|
|
||
| def format_savings_report(path: Path | None = None, *, verbose: bool = False) -> str: | ||
| """Return a formatted token-savings report.""" | ||
| if path is None: | ||
| path = _STATS_FILE | ||
| if not path.exists(): | ||
| return "No stats yet. Run a search first." | ||
|
|
||
| summary = build_savings_summary(path) | ||
| bar_width = 16 | ||
| heavy_line = " " + "β" * 64 | ||
| light_line = " " + "β" * 64 | ||
|
|
||
| lines = [ | ||
| "", | ||
| " Semble Token Savings", | ||
| heavy_line, | ||
| f" {'Period':<12} {'Calls':<6} Savings", | ||
| light_line, | ||
| ] | ||
| for label, bucket in summary.buckets.items(): | ||
| saved_chars = max(0, bucket.file_chars - bucket.snippet_chars) | ||
| saved_tokens = saved_chars // 4 # standard ~4 chars/token approximation | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmmmm... |
||
| if saved_tokens >= 1_000_000: | ||
| saved_str = f"~{saved_tokens / 1_000_000:.1f}M" | ||
| elif saved_tokens >= 1000: | ||
| saved_str = f"~{saved_tokens / 1000:.1f}k" | ||
| else: | ||
| saved_str = f"~{saved_tokens}" | ||
| calls_str = f"{bucket.calls / 1000:.1f}k" if bucket.calls >= 1000 else str(bucket.calls) | ||
| if bucket.file_chars > 0: | ||
| ratio = saved_chars / bucket.file_chars | ||
| filled = round(ratio * bar_width) | ||
| bar = "β" * filled + "β" * (bar_width - filled) | ||
| pct = round(ratio * 100) | ||
| lines.append(f" {label:<12} {calls_str:<6} [{bar}] {saved_str} tokens ({pct}%)") | ||
| else: | ||
| lines.append(f" {label:<12} {calls_str:<6} [{'β' * bar_width}] {saved_str} tokens") | ||
| if verbose and summary.call_type_counts: | ||
| lines += ["", " Usage Breakdown", light_line, f" {'Call type':<16} Calls"] | ||
| for call_type, count in sorted(summary.call_type_counts.items()): | ||
| count_str = f"{count / 1000:.1f}k" if count >= 1000 else str(count) | ||
| lines.append(f" {call_type:<16} {count_str}") | ||
| lines.append(heavy_line) | ||
| lines.append("") | ||
| return "\n".join(lines) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this can be
self._compute_file_sizes. I would not call this here and just make it part of the__init__, or, if it is fast enough, just make it a property. There's also afaik no need to make this a staticmethod, doingis completely fine. I realize you do need the root, but the root could be part of the
SembleIndex.