-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add sqlite-vss to add simioary search to sqlite
- Loading branch information
1 parent
60b7198
commit 90bd0a1
Showing
12 changed files
with
1,869 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
FROM ubuntu:22.04 | ||
|
||
|
||
RUN apt update | ||
RUN apt install -y python3-pip libgomp1 libatlas-base-dev liblapack-dev libsqlite3-dev | ||
|
||
WORKDIR /app | ||
|
||
ADD poetry.lock /app/poetry.lock | ||
ADD pyproject.toml /app/pyproject.toml | ||
|
||
RUN pip install poetry | ||
RUN poetry config virtualenvs.create false | ||
RUN poetry install | ||
|
||
ADD . /app | ||
|
||
CMD ["pytest", "./tests"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,29 @@ | ||
# Content map | ||
|
||
A way to share content from a specific domain. Similar to sitemaps but for content. | ||
A way to share content from a specific domain using SQLite as an alternative to | ||
RSS feeds. The purpose of this library is to simply create a dataset for all the | ||
content on your website, using the XML sitemap as a starting point. | ||
|
||
|
||
## Installation | ||
|
||
```bash | ||
|
||
pip install contentmap | ||
|
||
``` | ||
|
||
## Quickstart | ||
|
||
To build your contentmap.db that will contain all your content using your XML | ||
sitemap as a starting point, you only need to write the following: | ||
|
||
```python | ||
from contentmap.sitemap import SitemapToContentDatabase | ||
|
||
database = SitemapToContentDatabase("https://yourblog.com/sitemap.xml") | ||
database.load() | ||
|
||
``` | ||
|
||
You can control how many urls can be crawled concurrently and also set some timeout. |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
""" | ||
Class ContentMapVSS to create vector search dataset from a contentmap | ||
dataset already created. | ||
""" | ||
import sqlite3 | ||
from typing import Optional | ||
|
||
import sqlite_vss | ||
from langchain.text_splitter import RecursiveCharacterTextSplitter | ||
from langchain.text_splitter import CharacterTextSplitter | ||
from langchain_community.document_loaders import TextLoader | ||
from langchain_community.embeddings.sentence_transformer import ( | ||
SentenceTransformerEmbeddings, | ||
) | ||
from langchain_community.vectorstores import SQLiteVSS | ||
|
||
|
||
class ContentMapVSS: | ||
|
||
def __init__(self, | ||
connection: Optional[sqlite3.Connection] = None, | ||
db_file: str = "contentmap.db" | ||
): | ||
|
||
self.connection = connection | ||
if not connection: | ||
self.connection = SQLiteVSS.create_connection(db_file) | ||
|
||
def load(self): | ||
# content table must be there | ||
assert self.table_exists(table_name="content") | ||
|
||
embedding_function = SentenceTransformerEmbeddings( | ||
model_name="all-MiniLM-L6-v2" | ||
) | ||
vss = SQLiteVSS( | ||
table="content_chunks", | ||
embedding=embedding_function, | ||
connection=self.connection | ||
) | ||
return vss | ||
|
||
def table_exists(self, table_name: str) -> bool: | ||
res = self.connection.execute(f""" | ||
SELECT name | ||
FROM sqlite_master | ||
WHERE type='table' AND name='{table_name}'; | ||
""") | ||
rows = res.fetchall() | ||
if len(rows) == 1: | ||
return True | ||
return False |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import sqlite3 | ||
|
||
|
||
def test_fts_extension_enabled(): | ||
|
||
con = sqlite3.connect(':memory:') | ||
cur = con.cursor() | ||
cur.execute('pragma compile_options;') | ||
available_pragmas = cur.fetchall() | ||
con.close() | ||
|
||
assert ('ENABLE_FTS5',) in available_pragmas |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from contentmap.vss import ContentMapVSS | ||
import os.path as op | ||
from tests.utils import build_fixture_db | ||
|
||
|
||
class TestContentMapVSS: | ||
|
||
def test_assertion_content_exists(self): | ||
fixture_db = op.join(op.dirname(__file__), "fixture.db") | ||
vss_content = ContentMapVSS(db_file=fixture_db) | ||
assert vss_content.table_exists(table_name="content") is True | ||
|
||
def test_assertion_content_not_exists(self): | ||
vss_content = ContentMapVSS(db_file=":memory:") | ||
assert vss_content.table_exists(table_name="content") is False | ||
|
||
|
||
class TestVssTablesCreation: | ||
|
||
db = build_fixture_db() | ||
|
||
def test_vss_instance(self): | ||
cm_vss = ContentMapVSS(db_file=self.db) | ||
cm_vss.load() | ||
assert cm_vss.table_exists("content_chunks") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
import os.path as op | ||
import shutil | ||
|
||
|
||
def build_fixture_db(): | ||
fixture_db = op.join(op.dirname(__file__), 'fixture.db') | ||
dest = op.join(op.dirname(__file__), 'contentmap.db') | ||
shutil.copy(fixture_db, dest) | ||
return dest |