Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add sqlite-vss to add similarity search to sqlite #4

Merged
merged 2 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
strategy:
fail-fast: true
matrix:
python-version: ["3.8", "3.9", "3.10"]
python-version: ["3.8", "3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
contentmap.db

/scratch

Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
21 changes: 21 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM ubuntu:22.04


RUN apt update
RUN apt install -y python3-pip libgomp1 libatlas-base-dev liblapack-dev libsqlite3-dev

WORKDIR /app

ADD poetry.lock /app/poetry.lock
ADD pyproject.toml /app/pyproject.toml

RUN pip install poetry
RUN poetry config virtualenvs.create false
RUN poetry install

RUN python3 -c 'from sentence_transformers import SentenceTransformer; embedder = SentenceTransformer("all-MiniLM-L6-v2")'


ADD . /app

CMD ["pytest", "./tests"]
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,29 @@
# Content map

A way to share content from a specific domain. Similar to sitemaps but for content.
A way to share content from a specific domain using SQLite as an alternative to
RSS feeds. The purpose of this library is to simply create a dataset for all the
content on your website, using the XML sitemap as a starting point.


## Installation

```bash

pip install contentmap

```

## Quickstart

To build your contentmap.db that will contain all your content using your XML
sitemap as a starting point, you only need to write the following:

```python
from contentmap.sitemap import SitemapToContentDatabase

database = SitemapToContentDatabase("https://yourblog.com/sitemap.xml")
database.load()

```

You can control how many urls can be crawled concurrently and also set some timeout.
Empty file added contentmap/ftse.py
Empty file.
76 changes: 76 additions & 0 deletions contentmap/vss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""
Class ContentMapVSS to create vector search dataset from a contentmap
dataset already created.
"""
import sqlite3
from typing import Optional

import sqlite_vss
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import SQLiteVSS


class ContentMapVSS:

def __init__(self,
connection: Optional[sqlite3.Connection] = None,
db_file: str = "contentmap.db"
):

self.connection = connection
if not connection:
self.connection = SQLiteVSS.create_connection(db_file)

embedding_function = SentenceTransformerEmbeddings(
model_name="all-MiniLM-L6-v2"
)
self.vss = SQLiteVSS(
table="content_chunks",
embedding=embedding_function,
connection=self.connection
)

def load(self):
# content table must be there
assert self.table_exists(table_name="content")
texts, metadatas = self.prepare_texts_and_metadatas()
self.vss.add_texts(texts=texts, metadatas=metadatas)
return self.vss

def table_exists(self, table_name: str) -> bool:
res = self.connection.execute(f"""
SELECT name
FROM sqlite_master
WHERE type='table' AND name='{table_name}';
""")
rows = res.fetchall()
if len(rows) == 1:
return True
return False

def prepare_texts_and_metadatas(self):
cursor = self.connection.cursor()
result = cursor.execute("SELECT content, url FROM content")
rows = result.fetchall()

# based on Anyscale analysis (https://t.ly/yjgxQ), it looks like the
# sweet spot is 700 chunk size and 50 chunk overlap
text_splitter = CharacterTextSplitter(chunk_size=700, chunk_overlap=50)

texts = []
metadatas = []
for row in rows:
chunks = text_splitter.split_text(row["content"])
chunk_metadatas = [{"url": row["url"]} for _ in chunks]
texts += chunks
metadatas += chunk_metadatas

return texts, metadatas

def similarity_search(self, *args, **kwargs):
return self.vss.similarity_search(*args, **kwargs)
1,721 changes: 1,720 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ tqdm = "^4.66.1"
lxml = "4.9.4"
trafilatura = "^1.6.4"
aiohttp = "^3.9.1"
sqlite-vss = "^0.1.2"
langchain = "^0.1.8"
sentence-transformers = "^2.3.1"


[tool.poetry.group.test.dependencies]
Expand Down
8 changes: 7 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pytest
import os
import os.path as op
import logging


@pytest.fixture(autouse=True)
Expand All @@ -8,5 +10,9 @@ def remove_created_database_after_test():
# Setup logic
yield # this is where the testing happens
# Teardown logic
os.remove("contentmap.db")

contentmap_db_path = op.join(op.dirname(__file__), "contentmap.db")
if op.exists(contentmap_db_path):
logging.info('Destroying mock sqlite content instance')
os.remove(contentmap_db_path)

Binary file added tests/fixture.db
Binary file not shown.
12 changes: 12 additions & 0 deletions tests/test_fts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import sqlite3


def test_fts_extension_enabled():

con = sqlite3.connect(':memory:')
cur = con.cursor()
cur.execute('pragma compile_options;')
available_pragmas = cur.fetchall()
con.close()

assert ('ENABLE_FTS5',) in available_pragmas
50 changes: 50 additions & 0 deletions tests/test_vss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from contentmap.vss import ContentMapVSS
import os.path as op
from tests.utils import build_fixture_db


class TestContentMapVSS:

def test_assertion_content_exists(self):
fixture_db = op.join(op.dirname(__file__), "fixture.db")
vss_content = ContentMapVSS(db_file=fixture_db)
assert vss_content.table_exists(table_name="content") is True

def test_assertion_content_not_exists(self):
vss_content = ContentMapVSS(db_file=":memory:")
assert vss_content.table_exists(table_name="content") is False


class TestVssTablesCreation:

def test_vss_instance(self):
db = build_fixture_db()
cm_vss = ContentMapVSS(db_file=db)
cm_vss.load()
assert cm_vss.table_exists("content_chunks")

def test_prepare_texts_and_metadatas(self):
db = build_fixture_db()
cm_vss = ContentMapVSS(db_file=db)
texts, metadatas = cm_vss.prepare_texts_and_metadatas()
assert len(texts) == len(metadatas) >= 1

def test_chunk_table(self):
db = build_fixture_db()
cm_vss = ContentMapVSS(db_file=db)
cm_vss.load()
assert cm_vss.table_exists("content_chunks")
cursor = cm_vss.connection.cursor()
res = cursor.execute("SELECT * FROM content_chunks")
rows = res.fetchall()
assert len(rows) >= 15

def test_similarity_search(self):
db = build_fixture_db()
cm_vss = ContentMapVSS(db_file=db)
cm_vss.load()
data = cm_vss.similarity_search(query="who is Mistral ai company?", k=2)
assert len(data) == 2
metadatas = [doc.metadata for doc in data]
for meta in metadatas:
assert meta.get("url") == "https://philippeoger.com/pages/ai-scene-in-europe-last-week/"
9 changes: 9 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import os.path as op
import shutil


def build_fixture_db():
fixture_db = op.join(op.dirname(__file__), 'fixture.db')
dest = op.join(op.dirname(__file__), 'contentmap.db')
shutil.copy(fixture_db, dest)
return dest
Loading