Skip to content

Commit

Permalink
feat: Add sqlite-vss to add simioary search to sqlite
Browse files Browse the repository at this point in the history
  • Loading branch information
philippe2803 committed Feb 21, 2024
1 parent 60b7198 commit 90bd0a1
Show file tree
Hide file tree
Showing 12 changed files with 1,869 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
strategy:
fail-fast: true
matrix:
python-version: ["3.8", "3.9", "3.10"]
python-version: ["3.8", "3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v3
Expand Down
18 changes: 18 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
FROM ubuntu:22.04


RUN apt update
RUN apt install -y python3-pip libgomp1 libatlas-base-dev liblapack-dev libsqlite3-dev

WORKDIR /app

ADD poetry.lock /app/poetry.lock
ADD pyproject.toml /app/pyproject.toml

RUN pip install poetry
RUN poetry config virtualenvs.create false
RUN poetry install

ADD . /app

CMD ["pytest", "./tests"]
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,29 @@
# Content map

A way to share content from a specific domain. Similar to sitemaps but for content.
A way to share content from a specific domain using SQLite as an alternative to
RSS feeds. The purpose of this library is to simply create a dataset for all the
content on your website, using the XML sitemap as a starting point.


## Installation

```bash

pip install contentmap

```

## Quickstart

To build your contentmap.db that will contain all your content using your XML
sitemap as a starting point, you only need to write the following:

```python
from contentmap.sitemap import SitemapToContentDatabase

database = SitemapToContentDatabase("https://yourblog.com/sitemap.xml")
database.load()

```

You can control how many urls can be crawled concurrently and also set some timeout.
Empty file added contentmap/ftse.py
Empty file.
52 changes: 52 additions & 0 deletions contentmap/vss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""
Class ContentMapVSS to create vector search dataset from a contentmap
dataset already created.
"""
import sqlite3
from typing import Optional

import sqlite_vss
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import SQLiteVSS


class ContentMapVSS:

def __init__(self,
connection: Optional[sqlite3.Connection] = None,
db_file: str = "contentmap.db"
):

self.connection = connection
if not connection:
self.connection = SQLiteVSS.create_connection(db_file)

def load(self):
# content table must be there
assert self.table_exists(table_name="content")

embedding_function = SentenceTransformerEmbeddings(
model_name="all-MiniLM-L6-v2"
)
vss = SQLiteVSS(
table="content_chunks",
embedding=embedding_function,
connection=self.connection
)
return vss

def table_exists(self, table_name: str) -> bool:
res = self.connection.execute(f"""
SELECT name
FROM sqlite_master
WHERE type='table' AND name='{table_name}';
""")
rows = res.fetchall()
if len(rows) == 1:
return True
return False
1,721 changes: 1,720 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ tqdm = "^4.66.1"
lxml = "4.9.4"
trafilatura = "^1.6.4"
aiohttp = "^3.9.1"
sqlite-vss = "^0.1.2"
langchain = "^0.1.8"
sentence-transformers = "^2.3.1"


[tool.poetry.group.test.dependencies]
Expand Down
3 changes: 2 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ def remove_created_database_after_test():
# Setup logic
yield # this is where the testing happens
# Teardown logic
os.remove("contentmap.db")
if os.path.exists("contentmap.db"):
os.remove("contentmap.db")

Binary file added tests/fixture.db
Binary file not shown.
12 changes: 12 additions & 0 deletions tests/test_fts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import sqlite3


def test_fts_extension_enabled():

con = sqlite3.connect(':memory:')
cur = con.cursor()
cur.execute('pragma compile_options;')
available_pragmas = cur.fetchall()
con.close()

assert ('ENABLE_FTS5',) in available_pragmas
25 changes: 25 additions & 0 deletions tests/test_vss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from contentmap.vss import ContentMapVSS
import os.path as op
from tests.utils import build_fixture_db


class TestContentMapVSS:

def test_assertion_content_exists(self):
fixture_db = op.join(op.dirname(__file__), "fixture.db")
vss_content = ContentMapVSS(db_file=fixture_db)
assert vss_content.table_exists(table_name="content") is True

def test_assertion_content_not_exists(self):
vss_content = ContentMapVSS(db_file=":memory:")
assert vss_content.table_exists(table_name="content") is False


class TestVssTablesCreation:

db = build_fixture_db()

def test_vss_instance(self):
cm_vss = ContentMapVSS(db_file=self.db)
cm_vss.load()
assert cm_vss.table_exists("content_chunks")
9 changes: 9 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import os.path as op
import shutil


def build_fixture_db():
fixture_db = op.join(op.dirname(__file__), 'fixture.db')
dest = op.join(op.dirname(__file__), 'contentmap.db')
shutil.copy(fixture_db, dest)
return dest

0 comments on commit 90bd0a1

Please sign in to comment.