Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
afc3fb4
Refactor README and Vicinity class to support any serializable item type
davidberenstein1957 Jan 20, 2025
9ffb491
Update README.md to include examples for saving/loading vector stores…
davidberenstein1957 Jan 20, 2025
7b2bb53
Refactor Vicinity class to streamline token handling
davidberenstein1957 Jan 20, 2025
a5ce987
Refactor item handling in tests and Vicinity class
davidberenstein1957 Jan 20, 2025
022c7b1
Apply suggestions from code review
davidberenstein1957 Jan 20, 2025
eaabbfa
Refactor token insertion in Vicinity class to simplify duplicate hand…
davidberenstein1957 Jan 20, 2025
031c136
Refactor token deletion logic in Vicinity class to improve error hand…
davidberenstein1957 Jan 20, 2025
26e7ed6
Enhance error handling in Vicinity class for JSON serialization
davidberenstein1957 Jan 20, 2025
6fb6305
Add non-serializable items fixture and test for Vicinity class
davidberenstein1957 Jan 20, 2025
c86f7e5
Add Hugging Face integration for Vicinity class
davidberenstein1957 Jan 28, 2025
a410686
Merge branch 'MinishLab:main' into add-hub-integration
davidberenstein1957 Jan 28, 2025
4f30d45
Enhance Hugging Face integration with improved error handling and dat…
davidberenstein1957 Feb 25, 2025
cab15e5
Update pyproject.toml and README.md for improved package installation…
davidberenstein1957 Feb 25, 2025
65465f3
Add test for Vicinity.load_from_hub method
davidberenstein1957 Feb 25, 2025
06545dd
Remove test files for utils and vicinity modules
davidberenstein1957 Feb 25, 2025
cc3fbf4
Add comprehensive test suites for Vicinity and utility functions
davidberenstein1957 Feb 25, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@

</div>


Vicinity is a light-weight, low-dependency vector store. It provides a simple and intuitive interface for nearest neighbor search, with support for different backends and evaluation.

There are many nearest neighbors packages and methods out there. However, we found it difficult to compare them. Every package has its own interface, quirks, and limitations, and learning a new package can be time-consuming. In addition to that, how do you effectively evaluate different packages? How do you know which one is the best for your use case?
Expand All @@ -43,7 +42,7 @@ Install the package with:
```bash
pip install vicinity
```
Optionally, [install any of the supported backends](#installation), or simply install all of them with:
Optionally, [install specific backends and integrations](#installation), or simply install all of them with:
```bash
pip install vicinity[all]
```
Expand Down Expand Up @@ -87,6 +86,13 @@ vicinity.save('my_vector_store')
vicinity = Vicinity.load('my_vector_store')
```

Pushing and loading a vector store from the Hugging Face Hub:

```python
vicinity.push_to_hub(model_name_or_path='my_vector_store', repo_id='my_vector_store')
vicinity = Vicinity.load_from_hub(repo_id='my_vector_store')
```

Evaluating a backend:

```python
Expand Down Expand Up @@ -167,9 +173,18 @@ The following installation options are available:
# Install the base package
pip install vicinity

# Install all backends
# Install all integrations and backends
pip install vicinity[all]

# Install all integrations
pip install vicinity[integrations]

# Install specific integrations
pip install vicinity[huggingface]

# Install all backends
pip install vicinity[backends]

# Install specific backends
pip install vicinity[annoy]
pip install vicinity[faiss]
Expand Down
21 changes: 21 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ dev = [
"ruff",
"setuptools"
]

# Integrations
huggingface = ["datasets"]
integrations = [
"datasets"
]

# Backends
hnsw = ["hnswlib"]
pynndescent = [
"pynndescent>=0.5.10",
Expand All @@ -53,7 +61,20 @@ annoy = ["annoy"]
faiss = ["faiss-cpu"]
usearch = ["usearch"]
voyager = ["voyager"]
backends = [
"hnswlib",
"pynndescent>=0.5.10",
"numba>=0.59.0",
"llvmlite>=0.42.0",
"numpy>=1.24.0",
"annoy",
"faiss-cpu",
"usearch",
"voyager"
]

all = [
"datasets",
"hnswlib",
"pynndescent>=0.5.10",
"numba>=0.59.0",
Expand Down
33 changes: 33 additions & 0 deletions tests/integrations/test_huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from __future__ import annotations

import io
import sys

from vicinity import Vicinity
from vicinity.datatypes import Backend
from vicinity.integrations.huggingface import _MODEL_NAME_OR_PATH_PRINT_STATEMENT

BackendType = tuple[Backend, str]


def test_load_from_hub(vicinity_instance: Vicinity) -> None:
"""
Test Vicinity.load_from_hub.

:param vicinity_instance: A Vicinity instance.
"""
repo_id = "davidberenstein1957/my-vicinity-repo"
# get the first part of the print statement to test if model name or path is printed
expected_print_statement = _MODEL_NAME_OR_PATH_PRINT_STATEMENT.split(":")[0]

# Capture the output
captured_output = io.StringIO()
sys.stdout = captured_output

Vicinity.load_from_hub(repo_id=repo_id)

# Reset redirect.
sys.stdout = sys.__stdout__

# Check if the expected message is in the output
assert expected_print_statement in captured_output.getvalue()
30 changes: 30 additions & 0 deletions vicinity/integrations/dataset_card_template.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
---
tags:
- vicinity
- vector-store
---

# Dataset Card for {repo_id}

This dataset was created using the [vicinity](https://github.com/MinishLab/vicinity) library, a lightweight nearest neighbors library with flexible backends.

It contains a vector space with {num_items} items.

## Usage

You can load this dataset using the following code:

```python
from vicinity import Vicinity
vicinity = Vicinity.load_from_hub("{repo_id}")
```

After loading the dataset, you can use the `vicinity.query` method to find the nearest neighbors to a vector.

## Configuration

The configuration of the dataset is stored in the `config.json` file. The vector backend is stored in the `backend` folder.

```bash
{config}
```
138 changes: 138 additions & 0 deletions vicinity/integrations/huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
from __future__ import annotations

import json
import logging
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING, Any

from vicinity.backends import BasicVectorStore, get_backend_class
from vicinity.datatypes import Backend

if TYPE_CHECKING:
from huggingface_hub import CommitInfo

from vicinity.vicinity import Vicinity

_HUB_IMPORT_ERROR = ImportError(
"`datasets` and `huggingface_hub` are required to push to the Hugging Face Hub. Please install them with `pip install 'vicinity[huggingface]'`"
)
_MODEL_NAME_OR_PATH_PRINT_STATEMENT = (
"Embeddings in Vicinity instance were created from model name or path: {model_name_or_path}"
)

logger = logging.getLogger(__name__)


class HuggingFaceMixin:
def push_to_hub(
self,
model_name_or_path: str,
repo_id: str,
token: str | None = None,
private: bool = False,
**kwargs: Any,
) -> "CommitInfo":
"""
Push the Vicinity instance to the Hugging Face Hub.

:param model_name_or_path: The name of the model or the path to the local directory
that was used to create the embeddings in the Vicinity instance.
:param repo_id: The repository ID on the Hugging Face Hub
:param token: Optional authentication token for private repositories
:param private: Whether to create a private repository
:param **kwargs: Additional arguments passed to Dataset.push_to_hub()
:return: The commit info
"""
try:
from datasets import Dataset
from huggingface_hub import DatasetCard, upload_file, upload_folder
except ImportError:
raise _HUB_IMPORT_ERROR

# Create and push dataset with items and vectors
if isinstance(self.items[0], dict):
dataset_dict = {k: [item[k] for item in self.items] for k in self.items[0].keys()}
else:
dataset_dict = {"items": self.items}
if self.vector_store is not None:
dataset_dict["vectors"] = self.vector_store.vectors
dataset = Dataset.from_dict(dataset_dict)
dataset.push_to_hub(repo_id, token=token, private=private, **kwargs)

# Save backend and config files to temp directory and upload
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)

# Save and upload backend
self.backend.save(temp_path)
upload_folder(
repo_id=repo_id,
folder_path=temp_path,
token=token,
repo_type="dataset",
path_in_repo="backend",
)

# Save and upload config
config = {
"metadata": self.metadata,
"backend_type": self.backend.backend_type.value,
"model_name_or_path": model_name_or_path,
}
config_path = temp_path / "config.json"
config_path.write_text(json.dumps(config))
upload_file(
repo_id=repo_id,
path_or_fileobj=config_path,
token=token,
repo_type="dataset",
path_in_repo="config.json",
)

# Load the dataset card template from the related path
template_path = Path(__file__).parent / "dataset_card_template.md"
template = template_path.read_text()
content = template.format(repo_id=repo_id, num_items=len(self.items), config=json.dumps(config, indent=4))
return DatasetCard(content=content).push_to_hub(repo_id=repo_id, token=token, repo_type="dataset")

@classmethod
def load_from_hub(cls, repo_id: str, token: str | None = None, **kwargs: Any) -> "Vicinity":
"""
Load a Vicinity instance from the Hugging Face Hub.

:param repo_id: The repository ID on the Hugging Face Hub.
:param token: Optional authentication token for private repositories.
:param **kwargs: Additional arguments passed to load_dataset.
:return: A Vicinity instance loaded from the Hub.
"""
try:
from datasets import load_dataset
from huggingface_hub import snapshot_download
except ImportError:
raise _HUB_IMPORT_ERROR

# Load dataset and extract items and vectors
dataset = load_dataset(repo_id, token=token, split="train", **kwargs)
if "items" in dataset.column_names:
items = dataset["items"]
else:
# Create items from all columns except 'vectors'
items = []
columns = [col for col in dataset.column_names if col != "vectors"]
for i in range(len(dataset)):
items.append({col: dataset[col][i] for col in columns})
has_vectors = "vectors" in dataset.column_names
vector_store = BasicVectorStore(vectors=dataset["vectors"]) if has_vectors else None

# Download and load config and backend
repo_path = Path(snapshot_download(repo_id=repo_id, token=token, repo_type="dataset"))
with open(repo_path / "config.json") as f:
config = json.load(f)
model_name_or_path = config.pop("model_name_or_path")

print(_MODEL_NAME_OR_PATH_PRINT_STATEMENT.format(model_name_or_path=model_name_or_path))
backend_type = Backend(config["backend_type"])
backend = get_backend_class(backend_type).load(repo_path / "backend")

return cls(items=items, backend=backend, metadata=config["metadata"], vector_store=vector_store)
4 changes: 3 additions & 1 deletion vicinity/vicinity.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@

logger = logging.getLogger(__name__)

from vicinity.integrations.huggingface import HuggingFaceMixin

class Vicinity:

class Vicinity(HuggingFaceMixin):
"""
Work with vector representations of items.

Expand Down