Merge pull request #33 from iryna-kondr/feature_memory

iryna-kondr · web-flow · commit 66de4ab42e21 · 2023-06-11T21:13:48.000+02:00
Dynamic few shot classifier
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,7 +13,6 @@ repos:
       - id: check-executables-have-shebangs
       - id: check-case-conflict
       - id: check-added-large-files
-      - id: detect-aws-credentials
       - id: detect-private-key
   # Formatter for Json and Yaml files
   - repo: https://github.com/pre-commit/mirrors-prettier
diff --git a/README.md b/README.md
@@ -62,7 +62,8 @@ ZeroShotGPTClassifier(openai_model="gpt4all::ggml-gpt4all-j-v1.3-groovy")
 
 When running for the first time, the model file will be downloaded automatially.
 
-At the moment only the following estimators support gpt4all as a backend: 
+At the moment only the following estimators support gpt4all as a backend:
+
 - `ZeroShotGPTClassifier`
 - `MultiLabelZeroShotGPTClassifier`
 - `FewShotGPTClassifier`
@@ -179,6 +180,27 @@ While the api remains the same as for the zero shot classifier, there are a few
 
 Note: as the model is not being re-trained, but uses the training data during inference, one could say that this is still a (different) zero-shot approach.
 
+### Dynamic Few-Shot Text Classification
+
+`DynamicFewShotGPTClassifier` dynamically selects N samples per class to include in the prompt. This allows the few-shot classifier to scale to datasets that are too large for the standard context window of LLMs.
+
+*How does it work?*
+
+During fitting, the whole dataset is partitioned by class, vectorized, and stored.
+
+During inference, the [annoy](https://github.com/spotify/annoy) library is used for fast neighbor lookup, which allows including only the most similar examples in the prompt.
+
+```python
+from skllm import DynamicFewShotGPTClassifier
+from skllm.datasets import get_classification_dataset
+
+X, y = get_classification_dataset()
+
+clf = DynamicFewShotGPTClassifier(n_examples=3)
+clf.fit(X, y)
+labels = clf.predict(X)
+```
+
 ### Text Vectorization
 
 As an alternative to using GPT as a classifier, it can be used solely for data preprocessing. `GPTVectorizer` allows to embed a chunk of text of arbitrary length to a fixed-dimensional vector, that can be used with virtually any classification or regression model.
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,9 +8,10 @@ dependencies = [
   "pandas>=1.5.0",
   "openai>=0.27.0",
   "tqdm>=4.60.0",
+  "annoy>=1.17.2",
 ]
 name = "scikit-llm"
-version = "0.1.1"
+version = "0.2.0"
 authors = [
   { name="Oleg Kostromin", email="kostromin97@gmail.com" },
   { name="Iryna Kondrashchenko", email="iryna230520@gmail.com" },
@@ -79,12 +80,13 @@ target-version = ['py38', 'py39', 'py310', 'py311']
 profile = "black"
 filter_files = true
 known_first_party = ["skllm", "skllm.*"]
+skip = ["__init__.py"]
 
 [tool.docformatter]
 close-quotes-on-newline = true # D209
 
 [tool.interrogate]
-fail-under = 80
+fail-under = 65
 ignore-module = true
 ignore-nested-functions = true
 ignore-private = true
diff --git a/skllm/__init__.py b/skllm/__init__.py
@@ -1,5 +1,7 @@
-from skllm.models.gpt_few_shot_clf import FewShotGPTClassifier
+# ordering is important here to prevent circular imports
 from skllm.models.gpt_zero_shot_clf import (
     MultiLabelZeroShotGPTClassifier,
     ZeroShotGPTClassifier,
 )
+from skllm.models.gpt_few_shot_clf import FewShotGPTClassifier
+from skllm.models.gpt_dyn_few_shot_clf import DynamicFewShotGPTClassifier
diff --git a/skllm/memory/__init__.py b/skllm/memory/__init__.py
@@ -0,0 +1 @@
+from skllm.memory._annoy import AnnoyMemoryIndex
diff --git a/skllm/memory/_annoy.py b/skllm/memory/_annoy.py
@@ -0,0 +1,119 @@
+import os
+import tempfile
+from typing import Any, List
+
+from annoy import AnnoyIndex
+from numpy import ndarray
+
+from skllm.memory.base import _BaseMemoryIndex
+
+
+class AnnoyMemoryIndex(_BaseMemoryIndex):
+    """Memory index using Annoy.
+
+    Parameters
+    ----------
+    dim : int
+        dimensionality of the vectors
+    metric : str, optional
+        metric to use, by default "euclidean"
+    """
+
+    def __init__(self, dim: int, metric: str = "euclidean", **kwargs: Any) -> None:
+        self._index = AnnoyIndex(dim, metric)
+        self.metric = metric
+        self.dim = dim
+        self.built = False
+
+    def add(self, id: int, vector: ndarray) -> None:
+        """Adds a vector to the index.
+
+        Parameters
+        ----------
+        id : Any
+            identifier for the vector
+        vector : ndarray
+            vector to add to the index
+        """
+        if self.built:
+            raise RuntimeError("Cannot add vectors after index is built.")
+        self._index.add_item(id, vector)
+
+    def build(self) -> None:
+        """Builds the index.
+
+        No new vectors can be added after building.
+        """
+        self._index.build(-1)
+        self.built = True
+
+    def retrieve(self, vectors: ndarray, k: int) -> List[List[int]]:
+        """Retrieves the k nearest neighbors for each vector.
+
+        Parameters
+        ----------
+        vectors : ndarray
+            vectors to retrieve nearest neighbors for
+        k : int
+            number of nearest neighbors to retrieve
+
+        Returns
+        -------
+        List
+            ids of retrieved nearest neighbors
+        """
+        if not self.built:
+            raise RuntimeError("Cannot retrieve vectors before the index is built.")
+        return [
+            self._index.get_nns_by_vector(v, k, search_k=-1, include_distances=False)
+            for v in vectors
+        ]
+
+    def __getstate__(self) -> dict:
+        """Returns the state of the object. To store the actual annoy index, it
+        has to be written to a temporary file.
+
+        Returns
+        -------
+        dict
+            state of the object
+        """
+        state = self.__dict__.copy()
+
+        # save index to temporary file
+        with tempfile.NamedTemporaryFile(delete=False) as tmp:
+            temp_filename = tmp.name
+            self._index.save(temp_filename)
+
+        # read bytes from the file
+        with open(temp_filename, "rb") as tmp:
+            index_bytes = tmp.read()
+
+        # store bytes representation in state
+        state["_index"] = index_bytes
+
+        # remove temporary file
+        os.remove(temp_filename)
+
+        return state
+
+    def __setstate__(self, state: dict) -> None:
+        """Sets the state of the object. It restores the annoy index from the
+        bytes representation.
+
+        Parameters
+        ----------
+        state : dict
+            state of the object
+        """
+        self.__dict__.update(state)
+        # restore index from bytes
+        with tempfile.NamedTemporaryFile(delete=False) as tmp:
+            temp_filename = tmp.name
+            tmp.write(self._index)
+
+        self._index = AnnoyIndex(self.dim, self.metric)
+        self._index.load(temp_filename)
+
+        # remove temporary file
+        os.remove(temp_filename)
diff --git a/skllm/memory/base.py b/skllm/memory/base.py
@@ -0,0 +1,45 @@
+from abc import ABC, abstractmethod
+from typing import Any, List
+
+from numpy import ndarray
+
+
+class _BaseMemoryIndex(ABC):
+    @abstractmethod
+    def add(self, id: Any, vector: ndarray):
+        """Adds a vector to the index.
+
+        Parameters
+        ----------
+        id : Any
+            identifier for the vector
+        vector : ndarray
+            vector to add to the index
+        """
+        pass
+
+    @abstractmethod
+    def retrieve(self, vectors: ndarray, k: int) -> List:
+        """Retrieves the k nearest neighbors for each vector.
+
+        Parameters
+        ----------
+        vectors : ndarray
+            vectors to retrieve nearest neighbors for
+        k : int
+            number of nearest neighbors to retrieve
+
+        Returns
+        -------
+        List
+            ids of retrieved nearest neighbors
+        """
+        pass
+
+    @abstractmethod
+    def build(self) -> None:
+        """Builds the index.
+
+        All build parameters should be passed to the constructor.
+        """
+        pass
diff --git a/skllm/models/gpt_dyn_few_shot_clf.py b/skllm/models/gpt_dyn_few_shot_clf.py
@@ -0,0 +1,115 @@
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+
+from skllm import FewShotGPTClassifier
+from skllm.memory import AnnoyMemoryIndex
+from skllm.models.gpt_few_shot_clf import _TRAINING_SAMPLE_PROMPT_TEMPLATE
+from skllm.preprocessing import GPTVectorizer
+from skllm.prompts.builders import build_few_shot_prompt_slc
+from skllm.utils import to_numpy
+
+
+class DynamicFewShotGPTClassifier(FewShotGPTClassifier):
+    """Dynamic few-shot single-label classifier.
+
+    Parameters
+    ----------
+    n_examples : int, optional
+        number of examples per class, by default 3
+    openai_key : Optional[str] , default : None
+        Your OpenAI API key. If None, the key will be read from the SKLLM_CONFIG_OPENAI_KEY environment variable.
+    openai_org : Optional[str] , default : None
+        Your OpenAI organization. If None, the organization will be read from the SKLLM_CONFIG_OPENAI_ORG
+        environment variable.
+    openai_model : str , default : "gpt-3.5-turbo"
+        The OpenAI model to use. See https://beta.openai.com/docs/api-reference/available-models for a list of
+        available models.
+    default_label : Optional[Union[List[str], str]] , default : 'Random'
+        The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random
+        label will be chosen based on probabilities from the training set.
+    """
+
+    def __init__(
+        self,
+        n_examples: int = 3,
+        openai_key: str | None = None,
+        openai_org: str | None = None,
+        openai_model: str = "gpt-3.5-turbo",
+        default_label: str | None = "Random",
+    ):
+        super().__init__(openai_key, openai_org, openai_model, default_label)
+        self.n_examples = n_examples
+
+    def fit(
+        self,
+        X: np.ndarray | pd.Series | list[str],
+        y: np.ndarray | pd.Series | list[str],
+    ) -> DynamicFewShotGPTClassifier:
+        """Fits the model to the given data.
+
+        Parameters
+        ----------
+        X : Union[np.ndarray, pd.Series, List[str]]
+            training data
+        y : Union[np.ndarray, pd.Series, List[str]]
+            training labels
+
+        Returns
+        -------
+        DynamicFewShotGPTClassifier
+            self
+        """
+        X = to_numpy(X)
+        y = to_numpy(y)
+        self.embedding_model_ = GPTVectorizer().fit(X)
+        self.classes_, self.probabilities_ = self._get_unique_targets(y)
+
+        self.data_ = {}
+        for cls in self.classes_:
+            print(f"Building index for class `{cls}` ...")
+            self.data_[cls] = {}
+            partition = X[y == cls]
+            self.data_[cls]["partition"] = partition
+            embeddings = self.embedding_model_.transform(partition)
+            index = AnnoyMemoryIndex(embeddings.shape[1])
+            for i, embedding in enumerate(embeddings):
+                index.add(i, embedding)
+            index.build()
+            self.data_[cls]["index"] = index
+
+        return self
+
+    def _get_prompt(self, x: str) -> str:
+        """Generates the prompt for the given input.
+
+        Parameters
+        ----------
+        x : str
+            sample to classify
+
+        Returns
+        -------
+        str
+            final prompt
+        """
+        embedding = self.embedding_model_.transform([x])
+        training_data = []
+        for cls in self.classes_:
+            index = self.data_[cls]["index"]
+            partition = self.data_[cls]["partition"]
+            neighbors = index.retrieve(embedding, min(self.n_examples, len(partition)))
+            neighbors = [partition[i] for i in neighbors[0]]
+            training_data.extend(
+                [
+                    _TRAINING_SAMPLE_PROMPT_TEMPLATE.format(x=neighbor, label=cls)
+                    for neighbor in neighbors
+                ]
+            )
+
+        training_data_str = "\n".join(training_data)
+
+        return build_few_shot_prompt_slc(
+            x=x, training_data=training_data_str, labels=repr(self.classes_)
+        )
diff --git a/tests/test_gpt_few_shot_clf.py b/tests/test_gpt_few_shot_clf.py
diff --git a/tests/test_gpt_zero_shot_clf.py b/tests/test_gpt_zero_shot_clf.py

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,7 @@`
`1`		`-from skllm.models.gpt_few_shot_clf import FewShotGPTClassifier`
	`1`	`+# ordering is important here to prevent circular imports`
`2`	`2`	`from skllm.models.gpt_zero_shot_clf import (`
`3`	`3`	`MultiLabelZeroShotGPTClassifier,`
`4`	`4`	`ZeroShotGPTClassifier,`
`5`	`5`	`)`
	`6`	`+from skllm.models.gpt_few_shot_clf import FewShotGPTClassifier`
	`7`	`+from skllm.models.gpt_dyn_few_shot_clf import DynamicFewShotGPTClassifier`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from skllm.memory._annoy import AnnoyMemoryIndex`