Filimoa · miku · Apr 12, 2024
diff --git a/src/openparse/processing/__init__.py b/src/openparse/processing/__init__.py
@@ -2,6 +2,7 @@
     IngestionPipeline,
     BasicIngestionPipeline,
     SemanticIngestionPipeline,
+    LocalSemanticIngestionPipeline,
     NoOpIngestionPipeline,
 )
 from .basic_transforms import (
@@ -15,7 +16,7 @@
     CombineNodesSpatially,
     RemoveNodesBelowNTokens,
 )
-from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings
+from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings, OllamaEmbeddings
 
 __all__ = [
     "ProcessingStep",
@@ -29,8 +30,10 @@
     "BasicIngestionPipeline",
     "IngestionPipeline",
     "SemanticIngestionPipeline",
+    "LocalSemanticIngestionPipeline",
     "NoOpIngestionPipeline",
     "RemoveNodesBelowNTokens",
     "CombineNodesSemantically",
     "OpenAIEmbeddings",
+    "OllamaEmbeddings",
 ]
diff --git a/src/openparse/processing/ingest.py b/src/openparse/processing/ingest.py
@@ -17,6 +17,7 @@
 from openparse.processing.semantic_transforms import (
     CombineNodesSemantically,
     OpenAIEmbeddings,
+    OllamaEmbeddings,
     EmbeddingModel,
 )
 
@@ -131,3 +132,45 @@ def __init__(
             ),
             RemoveNodesBelowNTokens(min_tokens=min_tokens),
         ]
+
+class LocalSemanticIngestionPipeline(IngestionPipeline):
+    """
+    A semantic pipeline for ingesting and processing Nodes using ollama for embeddings.
+    """
+
+    def __init__(
+        self,
+        url: str = "http://localhost:11434",
+        model: str = "mxbai-embed-large",
+        min_tokens: int = consts.TOKENIZATION_LOWER_LIMIT,
+        max_tokens: int = consts.TOKENIZATION_UPPER_LIMIT,
+    ) -> None:
+        embedding_client = OllamaEmbeddings(url=url, model=model)
+
+        self.transformations = [
+            RemoveTextInsideTables(),
+            RemoveFullPageStubs(max_area_pct=0.35),
+            # mostly aimed at combining bullets and weird formatting
+            CombineNodesSpatially(
+                x_error_margin=10,
+                y_error_margin=2,
+                criteria="both_small",
+            ),
+            CombineHeadingsWithClosestText(),
+            CombineBullets(),
+            RemoveMetadataElements(),
+            RemoveRepeatedElements(threshold=2),
+            RemoveNodesBelowNTokens(min_tokens=10),
+            CombineBullets(),
+            CombineNodesSemantically(
+                embedding_client=embedding_client,
+                min_similarity=0.6,
+                max_tokens=max_tokens // 2,
+            ),
+            CombineNodesSemantically(
+                embedding_client=embedding_client,
+                min_similarity=0.55,
+                max_tokens=max_tokens,
+            ),
+            RemoveNodesBelowNTokens(min_tokens=min_tokens),
+        ]
diff --git a/src/openparse/processing/semantic_transforms.py b/src/openparse/processing/semantic_transforms.py
@@ -1,4 +1,8 @@
+import json
+
 from typing import List, Literal, Dict, Union
+from urllib.parse import urlparse
+from http.client import HTTPConnection, HTTPSConnection
 
 import numpy as np
 
@@ -68,6 +72,68 @@ def _create_client(self):
         return OpenAI(api_key=self.api_key)
 
 
+class OllamaEmbeddings:
+    """
+    Use local models via ollama for calculating embeddings. Uses the REST API
+    https://github.com/ollama/ollama/blob/main/docs/api.md.
+
+    * nomic-embed-text
+    * mxbai-embed-large
+    """
+
+    def __init__(
+        self,
+        url: str = "http://localhost:11434/",
+        model: str = "mxbai-embed-large",
+        batch_size: int = 256,
+    ):
+        """
+        Used to generate embeddings for Nodes.
+        """
+        self.url = url
+        self.model = model
+        self.batch_size = batch_size
+
+    def embed_many(self, texts: List[str]) -> List[List[float]]:
+        """
+        Generate embeddings for a list of texts. Support for batches coming
+        soon, cf. https://ollama.com/blog/embedding-models
+
+        Args:
+            texts (list[str]): The list of texts to embed.
+            batch_size (int): The number of texts to process in each batch.
+
+        Returns:
+            List[List[float]]: A list of embeddings.
+        """
+        conn = self._create_conn()
+        res = []
+        for i in range(0, len(texts), self.batch_size):
+            batch_texts = texts[i : i + self.batch_size]
+            for text in batch_texts:
+                params = json.dumps({"model": self.model, "prompt": text})
+                headers = {"Content-Type": "application/x-www-form-urlencoded", "Accept": "application/json"}
+                conn.request("POST", "/api/embeddings", params, headers)
+                response = conn.getresponse()
+                if response.status != 200:
+                    raise RuntimeError(
+                        "embeddings request failed: {} {}".format(
+                            response.status, response.reason
+                        )
+                    )
+                doc = json.loads(response.read())
+                res.extend(doc["embedding"])
+        conn.close()
+        return res
+
+    def _create_conn(self):
+        parsed = urlparse(self.url)
+        if parsed.scheme == "https":
+            return HTTPSConnection(parsed.hostname, parsed.port)
+        else:
+            return HTTPConnection(parsed.hostname, parsed.port)
+
+
 class CombineNodesSemantically(ProcessingStep):
     """
     Combines nodes that are semantically related.