Add doc strings, bump version

jalammar · jalammar · commit 811df14f54d9 · 2022-11-03T11:26:48.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,19 +1,18 @@
 [tool.poetry]
 name = "topically"
-version = "0.0.1"
+version = "0.0.4"
 description = ""
 authors = ["Jay Alammar <jay@cohere.ai>"]
 license = "MIT"
 
 [tool.poetry.dependencies]
-python = "^3.9"
+python = "^3.7"
 cohere = "^2.1"
-umap-learn = "^0.5"
-streamlit = "^1.12"
-altair = "^4.2"
-pandas = "^1.4"
-matplotlib = "^3.5"
-bertopic = {version = "*", extras = ["bertopic"]}
+pandas = "^1.2"
+bertopic = {version = "*", optional = true}
+
+[tool.poetry.extras]
+bertopic = ["bertopic"]
 
 [tool.poetry.dev-dependencies]
 yapf = "^0.32"
diff --git a/topically/__init__.py b/topically/__init__.py
@@ -8,4 +8,4 @@
 
 from .app import Topically
 
-__version__ = "0.0.2"
+__version__ = "0.0.4"
diff --git a/topically/app.py b/topically/app.py
@@ -72,7 +72,7 @@ def name_clusters(self, X, prompt: str = '', num_generations=1, num_sample_texts
         def name_cluster(cluster_number):
             # Get the texts in this cluster, sample from them
             cluster_texts = texts[cluster_assignments == cluster_number]
-            # sample_texts_from_cluster = cluster_texts.sample(num_sample_texts)
+
             if len(cluster_texts) > num_sample_texts:
                 sample_texts_from_cluster = np.random.choice(cluster_texts, num_sample_texts, replace=False)
             else:
diff --git a/topically/cluster_namers.py b/topically/cluster_namers.py
@@ -12,21 +12,64 @@
 
 
 class ClusterNamer(BaseEstimator):
-
+    """ Assign names to clusters of text based on their content using managed Language Models from Cohere."""
     def __init__(self, co, prompt: str = '', num_generations: int = 1, temperature=0.6):
+        """
+        Name a cluster using the default prompt. Calls the Cohere generate end-point to assign a name to the cluster.
+
+        Parameters
+        ----------
+            co: Python cohere SDK object
+            prompt: str
+                The text prompt the generative model uses to name a cluster
+            num_generations: int
+                The number of candidate generations to get for each cluster. Multiple generations can enhance the quality of cluster names.
+            temperature: float
+                Typically between 0-1, the temperature value used to control the randomness of the generation model. Lower values lead to more predictable, less "creative" names.
+
+        """
         self.co = co
         self.prompt = prompt
         self.num_generations = num_generations
         self.temperature = temperature
 
     def make_prompt(self, cluster_example_texts):
+        """
+        Prepare the naming prompt by adding examples from a single cluster to the prompt.
+
+        Parameters
+        ----------
+            cluster_example_texts: array-like of strings
+                A collection of texts belonging to a single cluster/topic. They are added to the naming prompt
+
+        Returns
+        -------
+            prompt: str
+               The naming prompt including the examples from this cluster
+
+        """
         # Add the data of the current cluster we want to label
         return self.prompt + construct_example_for_prompt(cluster_example_texts)
 
     def generate(self, cluster_example_texts):
+        """
+        Generate suggest topic name(s)
+
+        Parameters
+        ----------
+            cluster_example_texts: array-like of strings
+                A collection of texts belonging to a single cluster/topic. They are added to the naming prompt
+
+        Returns
+        -------
+            generations: list of Cohere Generation objects.
+               The cluster names suggest by the generative model
+
+        """
         # Add the data of the current cluster we want to label
         prompt = self.make_prompt(cluster_example_texts)
 
+        # Generate using the language model
         request = self.co.generate(model='xlarge',
                                    prompt=prompt,
                                    max_tokens=50,
@@ -37,6 +80,20 @@ def generate(self, cluster_example_texts):
         return request.generations
 
     def predict(self, texts):
+        """
+        Generate a name for a single topic
+
+        Parameters
+        ----------
+            texts: array-like of strings
+                A collection of texts belonging to a single cluster/topic. They are used to help the model suggest a name for the clsuter.
+
+        Returns
+        -------
+            topic_name: str
+               The suggest name for the topic/cluster.
+
+        """
         gens = self.generate(texts)
 
         if self.num_generations > 1:
@@ -74,6 +131,20 @@ def rerank_by_likelihood(generations: cohere.generation.Generations):
 
 
 def construct_example_for_prompt(cluster_example_texts):
+    """
+    Prepare a single portion of the prompt by stitching the texts as an example
+
+    Parameters
+    ----------
+        cluster_example_texts: array-like of strings
+            A collection of texts belonging to a single cluster/topic. They are used to help the model suggest a name for the clsuter.
+
+    Returns
+    -------
+        example_prompt_text: str
+           A portion of a naming prompt
+
+    """
     example_prompt_text = f'\nCluster:\nSample texts from this cluster:\n'
     for text in cluster_example_texts:
         example_prompt_text += f'- {text}\n'

Original file line number	Diff line number	Diff line change
`@@ -8,4 +8,4 @@`
`8`	`8`
`9`	`9`	`from .app import Topically`
`10`	`10`
`11`		`-__version__ = "0.0.2"`
	`11`	`+__version__ = "0.0.4"`