Skip to content

Commit 811df14

Browse files
committed
Add doc strings, bump version
1 parent 2e3bea6 commit 811df14

File tree

4 files changed

+81
-11
lines changed

4 files changed

+81
-11
lines changed

pyproject.toml

+7-8
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,18 @@
11
[tool.poetry]
22
name = "topically"
3-
version = "0.0.1"
3+
version = "0.0.4"
44
description = ""
55
authors = ["Jay Alammar <[email protected]>"]
66
license = "MIT"
77

88
[tool.poetry.dependencies]
9-
python = "^3.9"
9+
python = "^3.7"
1010
cohere = "^2.1"
11-
umap-learn = "^0.5"
12-
streamlit = "^1.12"
13-
altair = "^4.2"
14-
pandas = "^1.4"
15-
matplotlib = "^3.5"
16-
bertopic = {version = "*", extras = ["bertopic"]}
11+
pandas = "^1.2"
12+
bertopic = {version = "*", optional = true}
13+
14+
[tool.poetry.extras]
15+
bertopic = ["bertopic"]
1716

1817
[tool.poetry.dev-dependencies]
1918
yapf = "^0.32"

topically/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@
88

99
from .app import Topically
1010

11-
__version__ = "0.0.2"
11+
__version__ = "0.0.4"

topically/app.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def name_clusters(self, X, prompt: str = '', num_generations=1, num_sample_texts
7272
def name_cluster(cluster_number):
7373
# Get the texts in this cluster, sample from them
7474
cluster_texts = texts[cluster_assignments == cluster_number]
75-
# sample_texts_from_cluster = cluster_texts.sample(num_sample_texts)
75+
7676
if len(cluster_texts) > num_sample_texts:
7777
sample_texts_from_cluster = np.random.choice(cluster_texts, num_sample_texts, replace=False)
7878
else:

topically/cluster_namers.py

+72-1
Original file line numberDiff line numberDiff line change
@@ -12,21 +12,64 @@
1212

1313

1414
class ClusterNamer(BaseEstimator):
15-
15+
""" Assign names to clusters of text based on their content using managed Language Models from Cohere."""
1616
def __init__(self, co, prompt: str = '', num_generations: int = 1, temperature=0.6):
17+
"""
18+
Name a cluster using the default prompt. Calls the Cohere generate end-point to assign a name to the cluster.
19+
20+
Parameters
21+
----------
22+
co: Python cohere SDK object
23+
prompt: str
24+
The text prompt the generative model uses to name a cluster
25+
num_generations: int
26+
The number of candidate generations to get for each cluster. Multiple generations can enhance the quality of cluster names.
27+
temperature: float
28+
Typically between 0-1, the temperature value used to control the randomness of the generation model. Lower values lead to more predictable, less "creative" names.
29+
30+
"""
1731
self.co = co
1832
self.prompt = prompt
1933
self.num_generations = num_generations
2034
self.temperature = temperature
2135

2236
def make_prompt(self, cluster_example_texts):
37+
"""
38+
Prepare the naming prompt by adding examples from a single cluster to the prompt.
39+
40+
Parameters
41+
----------
42+
cluster_example_texts: array-like of strings
43+
A collection of texts belonging to a single cluster/topic. They are added to the naming prompt
44+
45+
Returns
46+
-------
47+
prompt: str
48+
The naming prompt including the examples from this cluster
49+
50+
"""
2351
# Add the data of the current cluster we want to label
2452
return self.prompt + construct_example_for_prompt(cluster_example_texts)
2553

2654
def generate(self, cluster_example_texts):
55+
"""
56+
Generate suggest topic name(s)
57+
58+
Parameters
59+
----------
60+
cluster_example_texts: array-like of strings
61+
A collection of texts belonging to a single cluster/topic. They are added to the naming prompt
62+
63+
Returns
64+
-------
65+
generations: list of Cohere Generation objects.
66+
The cluster names suggest by the generative model
67+
68+
"""
2769
# Add the data of the current cluster we want to label
2870
prompt = self.make_prompt(cluster_example_texts)
2971

72+
# Generate using the language model
3073
request = self.co.generate(model='xlarge',
3174
prompt=prompt,
3275
max_tokens=50,
@@ -37,6 +80,20 @@ def generate(self, cluster_example_texts):
3780
return request.generations
3881

3982
def predict(self, texts):
83+
"""
84+
Generate a name for a single topic
85+
86+
Parameters
87+
----------
88+
texts: array-like of strings
89+
A collection of texts belonging to a single cluster/topic. They are used to help the model suggest a name for the clsuter.
90+
91+
Returns
92+
-------
93+
topic_name: str
94+
The suggest name for the topic/cluster.
95+
96+
"""
4097
gens = self.generate(texts)
4198

4299
if self.num_generations > 1:
@@ -74,6 +131,20 @@ def rerank_by_likelihood(generations: cohere.generation.Generations):
74131

75132

76133
def construct_example_for_prompt(cluster_example_texts):
134+
"""
135+
Prepare a single portion of the prompt by stitching the texts as an example
136+
137+
Parameters
138+
----------
139+
cluster_example_texts: array-like of strings
140+
A collection of texts belonging to a single cluster/topic. They are used to help the model suggest a name for the clsuter.
141+
142+
Returns
143+
-------
144+
example_prompt_text: str
145+
A portion of a naming prompt
146+
147+
"""
77148
example_prompt_text = f'\nCluster:\nSample texts from this cluster:\n'
78149
for text in cluster_example_texts:
79150
example_prompt_text += f'- {text}\n'

0 commit comments

Comments
 (0)