Skip to content

Commit 4524227

Browse files
Paula Navarrete DíazPaula Navarrete Díaz
Paula Navarrete Díaz
authored and
Paula Navarrete Díaz
committed
token probability feature
1 parent 22fc390 commit 4524227

File tree

3 files changed

+106
-59
lines changed

3 files changed

+106
-59
lines changed

notebooks/word_count_prototype.ipynb

+87-59
Large diffs are not rendered by default.

src/cdstemplate/word_count.py

+13
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,19 @@ def get_token_counts_as_dataframe(self):
101101
)
102102
dataframe = dataframe.sort_values("token")
103103
return dataframe
104+
105+
def get_token_probabilities_as_dataframe(self):
106+
"""Returns the token probabilities of the corpus as a Pandas DataFrame with columns 'token', 'probability'
107+
"""
108+
df=self.get_token_counts_as_dataframe()
109+
total_tokens=df["count"].sum()
110+
111+
dataframe = pd.DataFrame.from_records(
112+
list(self.token_counter.items()), columns=["token", "probability"]
113+
)
114+
dataframe = dataframe.sort_values("token")
115+
dataframe["probability"]=dataframe["probability"].div(total_tokens)
116+
return dataframe
104117

105118
def save_token_counts(self, csv_file):
106119
"""Saves the counts of tokens the corpus to a specified

tests/test_word_count.py

+6
Original file line numberDiff line numberDiff line change
@@ -103,3 +103,9 @@ def test_corpus_counter_save_csv(tmp_path):
103103
assert my_csv.is_file()
104104
expected_csv = "token,count\na,2\nb,1\nc,1\nx,1\ny,1\nz,1\n"
105105
assert my_csv.read_text() == expected_csv
106+
107+
def test_token_probabilities_to_dataframe():
108+
cc = word_count.CorpusCounter()
109+
cc.add_doc("A a B b b b b b")
110+
dataframe = cc.get_token_probabilities_as_dataframe()
111+
assert dataframe["probability"].sum()==1

0 commit comments

Comments
 (0)