Skip to content

Commit 3a7cbf9

Browse files
committed
tf-idf model added to find the most significant words in each book; pandas dataframe added to show similarities of books; updated the file run.py
1 parent 966d9fe commit 3a7cbf9

File tree

2 files changed

+15
-4
lines changed

2 files changed

+15
-4
lines changed

BooksSimilarity/run.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from prepare_texts import get_texts, read, preprocess, stemming
2-
from words_occurrences import get_dict, bows, most_common
2+
from words_occurrences import get_dict, bows, most_common, tf_idf
33

44

55
def discover_similar_books():
@@ -22,8 +22,11 @@ def discover_similar_books():
2222
bags_of_words = bows(stems_dict, stemmed)
2323
# print(bags_of_words[0])
2424

25-
df = most_common(0, bags_of_words, stemmed)
26-
print(df)
25+
common_words_0 = most_common(0, bags_of_words, stemmed)
26+
print(common_words_0)
27+
28+
books_similarities = tf_idf(bags_of_words, titles)
29+
print(books_similarities)
2730

2831

2932
if __name__ == '__main__':

BooksSimilarity/words_occurrences.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from gensim import corpora
1+
from gensim import corpora, similarities
2+
from gensim.models import TfidfModel
23
import pandas as pd
34

45

@@ -19,3 +20,10 @@ def most_common(book_ind, bag_of_words, stems):
1920
occurences.insert(2, 'tokens', tokens)
2021
occ_sorted = occurences.sort_values(by=['occurences'], ascending=False, inplace=False)
2122
return occ_sorted
23+
24+
25+
def tf_idf(bag_of_words, titles):
26+
model = TfidfModel(bag_of_words)
27+
similar = similarities.MatrixSimilarity(model[bag_of_words])
28+
similar_df = pd.DataFrame(similar, index=titles, columns=titles)
29+
return similar_df

0 commit comments

Comments
 (0)