@@ -12,9 +12,9 @@ class TokenStatistics():
12
12
''' Helper class that stores the tf, idf and number of documents
13
13
for a token. '''
14
14
def __init__ (self ):
15
- self .tf_dict = {} # Key, value = category, tf
16
- self .num_docs_with_token = 0 # Number of documents containing token
17
- self .idf = 0 # idf
15
+ self .tf_dict = defaultdict ( lambda : 0 ) # Key, value = category, tf
16
+ self .num_docs_with_token = 0 # Number of documents iwth token
17
+ self .idf = 0 # idf
18
18
19
19
20
20
class InvertedIndex ():
@@ -44,7 +44,7 @@ def compute_tfidfs(self, train_labels_filename):
44
44
self .inverted_index [token ].tf_dict [category ] += 1
45
45
46
46
for token in set (token_list ):
47
- self .inverted_index [token ].doc_count += 1
47
+ self .inverted_index [token ].num_docs_with_token += 1
48
48
49
49
self .num_documents += 1
50
50
@@ -83,12 +83,18 @@ def save(self, filename):
83
83
84
84
def tokenize (file_path ):
85
85
''' Takes article path, and returns list of tokens. '''
86
- return ['foobar' ]
86
+ tokens = []
87
+
88
+ with open (file_path , 'r' ) as f :
89
+ for line in f :
90
+ tokens += line .split ()
91
+
92
+ return tokens
87
93
88
94
89
95
if __name__ == '__main__' :
90
- train_labels_filename = input ('Train labels file:' )
91
- model_filename = input ('Train labels file:' )
96
+ train_labels_filename = input ('Train labels file:\t ' )
97
+ model_filename = input ('Model checkpoint file:\t ' )
92
98
93
99
print ('Training text categorizer...' )
94
100
0 commit comments