-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconstruct_index.py
127 lines (90 loc) · 3.79 KB
/
construct_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
""" Construct Index Class """
# ---------------------------------------- IMPORT HERE ----------------------------------------
import bisect, nltk, os, pygtrie, string
import multiprocessing
import pandas as pd
from bidict import bidict
from nltk.tokenize import word_tokenize
from word_processor import Word_processor
import math
# ---------------------------------------- Construct_index ----------------------------------------
class Construct_index:
def __init__(self, folder_path):
self.indexes = list() # List of indexes. One index for each csv file
self.folder_path = folder_path
self.index_mapping = bidict(
{
i:os.listdir(folder_path)[i] for i in range(len(os.listdir(folder_path)))
}
) # Store bidirectional mapping between index number and index name for fast two-way lookup
self.word_processor = Word_processor()
self.idf_list = list()
# ---------------------------------------- PREPROCESS ----------------------------------------
def pre_process(self, file_path, col = "Snippet"):
""" Wrapper function, called for each file
Returns a dataframe with new preprocessed column 'Text' """
df = pd.read_csv(file_path)
data = []
column = df[col]
for row in column:
data.append(self.word_processor.process(row))
df["Text"] = data
return df
# ---------------------------------------- MISC ----------------------------------------
def add_tfidf(self, index_trie, rev_trie, corpus_len):
""" tf-idf scores """
index_trie_list = list(index_trie) # list of terms in index
idf_dict = dict()
for term in index_trie_list:
idf = math.log(corpus_len / (1 + len(index_trie[term])), 10) + 1
idf_dict[term] = idf
for docid in index_trie[term].keys():
tfidf = math.log(1 + len(index_trie[term][docid][0]), 10) * idf
index_trie[term][docid][1] = tfidf
#rev_trie[term[::-1]][docid][1] = tfidf
return idf_dict
def update_trie(self, term, docid, pos, trie):
""" Updating positional index """
if term in trie:
if docid in trie[term]:
trie[term][docid][0].add(pos)
else:
trie[term][docid] = [set((pos,)),1]
else:
trie[term] = {docid: [set((pos,)), 1]}
# ---------------------------------------- INDEX CONSTRUCTION ----------------------------------------
def construct_index_helper(self, file_path):
""" Helper function to create index for each file
Returns (normal index, reverse index)
Trie node: key, value pairs
key - <term>, value- {docId1: {pos1, pos2, pos3...}, docId2: {pos1,pos2...}} """
file_path = os.path.join(self.folder_path, file_path)
df = self.pre_process(file_path)
corpus = df["Text"]
# Creating 2 tries
index_trie = pygtrie.CharTrie()
rev_trie = pygtrie.CharTrie()
for i in range(len(corpus)):
row = word_tokenize(corpus[i])
for j in range(len(row)):
self.update_trie(row[j], i, j, index_trie)
rev_term = row[j][::-1]
rev_trie[rev_term] = index_trie[row[j]]
idf_dict = self.add_tfidf(index_trie, rev_trie, len(corpus))
return ((index_trie, rev_trie), idf_dict)
def construct_index(self):
""" Interface for constructing index. Only this function is available to the client """
pool = multiprocessing.Pool(multiprocessing.cpu_count())
index_info = pool.map(self.construct_index_helper, self.index_mapping.inverse)
pool.close()
pool.join()
for ele in index_info:
# print(len(ele))
self.indexes.append(ele[0])
self.idf_list.append(ele[1])
# print("FINAL LENGTH OF INDEX: ", len(self.indexes))
# print("FINAL LENGTH OF IDF: ", len(self.idf_list))
# ---------------------------------------- INDEX STORE ----------------------------------------
def collect_index(self):
""" Returns the built-up index and mapping. Only this function is available to the client """
return self.indexes, self.index_mapping, self.idf_list