-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdictionary.py
116 lines (87 loc) · 3.37 KB
/
dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from constants import *
from preprocess import Preprocess
import os
import re
import csv
import pickle
from collections import defaultdict
import numpy as np
class Dictionary:
word2idx = {}
idx2word = []
labels2idx = {}
idx2labels = []
pp_data = None # type: Preprocess
max_labels = None
oov = '<UNK>'
pad = '<PAD>'
def __init__(self, preprocessed_data, max_labels = 1000):
self.pp_data = preprocessed_data # type: Preprocess
self.max_labels = max_labels
if self.dictionariesAreBuilt():
self.loadDictionaries()
else:
self.generateDictionaries(data_folder + train_data_write_file, True)
def dictionariesAreBuilt(self):
return False
return os.path.isfile(data_folder + idx2word_file) and os.path.isfile(data_folder + word2idx_file) \
and os.path.isfile(data_folder + labels2idx_file) and os.path.isfile(data_folder + idx2labels_file)
def generateDictionaries(self, processed_csv, save):
self.idx2word.append(self.pad)
self.word2idx[self.pad] = 0
self.idx2word.append(self.oov)
self.word2idx[self.oov] = 1
with open(processed_csv, 'r') as csv_data:
data = csv.reader(csv_data, delimiter=self.pp_data.csv_delimiter)
answers = []
for (_, _, question, answer) in data:
words = question + ' ' + answer
answers.append(answer)
for word in re.split(r'[^\w]+', words):
lowercase_word = word.lower()
if lowercase_word not in self.word2idx:
index = len(self.idx2word)
self.idx2word.append(lowercase_word)
self.word2idx[lowercase_word] = index
csv_data.close()
labels = defaultdict(int)
for answer in answers:
labels[answer.lower()] += 1
sorted_answers = sorted(labels, key=labels.get, reverse=True)
if str(self.max_labels) == 'all':
self.idx2labels = sorted_answers
else:
self.idx2labels = sorted_answers[0:self.max_labels - 1]
self.idx2labels.append(self.oov) # append out of vocabulary word
for i in range(len(self.idx2labels)):
self.labels2idx[self.idx2labels[i]] = i
if save:
self.saveDictionaries()
def saveDictionaries(self):
fd = open(data_folder + idx2word_file, 'wb')
pickle.dump(self.idx2word, fd)
fd.close()
fd = open(data_folder + word2idx_file, 'wb')
pickle.dump(self.word2idx, fd)
fd.close()
fd = open(data_folder + labels2idx_file, 'wb')
pickle.dump(self.labels2idx, fd)
fd.close()
fd = open(data_folder + idx2labels_file, 'wb')
pickle.dump(self.idx2labels, fd)
fd.close()
def loadDictionaries(self):
fd = open(data_folder + word2idx_file, 'rb')
self.word2idx = pickle.load(fd)
fd.close()
fd = open(data_folder + idx2word_file, 'rb')
self.idx2word = pickle.load(fd)
fd.close()
fd = open(data_folder + idx2labels_file, 'rb')
self.idx2labels = pickle.load(fd)
fd.close()
fd = open(data_folder + labels2idx_file, 'rb')
self.labels2idx = pickle.load(fd)
fd.close()
def getVocabSize(self):
return len(self.idx2word)