-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtorch_preprocess_word.py
170 lines (150 loc) · 5.77 KB
/
torch_preprocess_word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# _*_ coding: utf-8 _*_
import os
import sys
import numpy as np
from nltk.tokenize import word_tokenize
import random
def load_data(training_file, validation_file, testing_file):
# Load data from files
texts = []
labels = []
label_texts = []
for file_n in [training_file, validation_file, testing_file]:
txts = []
lbs = []
lb_txts = []
with open(file_n,'r',encoding='utf-8', errors='ignore') as f:
for line in f:
line = line.strip()
label, label_text, text = line.split("\t")
txts.append(text)
lbs.append(label)
lb_txts.append(label_text)
texts.append(txts)
labels.append(lbs)
label_texts.append(lb_txts)
return texts, label_texts, labels
def get_vocab(texts, label_texts, labels):
x = texts[0]+ texts[1] +texts[2]
label_texts = label_texts[0]+label_texts[1] +label_texts[2]
y = labels[0] + labels[1] +labels[2]
label_y_dict = {}
text_x_dict = {}
label_text_dict = {}
for index, text_sent in enumerate(x+label_texts):
for text_item in word_tokenize(text_sent):
text_item = text_item.lower()
if text_item not in text_x_dict:
text_x_dict[text_item] = 1
else:
text_x_dict[text_item] +=1
vocab_dict = {token_stem.lower(): index for index, token_stem in enumerate(list(text_x_dict.keys()), start=1)}
for index,label_elem in enumerate(y):
if label_elem not in label_text_dict:
label_text_dict[label_elem] = label_texts[index]
if label_elem not in label_y_dict:
label_y_dict[label_elem] = 1
else:
label_y_dict[label_elem] += 1
label_text_key_text = {}
label_text_dict_dupl = {}
for conc, text in label_text_dict.items():
if text in label_text_key_text:
text_new = text +str(random.randint(1,10))
while text_new in label_text_key_text:
text_new = text + str(random.randint(1, 10))
label_text_dict_dupl[conc] = text_new
else:
label_text_key_text[text] = conc
for conc, text in label_text_dict_dupl.items():
label_text_dict[conc] = text
label_text_list = list(label_text_dict.values())
label_text_list.sort(key=len,reverse=True)
label_text2int = {label: index for index, label in enumerate(label_text_list, start=0)}
label_text_key_text = {text: conc for conc, text in label_text_dict.items()}
label_dict = {label_text_key_text[label]: index for label, index in label_text2int.items()}
return vocab_dict, label_dict , label_text_dict
def get_idx_from_sent(sent, word_idx_map, max_l):
"""
Transforms sentence into a list of indices. Post-Pad with zeroes.
"""
x = []
# for i in range(pad):
# x.append(word_idx_map[padding_char])
for word in sent:
word = word.lower()
if word in word_idx_map.keys():
x.append(word_idx_map[word])
else:
x.append(word_idx_map["UNK"])
while len(x) < max_l:
x.append(0)
return x[:max_l]
def get_idx_from_sent_character(sent, word_idx_map):
"""
Transforms sentence into a list of indices. Post-Pad with zeroes.
"""
x = []
# for i in range(pad):
# x.append(word_idx_map[padding_char])
for word in sent:
word = word.lower()
if word in word_idx_map.keys():
x.append(word_idx_map[word])
else:
x.append(word_idx_map["UNK"])
return x
def dataset_preprocess(texts,labels, vocab_dict,label_dict,max_length):
text_x = []
label_y = []
sent_length = []
for index, sent in enumerate(texts):
word_list = word_tokenize(sent)
sent_length.append(len(word_list))
text_x.append(get_idx_from_sent(word_tokenize(sent),vocab_dict,max_length))
# if index <=2000:
# label_y.append(0)
# elif 3000<=index<=7000:
# label_y.append(1)
# else:
# label_y.append(2)
label_y.append(label_dict[labels[index]])
return np.asanyarray(text_x),np.asanyarray(label_y),np.asarray(sent_length)
def dataset_preprocess_character(texts,labels, vocab_dict,label_dict):
text_x = []
label_y = []
sent_length = []
for index, sent in enumerate(texts):
#word_list = word_tokenize(sent)
sent_length.append(len(sent))
text_x.append(get_idx_from_sent_character(sent,vocab_dict))
# if index <=2000:
# label_y.append(0)
# elif 3000<=index<=7000:
# label_y.append(1)
# else:
# label_y.append(2)
label_y.append(label_dict[labels[index]])
return text_x,np.asanyarray(label_y),np.asarray(sent_length)
def label_preprocess_character(label_dict,label_texts_dict,vocab_dict):
label_dict_new = {index:label for label, index in label_dict.items()}
labels_sorted = [label_texts_dict[label_dict_new[i]] for i in range(len(label_dict))]
len_max = [len(labels_sorted_single) for labels_sorted_single in labels_sorted]
label_x = [get_idx_from_sent_character(label,vocab_dict) for label in labels_sorted]
return label_x,np.asarray(len_max)
def dataset_preprocess_character_pretrained(texts,labels, vocab_dict,label_dict):
text_x = []
label_y = []
sent_length = []
for index, sent in enumerate(texts):
#word_list = word_tokenize(sent)
sent_length.append(len(sent))
#text_x.append(get_idx_from_sent_character(sent,vocab_dict))
# if index <=2000:
# label_y.append(0)
# elif 3000<=index<=7000:
# label_y.append(1)
# else:
# label_y.append(2)
label_y.append(label_dict[labels[index]])
return texts,np.asanyarray(label_y),np.asarray(sent_length)