-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathprepare_data.py
136 lines (112 loc) · 4.45 KB
/
prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import re
import random
import numpy as np
from tqdm import tqdm
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
def calc_text_uniq_words(text):
unique_words = set()
for word in text.split():
unique_words.add(word)
return len(unique_words)
# https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/46371
def substitute_repeats_fixed_len(text, nchars, ntimes=3):
return re.sub(r"(\S{{{}}})(\1{{{},}})".format(nchars, ntimes-1), r"\1", text)
def substitute_repeats(text, ntimes=3):
for nchars in range(1, 20):
text = substitute_repeats_fixed_len(text, nchars, ntimes)
return text
# Split word and digits
def split_text_and_digits(text, regexps):
for regexp in regexps:
result = regexp.match(text)
if result is not None:
return ' '.join(result.groups())
return text
def read_wrong_words(fname):
wrong_word_dict = {}
with open(fname) as f:
for line in f:
line = line.rstrip()
line = re.sub(' +', ' ', line)
line = line.split()
if len(line) < 2:
continue
wrong_word_dict[line[0]] = ' '.join(line[1:])
return wrong_word_dict
def combine_swear_words(text, swear_words):
i = 0
n = len(text)
result = []
while i < n - 1:
word = text[i]
next_word = text[i+1]
if len(word) == 1 or len(next_word) == 1:
if not (word.isdigit() or next_word.isdigit()):
combine_word = '{}{}'.format(word, next_word)
if combine_word in swear_words:
i += 1
word = combine_word
result.append(word)
i += 1
return result
def clean_text(df, tokinizer, wrong_words_dict, swear_words, regexps, autocorrect=True, swear_combine=True):
df.fillna("__NA__", inplace=True)
texts = df.tolist()
result = []
for text in tqdm(texts):
tokens = tokinizer.tokenize(text.lower())
tokens = [split_text_and_digits(token, regexps) for token in tokens]
tokens = [substitute_repeats(token, 3) for token in tokens]
if swear_combine:
tokens = combine_swear_words(tokens, swear_words)
text = ' '.join(tokens)
if autocorrect:
for wrong, right in wrong_words_dict.items():
text = text.replace(wrong, right)
result.append(text)
return result
def convert_text2seq(train_texts, test_texts, max_words, max_seq_len, lower=True, char_level=False):
tokenizer = Tokenizer(num_words=max_words, lower=lower, char_level=char_level)
tokenizer.fit_on_texts(train_texts + test_texts)
word_seq_train = tokenizer.texts_to_sequences(train_texts)
word_seq_test = tokenizer.texts_to_sequences(test_texts)
word_index = tokenizer.word_index
word_seq_train = list(sequence.pad_sequences(word_seq_train, maxlen=max_seq_len))
word_seq_test = list(sequence.pad_sequences(word_seq_test, maxlen=max_seq_len))
return word_seq_train, word_seq_test, word_index
def get_embedding_matrix(embed_dim, embeds, max_words, word_index):
words_not_found = []
nb_words = min(max_words, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
if i >= nb_words:
continue
embedding_vector = embeds[word]
if embedding_vector is not None and len(embedding_vector) > 0:
embedding_matrix[i] = embedding_vector
else:
words_not_found.append(word)
return embedding_matrix, words_not_found
def split_data_idx(n, test_size=0.2, shuffle=True, random_state=0):
train_size = 1 - test_size
idxs = np.arange(n)
if shuffle:
random.seed(random_state)
random.shuffle(idxs)
return idxs[:int(train_size*n)], idxs[int(train_size*n):]
def split_data(x, y, test_size=0.2, shuffle=True, random_state=0):
n = len(x)
train_idxs, test_idxs = split_data_idx(n, test_size, shuffle, random_state)
return np.array(x[train_idxs]), np.array(x[test_idxs]), y[train_idxs], y[test_idxs], train_idxs, test_idxs
def get_bow(texts, words):
result = np.zeros((len(texts), len(words)))
print(np.shape(result))
for i, text in tqdm(enumerate(texts)):
for j, word in enumerate(words):
try:
if word in text:
result[i][j] = 1
except UnicodeDecodeError:
pass
return result