-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_ner_train_python2.py
204 lines (167 loc) · 7.93 KB
/
build_ner_train_python2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import itertools
import json
import random
from collections import Counter
import optparse
import re
import ast
import comps.attribute_extraction.extract_from_product as guys_report_module
from run.search_first import run_cmd
# import nltk
import pandas as pd
import spacy
import subprocess
from pathlib import Path
from spacy.lang.en import English
from spacy.gold import GoldParse
from spacy.util import minibatch, compounding
from spacy.scorer import Scorer
from sklearn.model_selection import train_test_split
from comps.product_classifier.db_data_fetch import DataFetchFuncs
from lib.help_functions import df_to_csv, read_excel_range_to_df
from lib.time_utils import time_string
from itertools import izip, islice, tee
DONT_LIMIT = "don't limit"
def consecutive_indexes(iterable, n):
return izip(*(islice(it, i, None) for i, it in enumerate(tee(iterable, n))))
def tuples_have_overlapping_indexes(tuples_list):
tuples_list = sorted(tuples_list, key=lambda x: x[0])
indexes = list(consecutive_indexes(range(len(tuples_list)), 2))
overlapping_tuples = []
for index_of_first, index_of_second in indexes:
if not (tuples_list[index_of_first][1] < tuples_list[index_of_second][0]):
overlapping_tuples.append((tuples_list[index_of_first],tuples_list[index_of_second]))
return overlapping_tuples
def test_overlaps():
test_ind1 = [(1, 3), (5, 7), (6, 9)]
test_ind2 = [(1, 3), (5, 7), (8, 9)]
test_ind3 = [(1, 5), (5, 7), (8, 9)]
test_ind4 = [(5, 7), (1, 5), (8, 9)]
test_ind5 = [(5, 7), (1, 3), (8, 9)]
assert tuples_have_overlapping_indexes(test_ind1) == True
assert tuples_have_overlapping_indexes(test_ind2) == False
assert tuples_have_overlapping_indexes(test_ind3) == True
assert tuples_have_overlapping_indexes(test_ind4) == True
assert tuples_have_overlapping_indexes(test_ind5) == False
def dict_of_tuples_with_overlapping_indexes(d):
list_of_indexes = list(itertools.chain(d.values()))
return tuples_have_overlapping_indexes(list_of_indexes)
def map_cms_to_relevant_attribute_names():
fn = 'ner_attributes/data/cms_to_attributes.xlsx'
df = read_excel_range_to_df(fn, 'Sheet1', 'a1', 'k17', True)
d = dict()
for cms_col in df.columns:
d[trim(cms_col)] = [trim(x) for x in list(df[cms_col]) if not pd.isnull(x)]
return d
class AttributesIndexer(object):
def __init__(self, cms, df, attrs_for_cms=DONT_LIMIT):
self.cms = cms
self.df = df
with open('comps/attribute_extraction/function_files/attributes_dict.json') as json_file:
self.cms_attributes_definitions_dict = {k: v for k, v in json.load(json_file).iteritems() if attrs_for_cms == DONT_LIMIT or k in attrs_for_cms}
self.trim_df_with_needed_columns_from_db(self.df)
self.attributes_columns = self.get_attributes_columns(self.df)
def run(self):
self.df['attrs_indexes_dict'] = self.df.apply(self.annotate_row_with_indexes_of_attr_values, axis=1)
self.remove_rows_with_overlapping_attributes()
fn = 'ner_attributes/data/{cms}_with_indexes.csv'.format(ts=time_string(), cms=self.cms)
df_to_csv(self.df, fn)
return fn
def annotate_row_with_indexes_of_attr_values(self, row):
d = dict()
for attr_name in self.attributes_columns:
if pd.isna(row[attr_name]) or row[attr_name] == '':
continue
doc = row.product_name
doc = doc.lower()
possible_values_of_attribute_in_json = list(itertools.chain.from_iterable([syns_joined_with_coma.split(',')
for syns_joined_with_coma in self.cms_attributes_definitions_dict.get(attr_name, [])]))
if not possible_values_of_attribute_in_json:
print 'warning:', attr_name, 'not in json'
continue
for possible_value in possible_values_of_attribute_in_json:
possible_value = possible_value.lower().strip()
if possible_value == '':
continue
word_delimiter_pattern = re.compile(r"\b{possible_value}\b".format(possible_value=possible_value))
if possible_value in re.findall(word_delimiter_pattern, doc):
value_start_index_in_doc = re.finditer(word_delimiter_pattern, doc).next().start()
value_end_index_in_doc = value_start_index_in_doc + len(possible_value)
d[attr_name] = value_start_index_in_doc, value_end_index_in_doc
return d
def trim_df_with_needed_columns_from_db(self, df):
if 'description' not in df.columns:
self.df = DataFetchFuncs.annotate_with_product_data(df, ['description'])
self.df = self.df.rename(columns={col: col.replace(' ', '_') for col in self.df.columns})
@staticmethod
def get_attributes_columns(df):
return [col for col in df.columns if col not in ['product_id', 'product_name', 'description', 'product_type', ]]
def remove_rows_with_overlapping_attributes(self):
self.df['overlaps'] = self.df['attrs_indexes_dict'].apply(dict_of_tuples_with_overlapping_indexes)
self.df = self.df[self.df['overlaps'].astype(str) == '[]']
def trim(x):
return x.replace(' ', '_').strip().lower()
def main_create_indexed(cms, guys_report, attrs_for_cms):
guys_report_df = pd.read_csv(guys_report, encoding='utf-8')
if guys_report_df.empty:
return None
a = AttributesIndexer(cms, guys_report_df, attrs_for_cms)
ret_fn = a.run()
return ret_fn
def concat_all_to_one_indexed_report(cms_reports_with_index_dicts_fns):
dfs = []
for indexed_report_fn in cms_reports_with_index_dicts_fns:
df = pd.read_csv(indexed_report_fn, encoding='utf-8')
dfs.append(df)
fn = 'ner_attributes/data/all_cms_indexed.csv'
big_df = pd.concat(dfs)
first_cols = [
'product_id',
'product_type',
'product_name',
'attrs_indexes_dict',
]
# keys_in_all_dicts=ast.literal_eval(d_str) for d_str in big_df['attrs_indexes_dict'].values
df=pd.DataFrame([ast.literal_eval(d_str) for d_str in big_df['attrs_indexes_dict'].values])
big_df = big_df[first_cols + list(df.columns)]
remaining_cols = [col for col in big_df.columns if col not in first_cols]
big_df = big_df[first_cols + remaining_cols]
big_df=big_df.dropna(how='all',axis='columns')
return df_to_csv(big_df, fn)
def main():
cms_list = [
'allinone_desktop',
'computer',
'mini_pc',
'cpu',
'camcoder',
'dslr_camera',
'lens',
'lens_cap',
'lens_hood',
# excluded cms:
# 'point_shoot_camera',#empty Guy's report
# 'tablet', error in Guy's report
]
limit_cms_attributes = False
cms_to_relevant_attribute_names = map_cms_to_relevant_attribute_names()
cms_reports_with_index_dicts_fns = []
for cms in cms_list:
# 1 - Guy's report
options = guys_report_module.process_command_line()
options.cms_vertical_list = [cms]
guys_report_module.main(options)
guys_report_fn = 'ner_attributes/data/{cms}_guys_report.csv'.format(cms=cms) # created in extract_from_product.py
# 2 - create column with dictionary of indexes
if limit_cms_attributes:
attrs_for_cms = cms_to_relevant_attribute_names[cms]
else:
attrs_for_cms = DONT_LIMIT
report_with_indexes_dict_fn = main_create_indexed(cms, guys_report_fn, attrs_for_cms)
# report_with_indexes_dict_fn = 'ner_attributes/data/{cms}_with_indexes.csv'.format(ts=time_string(), cms=cms)
if not report_with_indexes_dict_fn:
print 'warning: cms', cms, 'has empty guys report'
cms_reports_with_index_dicts_fns.append(report_with_indexes_dict_fn)
concat_all_to_one_indexed_report(cms_reports_with_index_dicts_fns)
if __name__ == '__main__':
main()