-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmystem.py
160 lines (139 loc) · 4.26 KB
/
mystem.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import utils
import tempfile
import subprocess
import logging
MYSTEM_NAMES_LEXEMS = ['фам', 'имя', 'отч']
STRANGE_MIDDLE_NAMES = ['оглы', 'кызы']
def get_words_describe(data):
"""
:rtype data: set[str]
:rtype: generator
"""
with tempfile.NamedTemporaryFile(mode='w') as temp_input:
temp_input.write('\n'.join(data))
temp_input.flush()
with tempfile.NamedTemporaryFile(mode='r') as temp_output:
try:
utils.get_output_from_program('./mystem', '-in', '--format', 'json',
temp_input.name, temp_output.name)
return utils.iload_json(temp_output.read())
except subprocess.CalledProcessError as e:
logging.error("Mystem run error: %s. And returned %s code", e.stderr, e.returncode)
def get_word_describe_dict(data):
"""
Get all words analysis
:rtype data: list[str]
:rtype: dict{str, list[int]}
"""
result = dict()
for word in get_words_describe(set(data)):
word_analysis = [0, 0, 0]
for i in range(len(MYSTEM_NAMES_LEXEMS)):
for lexem in word['analysis']:
if MYSTEM_NAMES_LEXEMS[i] in lexem['gr']:
word_analysis[i] += 1
result[word['text']] = word_analysis
return result
def prepare_data(data_string):
"""
Prepare data, get all characters and make it small
:type data_string: str
:rtype: list[str]
"""
data = ['']
is_last_space = True
for ch in data_string:
if ch.isalpha():
data[-1] += ch.lower()
is_last_space = False
elif not is_last_space:
data += ['']
is_last_space = True
if is_last_space:
data = data[:-1]
return data
def set_upper_letters(s):
"""
Make first letter upper, and other small, when it need.
:rtype s: str
:rtype: str
"""
s = s.lower()
if s in STRANGE_MIDDLE_NAMES:
return s
if len(s) < 2:
logging.error("Name %s it too short", s)
raise Exception("Process upper letters error")
return s[0].upper() + s[1:]
def processing_strange_middle_names(data):
"""
Process middle names with two words
:type data: list[str]
:rtype: (list[str], list[list[str]])
"""
positions = []
for i in range(len(data)):
if data[i] in STRANGE_MIDDLE_NAMES:
if i < 3:
raise Exception('Strange middle names process error')
if len(positions) > 0 and i - positions[-1] < 4:
raise Exception('Strange middle names process error')
positions += [i]
processed = []
for i in positions[::-1]:
cur_name = data[i - 3:i + 1]
processed += [cur_name]
data = data[:i - 3] + data[i + 1:]
return data, processed
def gen_is_middle_name_fucntion(data):
"""
:type data: list[str]
"""
word_describe = get_word_describe_dict(data)
def is_middle_name(s):
"""
:type s: str
:rtype: bool
"""
cur_middle_name = word_describe[s][2]
other = sum(word_describe[s])
if cur_middle_name > 0 or other == 0:
return True
return False
return is_middle_name
def process_other_names(data):
"""
:param data: list[str]
:rtype: list[list[str]]
"""
processed = []
cur = 0
is_middle_name = gen_is_middle_name_fucntion(data)
for word in data:
if cur == 0:
processed.append([word])
elif cur == 1:
processed[-1].append(word)
elif cur == 2:
if is_middle_name(word):
processed[-1].append(word)
else:
cur = 0
processed.append([word])
cur += 1
if cur == 3:
cur = 0
return processed
def process_names(str_data):
"""
:type str_data: str
:rtype: list[str]
"""
data = prepare_data(str_data)
data, processed = processing_strange_middle_names(data)
processed += process_other_names(data)
for i in range(len(processed)):
for j in range(len(processed[i])):
processed[i][j] = set_upper_letters(processed[i][j])
processed[i] = ' '.join(processed[i])
return processed