-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmapping.py
94 lines (81 loc) · 3.09 KB
/
mapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from fuzzywuzzy import fuzz #added
import stream_deepspeech
from timeit import default_timer as timer
import json
def get_match(substring):
"""Returns the best match of a given substring, otherwise, returns no match"""
matches = [(correctionJSON()[key], fuzz.ratio(substring, key)) for key in correctionJSON().keys()
if fuzz.ratio(substring, key) > 70]
ordered = sorted(matches, key=lambda x: x[1], reverse=True) #orders by the best matches
if len(ordered) > 0:
return ordered[0][0] #get the best match
else:
return "no match"
def get_substrings(string):
"""Returns a list of n-size substrings from a given string where n is from 1 to number of words in the string"""
string_list = string.split()
combinations = [string_list[i:j]
for i in range(len(string_list))
for j in range (i+1, len(string_list)+ 1)]
return combinations
def filter_stopwords(combinations):
"""Filters out substrings with stopwords"""
filtered = []
for substring in combinations:
append = True
for word in substring:
if word in listing_stopwords("stopwords.txt"):
append = False
if append == True:
filtered.append(substring)
return filtered
def generate_n_grams(list):
"""Prioritizes larger substrings over smaller substrings"""
list.sort(key=len, reverse=True)
concatenated = [" ".join(substring) for substring in list] #concatenated the words within the substrings cuz they were in lists before
return concatenated
def stt_mapper(result):
"""Returns the corrected text"""
best = result
variations = filter_stopwords(get_substrings(result))
n_grams = generate_n_grams(variations)
changed = []
for substring in n_grams:
no_change = False
for phrase in changed: #checks if a particular phrase has already been changed
if substring in phrase:
no_change = True
match = get_match(substring)
if match != "no match" and no_change == False: #there was a match
best = best.replace(substring, match)
changed.append(substring)
return best
def listing_stopwords(filename):
"""Returns a list of all the stopwords in stopwords.txt"""
try:
h = open(filename)
except:
raise FileNotFoundError
h.close()
f = open(filename, 'r')
list = []
lines = f.readlines()
for line in range(len(lines)): # each line has one word
list.append(lines[line].strip())
f.close()
"""Improvement could be where we load the stopwords into a hash table"""
return list
def correctionJSON():
"""Returns the JSON object containing corrected names"""
"""Reads from corrections.json"""
myjsonfile = open('corrections.json')
jsondata = myjsonfile.read()
"""Parsing"""
object = json.loads(jsondata)
return object
if __name__ == "__main__":
starttime = timer()
result = stream_deepspeech.run_stt(4)
correct = stt_mapper(result)
print("The time difference is :", timer() - starttime)
print(correct)