Skip to content

Commit 1f4d19d

Browse files
Merge pull request #304 from Shubh-Goyal-07/restructure
Updates spell_check module
2 parents d8b414e + f0237de commit 1f4d19d

File tree

14 files changed

+397
-14
lines changed

14 files changed

+397
-14
lines changed

src/spell_check/kenlm/local/Dockerfile

+2
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@ RUN apt-get update && apt-get install -y wget
1717
# Download the files using wget
1818
RUN wget "https://drive.google.com/uc?export=download&id=1frSw5-qfRMgrYs4QL961s2yYuq2KplEM" -O '5gram_model.bin'
1919
RUN wget "https://drive.google.com/uc?export=download&id=1o31Z4TZbAOEt6E8Rx7VMONJOGJH-5Mwk" -O 'lexicon.txt'
20+
RUN wget "https://drive.google.com/uc?export=download&id=14cMmeDPlAODbRe37CdHLnhClGX7JXG-A" -O 'freq_dict.txt'
2021
RUN wget "https://drive.google.com/uc?export=download&id=1-Dtk5socjYdeGyqhbQzG-rvWJfWVFGqv" -O '5gram_model_eng.bin'
2122
RUN wget "https://drive.google.com/uc?export=download&id=1-59pDTvEXCMUZ-NQ8BwmCnHQZh4Eg6Gw" -O 'lexicon_eng.txt'
23+
RUN wget "https://drive.google.com/uc?export=download&id=1Ztj6k0A4BMi_o87qwSDKJQ6cyhvlvneD" -O 'freq_dict_eng.txt'
2224

2325
# Copy the rest of the application code to the working directory
2426
COPY . /app/

src/spell_check/kenlm/local/README.md

+24-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
curl request :
1+
.curl request :
22

33
curl -X POST -H "Content-Type: application/json" -d '{
44
"text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି",
@@ -8,19 +8,40 @@ curl -X POST -H "Content-Type: application/json" -d '{
88
"lang" : "ory"
99
}' http://localhost:8000/
1010

11-
1211
curl -X POST -H "Content-Type: application/json" -d '{
1312
"text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି",
1413
"BEAM_WIDTH": 5,
1514
"SCORE_THRESHOLD": 1.5,
1615
"max_distance": 1
1716
}' http://localhost:8000/
1817

19-
2018
curl -X POST -H "Content-Type: application/json" -d '{
2119
"text": "how to apply for go-sugem scheme for my paddi crop",
2220
"BEAM_WIDTH": 5,
2321
"SCORE_THRESHOLD": 1.5,
2422
"max_distance": 1,
2523
"lang" : "eng"
2624
}' http://localhost:8000/
25+
26+
27+
28+
**curl request for update:**
29+
30+
curl -X PUT -H "Content-Type: application/json" -d '{
31+
"text": "ମିଶନରୀ",
32+
"lang" : "ory"
33+
}' http://localhost:8000/
34+
35+
curl -X PUT -H "Content-Type: application/json" -d '{
36+
"text": ["ପାମ ମିଶନରୀ ଉପରେ", "ରିହାତି ଧୈର୍ଯ ହୋଇଛି"]
37+
}' http://localhost:8000/
38+
39+
curl -X PUT -H "Content-Type: application/json" -d '{
40+
"text": "go-sugem",
41+
"lang" : "eng"
42+
}' http://localhost:8000/
43+
44+
curl -X PUT -H "Content-Type: application/json" -d '{
45+
"text": ["how to apply for", "scheme for my paddi crop"],
46+
"lang" : "eng"
47+
}' http://localhost:8000/

src/spell_check/kenlm/local/api.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from model import Model
2-
from request import ModelRequest
2+
from request import ModelRequest, ModelUpdateRequest
33
from quart import Quart, request
44
import aiohttp
55

@@ -17,13 +17,17 @@
1717
'eng': 'lexicon_eng.txt'
1818
}
1919

20+
freq_dict_paths = {
21+
'ory': 'freq_dict.txt',
22+
'eng': 'freq_dict_eng.txt'
23+
}
2024

2125

2226
@app.before_serving
2327
async def startup():
2428
app.client = aiohttp.ClientSession()
2529
global model
26-
model = Model(app, model_paths, vocab_paths)
30+
model = Model(app, model_paths, vocab_paths, freq_dict_paths)
2731

2832
@app.route('/', methods=['POST'])
2933
async def embed():
@@ -33,5 +37,13 @@ async def embed():
3337
result = await model.inference(req)
3438
return result
3539

40+
@app.route('/', methods=['PUT'])
41+
async def update():
42+
global model
43+
data = await request.get_json()
44+
req = ModelUpdateRequest(**data)
45+
result = await model.update_symspell(req)
46+
return result
47+
3648
if __name__ == "__main__":
3749
app.run()

src/spell_check/kenlm/local/model.py

+85-8
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
import kenlm
2-
from request import ModelRequest
2+
from request import ModelRequest, ModelUpdateRequest
33
import Levenshtein
44

5+
from symspellpy import SymSpell, Verbosity
6+
7+
from collections import Counter
8+
59
model_paths = {
610
'ory': '5gram_model.bin',
711
'eng': '5gram_model_eng.bin'
@@ -12,9 +16,14 @@
1216
'eng': 'lexicon_eng.txt'
1317
}
1418

19+
freq_dict_paths = {
20+
'ory': 'freq_dict.txt',
21+
'eng': 'freq_dict_eng.txt'
22+
}
23+
1524

1625
class TextCorrector:
17-
def __init__(self, model_paths, vocab_paths):
26+
def __init__(self, model_paths, vocab_paths, freq_dict_paths):
1827
# Initialize both models and vocabularies
1928
self.models = {
2029
'ory': kenlm.Model(model_paths['ory']),
@@ -24,13 +33,19 @@ def __init__(self, model_paths, vocab_paths):
2433
'ory': self.create_vocab_lexicon(vocab_paths['ory']),
2534
'eng': self.create_vocab_lexicon(vocab_paths['eng'])
2635
}
36+
37+
self.symspell_models = {
38+
'ory': self.create_symspell_model(freq_dict_paths['ory']),
39+
'eng': self.create_symspell_model(freq_dict_paths['eng'])
40+
}
2741
# Set the default language
2842
self.set_language('ory')
2943

3044
def set_language(self, lang):
3145
# Switch the model and vocabulary based on language
3246
self.model = self.models[lang]
3347
self.vocab = self.vocabs[lang]
48+
self.symspell_model = self.symspell_models[lang]
3449

3550
def create_vocab_lexicon(self, lexicon_path):
3651
vocabulary = []
@@ -40,14 +55,23 @@ def create_vocab_lexicon(self, lexicon_path):
4055
vocabulary.append(word)
4156
return vocabulary
4257

58+
def create_symspell_model(self, freq_dict_path):
59+
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
60+
sym_spell.load_dictionary(freq_dict_path, term_index=0, count_index=1, separator=' ')
61+
return sym_spell
62+
63+
# def generate_candidates(self, word, max_distance=1):
64+
# len_range = range(len(word) - max_distance, len(word) + max_distance + 1)
65+
# filtered_vocab = [vocab_word for vocab_word in self.vocab if len(vocab_word) in len_range]
66+
# return [vocab_word for vocab_word in filtered_vocab if 0 <= Levenshtein.distance(word, vocab_word) <= max_distance]
67+
4368
def generate_candidates(self, word, max_distance=1):
44-
len_range = range(len(word) - max_distance, len(word) + max_distance + 1)
45-
filtered_vocab = [vocab_word for vocab_word in self.vocab if len(vocab_word) in len_range]
46-
return [vocab_word for vocab_word in filtered_vocab if 0 <= Levenshtein.distance(word, vocab_word) <= max_distance]
69+
suggestions = self.symspell_model.lookup(word, Verbosity.CLOSEST, max_distance)
70+
return [suggestion.term for suggestion in suggestions]
4771

4872
def beam_search(self, chunk, BEAM_WIDTH=5, SCORE_THRESHOLD=1.5, max_distance=1):
4973
original_score = self.model.score(' '.join(chunk))
50-
74+
5175
initial_candidates = self.generate_candidates(chunk[0], max_distance=1)
5276
if not initial_candidates:
5377
initial_candidates = [chunk[0]]
@@ -88,11 +112,55 @@ def correct_text_with_beam_search(self, text, BEAM_WIDTH=5, SCORE_THRESHOLD=1.5,
88112
corrected_sentences.append(best_sentence)
89113

90114
return ' '.join(corrected_sentences)
115+
116+
def load_freq_dict(self, freq_dict_path):
117+
freq_dict = {}
118+
with open(freq_dict_path, 'r') as f:
119+
for line in f:
120+
word, freq = line.split()
121+
freq_dict[word] = int(freq)
122+
return freq_dict
123+
124+
def make_updation_counter(self, text):
125+
126+
if type(text) == list:
127+
text = ' '.join(text)
128+
129+
# remove punctuations from the text
130+
text = ''.join(e for e in text if e.isalnum() or e.isspace())
131+
words = text.split()
132+
133+
# create a dictionary of words and their frequencies
134+
dict = Counter(words)
135+
136+
return dict
137+
138+
def update_symspell_model(self, lang, text):
139+
# update the frequency dictionary
140+
current_freq_dict_counter = Counter(self.load_freq_dict(freq_dict_paths[lang]))
141+
new_freq_dict_counter = self.make_updation_counter(text)
142+
143+
# merge the two frequency dictionaries
144+
freq_dict_counter = current_freq_dict_counter + new_freq_dict_counter
145+
146+
freq_dict = {}
147+
for word, freq in freq_dict_counter.items():
148+
freq_dict[word] = int(freq)
149+
150+
with open(freq_dict_paths[lang], 'w') as f:
151+
for word, freq in freq_dict.items():
152+
f.write(word + ' ' + str(freq) + '\n')
153+
154+
# retrain the model with the updated frequency dictionary
155+
self.symspell_models[lang] = self.create_symspell_model(freq_dict_paths[lang])
156+
157+
return 'Model updated successfully'
158+
91159

92160
class Model():
93-
def __init__(self, context, model_paths, vocab_paths):
161+
def __init__(self, context, model_paths, vocab_paths, freq_dict_paths):
94162
self.context = context
95-
self.text_corrector = TextCorrector(model_paths, vocab_paths)
163+
self.text_corrector = TextCorrector(model_paths, vocab_paths, freq_dict_paths)
96164

97165
async def inference(self, request: ModelRequest):
98166
# Set the correct language model based on the request
@@ -105,3 +173,12 @@ async def inference(self, request: ModelRequest):
105173
max_distance=request.max_distance
106174
)
107175
return corrected_text
176+
177+
async def update_symspell(self, request: ModelUpdateRequest):
178+
# Set the correct language model based on the request
179+
self.text_corrector.set_language(request.lang)
180+
181+
# Update the model with the new data
182+
self.text_corrector.update_symspell_model(request.lang, request.text)
183+
184+
return 'Model updated successfully'

src/spell_check/kenlm/local/request.py

+8
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,11 @@ def __init__(self, text, BEAM_WIDTH, SCORE_THRESHOLD, max_distance, lang='ory'):
1111

1212
def to_json(self):
1313
return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
14+
15+
class ModelUpdateRequest():
16+
def __init__(self, text, lang='ory'):
17+
self.text = text
18+
self.lang = lang
19+
20+
def to_json(self):
21+
return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
quart
22
aiohttp
33
python-Levenshtein
4-
requests
4+
requests
5+
symspellpy

src/spell_check/spello/README.md

Whitespace-only changes.
+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Use an official Python runtime as a parent image
2+
FROM python:3.9-slim
3+
4+
WORKDIR /app
5+
6+
# Install system packages required for building kenlm
7+
RUN apt-get update && apt-get install -y cmake g++ zlib1g-dev
8+
9+
# Install requirements
10+
COPY requirements.txt requirements.txt
11+
RUN pip3 install -r requirements.txt
12+
13+
# Install wget
14+
RUN apt-get update && apt-get install -y wget
15+
16+
# Download the files using wget
17+
RUN wget "https://drive.google.com/uc?export=download&id=14cMmeDPlAODbRe37CdHLnhClGX7JXG-A" -O 'freq_dict.txt'
18+
RUN wget "https://drive.google.com/uc?export=download&id=1Ztj6k0A4BMi_o87qwSDKJQ6cyhvlvneD" -O 'freq_dict_eng.txt'
19+
20+
# Copy the rest of the application code to the working directory
21+
COPY . /app/
22+
23+
EXPOSE 8000
24+
25+
# Set the entrypoint for the container
26+
CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
**curl request for inferenece:**
2+
3+
curl -X POST -H "Content-Type: application/json" -d '{
4+
"text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି",
5+
"lang" : "ory"
6+
}' http://localhost:8000/
7+
8+
curl -X POST -H "Content-Type: application/json" -d '{
9+
"text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି"
10+
}' http://localhost:8000/
11+
12+
curl -X POST -H "Content-Type: application/json" -d '{
13+
"text": "how to apply for go-sugem scheme for my paddi crop",
14+
"lang" : "eng"
15+
}' http://localhost:8000/
16+
17+
18+
**curl request for update:**
19+
20+
curl -X PUT -H "Content-Type: application/json" -d '{
21+
"text": "ମିଶନରୀ",
22+
"lang" : "ory"
23+
}' http://localhost:8000/
24+
25+
curl -X PUT -H "Content-Type: application/json" -d '{
26+
"text": ["ପାମ ମିଶନରୀ ଉପରେ", "ରିହାତି ଧୈର୍ଯ ହୋଇଛି"]
27+
}' http://localhost:8000/
28+
29+
curl -X PUT -H "Content-Type: application/json" -d '{
30+
"text": "go-sugem",
31+
"lang" : "eng"
32+
}' http://localhost:8000/
33+
34+
curl -X PUT -H "Content-Type: application/json" -d '{
35+
"text": ["how to apply for", "scheme for my paddi crop"],
36+
"lang" : "eng"
37+
}' http://localhost:8000/
+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .request import *
2+
from .model import *

src/spell_check/spello/local/api.py

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from model import Model
2+
from request import ModelRequest
3+
from quart import Quart, request
4+
import aiohttp
5+
6+
app = Quart(__name__)
7+
8+
model = None
9+
10+
freq_dict_paths = {
11+
'ory': 'freq_dict.txt',
12+
'eng': 'freq_dict_eng.txt'
13+
}
14+
15+
spello_model_paths = {
16+
'ory': 'spello_model.pkl',
17+
'eng': 'spello_model_eng.pkl'
18+
}
19+
20+
21+
@app.before_serving
22+
async def startup():
23+
app.client = aiohttp.ClientSession()
24+
global model
25+
model = Model(app, freq_dict_paths)
26+
27+
@app.route('/', methods=['POST'])
28+
async def infer():
29+
global model
30+
data = await request.get_json()
31+
req = ModelRequest(**data)
32+
result = await model.inference(req)
33+
return result
34+
35+
@app.route('/', methods=['PUT'])
36+
async def update():
37+
# print("PUT")
38+
global model
39+
data = await request.get_json()
40+
req = ModelRequest(**data)
41+
result = await model.update(req)
42+
return result
43+
44+
45+
if __name__ == "__main__":
46+
app.run()

0 commit comments

Comments
 (0)