Skip to content

Commit 91bb671

Browse files
Merge pull request #317 from Shubh-Goyal-07/restructure
Updates NER
2 parents f36e68e + 94885de commit 91bb671

File tree

7 files changed

+282
-64
lines changed

7 files changed

+282
-64
lines changed

src/ner/agri_ner_akai/local/Dockerfile

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@ FROM python:3.9-slim
33

44
WORKDIR /app
55

6-
76
#install requirements
87
COPY requirements.txt requirements.txt
98
RUN pip3 install -r requirements.txt
109

10+
RUN python -m spacy download en_core_web_sm
11+
1112
# Copy the rest of the application code to the working directory
1213
COPY . /app/
1314
EXPOSE 8000

src/ner/agri_ner_akai/local/README.md

+28-7
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,42 @@
11
## NER:
22

3-
43
### Purpose :
4+
55
Model to detect
6+
67
- crops
78
- pests
8-
- seed type
9+
- seed type
10+
- email
11+
- time
12+
- phone numbers
13+
- numbers with units
14+
- dates
915

16+
### Testing the model deployment :
1017

11-
### Testing the model deployment :
12-
To run for testing just the Hugging Face deployment for grievence recognition, you can follow the following steps :
18+
To run for testing just the Hugging Face deployment for grievence recognition, you can follow the following steps :
1319

1420
- Git clone the repo
15-
- Go to current folder location i.e. ``` cd /src/ner/agri_ner_akai/local ```
16-
- Create docker image file and test the api:
21+
- Go to current folder location i.e. ``cd /src/ner/agri_ner_akai/local``
22+
- Create docker image file and test the api:
23+
1724
```
1825
docker build -t testmodel .
1926
docker run -p 8000:8000 testmodel
20-
curl -X POST -H "Content-Type: application/json" -d '{"text": "What are tomatoes and potaotes that are being attacked by aphids? "}' http://localhost:8000/
27+
```
28+
29+
### **Request**
30+
31+
```
32+
curl -X POST -H "Content-Type: application/json" -d '{
33+
"text": "What are tomatoes and potaotes that are being attacked by aphids will be treated next monday?",
34+
"type": ["email", "CROP"]
35+
}' http://localhost:8000/
36+
```
37+
38+
```
39+
curl -X POST -H "Content-Type: application/json" -d '{
40+
"text": "What are tomatoes and potaotes that are being attacked by aphids? "
41+
}' http://localhost:8000/
2142
```
+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
from transformers import pipeline
2+
from request import ModelRequest
3+
4+
class BertNERModel():
5+
def __new__(cls):
6+
if not hasattr(cls, 'instance'):
7+
cls.instance = super(BertNERModel, cls).__new__(cls)
8+
cls.nlp_ner = pipeline("ner", model="GautamR/akai_ner", tokenizer="GautamR/akai_ner")
9+
return cls.instance
10+
11+
def inference(self, sentence):
12+
entities = self.nlp_ner(sentence)
13+
return self.aggregate_entities(sentence, entities)
14+
15+
@staticmethod
16+
def aggregate_entities(sentence, entity_outputs):
17+
aggregated_entities = []
18+
current_entity = None
19+
20+
for entity in entity_outputs:
21+
entity_type = entity["entity"].split("-")[-1]
22+
23+
# Handle subwords
24+
if entity["word"].startswith("##"):
25+
# If we encounter an I-PEST or any other I- entity
26+
if "I-" in entity["entity"]:
27+
if current_entity: # Add previous entity
28+
aggregated_entities.append(current_entity)
29+
30+
word_start = sentence.rfind(" ", 0, entity["start"]) + 1
31+
word_end = sentence.find(" ", entity["end"])
32+
if word_end == -1:
33+
word_end = len(sentence)
34+
35+
current_entity = {
36+
"entity_group": entity_type,
37+
"score": float(entity["score"]),
38+
"word": sentence[word_start:word_end].replace('.','').replace('?',''),
39+
"start": float(word_start),
40+
"end": float(word_end)
41+
}
42+
aggregated_entities.append(current_entity)
43+
current_entity = None
44+
45+
else:
46+
if current_entity:
47+
# If it's a subword but not an I- entity
48+
current_entity["word"] += entity["word"][2:]
49+
current_entity["end"] = entity["end"]
50+
current_entity["score"] = float((current_entity["score"] + entity["score"]) / 2) # averaging scores
51+
52+
# Handle full words
53+
else:
54+
if current_entity:
55+
aggregated_entities.append(current_entity)
56+
57+
current_entity = {
58+
"entity_group": entity_type,
59+
"score": float(entity["score"]),
60+
"word": entity["word"],
61+
"start": float(entity["start"]),
62+
"end": float(entity["end"])
63+
}
64+
65+
if current_entity:
66+
aggregated_entities.append(current_entity)
67+
68+
return aggregated_entities

src/ner/agri_ner_akai/local/model.py

+37-55
Original file line numberDiff line numberDiff line change
@@ -1,69 +1,51 @@
11
from transformers import pipeline
22
from request import ModelRequest
3+
from regex_parse_ner import RegNERModel
4+
from bert_ner import BertNERModel
35

46
class Model():
5-
def __new__(cls, context):
6-
cls.context = context
7-
if not hasattr(cls, 'instance'):
8-
cls.instance = super(Model, cls).__new__(cls)
9-
cls.nlp_ner = pipeline("ner", model="GautamR/akai_ner", tokenizer="GautamR/akai_ner")
10-
return cls.instance
7+
def __init__(self, context):
8+
self.context = context
9+
print("Loading models...")
10+
self.regex_model = RegNERModel()
11+
print("Regex model loaded successfully")
12+
self.bert_model = BertNERModel()
13+
print("Bert model loaded successfully")
1114

12-
async def inference(self, request: ModelRequest):
13-
entities = self.nlp_ner(request.text)
14-
return self.aggregate_entities(request.text, entities)
15+
def combine_entities(self, reg_entities, bert_entities):
16+
combined_entities = reg_entities
17+
18+
for entity in bert_entities:
19+
if entity['entity_group'] not in combined_entities:
20+
combined_entities[entity['entity_group']] = []
1521

16-
@staticmethod
17-
def aggregate_entities(sentence, entity_outputs):
18-
aggregated_entities = []
19-
current_entity = None
22+
entity_info = {
23+
'name': entity['word'],
24+
'start': entity['start'],
25+
'end': entity['end'],
26+
'score': entity['score']
27+
}
2028

21-
for entity in entity_outputs:
22-
entity_type = entity["entity"].split("-")[-1]
29+
combined_entities[entity['entity_group']].append(entity_info)
2330

24-
# Handle subwords
25-
if entity["word"].startswith("##"):
26-
# If we encounter an I-PEST or any other I- entity
27-
if "I-" in entity["entity"]:
28-
if current_entity: # Add previous entity
29-
aggregated_entities.append(current_entity)
30-
31-
word_start = sentence.rfind(" ", 0, entity["start"]) + 1
32-
word_end = sentence.find(" ", entity["end"])
33-
if word_end == -1:
34-
word_end = len(sentence)
31+
return combined_entities
32+
33+
async def inference(self, request: ModelRequest):
34+
sentence = request.text
35+
types = request.type
3536

36-
current_entity = {
37-
"entity_group": entity_type,
38-
"score": float(entity["score"]),
39-
"word": sentence[word_start:word_end].replace('.','').replace('?',''),
40-
"start": float(word_start),
41-
"end": float(word_end)
42-
}
43-
aggregated_entities.append(current_entity)
44-
current_entity = None
37+
reg_entities = self.regex_model.inference(sentence)
38+
bert_entities = self.bert_model.inference(sentence)
4539

46-
else:
47-
if current_entity:
48-
# If it's a subword but not an I- entity
49-
current_entity["word"] += entity["word"][2:]
50-
current_entity["end"] = entity["end"]
51-
current_entity["score"] = float((current_entity["score"] + entity["score"]) / 2) # averaging scores
40+
combined_entities = self.combine_entities(reg_entities, bert_entities)
5241

53-
# Handle full words
54-
else:
55-
if current_entity:
56-
aggregated_entities.append(current_entity)
42+
final_entities = {}
5743

58-
current_entity = {
59-
"entity_group": entity_type,
60-
"score": float(entity["score"]),
61-
"word": entity["word"],
62-
"start": float(entity["start"]),
63-
"end": float(entity["end"])
64-
}
44+
if types is None:
45+
return combined_entities
6546

66-
if current_entity:
67-
aggregated_entities.append(current_entity)
47+
for entity_group in combined_entities:
48+
if entity_group in types:
49+
final_entities[entity_group] = combined_entities[entity_group]
6850

69-
return aggregated_entities
51+
return final_entities
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
import re
2+
import spacy
3+
from datetime import datetime, timedelta
4+
5+
class RegNERModel():
6+
def __init__(self):
7+
self.nlp = spacy.load("en_core_web_sm")
8+
9+
print("Model loaded successfully")
10+
11+
def detect_email(self, sentence):
12+
email_regex_pattern = '[A-Za-z0-9._%+-]*@[A-Za-z0-9.-]*\.[A-Z|a-z]*'
13+
emails_matches = []
14+
15+
for match in re.finditer(email_regex_pattern, sentence):
16+
emails_matches.append( {"name": match.group(), "start": match.start(), "end": match.end(), "score": 1.0} )
17+
18+
return emails_matches
19+
20+
def detect_time(self, sentence):
21+
time_regex = r'\b(?:1[0-2]|0?[1-9])(?::[0-5][0-9])?(?:\s?[ap]m)?\b'
22+
times = []
23+
24+
for match in re.finditer(time_regex, sentence, re.IGNORECASE):
25+
times.append( {"name": match.group(), "start": match.start(), "end": match.end(), "score": 1.0} )
26+
27+
return times
28+
29+
def detect_phone_numbers(self, sentence):
30+
phone_regex = r'(\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4})'
31+
32+
phone_numbers = []
33+
for match in re.finditer(phone_regex, sentence):
34+
phone_numbers.append( {"name": match.group(), "start": match.start(), "end": match.end(), "score": 1.0} )
35+
36+
return phone_numbers
37+
38+
def detect_numbers_with_units(self, sentence, phone_numbers):
39+
number_unit_regex = r'(?<!\d)(\d+(?:\.\d+)?)(?:\s+)(\w+)(?!\d)'
40+
41+
numbers_with_units = []
42+
43+
for match in re.finditer(number_unit_regex, sentence):
44+
number, unit = match.groups()
45+
if number not in phone_numbers:
46+
numbers_with_units.append( {"name": f"{number} {unit}", "start": match.start(), "end": match.end(), "score": 1.0} )
47+
48+
return numbers_with_units
49+
50+
def detect_dates(self, sentence):
51+
# Current date
52+
today = datetime.now()
53+
54+
# Define regex patterns for relative date expressions
55+
patterns = [
56+
r"(next|agle)\s+(monday|tuesday|wednesday|thursday|friday|saturday|sunday|somvar|mangalwar|budhwar|guruwar|shukrawar|shaniwar|raviwar)",
57+
r"(kal)",
58+
r"(next|agle)\s+(week|month|year|hafte|mahine|saal)"
59+
]
60+
61+
# Initialize empty list to store detected dates
62+
detected_dates = []
63+
64+
# Iterate through patterns and search for matches in text
65+
for pattern in patterns:
66+
for matchdates in re.finditer(pattern, sentence.lower()):
67+
match = matchdates.groups()
68+
if match[0] in ['next', 'agle']:
69+
if match[1] in ['monday', 'somvar']:
70+
# Find next Monday
71+
days_until_weekday = (today.weekday() - 1) % 7
72+
next_date = today + timedelta(days=days_until_weekday)
73+
detected_dates.append({"name": next_date.strftime("%d-%m-%Y"), "start": matchdates.start(), "end": matchdates.end(), "score": 1.0})
74+
elif match[1] in ['tuesday', 'mangalwar']:
75+
# Find next Tuesday
76+
days_until_weekday = (today.weekday() - 0) % 7
77+
next_date = today + timedelta(days=days_until_weekday )
78+
detected_dates.append({"name": next_date.strftime("%d-%m-%Y"), "start": matchdates.start(), "end": matchdates.end(), "score": 1.0})
79+
elif match[1] in ['wednesday', 'budhwar']:
80+
# Find next Wednesday
81+
days_until_weekday = (today.weekday() +1) % 7
82+
next_date = today + timedelta(days=days_until_weekday )
83+
detected_dates.append({"name": next_date.strftime("%d-%m-%Y"), "start": matchdates.start(), "end": matchdates.end(), "score": 1.0})
84+
elif match[1] in ['thursday', 'guruwar']:
85+
# Find next Thursday
86+
days_until_weekday = (today.weekday() +2) % 7
87+
next_date = today + timedelta(days=days_until_weekday )
88+
detected_dates.append({"name": next_date.strftime("%d-%m-%Y"), "start": matchdates.start(), "end": matchdates.end(), "score": 1.0})
89+
elif match[1] in ['friday', 'shukrawar']:
90+
# Find next Friday
91+
days_until_weekday = (today.weekday() +3) % 7
92+
next_date = today + timedelta(days=days_until_weekday )
93+
detected_dates.append({"name": next_date.strftime("%d-%m-%Y"), "start": matchdates.start(), "end": matchdates.end(), "score": 1.0})
94+
elif match[1] in ['saturday', 'shaniwar']:
95+
# Find next Saturday
96+
days_until_weekday = (today.weekday() +4) % 7
97+
next_date = today + timedelta(days=days_until_weekday )
98+
detected_dates.append({"name": next_date.strftime("%d-%m-%Y"), "start": matchdates.start(), "end": matchdates.end(), "score": 1.0})
99+
elif match[1] in ['sunday', 'raviwar']:
100+
# Find next Sunday
101+
days_until_weekday = (today.weekday() +5) % 7
102+
next_date = today + timedelta(days=days_until_weekday )
103+
detected_dates.append({"name": next_date.strftime("%d-%m-%Y"), "start": matchdates.start(), "end": matchdates.end(), "score": 1.0})
104+
elif match[1] in ['week', 'hafte']:
105+
# Find next week
106+
next_date = today + timedelta(days=(7 - today.weekday())+6)
107+
detected_dates.append({"name": next_date.strftime("%d-%m-%Y"), "start": matchdates.start(), "end": matchdates.end(), "score": 1.0})
108+
elif match[1] in ['month', 'mahine']:
109+
# Find next month
110+
next_date = today.replace(day=1, month=today.month+1)
111+
detected_dates.append({"name": next_date.strftime("%d-%m-%Y"), "start": matchdates.start(), "end": matchdates.end(), "score": 1.0})
112+
elif match[1] in ['year', 'saal']:
113+
# Find next year
114+
next_date = today.replace(day=1, month=1, year=today.year+1)
115+
detected_dates.append({"name": next_date.strftime("%d-%m-%Y"), "start": matchdates.start(), "end": matchdates.end(), "score": 1.0})
116+
elif match[0] == 'kal':
117+
# Find tomorrow's date
118+
next_date = today + timedelta(1)
119+
detected_dates.append({"name": next_date.strftime("%d-%m-%Y"), "start": matchdates.start(), "end": matchdates.end(), "score": 1.0})
120+
121+
return detected_dates
122+
123+
def inference(self, sentence):
124+
detected_emails = self.detect_email(sentence)
125+
detected_time = self.detect_time(sentence)
126+
detected_phone_numbers = self.detect_phone_numbers(sentence)
127+
detected_number_units = self.detect_numbers_with_units(sentence, detected_phone_numbers)
128+
detected_dates = self.detect_dates(sentence)
129+
130+
aggregated_entities = {}
131+
132+
if detected_emails:
133+
aggregated_entities["email"] = detected_emails
134+
if detected_time:
135+
aggregated_entities["time"] = detected_time
136+
if detected_phone_numbers:
137+
aggregated_entities["phone_number"] = detected_phone_numbers
138+
if detected_number_units:
139+
aggregated_entities["number_with_unit"] = detected_number_units
140+
if detected_dates:
141+
aggregated_entities["date"] = detected_dates
142+
143+
return aggregated_entities

src/ner/agri_ner_akai/local/request.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33

44

55
class ModelRequest():
6-
def __init__(self, text):
6+
def __init__(self, text, type=None):
77
self.text = text
8+
self.type = type
89

910
def to_json(self):
1011
return json.dumps(self, default=lambda o: o.__dict__,

0 commit comments

Comments
 (0)