-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
119 lines (94 loc) · 3.56 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import random
import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example
from misc import TRAINING_EPOCHS, MODEL_NAME, load_json_dataset
# Initialize blank French NLP model
nlp = spacy.blank("fr")
# Ensure NER component is added
if "ner" not in nlp.pipe_names:
ner = nlp.add_pipe("ner", last=True)
else:
ner = nlp.get_pipe("ner")
# Dataset and model paths
DATASET = load_json_dataset('train.json')
random.shuffle(DATASET)
SPLIT_INDEX = int(0.8 * len(DATASET))
TRAIN_SET = DATASET[:SPLIT_INDEX]
EVAL_SET = DATASET[SPLIT_INDEX:]
print(f"Training set size: {len(TRAIN_SET)}")
print(f"Evaluation set size: {len(EVAL_SET)}")
def align_entities(text, entities):
"""Adjust entity offsets to match SpaCy's tokenization."""
doc = nlp.make_doc(text)
new_entities = []
for start, end, label in entities:
entity_text = text[start:end]
for token in doc:
if entity_text == token.text:
new_entities.append((token.idx, token.idx + len(token.text), label))
break
return new_entities
def training():
for _, annotations in TRAIN_SET:
for ent in annotations["entities"]:
ner.add_label(ent[2]) # ent[2] is the entity label
# Prepare training examples
db = DocBin()
for text, annotations in TRAIN_SET:
try:
doc = nlp.make_doc(text)
ents = []
for start, end, label in annotations["entities"]:
span = doc.char_span(start, end, label=label)
if span:
ents.append(span)
doc.ents = ents
db.add(doc)
except Exception as e:
continue
print(f"Used {len(db)} out of {len(TRAIN_SET)} examples")
# Initialize the model
optimizer = nlp.initialize()
# Training loop
for epoch in range(TRAINING_EPOCHS):
losses = {}
random.shuffle(TRAIN_SET) # Shuffle at each epoch
for text, annotations in TRAIN_SET:
try:
# Update annotations with aligned entities
annotations["entities"] = align_entities(text, annotations["entities"])
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
nlp.update([example], drop=0.3, losses=losses)
except Exception:
continue
print(f"Epoch {epoch+1} Loss: {losses}")
nlp.to_disk(MODEL_NAME)
print(f"Model saved to {MODEL_NAME}")
def evaluation():
nlp = spacy.load(MODEL_NAME)
examples = []
for text, annotations in EVAL_SET:
try:
# Update annotations with aligned entities
annotations["entities"] = align_entities(text, annotations["entities"])
doc = nlp(text) # Process the text with the pipeline
example = Example.from_dict(doc, annotations) # Create an Example
examples.append(example)
except Exception:
continue
print(f"Used {len(examples)} out of {len(EVAL_SET)} examples")
# Evaluate the model
scores = nlp.evaluate(examples)
print("\n🔍 Evaluation Metrics:")
print(f"Precision: {scores['ents_p']:.3f}")
print(f"Recall: {scores['ents_r']:.3f}")
print(f"F1-score: {scores['ents_f']:.3f}")
if "ents_per_type" in scores:
print("\n📌 Per-Entity Scores:")
for entity, metrics in scores["ents_per_type"].items():
print(f" - {entity}: Precision={metrics['p']:.3f}, Recall={metrics['r']:.3f}, F1={metrics['f']:.3f}")
if __name__ == "__main__":
training()
evaluation()