@@ -45,7 +45,9 @@ def get_token_span(self, span: tuple[int, int]) -> Span:
45
45
46
46
47
47
class RegexpTagger :
48
- def __init__ (self , mapping : Union [list [tuple [str , str ]], tuple [str , str ]]) -> None :
48
+ def __init__ (
49
+ self , mapping : Union [list [Union [tuple [str , str ], tuple [str , str , int ]]], tuple [str , str ], tuple [str , str , int ]]
50
+ ) -> None :
49
51
r"""This tagger is capable of tagging sentence objects with given regexp -> label mappings.
50
52
51
53
I.e: The tuple (r'(["\'])(?:(?=(\\?))\2.)*?\1', 'QUOTE') maps every match of the regexp to
@@ -58,24 +60,33 @@ def __init__(self, mapping: Union[list[tuple[str, str]], tuple[str, str]]) -> No
58
60
Args:
59
61
mapping: A list of tuples or a single tuple representing a mapping as regexp -> label
60
62
"""
61
- self ._regexp_mapping : dict [str , typing .Pattern ] = {}
63
+ self ._regexp_mapping : list [str , typing .Pattern , int ] = []
62
64
self .register_labels (mapping = mapping )
63
65
66
+ def label_type (self ):
67
+ for regexp , label , group in self ._regexp_mapping :
68
+ return label
69
+
64
70
@property
65
71
def registered_labels (self ):
66
72
return self ._regexp_mapping
67
73
68
- def register_labels (self , mapping : Union [list [tuple [str , str ]], tuple [str , str ]]):
74
+ def register_labels (self , mapping : Union [list [tuple [str , str , int ]], tuple [str , str , int ]]):
69
75
"""Register a regexp -> label mapping.
70
76
71
77
Args:
72
78
mapping: A list of tuples or a single tuple representing a mapping as regexp -> label
73
79
"""
74
80
mapping = self ._listify (mapping )
75
81
76
- for regexp , label in mapping :
82
+ for entry in mapping :
83
+ regexp = entry [0 ]
84
+ label = entry [1 ]
85
+ group = entry [2 ] if len (entry ) > 2 else 0
77
86
try :
78
- self ._regexp_mapping [label ] = re .compile (regexp )
87
+ pattern = re .compile (regexp )
88
+ self ._regexp_mapping .append ((pattern , label , group ))
89
+
79
90
except re .error as err :
80
91
raise re .error (
81
92
f"Couldn't compile regexp '{ regexp } ' for label '{ label } '. Aborted with error: '{ err .msg } '"
@@ -89,10 +100,7 @@ def remove_labels(self, labels: Union[list[str], str]):
89
100
"""
90
101
labels = self ._listify (labels )
91
102
92
- for label in labels :
93
- if not self ._regexp_mapping .get (label ):
94
- continue
95
- self ._regexp_mapping .pop (label )
103
+ self ._regexp_mapping = [mapping for mapping in self ._regexp_mapping if mapping [1 ] not in labels ]
96
104
97
105
@staticmethod
98
106
def _listify (element : object ) -> list :
@@ -120,9 +128,11 @@ def _label(self, sentence: Sentence):
120
128
"""
121
129
collection = TokenCollection (sentence )
122
130
123
- for label , pattern in self ._regexp_mapping . items () :
131
+ for pattern , label , group in self ._regexp_mapping :
124
132
for match in pattern .finditer (sentence .to_original_text ()):
125
- span : tuple [int , int ] = match .span ()
133
+ # print(match)
134
+ span : tuple [int , int ] = match .span (group )
135
+ # print(span)
126
136
try :
127
137
token_span = collection .get_token_span (span )
128
138
except ValueError :
0 commit comments