@@ -175,6 +175,86 @@ public List<SegToken> process(String paragraph, SegMode mode) {
175
175
return tokens ;
176
176
}
177
177
178
+ public List <SegToken > processExtend (String paragraph , SegMode mode ) {
179
+ List <SegToken > tokens = new ArrayList <SegToken >();
180
+ StringBuilder sb = new StringBuilder ();
181
+ int offset = 0 ;
182
+ for (int i = 0 ; i < paragraph .length (); ++i ) {
183
+ char ch = CharacterUtil .regularize (paragraph .charAt (i ));
184
+ if (CharacterUtil .ccFindExtend (ch ))
185
+ sb .append (ch );
186
+ else {
187
+ if (sb .length () > 0 ) {
188
+ // process
189
+ if (mode == SegMode .SEARCH ) {
190
+ for (String word : sentenceProcess (sb .toString ())) {
191
+ tokens .add (new SegToken (word , offset , offset += word .length ()));
192
+ }
193
+ }
194
+ else {
195
+ for (String token : sentenceProcess (sb .toString ())) {
196
+ if (token .length () > 2 ) {
197
+ String gram2 ;
198
+ int j = 0 ;
199
+ for (; j < token .length () - 1 ; ++j ) {
200
+ gram2 = token .substring (j , j + 2 );
201
+ if (wordDict .containsWord (gram2 ))
202
+ tokens .add (new SegToken (gram2 , offset + j , offset + j + 2 ));
203
+ }
204
+ }
205
+ if (token .length () > 3 ) {
206
+ String gram3 ;
207
+ int j = 0 ;
208
+ for (; j < token .length () - 2 ; ++j ) {
209
+ gram3 = token .substring (j , j + 3 );
210
+ if (wordDict .containsWord (gram3 ))
211
+ tokens .add (new SegToken (gram3 , offset + j , offset + j + 3 ));
212
+ }
213
+ }
214
+ tokens .add (new SegToken (token , offset , offset += token .length ()));
215
+ }
216
+ }
217
+ sb = new StringBuilder ();
218
+ offset = i ;
219
+ }
220
+ if (wordDict .containsWord (paragraph .substring (i , i + 1 )))
221
+ tokens .add (new SegToken (paragraph .substring (i , i + 1 ), offset , ++offset ));
222
+ else
223
+ tokens .add (new SegToken (paragraph .substring (i , i + 1 ), offset , ++offset ));
224
+ }
225
+ }
226
+ if (sb .length () > 0 )
227
+ if (mode == SegMode .SEARCH ) {
228
+ for (String token : sentenceProcess (sb .toString ())) {
229
+ tokens .add (new SegToken (token , offset , offset += token .length ()));
230
+ }
231
+ }
232
+ else {
233
+ for (String token : sentenceProcess (sb .toString ())) {
234
+ if (token .length () > 2 ) {
235
+ String gram2 ;
236
+ int j = 0 ;
237
+ for (; j < token .length () - 1 ; ++j ) {
238
+ gram2 = token .substring (j , j + 2 );
239
+ if (wordDict .containsWord (gram2 ))
240
+ tokens .add (new SegToken (gram2 , offset + j , offset + j + 2 ));
241
+ }
242
+ }
243
+ if (token .length () > 3 ) {
244
+ String gram3 ;
245
+ int j = 0 ;
246
+ for (; j < token .length () - 2 ; ++j ) {
247
+ gram3 = token .substring (j , j + 3 );
248
+ if (wordDict .containsWord (gram3 ))
249
+ tokens .add (new SegToken (gram3 , offset + j , offset + j + 3 ));
250
+ }
251
+ }
252
+ tokens .add (new SegToken (token , offset , offset += token .length ()));
253
+ }
254
+ }
255
+
256
+ return tokens ;
257
+ }
178
258
179
259
/*
180
260
*
0 commit comments