|
10 | 10 |
|
11 | 11 | #include "AutomatonSearchIter.h"
|
12 | 12 | #include <wctype.h>
|
| 13 | +typedef uint64_t opt_int; |
13 | 14 |
|
14 | 15 | static PyTypeObject automaton_search_iter_type;
|
15 | 16 |
|
@@ -197,6 +198,31 @@ automaton_build_output(PyObject* self, PyObject** result) {
|
197 | 198 | }
|
198 | 199 |
|
199 | 200 |
|
| 201 | +static int |
| 202 | +automaton_build_output_optimized(PyObject* self, opt_int* results, int results_size) { |
| 203 | + TrieNode* node; |
| 204 | + int value; |
| 205 | + |
| 206 | + while (iter->output && !iter->output->eow) { |
| 207 | + iter->output = iter->output->fail; |
| 208 | + } |
| 209 | + |
| 210 | + if (iter->output) { |
| 211 | + node = iter->output; |
| 212 | + iter->output = iter->output->fail; |
| 213 | + value = node->output.integer; |
| 214 | + if ((0 <= value) && (value < results_size)) { |
| 215 | + results[value]++; |
| 216 | + return OutputValue; |
| 217 | + } else { |
| 218 | + PyErr_SetString(PyExc_ValueError, "results buffer is too small"); |
| 219 | + return OutputError; |
| 220 | + } |
| 221 | + } else { |
| 222 | + return OutputNone; |
| 223 | + } |
| 224 | +} |
| 225 | + |
200 | 226 |
|
201 | 227 | #ifdef VARIABLE_LEN_CHARCODES
|
202 | 228 | static bool
|
@@ -300,6 +326,69 @@ automaton_search_iter_next(PyObject* self) {
|
300 | 326 | }
|
301 | 327 |
|
302 | 328 |
|
| 329 | +static int |
| 330 | +automaton_search_iter_next_optimized(PyObject* self, opt_int* results, int results_size) { |
| 331 | + if (iter->version != iter->automaton->version) { |
| 332 | + PyErr_SetString(PyExc_ValueError, "underlaying automaton has changed, iterator is not valid anymore"); |
| 333 | + return 0; |
| 334 | + } |
| 335 | + |
| 336 | + if (iter->automaton->store != STORE_INTS) { |
| 337 | + PyErr_SetString(PyExc_ValueError, "underlaying automaton is not an integer key store"); |
| 338 | + return 0; |
| 339 | + } |
| 340 | + |
| 341 | +return_output: |
| 342 | + switch (automaton_build_output_optimized(self, results, results_size)) { |
| 343 | + case OutputValue: |
| 344 | + return 1; |
| 345 | + |
| 346 | + case OutputNone: |
| 347 | + break; |
| 348 | + |
| 349 | + case OutputError: |
| 350 | + return 0; |
| 351 | + } |
| 352 | + |
| 353 | +#ifdef VARIABLE_LEN_CHARCODES |
| 354 | + if (!automaton_search_iter_advance_index(self)) { |
| 355 | + return 0; |
| 356 | + } |
| 357 | +#else |
| 358 | + iter->index += 1; |
| 359 | + if (iter->ignore_white_space) { |
| 360 | + while (iswspace(iter->input.word[iter->index]) && (iter->index < iter->end)) { |
| 361 | + iter->index += 1; |
| 362 | + } |
| 363 | + } |
| 364 | +#endif |
| 365 | + while (iter->index < iter->end) { |
| 366 | + // process single char |
| 367 | + iter->state = ahocorasick_next( |
| 368 | + iter->state, |
| 369 | + iter->automaton->root, |
| 370 | + iter->input.word[iter->index] |
| 371 | + ); |
| 372 | + |
| 373 | + ASSERT(iter->state); |
| 374 | + |
| 375 | + iter->output = iter->state; |
| 376 | + goto return_output; |
| 377 | + |
| 378 | +#ifdef VARIABLE_LEN_CHARCODES |
| 379 | + if (!automaton_search_iter_advance_index(self)) { |
| 380 | + return 0; |
| 381 | + } |
| 382 | +#else |
| 383 | + iter->index += 1; |
| 384 | +#endif |
| 385 | + |
| 386 | + } // while |
| 387 | + |
| 388 | + return 0; // StopIteration |
| 389 | +} |
| 390 | + |
| 391 | + |
303 | 392 | static PyObject*
|
304 | 393 | automaton_search_iter_set(PyObject* self, PyObject* args) {
|
305 | 394 | PyObject* object;
|
@@ -367,12 +456,36 @@ automaton_search_iter_set(PyObject* self, PyObject* args) {
|
367 | 456 | }
|
368 | 457 |
|
369 | 458 |
|
| 459 | +static PyObject* |
| 460 | +automaton_search_iter_all(PyObject* self, PyObject* args) { |
| 461 | + Py_buffer buf; |
| 462 | + |
| 463 | + if (!PyArg_ParseTuple(args, "w*", &buf)) { |
| 464 | + return NULL; |
| 465 | + } |
| 466 | + |
| 467 | + if (buf.itemsize != sizeof(opt_int)) { |
| 468 | + PyErr_SetString(PyExc_ValueError, "invalid buffer type (expected int64)"); |
| 469 | + PyBuffer_Release(&buf); |
| 470 | + return NULL; |
| 471 | + } |
| 472 | + |
| 473 | + long out = 0; |
| 474 | + while (automaton_search_iter_next_optimized(self, (opt_int*) buf.buf, buf.len / buf.itemsize)) { |
| 475 | + out++; |
| 476 | + } |
| 477 | + |
| 478 | + PyBuffer_Release(&buf); |
| 479 | + return PyLong_FromLong(out); |
| 480 | +} |
| 481 | + |
370 | 482 | #undef iter
|
371 | 483 |
|
372 | 484 | #define method(name, kind) {#name, automaton_search_iter_##name, kind, automaton_search_iter_##name##_doc}
|
373 | 485 |
|
374 | 486 | static
|
375 | 487 | PyMethodDef automaton_search_iter_methods[] = {
|
| 488 | + method(all, METH_VARARGS), |
376 | 489 | method(set, METH_VARARGS),
|
377 | 490 |
|
378 | 491 | {NULL, NULL, 0, NULL}
|
|
0 commit comments