Skip to content

Commit c662fe6

Browse files
committed
Add an aggregated counting interface.
1 parent 53842f7 commit c662fe6

File tree

3 files changed

+127
-0
lines changed

3 files changed

+127
-0
lines changed

AutomatonSearchIter.c

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include "AutomatonSearchIter.h"
1212
#include <wctype.h>
13+
typedef uint64_t opt_int;
1314

1415
static PyTypeObject automaton_search_iter_type;
1516

@@ -197,6 +198,31 @@ automaton_build_output(PyObject* self, PyObject** result) {
197198
}
198199

199200

201+
static int
202+
automaton_build_output_optimized(PyObject* self, opt_int* results, int results_size) {
203+
TrieNode* node;
204+
int value;
205+
206+
while (iter->output && !iter->output->eow) {
207+
iter->output = iter->output->fail;
208+
}
209+
210+
if (iter->output) {
211+
node = iter->output;
212+
iter->output = iter->output->fail;
213+
value = node->output.integer;
214+
if ((0 <= value) && (value < results_size)) {
215+
results[value]++;
216+
return OutputValue;
217+
} else {
218+
PyErr_SetString(PyExc_ValueError, "results buffer is too small");
219+
return OutputError;
220+
}
221+
} else {
222+
return OutputNone;
223+
}
224+
}
225+
200226

201227
#ifdef VARIABLE_LEN_CHARCODES
202228
static bool
@@ -300,6 +326,69 @@ automaton_search_iter_next(PyObject* self) {
300326
}
301327

302328

329+
static int
330+
automaton_search_iter_next_optimized(PyObject* self, opt_int* results, int results_size) {
331+
if (iter->version != iter->automaton->version) {
332+
PyErr_SetString(PyExc_ValueError, "underlaying automaton has changed, iterator is not valid anymore");
333+
return 0;
334+
}
335+
336+
if (iter->automaton->store != STORE_INTS) {
337+
PyErr_SetString(PyExc_ValueError, "underlaying automaton is not an integer key store");
338+
return 0;
339+
}
340+
341+
return_output:
342+
switch (automaton_build_output_optimized(self, results, results_size)) {
343+
case OutputValue:
344+
return 1;
345+
346+
case OutputNone:
347+
break;
348+
349+
case OutputError:
350+
return 0;
351+
}
352+
353+
#ifdef VARIABLE_LEN_CHARCODES
354+
if (!automaton_search_iter_advance_index(self)) {
355+
return 0;
356+
}
357+
#else
358+
iter->index += 1;
359+
if (iter->ignore_white_space) {
360+
while (iswspace(iter->input.word[iter->index]) && (iter->index < iter->end)) {
361+
iter->index += 1;
362+
}
363+
}
364+
#endif
365+
while (iter->index < iter->end) {
366+
// process single char
367+
iter->state = ahocorasick_next(
368+
iter->state,
369+
iter->automaton->root,
370+
iter->input.word[iter->index]
371+
);
372+
373+
ASSERT(iter->state);
374+
375+
iter->output = iter->state;
376+
goto return_output;
377+
378+
#ifdef VARIABLE_LEN_CHARCODES
379+
if (!automaton_search_iter_advance_index(self)) {
380+
return 0;
381+
}
382+
#else
383+
iter->index += 1;
384+
#endif
385+
386+
} // while
387+
388+
return 0; // StopIteration
389+
}
390+
391+
303392
static PyObject*
304393
automaton_search_iter_set(PyObject* self, PyObject* args) {
305394
PyObject* object;
@@ -367,12 +456,36 @@ automaton_search_iter_set(PyObject* self, PyObject* args) {
367456
}
368457

369458

459+
static PyObject*
460+
automaton_search_iter_all(PyObject* self, PyObject* args) {
461+
Py_buffer buf;
462+
463+
if (!PyArg_ParseTuple(args, "w*", &buf)) {
464+
return NULL;
465+
}
466+
467+
if (buf.itemsize != sizeof(opt_int)) {
468+
PyErr_SetString(PyExc_ValueError, "invalid buffer type (expected int64)");
469+
PyBuffer_Release(&buf);
470+
return NULL;
471+
}
472+
473+
long out = 0;
474+
while (automaton_search_iter_next_optimized(self, (opt_int*) buf.buf, buf.len / buf.itemsize)) {
475+
out++;
476+
}
477+
478+
PyBuffer_Release(&buf);
479+
return PyLong_FromLong(out);
480+
}
481+
370482
#undef iter
371483

372484
#define method(name, kind) {#name, automaton_search_iter_##name, kind, automaton_search_iter_##name##_doc}
373485

374486
static
375487
PyMethodDef automaton_search_iter_methods[] = {
488+
method(all, METH_VARARGS),
376489
method(set, METH_VARARGS),
377490

378491
{NULL, NULL, 0, NULL}

docs/automaton_search_iter_all.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
all(array)
2+
----------------------------------------------------------------------
3+
4+
Collect all remaining match counts in a buffer.
5+
6+
Sets array[k] = count(v) for all k, v pairs in the automaton.

src/inline_doc.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,14 @@
236236
"Automaton. This iterator can be manipulated through its\n" \
237237
"set() method."
238238

239+
#define automaton_search_iter_all_doc \
240+
"all(array)\n" \
241+
"\n" \
242+
"Collect all remaining match counts in a buffer.\n" \
243+
"\n" \
244+
"Sets array[k] = count(v) for all k, v pairs in the\n" \
245+
"automaton."
246+
239247
#define automaton_search_iter_set_doc \
240248
"set(string, reset=False)\n" \
241249
"\n" \

0 commit comments

Comments
 (0)