Skip to content

Commit c8e4eaa

Browse files
authoredJan 9, 2024
Merge pull request #97 from iscgar/release_gil_for_bytes_haystack
Release the GIL for `BytesAhoCorasick` when the haystack is `bytes`
2 parents ab9d0b3 + 8994049 commit c8e4eaa

File tree

3 files changed

+47
-12
lines changed

3 files changed

+47
-12
lines changed
 

‎README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ You can control the behavior by using the `store_patterns` keyword argument to `
193193
## Implementation details <a name="implementation"></a>
194194

195195
* Matching on strings releases the GIL, to enable concurrency.
196-
Matching on bytes does not currently release the GIL, but see https://github.com/G-Research/ahocorasick_rs/issues/94 for a case where it could.
196+
Matching on bytes does not currently release the GIL for memory-safety reasons, unless the haystack type is `bytes`.
197197
* Not all features from the underlying library are exposed; if you would like additional features, please [file an issue](https://github.com/g-research/ahocorasick_rs/issues/new) or submit a PR.
198198

199199
## Benchmarks <a name="benchmarks"></a>

‎src/lib.rs

+17-11
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use pyo3::{
88
buffer::{PyBuffer, ReadOnlyCell},
99
exceptions::{PyTypeError, PyValueError},
1010
prelude::*,
11-
types::{PyList, PyUnicode},
11+
types::{PyBytes, PyList, PyUnicode},
1212
};
1313

1414
/// Search for multiple pattern strings against a single haystack string.
@@ -408,16 +408,22 @@ impl PyBytesAhoCorasick {
408408
haystack: &PyAny,
409409
overlapping: bool,
410410
) -> PyResult<Vec<(u64, usize, usize)>> {
411-
let haystack = PyBufferBytes::try_from(haystack)?;
412-
let matches = get_matches(&self_.ac_impl, haystack.as_ref(), overlapping)?;
413-
414-
// Note: we must collect here and not release the GIL or return an iterator
415-
// from this function due to the safety caveat in the implementation of
416-
// AsRef<[u8]> for PyBufferBytes, which is relevant here since the matches
417-
// iterator is holding an AsRef reference on the haystack.
418-
Ok(matches
419-
.map(|m| (m.pattern().as_u64(), m.start(), m.end()))
420-
.collect())
411+
let haystack_buffer = PyBufferBytes::try_from(haystack)?;
412+
let matches = get_matches(&self_.ac_impl, haystack_buffer.as_ref(), overlapping)?
413+
.map(|m| (m.pattern().as_u64(), m.start(), m.end()));
414+
415+
if !haystack.is_instance_of::<PyBytes>() {
416+
// Note: we must collect here and not release the GIL or return an iterator
417+
// from this function due to the safety caveat in the implementation of
418+
// AsRef<[u8]> for PyBufferBytes, which is relevant here since the matches
419+
// iterator is holding an AsRef reference to the haystack.
420+
Ok(matches.collect())
421+
} else {
422+
// However, if the haystack is a PyBytes, it's guaranteed to be immutable,
423+
// so the safety caveat doesn't apply, and we can safely release the GIL
424+
// while the matches iterator is holding a reference to the haystack.
425+
haystack.py().allow_threads(|| Ok(matches.collect()))
426+
}
421427
}
422428
}
423429

‎tests/test_ac_bytes.py

+29
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,35 @@ def test_different_byte_objects_matching(
7171
assert [haystack[s:e] for (_, s, e) in index_matches] == expected
7272

7373

74+
@pytest.mark.parametrize(
75+
"implementation",
76+
[
77+
None,
78+
Implementation.NoncontiguousNFA,
79+
Implementation.ContiguousNFA,
80+
Implementation.DFA,
81+
],
82+
)
83+
@pytest.mark.parametrize("haystack_type", [bytes, bytearray, memoryview])
84+
def test_different_byte_haystacks_matching(
85+
implementation: Optional[Implementation],
86+
haystack_type: type[bytes | bytearray | memoryview],
87+
) -> None:
88+
"""
89+
find_matches_as_indexes() returns matching patterns in the given byte string.
90+
"""
91+
haystack = haystack_type(b"hello, world, hello again")
92+
patterns = [b"hello", b"world"]
93+
ac = BytesAhoCorasick(patterns, implementation=implementation)
94+
95+
expected = [b"hello", b"world", b"hello"]
96+
97+
# find_matches_as_indexes()
98+
index_matches = ac.find_matches_as_indexes(haystack)
99+
assert [patterns[i] for (i, _, _) in index_matches] == expected
100+
assert [haystack[s:e] for (_, s, e) in index_matches] == expected
101+
102+
74103
def test_iterator_of_patterns() -> None:
75104
"""
76105
It's possible to construct ``BytesAhoCorasick()`` with an iterator.

0 commit comments

Comments
 (0)
Please sign in to comment.