diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 58ee2c703..dfee42576 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -37,7 +37,6 @@ jobs: - name: Install dependencies run: | - sudo apt-get update && sudo apt-get -y install libhyperscan-dev librdkafka-dev pip install --upgrade pip wheel pip install .[dev] diff --git a/CHANGELOG.md b/CHANGELOG.md index ed3ad3a05..47805bc1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ ## next release ### Breaking + +* remove `hyperscan_resolver` processor because it is not significantly faster as the `generic_resolver` with enabled cache + ### Features ### Improvements ### Bugfix diff --git a/doc/source/configuration/processor.rst b/doc/source/configuration/processor.rst index 54165aba3..cd5c6b308 100644 --- a/doc/source/configuration/processor.rst +++ b/doc/source/configuration/processor.rst @@ -19,7 +19,6 @@ Processors .. automodule:: logprep.processor.generic_resolver.processor .. automodule:: logprep.processor.geoip_enricher.processor .. automodule:: logprep.processor.grokker.processor -.. automodule:: logprep.processor.hyperscan_resolver.processor .. automodule:: logprep.processor.ip_informer.processor .. automodule:: logprep.processor.key_checker.processor .. automodule:: logprep.processor.labeler.processor diff --git a/doc/source/development/architecture/diagramms/process-Combined.drawio b/doc/source/development/architecture/diagramms/process-Combined.drawio index 4750edeef..b61df2c35 100644 --- a/doc/source/development/architecture/diagramms/process-Combined.drawio +++ b/doc/source/development/architecture/diagramms/process-Combined.drawio @@ -1,11 +1,11 @@ - + - + - + @@ -112,7 +112,7 @@ - + @@ -243,7 +243,7 @@ - + @@ -322,7 +322,7 @@ - + @@ -1637,85 +1637,8 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + @@ -1846,7 +1769,7 @@ - + @@ -1922,7 +1845,7 @@ - + @@ -1960,7 +1883,7 @@ - + @@ -2040,7 +1963,7 @@ - + @@ -2136,7 +2059,7 @@ - + @@ -2228,7 +2151,7 @@ - + @@ -2377,7 +2300,7 @@ - + @@ -2465,7 +2388,7 @@ - + @@ -2603,7 +2526,7 @@ - + @@ -2674,7 +2597,7 @@ - + @@ -2805,7 +2728,7 @@ - + diff --git a/doc/source/development/architecture/diagramms/process-Combined.drawio.html b/doc/source/development/architecture/diagramms/process-Combined.drawio.html index c83f705e9..4021ee2ae 100644 --- a/doc/source/development/architecture/diagramms/process-Combined.drawio.html +++ b/doc/source/development/architecture/diagramms/process-Combined.drawio.html @@ -5,8 +5,7 @@ process-Combined - -
- +
+ - + \ No newline at end of file diff --git a/logprep/processor/generic_resolver/rule.py b/logprep/processor/generic_resolver/rule.py index e9f9c9efe..8b71a3d66 100644 --- a/logprep/processor/generic_resolver/rule.py +++ b/logprep/processor/generic_resolver/rule.py @@ -3,7 +3,6 @@ ^^^^^^^^^^^^^^^^^^ The generic resolver requires the additional field :code:`generic_resolver`. -It works similarly to the hyperscan resolver, which utilizes hyperscan to process resolve lists. Configurable fields are being checked by regex patterns and a configurable value will be added if a pattern matches. The parameters within :code:`generic_resolver` must be of the form diff --git a/logprep/processor/hyperscan_resolver/__init__.py b/logprep/processor/hyperscan_resolver/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/logprep/processor/hyperscan_resolver/processor.py b/logprep/processor/hyperscan_resolver/processor.py deleted file mode 100644 index f75c0174f..000000000 --- a/logprep/processor/hyperscan_resolver/processor.py +++ /dev/null @@ -1,211 +0,0 @@ -""" -HyperscanResolver -================= - -The `hyperscan_resolver` is a processor that can resolve fields by using a map of resolve patterns -and resolve values. The map can be defined within rules or within a file. It uses python hyperscan -to speedup the pattern matching. -It works similarly to the generic resolver, but utilized hyperscan to process resolve lists. - -For further information see: `GenericResolver`_. - -Processor Configuration -^^^^^^^^^^^^^^^^^^^^^^^ -.. code-block:: yaml - :linenos: - - - hyperscanresolvername: - type: hyperscan_resolver - rules: - - tests/testdata/rules/rules - hyperscan_db_path: tmp/path/scan.db - -.. autoclass:: logprep.processor.hyperscan_resolver.processor.HyperscanResolver.Config - :members: - :undoc-members: - :inherited-members: - :noindex: - -.. automodule:: logprep.processor.hyperscan_resolver.rule -""" - -import errno -from os import makedirs, path -from typing import Any, Dict, Tuple - -from attr import define, field - -from logprep.processor.base.exceptions import ( - FieldExistsWarning, - ProcessingCriticalError, - SkipImportError, -) -from logprep.processor.field_manager.processor import FieldManager -from logprep.util.helper import add_fields_to, get_dotted_field_value -from logprep.util.validators import directory_validator - -# pylint: disable=no-name-in-module -try: - from hyperscan import HS_FLAG_CASELESS, HS_FLAG_SINGLEMATCH, Database, dumpb, loadb -except ModuleNotFoundError as error: # pragma: no cover - raise SkipImportError("hyperscan_resolver") from error - -# pylint: enable=no-name-in-module - -# pylint: disable=ungrouped-imports -from logprep.processor.hyperscan_resolver.rule import HyperscanResolverRule - -# pylint: enable=ungrouped-imports - - -class HyperscanResolver(FieldManager): - """Resolve values in documents by referencing a mapping list.""" - - @define(kw_only=True) - class Config(FieldManager.Config): - """HyperscanResolver config""" - - hyperscan_db_path: str = field(validator=directory_validator) - """Path to a directory where the compiled - `Hyperscan `_ - databases will be stored persistently. - Persistent storage is set to false per default. - If the specified directory does not exist, it will be created. - The database will be stored in the directory of the `hyperscan_resolver` if no path has - been specified within the pipeline config. - To update and recompile a persistently stored databases simply delete the whole directory. - The databases will be compiled again during the next run.""" - - __slots__ = ["_hyperscan_database_path", "_hyperscan_databases", "_replacements_from_file"] - - _replacements_from_file: dict - - _hyperscan_database_path: str - - _hyperscan_databases: dict - - rule_class = HyperscanResolverRule - - def __init__(self, name: str, configuration: FieldManager.Config): - super().__init__(name=name, configuration=configuration) - self._hyperscan_databases = {} - - hyperscan_db_path = configuration.hyperscan_db_path - if hyperscan_db_path: - self._hyperscan_database_path = hyperscan_db_path - else: - self._hyperscan_database_path = f"{path.dirname(path.abspath(__file__))}/hyperscan_dbs/" - - self._replacements_from_file = {} - - def _apply_rules(self, event: dict, rule: HyperscanResolverRule): - """Apply the given rule to the current event""" - conflicting_fields = [] - hyperscan_db, pattern_id_to_dest_val_map = self._get_hyperscan_database(rule) - - source_values = [] - for resolve_source, resolve_target in rule.field_mapping.items(): - src_val = get_dotted_field_value(event, resolve_source) - source_values.append(src_val) - matches = self._match_with_hyperscan(hyperscan_db, src_val) - if matches: - dest_val = pattern_id_to_dest_val_map[matches[matches.index(min(matches))]] - if dest_val: - current_content = get_dotted_field_value(event, resolve_target) - if isinstance(current_content, list) and dest_val in current_content: - continue - if rule.merge_with_target and current_content is None: - dest_val = [dest_val] - try: - add_fields_to( - event, - fields={resolve_target: dest_val}, - rule=rule, - merge_with_target=rule.merge_with_target, - overwrite_target=rule.overwrite_target, - ) - except FieldExistsWarning as error: - conflicting_fields.extend(error.skipped_fields) - self._handle_missing_fields(event, rule, rule.field_mapping.keys(), source_values) - if conflicting_fields: - raise FieldExistsWarning(rule, event, conflicting_fields) - - @staticmethod - def _match_with_hyperscan(hyperscan_db: Database, src_val: str) -> list: - if not src_val: - return [] - - def on_match(matching_pattern_id: int, _fr, _to, _flags, _context): - result.append(matching_pattern_id) - - result = [] - - hyperscan_db.scan(src_val.encode("utf-8"), match_event_handler=on_match) - return result - - def _get_hyperscan_database(self, rule: HyperscanResolverRule): - database_id = rule.file_name - resolve_list = rule.resolve_list - - if database_id not in self._hyperscan_databases: - try: - database, value_mapping = self._load_database(database_id, resolve_list) - except FileNotFoundError: - database, value_mapping = self._create_database(resolve_list, rule) - - if rule.store_db_persistent: - self._save_database(database, database_id) - - self._hyperscan_databases[database_id] = {} - self._hyperscan_databases[database_id]["db"] = database - self._hyperscan_databases[database_id]["value_mapping"] = value_mapping - - return ( - self._hyperscan_databases[database_id]["db"], - self._hyperscan_databases[database_id]["value_mapping"], - ) - - def _load_database(self, database_id: int, resolve_list: dict) -> Tuple[Any, Dict[int, Any]]: - value_mapping = {} - - with open(f"{self._hyperscan_database_path}/{database_id}.db", "rb") as db_file: - data = db_file.read() - - for idx, pattern in enumerate(resolve_list.keys()): - value_mapping[idx] = resolve_list[pattern] - - return loadb(data), value_mapping - - def _save_database(self, database: Database, database_id: int): - _create_hyperscan_dbs_dir(self._hyperscan_database_path) - serialized_db = dumpb(database) - - with open(f"{self._hyperscan_database_path}/{database_id}.db", "wb") as db_file: - db_file.write(serialized_db) - - def _create_database(self, resolve_list: dict, rule): - database = Database() - value_mapping = {} - db_patterns = [] - - for idx, pattern in enumerate(resolve_list.keys()): - db_patterns += [(pattern.encode("utf-8"), idx, HS_FLAG_SINGLEMATCH | HS_FLAG_CASELESS)] - value_mapping[idx] = resolve_list[pattern] - - if not db_patterns: - raise ProcessingCriticalError( - f"{self.name} No patter to compile for hyperscan database!", rule - ) - - expressions, ids, flags = zip(*db_patterns) - database.compile(expressions=expressions, ids=ids, elements=len(db_patterns), flags=flags) - - return database, value_mapping - - -def _create_hyperscan_dbs_dir(path_: str): - try: - makedirs(path_) - except OSError as err: - if err.errno != errno.EEXIST: - raise diff --git a/logprep/processor/hyperscan_resolver/rule.py b/logprep/processor/hyperscan_resolver/rule.py deleted file mode 100644 index 51480e529..000000000 --- a/logprep/processor/hyperscan_resolver/rule.py +++ /dev/null @@ -1,181 +0,0 @@ -""" -Rule Configuration -^^^^^^^^^^^^^^^^^^ - -The hyperscan resolver requires the additional field :code:`hyperscan_resolver`. - -The hyperscan resolver uses the -`Python Hyperscan library `_ -to check regex patterns. -By default, the compiled Hyperscan databases will be stored persistently in the directory -specified in the :code:`pipeline.yml`. -The field :code:`store_db_persistent` can be used to configure -if a database compiled from a rule's :code:`resolve_list` should be stored persistently. - -.. autoclass:: logprep.processor.hyperscan_resolver.rule.HyperscanResolverRule.Config - :members: - :undoc-members: - :inherited-members: - :noindex: -""" - -import re -from typing import Tuple - -from attrs import define, field, validators - -from logprep.processor.base.rule import InvalidRuleDefinitionError -from logprep.processor.generic_resolver.rule import GenericResolverRule -from logprep.util.getter import GetterFactory - - -class HyperscanResolverRuleError(InvalidRuleDefinitionError): - """Base class for HyperscanResolver rule related exceptions.""" - - def __init__(self, message: str): - super().__init__(f"HyperscanResolver rule ({message}): ") - - -class InvalidHyperscanResolverDefinition(HyperscanResolverRuleError): - """Raise if HyperscanResolver definition invalid.""" - - def __init__(self, definition): - message = f"The following HyperscanResolver definition is invalid: {definition}" - super().__init__(message) - - -class HyperscanResolverRule(GenericResolverRule): - """Check if documents match a filter.""" - - @define(kw_only=True) - class Config(GenericResolverRule.Config): - """RuleConfig for HyperscanResolver""" - - # Not used to check for equality, since it's results are reflected in resolve_list - resolve_from_file: dict = field( - validator=[ - validators.instance_of(dict), - validators.deep_mapping( - key_validator=validators.in_(["path", "pattern"]), - value_validator=validators.instance_of(str), - ), - ], - converter=lambda x: {"path": x, "pattern": ""} if isinstance(x, str) else x, - factory=dict, - eq=False, - ) - """A YML file with a resolve list and an optional regex pattern can - be used to resolve values (for string format see :ref:`getters`). - For this, either a field :code:`resolve_from_file` with a path to a resolve list - file must be added or dictionary field :code:`resolve_from_file` with the subfields - :code:`path` and :code:`pattern`. - Using the :code:`pattern` option allows to define one regex pattern that - can be used on all entries within a resolve list instead of having - to write a regex pattern for each entry in the list.""" - store_db_persistent: bool = field(validator=validators.instance_of(bool), default=False) - """Can be used to configure if a database compiled from - a rule's :code:`resolve_list` should be stored persistently.""" - - def __attrs_post_init__(self): - if self.resolve_from_file: - self._init_resolve_from_file() - - def _init_resolve_from_file(self): - pattern, resolve_file_path = self._get_resolve_file_path_and_pattern() - try: - add_dict = GetterFactory.from_string(resolve_file_path).get_yaml() - - if isinstance(add_dict, dict) and all( - isinstance(value, str) for value in add_dict.values() - ): - self._add_dict_to_resolve_list(add_dict, pattern) - else: - raise InvalidHyperscanResolverDefinition( - f"Additions file '{self.resolve_from_file} must be a dictionary with " - f"string values!" - ) - except FileNotFoundError as error: - raise InvalidHyperscanResolverDefinition( - f"Additions file '{self.resolve_from_file}' not found!" - ) from error - - def _add_dict_to_resolve_list(self, add_dict: dict, pattern: str): - if pattern: - add_dict = self._replace_patterns_in_resolve_dict(add_dict, pattern) - self.resolve_list = {**self.resolve_list, **add_dict} - - @staticmethod - def _replace_patterns_in_resolve_dict(add_dict: dict, pattern: str): - replaced_add_dict = {} - for key, value in add_dict.items(): - matches = re.match(pattern, key) - if matches: - mapping = matches.group("mapping") - if mapping: - match_key = re.match(f"^{pattern}$", key) - if match_key: - replaced_pattern = HyperscanResolverRule.Config._replace_pattern( - mapping, pattern - ) - replaced_add_dict[replaced_pattern] = value - add_dict = replaced_add_dict - return add_dict - - @staticmethod - def _replace_pattern(mapping: str, pattern: str) -> str: - first_pos = pattern.find("(?P") - last_pos = first_pos - bracket_cnt = 0 - escape_cnt = 0 - for char in pattern[first_pos:]: - if char == "\\": - escape_cnt += 1 - elif char == "(": - if escape_cnt % 2 == 0: - bracket_cnt += 1 - escape_cnt = 0 - elif char == ")": - if escape_cnt % 2 == 0: - bracket_cnt -= 1 - escape_cnt = 0 - else: - escape_cnt = 0 - last_pos += 1 - if bracket_cnt <= 0: - break - replaced_pattern = pattern[:first_pos] + re.escape(mapping) + pattern[last_pos:] - return replaced_pattern - - def _get_resolve_file_path_and_pattern(self) -> Tuple[str, str]: - resolve_file_path = None - pattern = None - if isinstance(self.resolve_from_file, str): - resolve_file_path = self.resolve_from_file - elif isinstance(self.resolve_from_file, dict): - resolve_file_path = self.resolve_from_file.get("path") - pattern = self.resolve_from_file.get("pattern") - if resolve_file_path is None or pattern is None: - raise InvalidHyperscanResolverDefinition( - f"Parameter 'resolve_from_file' ({self.resolve_from_file}) must be " - f"either a dictionary with path and pattern or a string containing a path!" - ) - return pattern, resolve_file_path - - # pylint: disable=C0111 - @property - def field_mapping(self) -> dict: - return self._config.field_mapping - - @property - def resolve_list(self) -> dict: - return self._config.resolve_list - - @property - def resolve_from_file(self) -> str: - return self._config.resolve_from_file - - @property - def store_db_persistent(self) -> bool: - return self._config.store_db_persistent - - # pylint: enable=C0111 diff --git a/logprep/registry.py b/logprep/registry.py index c9329e52b..9bd6eedb1 100644 --- a/logprep/registry.py +++ b/logprep/registry.py @@ -31,7 +31,6 @@ from logprep.processor.generic_resolver.processor import GenericResolver from logprep.processor.geoip_enricher.processor import GeoipEnricher from logprep.processor.grokker.processor import Grokker -from logprep.processor.hyperscan_resolver.processor import HyperscanResolver from logprep.processor.ip_informer.processor import IpInformer from logprep.processor.key_checker.processor import KeyChecker from logprep.processor.labeler.processor import Labeler @@ -66,7 +65,6 @@ class Registry: "generic_resolver": GenericResolver, "geoip_enricher": GeoipEnricher, "grokker": Grokker, - "hyperscan_resolver": HyperscanResolver, "ip_informer": IpInformer, "key_checker": KeyChecker, "labeler": Labeler, diff --git a/pyproject.toml b/pyproject.toml index a32874dd6..b13ae1b08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,7 +63,6 @@ dependencies = [ "confluent-kafka>2", "filelock", "geoip2", - "hyperscan>=0.7.0", "jsonref", "luqum", "more-itertools==8.10.0", diff --git a/tests/testdata/unit/hyperscan_resolver/resolve_mapping_different.yml b/tests/testdata/unit/hyperscan_resolver/resolve_mapping_different.yml deleted file mode 100644 index 2ee1db032..000000000 --- a/tests/testdata/unit/hyperscan_resolver/resolve_mapping_different.yml +++ /dev/null @@ -1,2 +0,0 @@ -to_resolve: resolved -other_to_resolve: other_resolved \ No newline at end of file diff --git a/tests/testdata/unit/hyperscan_resolver/resolve_mapping_no_regex.yml b/tests/testdata/unit/hyperscan_resolver/resolve_mapping_no_regex.yml deleted file mode 100644 index 2617cf2d4..000000000 --- a/tests/testdata/unit/hyperscan_resolver/resolve_mapping_no_regex.yml +++ /dev/null @@ -1,3 +0,0 @@ -ab: ab_resolved -de: de_resolved -0gh0: gh_resolved \ No newline at end of file diff --git a/tests/testdata/unit/hyperscan_resolver/resolve_mapping_regex.yml b/tests/testdata/unit/hyperscan_resolver/resolve_mapping_regex.yml deleted file mode 100644 index c832febde..000000000 --- a/tests/testdata/unit/hyperscan_resolver/resolve_mapping_regex.yml +++ /dev/null @@ -1,3 +0,0 @@ -\d*?ab\d*: ab_resolved -\d*?de\d*: de_resolved -\d*?0gh0\d*: gh_resolved \ No newline at end of file diff --git a/tests/testdata/unit/hyperscan_resolver/resolve_mapping_same.yml b/tests/testdata/unit/hyperscan_resolver/resolve_mapping_same.yml deleted file mode 100644 index ea529e888..000000000 --- a/tests/testdata/unit/hyperscan_resolver/resolve_mapping_same.yml +++ /dev/null @@ -1 +0,0 @@ -to_resolve: resolved \ No newline at end of file diff --git a/tests/testdata/unit/hyperscan_resolver/resolve_mapping_with_parenthesis.yml b/tests/testdata/unit/hyperscan_resolver/resolve_mapping_with_parenthesis.yml deleted file mode 100644 index 818829a4c..000000000 --- a/tests/testdata/unit/hyperscan_resolver/resolve_mapping_with_parenthesis.yml +++ /dev/null @@ -1,3 +0,0 @@ -ab: ab_resolved -ab)c: ab)c_resolved -ab\)c: ab\)c_resolved \ No newline at end of file diff --git a/tests/testdata/unit/hyperscan_resolver/rules/rule_1.json b/tests/testdata/unit/hyperscan_resolver/rules/rule_1.json deleted file mode 100644 index a6dbd75d5..000000000 --- a/tests/testdata/unit/hyperscan_resolver/rules/rule_1.json +++ /dev/null @@ -1,11 +0,0 @@ -[{ - "filter": "anything", - "hyperscan_resolver": { - "field_mapping": { - "to_resolve": "resolved" - }, - "resolve_list": { - "to_resolve": "resolved_value" - } - } -}] diff --git a/tests/testdata/unit/hyperscan_resolver/rules/rule_2.json b/tests/testdata/unit/hyperscan_resolver/rules/rule_2.json deleted file mode 100644 index 08605d54b..000000000 --- a/tests/testdata/unit/hyperscan_resolver/rules/rule_2.json +++ /dev/null @@ -1,11 +0,0 @@ -[{ - "filter": "something-else", - "hyperscan_resolver": { - "field_mapping": { - "to_resolve": "well-resolved" - }, - "resolve_list": { - "to_resolve": "resolved_value" - } - } -}] diff --git a/tests/unit/processor/hyperscan_resolver/__init__.py b/tests/unit/processor/hyperscan_resolver/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/unit/processor/hyperscan_resolver/test_hyperscan_resolver.py b/tests/unit/processor/hyperscan_resolver/test_hyperscan_resolver.py deleted file mode 100644 index 7b0c81f68..000000000 --- a/tests/unit/processor/hyperscan_resolver/test_hyperscan_resolver.py +++ /dev/null @@ -1,734 +0,0 @@ -# pylint: disable=protected-access -# pylint: disable=missing-docstring -# pylint: disable=wrong-import-position -# pylint: disable=wrong-import-order -from collections import OrderedDict -from copy import deepcopy - -import pytest - -from logprep.processor.base.exceptions import ( - FieldExistsWarning, - ProcessingCriticalError, -) - -pytest.importorskip("hyperscan") - -# pylint: disable=ungrouped-imports -from logprep.processor.hyperscan_resolver.rule import InvalidHyperscanResolverDefinition -from tests.unit.processor.base import BaseProcessorTestCase - -# pylint: enable=ungrouped-imports - -pytest.importorskip("logprep.processor.hyperscan_resolver") - -from logprep.processor.hyperscan_resolver.processor import HyperscanResolver - - -class TestHyperscanResolverProcessor(BaseProcessorTestCase): - CONFIG = { - "type": "hyperscan_resolver", - "rules": ["tests/testdata/unit/hyperscan_resolver/rules"], - "tree_config": "tests/testdata/unit/shared_data/tree_config.json", - "hyperscan_db_path": "/tmp", - } - - def test_resolve_instantiates(self): - rule = {"filter": "anything", "hyperscan_resolver": {"field_mapping": {}}} - - self._load_rule(rule) - - assert isinstance(self.object, HyperscanResolver) - - def test_resolve_not_dotted_field_no_conflict_match(self): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_list": {".*HELLO\\d": "Greeting"}, - }, - } - - self._load_rule(rule) - - expected = {"to_resolve": "something HELLO1", "resolved": "Greeting"} - document = {"to_resolve": "something HELLO1"} - - self.object.process(document) - - assert document == expected - - def test_resolve_not_dotted_field_no_conflict_and_to_list_entries_match( - self, - ): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_list": {".*HELLO\\d": "Greeting", ".*BYE\\d": "Farewell"}, - }, - } - - self._load_rule(rule) - expected = {"to_resolve": "something HELLO1", "resolved": "Greeting"} - document = {"to_resolve": "something HELLO1"} - - self.object.process(document) - assert document == expected - - expected = {"to_resolve": "something BYE1", "resolved": "Farewell"} - document = {"to_resolve": "something BYE1"} - - self.object.process(document) - assert document == expected - - def test_resolve_not_dotted_field_no_conflict_no_match(self): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_list": {".*HELLO\\d": "Greeting"}, - }, - } - - self._load_rule(rule) - - expected = {"to_resolve": "something no"} - document = {"to_resolve": "something no"} - - self.object.process(document) - - assert document == expected - - def test_resolve_dotted_no_conflict_match(self): - rule = { - "filter": "to.resolve", - "hyperscan_resolver": { - "field_mapping": {"to.resolve": "resolved"}, - "resolve_list": {".*HELLO\\d": "Greeting"}, - }, - } - - self._load_rule(rule) - - expected = {"to": {"resolve": "something HELLO1"}, "resolved": "Greeting"} - document = {"to": {"resolve": "something HELLO1"}} - - self.object.process(document) - - assert document == expected - - def test_resolve_dotted_no_conflict_from_file(self): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": "tests/testdata/unit/hyperscan_resolver/" - "resolve_mapping_no_regex.yml", - "resolve_list": {"FOO": "BAR"}, - }, - } - - self._load_rule(rule) - - expected = {"to_resolve": "ab", "resolved": "ab_resolved"} - document = {"to_resolve": "ab"} - - self.object.process(document) - - assert document == expected - - def test_resolve_from_file_and_from_list(self): - rule = { - "filter": "to_resolve_1 AND to_resolve_2", - "hyperscan_resolver": { - "field_mapping": {"to_resolve_1": "resolved_1", "to_resolve_2": "resolved_2"}, - "resolve_from_file": "tests/testdata/unit/hyperscan_resolver/" - "resolve_mapping_no_regex.yml", - "resolve_list": {"fg": "fg_resolved"}, - }, - } - - self._load_rule(rule) - - expected = { - "to_resolve_1": "ab", - "to_resolve_2": "fg", - "resolved_1": "ab_resolved", - "resolved_2": "fg_resolved", - } - document = {"to_resolve_1": "ab", "to_resolve_2": "fg"} - - self.object.process(document) - - assert document == expected - - def test_resolve_dotted_no_conflict_no_from_file(self): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": "tests/testdata/unit/hyperscan_resolver/" - "resolve_mapping_no_regex.yml", - "resolve_list": {"FOO": "BAR"}, - }, - } - - self._load_rule(rule) - - expected = { - "to_resolve": "not_in_list", - } - document = {"to_resolve": "not_in_list"} - - self.object.process(document) - - assert document == expected - - def test_resolve_dotted_no_conflict_from_file_and_list( - self, - ): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": "tests/testdata/unit/hyperscan_resolver/" - "resolve_mapping_no_regex.yml", - "merge_with_target": True, - }, - } - - self._load_rule(rule) - - expected = {"to_resolve": "12ab34", "resolved": ["ab_resolved"]} - document = {"to_resolve": "12ab34"} - - self.object.process(document) - - assert document == expected - - def test_resolve_dotted_no_conflict_from_file_and_list_has_conflict( - self, - ): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": "tests/testdata/unit/hyperscan_resolver/" - "resolve_mapping_no_regex.yml", - "merge_with_target": True, - }, - } - - self._load_rule(rule) - - expected = {"to_resolve": "12ab34", "resolved": ["ab_resolved"]} - document = {"to_resolve": "12ab34"} - - self.object.process(document) - self.object.process(document) - - assert document == expected - - def test_resolve_dotted_no_conflict_from_file_and_list_has_conflict_and_diff_inputs( - self, - ): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved", "other_to_resolve": "resolved"}, - "resolve_from_file": "tests/testdata/unit/hyperscan_resolver/" - "resolve_mapping_no_regex.yml", - "merge_with_target": True, - }, - } - - self._load_rule(rule) - - expected = { - "to_resolve": "12ab34", - "other_to_resolve": "00de11", - "resolved": ["ab_resolved", "de_resolved"], - } - document = {"to_resolve": "12ab34", "other_to_resolve": "00de11"} - - self.object.process(document) - self.object.process(document) - - assert document == expected - - def test_resolve_from_file_and_file_does_not_exist(self): - rule = { - "filter": "to.resolve", - "hyperscan_resolver": { - "field_mapping": {"to.resolve": "resolved"}, - "resolve_from_file": "i/do/not/exist", - }, - } - - with pytest.raises(InvalidHyperscanResolverDefinition): - self._load_rule(rule) - - def test_resolve_dotted_no_conflict_no_match(self): - rule = { - "filter": "to.resolve", - "hyperscan_resolver": { - "field_mapping": {"to.resolve": "resolved"}, - "resolve_list": {".*HELLO\\d": "Greeting"}, - }, - } - - self._load_rule(rule) - - expected = {"to": {"resolve": "something no"}} - document = {"to": {"resolve": "something no"}} - - self.object.process(document) - - assert document == expected - - def test_resolve_dotted_field_is_missing(self): - rule = { - "filter": "to.other_field", - "hyperscan_resolver": { - "field_mapping": {"to.resolve": "resolved"}, - "resolve_list": {".*HELLO\\d": "Greeting"}, - }, - } - self._load_rule(rule) - - expected = { - "to": {"other_field": "something no"}, - "tags": ["_hyperscan_resolver_missing_field_warning"], - } - document = {"to": {"other_field": "something no"}} - - self.object.process(document) - - assert document == expected - - def test_resolve_dotted_dest_field_no_conflict_match(self): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "re.solved"}, - "resolve_list": {".*HELLO\\d": "Greeting"}, - }, - } - - self._load_rule(rule) - - expected = {"to_resolve": "something HELLO1", "re": {"solved": "Greeting"}} - document = {"to_resolve": "something HELLO1"} - - self.object.process(document) - - assert document == expected - - def test_resolve_dotted_dest_field_no_conflict_no_match(self): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "re.solved"}, - "resolve_list": {".*HELLO\\d": "Greeting"}, - }, - } - - self._load_rule(rule) - - expected = {"to_resolve": "something no"} - document = {"to_resolve": "something no"} - - self.object.process(document) - - assert document == expected - - def test_resolve_dotted_and_dest_field_no_conflict_match(self): - rule = { - "filter": "to.resolve", - "hyperscan_resolver": { - "field_mapping": {"to.resolve": "re.solved"}, - "resolve_list": {".*HELLO\\d": "Greeting"}, - }, - } - - self._load_rule(rule) - - expected = {"to": {"resolve": "something HELLO1"}, "re": {"solved": "Greeting"}} - document = {"to": {"resolve": "something HELLO1"}} - - self.object.process(document) - - assert document == expected - - def test_resolve_dotted_and_dest_field_with_conflict_match(self): - rule = { - "filter": "to.resolve", - "hyperscan_resolver": { - "field_mapping": {"to.resolve": "re.solved"}, - "resolve_list": {".*HELLO\\d": "Greeting"}, - }, - } - self._load_rule(rule) - document = {"to": {"resolve": "something HELLO1"}, "re": {"solved": "I already exist!"}} - expected = { - "to": {"resolve": "something HELLO1"}, - "re": {"solved": "I already exist!"}, - "tags": ["_hyperscan_resolver_failure"], - } - result = self.object.process(document) - assert len(result.warnings) == 1 - assert isinstance(result.warnings[0], FieldExistsWarning) - assert document == expected - - def test_resolve_with_multiple_match_first_only(self): - rule = { - "filter": "to.resolve", - "hyperscan_resolver": { - "field_mapping": {"to.resolve": "re.solved"}, - "resolve_list": OrderedDict( - { - ".*HELLO\\d": "Greeting", - ".*HELL.\\d": "Greeting2", - ".*HEL..\\d": "Greeting3", - } - ), - }, - } - - self._load_rule(rule) - - expected = {"to": {"resolve": "something HELLO1"}, "re": {"solved": "Greeting"}} - document = {"to": {"resolve": "something HELLO1"}} - - self.object.process(document) - - assert document == expected - - -class TestHyperscanResolverProcessorWithPatterns(BaseProcessorTestCase): - CONFIG = deepcopy(TestHyperscanResolverProcessor.CONFIG) - - def test_resolve_no_conflict_from_file(self): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": { - "path": "tests/testdata/unit/hyperscan_resolver" - "/resolve_mapping_no_regex.yml", - "pattern": r"\d*(?P[a-z]+)\d*", - }, - "resolve_list": {"FOO": "BAR"}, - }, - } - - self._load_rule(rule) - - expected = {"to_resolve": "ab", "resolved": "ab_resolved"} - document = {"to_resolve": "ab"} - - self.object.process(document) - - assert document == expected - - def test_resolve_no_conflict_from_file_and_escaped_parenthesis( - self, - ): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": { - "path": "tests/testdata/unit/hyperscan_resolver" - "/resolve_mapping_with_parenthesis.yml", - "pattern": r"\d*(?P[a-z]+\)c)\d*", - }, - }, - } - - self._load_rule(rule) - - expected = {"to_resolve": "ab)c", "resolved": "ab)c_resolved"} - document = {"to_resolve": "ab)c"} - - self.object.process(document) - - assert document == expected - - def test_resolve_dotted_no_conflict_from_file_and_escaped_parenthesis_and_backslash( - self, - ): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": { - "path": "tests/testdata/unit/hyperscan_resolver" - "/resolve_mapping_with_parenthesis.yml", - "pattern": r"\d*(?P[a-z]+\\\)c)\d*", - }, - }, - } - - self._load_rule(rule) - - expected = {"to_resolve": r"ab\)c", "resolved": r"ab\)c_resolved"} - document = {"to_resolve": r"ab\)c"} - - self.object.process(document) - - assert document == expected - - def test_resolve_dotted_no_conflict_from_file_and_escaped_to_unbalanced_parenthesis( - self, - ): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": { - "path": "tests/testdata/unit/hyperscan_resolver" - "/resolve_mapping_with_parenthesis.yml", - "pattern": r"\d*(?P[a-z]+\\\\)c)\d*", - }, - }, - } - - with pytest.raises(Exception, match="unbalanced parenthesis"): - self._load_rule(rule) - - def test_resolve_from_file_and_from_list(self): - rule = { - "filter": "to_resolve_1 AND to_resolve_2", - "hyperscan_resolver": { - "field_mapping": {"to_resolve_1": "resolved_1", "to_resolve_2": "resolved_2"}, - "resolve_from_file": { - "path": "tests/testdata/unit/hyperscan_resolver" - "/resolve_mapping_no_regex.yml", - "pattern": r"\d*(?P[a-z]+)\d*", - }, - "resolve_list": {"fg": "fg_resolved"}, - }, - } - - self._load_rule(rule) - - expected = { - "to_resolve_1": "ab", - "to_resolve_2": "fg", - "resolved_1": "ab_resolved", - "resolved_2": "fg_resolved", - } - document = {"to_resolve_1": "ab", "to_resolve_2": "fg"} - - self.object.process(document) - - assert document == expected - - def test_resolve_no_conflict_no_from_file(self): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": { - "path": "tests/testdata/unit/hyperscan_resolver" - "/resolve_mapping_no_regex.yml", - "pattern": r"\d*(?P[a-z]+)\d*", - }, - "resolve_list": {"FOO": "BAR"}, - }, - } - - self._load_rule(rule) - - expected = { - "to_resolve": "not_in_list", - } - document = {"to_resolve": "not_in_list"} - - self.object.process(document) - - assert document == expected - - def test_resolve_no_conflict_from_file_and_list( - self, - ): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": { - "path": "tests/testdata/unit/hyperscan_resolver" - "/resolve_mapping_no_regex.yml", - "pattern": r"\d*(?P[a-z]+)\d*", - }, - "merge_with_target": True, - }, - } - - self._load_rule(rule) - - expected = {"to_resolve": "12ab34", "resolved": ["ab_resolved"]} - document = {"to_resolve": "12ab34"} - - self.object.process(document) - - assert document == expected - - def test_resolve_with_parenthesis_in_mapping(self): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": { - "path": "tests/testdata/unit/hyperscan_resolver" - "/resolve_mapping_with_parenthesis.yml", - "pattern": r"\d*(?P(([a-z])+)())\d*", - }, - "merge_with_target": True, - }, - } - - self._load_rule(rule) - - expected = {"to_resolve": "12ab34", "resolved": ["ab_resolved"]} - document = {"to_resolve": "12ab34"} - - self.object.process(document) - - assert document == expected - - def test_resolve_with_partially_matching_mapping(self): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": { - "path": "tests/testdata/unit/hyperscan_resolver" - "/resolve_mapping_no_regex.yml", - "pattern": r"\d*(?P[a-z]+)\d*", - }, - "merge_with_target": True, - }, - } - - self._load_rule(rule) - - expected = {"to_resolve": "gh", "resolved": ["gh_resolved"]} - document = {"to_resolve": "gh"} - - self.object.process(document) - - assert document == expected - - def test_resolve_no_matching_pattern(self): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": { - "path": "tests/testdata/unit/hyperscan_resolver" - "/resolve_mapping_no_regex.yml", - "pattern": r"\d*(?P[123]+)\d*", - }, - "merge_with_target": True, - }, - } - self._load_rule(rule) - document = {"to_resolve": "12ab34"} - result = self.object.process(document) - assert isinstance(result.errors[0], ProcessingCriticalError) - - def test_resolve_no_conflict_from_file_and_list_has_conflict( - self, - ): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": { - "path": "tests/testdata/unit/hyperscan_resolver" - "/resolve_mapping_no_regex.yml", - "pattern": r"\d*(?P[a-z]+)\d*", - }, - "merge_with_target": True, - }, - } - - self._load_rule(rule) - - expected = {"to_resolve": "12ab34", "resolved": ["ab_resolved"]} - document = {"to_resolve": "12ab34"} - - self.object.process(document) - self.object.process(document) - - assert document == expected - - def test_resolve_no_conflict_from_file_and_list_has_conflict_and_diff_inputs( - self, - ): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved", "other_to_resolve": "resolved"}, - "resolve_from_file": { - "path": "tests/testdata/unit/hyperscan_resolver" - "/resolve_mapping_no_regex.yml", - "pattern": r"\d*(?P[a-z]+)\d*", - }, - "merge_with_target": True, - }, - } - - self._load_rule(rule) - - expected = { - "to_resolve": "12ab34", - "other_to_resolve": "00de11", - "resolved": ["ab_resolved", "de_resolved"], - } - document = {"to_resolve": "12ab34", "other_to_resolve": "00de11"} - - self.object.process(document) - self.object.process(document) - - assert document == expected - - def test_resolve_dotted_no_conflict_from_file_group_mapping_does_not_exist( - self, - ): - rule = { - "filter": "to_resolve", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": { - "path": "tests/testdata/unit/hyperscan_resolver/resolve_mapping_regex.yml", - "pattern": r"\d*(?P[a-z]+)\d*", - }, - "resolve_list": {"FOO": "BAR"}, - }, - } - - self._load_rule(rule) - - document = {"to_resolve": "ab"} - - self.object.process(document) - - def test_resolve_from_file_and_file_does_not_exist(self): - rule = { - "filter": "to.resolve", - "hyperscan_resolver": { - "field_mapping": {"to.resolve": "resolved"}, - "resolve_from_file": {"path": "i/do/not/exist", "pattern": "bar"}, - }, - } - - with pytest.raises( - InvalidHyperscanResolverDefinition, - match=r"The following HyperscanResolver definition is invalid: Additions file '{" - r"'path': 'i/do/not/exist', 'pattern': 'bar'}' not found!", - ): - self._load_rule(rule) diff --git a/tests/unit/processor/hyperscan_resolver/test_hyperscan_resolver_rule.py b/tests/unit/processor/hyperscan_resolver/test_hyperscan_resolver_rule.py deleted file mode 100644 index 3982c5560..000000000 --- a/tests/unit/processor/hyperscan_resolver/test_hyperscan_resolver_rule.py +++ /dev/null @@ -1,193 +0,0 @@ -# pylint: disable=protected-access -# pylint: disable=missing-docstring -# pylint: disable=wrong-import-position -# pylint: disable=wrong-import-order -import pytest - -from logprep.processor.hyperscan_resolver.rule import HyperscanResolverRule - -pytest.importorskip("hyperscan") - - -@pytest.fixture(name="rule_definition") -def fixture_rule_definition(): - return { - "filter": "some_filter", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_list": {"to_resolve": "resolved"}, - "store_db_persistent": True, - }, - "description": "insert a description text", - } - - -@pytest.fixture(name="rule_with_resolve_file_definition") -def fixture_rule_with_resolve_file_definition(): - return { - "filter": "some_filter", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": { - "path": "tests/testdata/unit/hyperscan_resolver/resolve_mapping_no_regex.yml", - "pattern": r"\d*?(?P[a-z0]+)\d*", - }, - }, - "description": "insert a description text", - } - - -@pytest.mark.parametrize( - "testcase, other_rule_definition, is_equal", - [ - ( - "Should be equal cause the same", - { - "filter": "some_filter", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_list": {"to_resolve": "resolved"}, - "store_db_persistent": True, - }, - }, - True, - ), - ( - "Should be not equal cause of other store_db_persistent", - { - "filter": "some_filter", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_list": {"to_resolve": "resolved"}, - "store_db_persistent": False, - }, - }, - False, - ), - ( - "Should be not equal cause of no store_db_persistent and default is different", - { - "filter": "some_filter", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_list": {"to_resolve": "resolved"}, - }, - }, - False, - ), - ( - "Should be not equal cause of other filter", - { - "filter": "other_filter", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_list": {"to_resolve": "resolved"}, - "store_db_persistent": True, - }, - }, - False, - ), - ( - "Should be not equal cause of other resolve", - { - "filter": "some_filter", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_list": {"other_to_resolve": "other_resolved"}, - "store_db_persistent": True, - }, - }, - False, - ), - ( - "Should be not equal cause of more resolves", - { - "filter": "some_filter", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_list": { - "to_resolve": "resolved", - "other_to_resolve": "other_resolved", - }, - "store_db_persistent": True, - }, - }, - False, - ), - ( - "Should be equal cause file value results in same resolve values", - { - "filter": "some_filter", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": "tests/testdata/unit/hyperscan_resolver/" - "resolve_mapping_same.yml", - "store_db_persistent": True, - }, - }, - True, - ), - ( - "Should be not equal cause file value results in different resolve values", - { - "filter": "some_filter", - "hyperscan_resolver": { - "field_mapping": {"to_resolve": "resolved"}, - "resolve_from_file": "tests/testdata/unit/hyperscan_resolver/" - "resolve_mapping_different.yml", - "store_db_persistent": True, - }, - }, - False, - ), - ], -) -def test_rules_equality( - rule_definition, - testcase, - other_rule_definition, - is_equal, -): - rule1 = HyperscanResolverRule._create_from_dict( - rule_definition, - ) - - rule2 = HyperscanResolverRule._create_from_dict( - other_rule_definition, - ) - - assert (rule1 == rule2) == is_equal, testcase - - -def test_rules_with_differently_defined_but_equivalent_regex_pattern_definition_types_are_equal( - rule_with_resolve_file_definition, -): - rule_no_regex = HyperscanResolverRule._create_from_dict( - rule_with_resolve_file_definition, - ) - - rule_with_resolve_file_definition["hyperscan_resolver"][ - "resolve_from_file" - ] = "tests/testdata/unit/hyperscan_resolver/resolve_mapping_regex.yml" - rule_regex = HyperscanResolverRule._create_from_dict( - rule_with_resolve_file_definition, - ) - - assert rule_no_regex == rule_regex - - -def test_replace_pattern_with_parenthesis_after_closing_parenthesis_not_included_in_replacement(): - replaced_pattern = HyperscanResolverRule.Config._replace_pattern( - "123abc456", - r"\d*(?P[a-z]+)c)\d*", - ) - - assert replaced_pattern == "\\d*123abc456c)\\d*" - - -def test_replace_pattern_with_escaped_parenthesis_is_included_in_replacement(): - replaced_pattern = HyperscanResolverRule.Config._replace_pattern( - r"123ab\)c123", r"\d*(?P[a-z]+\)c)\d*" - ) - - assert replaced_pattern == "\\d*123ab\\\\\\)c123\\d*"