Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable regex flags manipulation #1193

Merged
merged 12 commits into from
Oct 26, 2023
2 changes: 2 additions & 0 deletions presidio-analyzer/presidio_analyzer/analysis_explanation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def __init__(
pattern: str = None,
validation_result: float = None,
textual_explanation: str = None,
regex_flags: int = None,
):

self.recognizer = recognizer
Expand All @@ -34,6 +35,7 @@ def __init__(
self.score_context_improvement = 0
self.supportive_context_word = ""
self.validation_result = validation_result
self.regex_flags = regex_flags

def __repr__(self):
"""Create string representation of the object."""
Expand Down
23 changes: 17 additions & 6 deletions presidio-analyzer/presidio_analyzer/pattern_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ class PatternRecognizer(LocalRecognizer):
:param context: list of context words
:param deny_list_score: confidence score for a term
identified using a deny-list
:param global_regex_flags: regex flags to be used in regex matching,
including deny-lists.
"""

def __init__(
Expand All @@ -37,9 +39,9 @@ def __init__(
deny_list: List[str] = None,
context: List[str] = None,
deny_list_score: float = 1.0,
global_regex_flags: Optional[int] = re.DOTALL | re.MULTILINE | re.IGNORECASE,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is pretty bad. You just change the default value from re.DOTALL | re.MULTILINE to re.DOTALL | re.MULTILINE | re.IGNORECASE.

This affect us as a client of this library.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ducquangkstn thanks for the feedback. This change allows you to have more customizability with regards to regex flags. Is this blocking you in any way?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this blocking you in any way?

Actually, no. It just took me a while to figure out why the behavior is changed when pumping presidio version.

We (my company) are lucky that we have some unit tests. Not sure about other ppl.

version: str = "0.0.1",
):

if not supported_entity:
raise ValueError("Pattern recognizer should be initialized with entity")

Expand All @@ -61,6 +63,7 @@ def __init__(
self.patterns = patterns
self.context = context
self.deny_list_score = deny_list_score
self.global_regex_flags = global_regex_flags

if deny_list:
deny_list_pattern = self._deny_list_to_regex(deny_list)
Expand All @@ -76,16 +79,16 @@ def analyze(
self,
text: str,
entities: List[str],
nlp_artifacts: NlpArtifacts = None,
regex_flags: int = None,
nlp_artifacts: Optional[NlpArtifacts] = None,
regex_flags: Optional[int] = None,
) -> List[RecognizerResult]:
"""
Analyzes text to detect PII using regular expressions or deny-lists.

:param text: Text to be analyzed
:param entities: Entities this recognizer can detect
:param nlp_artifacts: Output values from the NLP engine
:param regex_flags:
:param regex_flags: regex flags to be used in regex matching
:return:
"""
results = []
Expand Down Expand Up @@ -140,6 +143,7 @@ def build_regex_explanation(
pattern: str,
original_score: float,
validation_result: bool,
regex_flags: int,
) -> AnalysisExplanation:
"""
Construct an explanation for why this entity was detected.
Expand All @@ -149,6 +153,7 @@ def build_regex_explanation(
:param pattern: Regex pattern logic
:param original_score: Score given by the recognizer
:param validation_result: Whether validation was used and its result
:param regex_flags: Regex flags used in the regex matching
:return: Analysis explanation
"""
explanation = AnalysisExplanation(
Expand All @@ -157,6 +162,7 @@ def build_regex_explanation(
pattern_name=pattern_name,
pattern=pattern,
validation_result=validation_result,
regex_flags=regex_flags,
)
return explanation

Expand All @@ -172,7 +178,7 @@ def __analyze_patterns(
:param flags: regex flags
:return: A list of RecognizerResult
"""
flags = flags if flags else re.DOTALL | re.MULTILINE
flags = flags if flags else self.global_regex_flags
results = []
for pattern in self.patterns:
match_start_time = datetime.datetime.now()
Expand All @@ -197,7 +203,12 @@ def __analyze_patterns(

validation_result = self.validate_result(current_match)
description = self.build_regex_explanation(
self.name, pattern.name, pattern.regex, score, validation_result
self.name,
pattern.name,
pattern.regex,
score,
validation_result,
flags,
)
pattern_result = RecognizerResult(
entity_type=self.supported_entities[0],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,32 +96,3 @@ def __init__(
context=context,
supported_language=supported_language,
)

def analyze(
self,
text: str,
entities: List[str],
nlp_artifacts: NlpArtifacts = None,
regex_flags: int = None,
) -> List[RecognizerResult]:
"""
Analyzes text to detect PII using regular expressions or deny-lists.

:param text: Text to be analyzed
:param entities: Entities this recognizer can detect
:param nlp_artifacts: Output values from the NLP engine
:param regex_flags:
:return:
"""
regex_flags = (
regex_flags | re.IGNORECASE
if regex_flags
else re.DOTALL | re.MULTILINE | re.IGNORECASE
) # noqa: E501

return super().analyze(
text=text,
entities=entities,
nlp_artifacts=nlp_artifacts,
regex_flags=regex_flags,
)
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,14 @@ def __init__(
self.replacement_pairs = replacement_pairs or [("-", ""), (" ", "")]
self.exact_match = exact_match
self.BOSEOS = bos_eos if exact_match else ()
self.flags = regex_flags
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
global_regex_flags=regex_flags
)

def validate_result(self, pattern_text: str): # noqa D102
Expand Down Expand Up @@ -126,9 +126,10 @@ def __analyze_patterns(self, text: str, flags: int = None):
:param flags: regex flags
:return: A list of RecognizerResult
"""
flags = flags if flags else self.global_regex_flags
results = []
for pattern in self.patterns:
matches = re.finditer(pattern.regex, text, flags=self.flags)
matches = re.finditer(pattern.regex, text, flags=flags)

for match in matches:
for grp_num in reversed(range(1, len(match.groups()) + 1)):
Expand All @@ -148,7 +149,12 @@ def __analyze_patterns(self, text: str, flags: int = None):

validation_result = self.validate_result(current_match)
description = PatternRecognizer.build_regex_explanation(
self.name, pattern.name, pattern.regex, score, validation_result
self.name,
pattern.name,
pattern.regex,
score,
validation_result,
flags,
)
pattern_result = RecognizerResult(
entity_type=self.supported_entities[0],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import logging
from typing import Optional, List, Iterable, Union, Type, Dict

import regex as re

from pathlib import Path
from presidio_analyzer.nlp_engine.transformers_nlp_engine import (
TransformersNlpEngine,
Expand Down Expand Up @@ -53,13 +55,21 @@ class RecognizerRegistry:

:param recognizers: An optional list of recognizers,
that will be available instead of the predefined recognizers
:param global_regex_flags : regex flags to be used in regex matching,
including deny-lists

"""

def __init__(self, recognizers: Optional[Iterable[EntityRecognizer]] = None):
def __init__(
self,
recognizers: Optional[Iterable[EntityRecognizer]] = None,
global_regex_flags: Optional[int] = re.DOTALL | re.MULTILINE | re.IGNORECASE,
):
if recognizers:
self.recognizers = recognizers
else:
self.recognizers = []
self.global_regex_flags = global_regex_flags

def load_predefined_recognizers(
self, languages: Optional[List[str]] = None, nlp_engine: NlpEngine = None
Expand Down Expand Up @@ -112,10 +122,18 @@ def load_predefined_recognizers(
],
}
for lang in languages:
lang_recognizers = [rc() for rc in recognizers_map.get(lang, [])]
lang_recognizers = [
self.__instantiate_recognizer(
recognizer_class=rc, supported_language=lang
)
for rc in recognizers_map.get(lang, [])
]
self.recognizers.extend(lang_recognizers)
all_recognizers = [
rc(supported_language=lang) for rc in recognizers_map.get("ALL", [])
self.__instantiate_recognizer(
recognizer_class=rc, supported_language=lang
)
for rc in recognizers_map.get("ALL", [])
]
self.recognizers.extend(all_recognizers)
if nlp_engine:
Expand Down Expand Up @@ -283,3 +301,18 @@ def add_recognizers_from_yaml(self, yml_path: Union[str, Path]) -> None:
except TypeError as yaml_error:
print(f"Failed to parse file {yml_path}")
raise yaml_error

def __instantiate_recognizer(
self, recognizer_class: Type[EntityRecognizer], supported_language: str
):
"""
Instantiate a recognizer class given type and input.

:param recognizer_class: Class object of the recognizer
:param supported_language: Language this recognizer should support
"""

inst = recognizer_class(supported_language=supported_language)
if isinstance(inst, PatternRecognizer):
inst.global_regex_flags = self.global_regex_flags
return inst

This file was deleted.

5 changes: 2 additions & 3 deletions presidio-analyzer/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,16 @@
"presidio_analyzer",
"presidio_analyzer.predefined_recognizers",
"presidio_analyzer.nlp_engine",
"presidio_analyzer.recognizer_registry",
"presidio_analyzer.context_aware_enhancers",
],
trusted_host=["pypi.org"],
tests_require=["pytest", "flake8>=3.7.9"],
install_requires=[
"spacy>=3.4.4",
"spacy>=3.4.4, <4.0.0",
"regex",
"tldextract",
"pyyaml",
"phonenumbers>=8.12",
"phonenumbers>=8.12,<9.0.0",
],
extras_require={
"transformers": ["spacy_huggingface_pipelines"],
Expand Down
20 changes: 19 additions & 1 deletion presidio-analyzer/tests/test_pattern_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def test_deny_list_score_change():

@pytest.mark.parametrize(
"text,flag,expected_len",
[("mrs. Kennedy", re.IGNORECASE, 1), ("mrs. Kennedy", None, 0)],
[("mrs. Kennedy", re.IGNORECASE, 1), ("mrs. Kennedy", re.DOTALL, 0)],
)
def test_deny_list_regex_flags(text, flag, expected_len):
deny_list = ["Mr.", "Mrs."]
Expand All @@ -201,3 +201,21 @@ def test_empty_deny_list_raises_value_error():
supported_language="en",
deny_list=[],
)


@pytest.mark.parametrize(
"global_flag,expected_len",
[(re.IGNORECASE | re.MULTILINE, 2), (re.MULTILINE, 0)],
)
def test_global_regex_flag_deny_list_returns_right_result(global_flag, expected_len):
deny_list = ["MrS", "mR"]
text = "Mrs. smith \n\n" \
"and Mr. Jones were sitting in the room."

recognizer_ignore_case = PatternRecognizer(supported_entity="TITLE",
name="TitlesRecognizer",
deny_list=deny_list,
global_regex_flags=global_flag)

results = recognizer_ignore_case.analyze(text=text, entities=["TITLE"])
assert len(results) == expected_len
9 changes: 9 additions & 0 deletions presidio-analyzer/tests/test_recognizer_registry.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path

import pytest
import regex as re

from presidio_analyzer import (
RecognizerRegistry,
Expand Down Expand Up @@ -204,3 +205,11 @@ def test_recognizer_registry_exception_erroneous_yaml():
with pytest.raises(TypeError):
registry = RecognizerRegistry()
registry.add_recognizers_from_yaml(test_yaml)


def test_predefined_pattern_recognizers_have_the_right_regex_flags():
registry = RecognizerRegistry(global_regex_flags=re.DOTALL)
registry.load_predefined_recognizers()
for rec in registry.recognizers:
if isinstance(rec, PatternRecognizer):
assert rec.global_regex_flags == re.DOTALL