From 401e40559e86a9726611754f6a5d2b18a40dba05 Mon Sep 17 00:00:00 2001 From: Gokulakrishnan Shankar Date: Fri, 16 Jun 2023 00:02:10 +0530 Subject: [PATCH 1/4] Initial logic check for merging 2 entities --- .../presidio_anonymizer/anonymizer_engine.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py b/presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py index 9741167a7..834173b57 100644 --- a/presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py +++ b/presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py @@ -1,5 +1,6 @@ """Handles the entire logic of the Presidio-anonymizer and text anonymizing.""" import logging +import re from typing import List, Dict, Optional from presidio_anonymizer.core import EngineBase @@ -78,9 +79,22 @@ def anonymize( analyzer_results ) + # This list merges adjacent entities of the same type + # if there is whitespace between them. + merged_results = [] + prev_result = None + for result in analyzer_results: + if prev_result is not None: + if prev_result.entity_type == result.entity_type: + if re.search(r'^( )+$', text[prev_result.end:result.start]): + merged_results.remove(prev_result) + result.start = prev_result.start + merged_results.append(result) + prev_result = result + operators = self.__check_or_add_default_operator(operators) - return self._operate(text, analyzer_results, operators, OperatorType.Anonymize) + return self._operate(text, merged_results, operators, OperatorType.Anonymize) def _remove_conflicts_and_get_text_manipulation_data( self, analyzer_results: List[RecognizerResult] From 6181df8743e90e3523bc71aae4467a66584bbeb5 Mon Sep 17 00:00:00 2001 From: Gokulakrishnan Shankar Date: Sat, 17 Jun 2023 17:00:20 +0530 Subject: [PATCH 2/4] Unit test added --- .../tests/test_anonymizer_engine.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/presidio-anonymizer/tests/test_anonymizer_engine.py b/presidio-anonymizer/tests/test_anonymizer_engine.py index b547002ad..c498c5e16 100644 --- a/presidio-anonymizer/tests/test_anonymizer_engine.py +++ b/presidio-anonymizer/tests/test_anonymizer_engine.py @@ -157,6 +157,25 @@ def test_given_several_results_then_we_filter_them_and_get_correct_mocked_result assert result.items[0].text == "text" +def test_given_sorted_analyzer_results_merge_entities_separated_by_white_space(): + analyzer_results = [ + RecognizerResult(start=11, end=16, score=0.8, entity_type="PERSON"), + RecognizerResult(start=17, end=22, score=0.8, entity_type="PERSON"), + ] + engine = AnonymizerEngine() + result = engine.anonymize( + "My name is David Jones", + analyzer_results, + operators={"PERSON": OperatorConfig("replace", {"new_value": "BIP"})}, + ) + assert result.text == "My name is BIP" + assert result.items[0].operator == "replace" + assert result.items[0].entity_type == "PERSON" + assert result.items[0].start == 11 + assert result.items[0].end == 14 + assert result.items[0].text == "BIP" + + def _operate( text: str, text_metadata: List[PIIEntity], From 714cca506c9d1ad7fe21bb60995fc8935fed380b Mon Sep 17 00:00:00 2001 From: Gokulakrishnan Shankar Date: Mon, 19 Jun 2023 18:54:52 +0530 Subject: [PATCH 3/4] Merge logic moved to function --- .../presidio_anonymizer/anonymizer_engine.py | 33 ++++++++++++------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py b/presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py index 834173b57..8800dc0b0 100644 --- a/presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py +++ b/presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py @@ -79,18 +79,9 @@ def anonymize( analyzer_results ) - # This list merges adjacent entities of the same type - # if there is whitespace between them. - merged_results = [] - prev_result = None - for result in analyzer_results: - if prev_result is not None: - if prev_result.entity_type == result.entity_type: - if re.search(r'^( )+$', text[prev_result.end:result.start]): - merged_results.remove(prev_result) - result.start = prev_result.start - merged_results.append(result) - prev_result = result + merged_results = self._merge_entities_with_whitespace_between( + text, analyzer_results + ) operators = self.__check_or_add_default_operator(operators) @@ -153,6 +144,24 @@ def _remove_conflicts_and_get_text_manipulation_data( ) return unique_text_metadata_elements + def _merge_entities_with_whitespace_between( + self, + text: str, + analyzer_results: List[RecognizerResult] + ) -> List[RecognizerResult]: + """Merge adjacent entities of the same type separated by whitespace.""" + merged_results = [] + prev_result = None + for result in analyzer_results: + if prev_result is not None: + if prev_result.entity_type == result.entity_type: + if re.search(r'^( )+$', text[prev_result.end:result.start]): + merged_results.remove(prev_result) + result.start = prev_result.start + merged_results.append(result) + prev_result = result + return merged_results + def get_anonymizers(self) -> List[str]: """Return a list of supported anonymizers.""" names = [p for p in self.operators_factory.get_anonymizers().keys()] From 86205342be1e9ec909a78e5bc2e9033cf48c0e4b Mon Sep 17 00:00:00 2001 From: Gokulakrishnan Shankar Date: Mon, 19 Jun 2023 18:57:46 +0530 Subject: [PATCH 4/4] Test cases added --- .../tests/test_anonymizer_engine.py | 84 ++++++++++++++++--- 1 file changed, 72 insertions(+), 12 deletions(-) diff --git a/presidio-anonymizer/tests/test_anonymizer_engine.py b/presidio-anonymizer/tests/test_anonymizer_engine.py index c498c5e16..9d8ae88b7 100644 --- a/presidio-anonymizer/tests/test_anonymizer_engine.py +++ b/presidio-anonymizer/tests/test_anonymizer_engine.py @@ -157,23 +157,83 @@ def test_given_several_results_then_we_filter_them_and_get_correct_mocked_result assert result.items[0].text == "text" -def test_given_sorted_analyzer_results_merge_entities_separated_by_white_space(): - analyzer_results = [ - RecognizerResult(start=11, end=16, score=0.8, entity_type="PERSON"), - RecognizerResult(start=17, end=22, score=0.8, entity_type="PERSON"), +@pytest.mark.parametrize( + # fmt: off + "text, analyzer_results, expected", + [ + ( + "My name is David Jones", + [ + RecognizerResult(start=11, end=16, score=0.8, entity_type="PERSON"), + RecognizerResult(start=17, end=22, score=0.8, entity_type="PERSON"), + ], + EngineResult( + text="My name is BIP", + items=[ + OperatorResult(11, 14, "PERSON", "BIP", "replace"), + ] + ) + ), + ( + "My name is David Jones", + [ + RecognizerResult(start=11, end=16, score=0.8, entity_type="PERSON"), + RecognizerResult(start=19, end=24, score=0.8, entity_type="PERSON"), + ], + EngineResult( + text="My name is BIP", + items=[ + OperatorResult(11, 14, "PERSON", "BIP", "replace"), + ] + ) + ), + ( + "My name is Jones, David", + [ + RecognizerResult(start=11, end=16, score=0.8, entity_type="PERSON"), + RecognizerResult(start=18, end=23, score=0.8, entity_type="PERSON"), + ], + EngineResult( + text="My name is BIP, BIP", + items=[ + OperatorResult(11, 14, "PERSON", "BIP", "replace"), + OperatorResult(16, 19, "PERSON", "BIP", "replace"), + ] + ) + ), + ( + "The phone book said: Jones 212-555-5555", + [ + RecognizerResult(start=21, end=26, score=0.8, entity_type="PERSON"), + RecognizerResult( + start=27, end=39, score=0.8, entity_type="PHONE NUMBER" + ), + ], + EngineResult( + text="The phone book said: BIP BEEP", + items=[ + OperatorResult(21, 24, "PERSON", "BIP", "replace"), + OperatorResult(25, 29, "PHONE NUMBER", "BEEP", "replace"), + ] + ) + ), ] + # fmt: on +) +def test_given_sorted_analyzer_results_merge_entities_separated_by_white_space( + text, analyzer_results, expected +): engine = AnonymizerEngine() result = engine.anonymize( - "My name is David Jones", + text, analyzer_results, - operators={"PERSON": OperatorConfig("replace", {"new_value": "BIP"})}, + operators={ + "PERSON": OperatorConfig("replace", {"new_value": "BIP"}), + "PHONE NUMBER": OperatorConfig("replace", {"new_value": "BEEP"}), + }, ) - assert result.text == "My name is BIP" - assert result.items[0].operator == "replace" - assert result.items[0].entity_type == "PERSON" - assert result.items[0].start == 11 - assert result.items[0].end == 14 - assert result.items[0].text == "BIP" + assert result.text == expected.text + assert sorted(result.items) == sorted(expected.items) def _operate(