From 35cdacf4b98b59cb522598041373f99874e7de58 Mon Sep 17 00:00:00 2001 From: omri374 Date: Thu, 22 Apr 2021 12:53:34 +0300 Subject: [PATCH 1/8] initial commit --- docs/analyzer/adding_recognizers.md | 53 +++++++++++++ docs/api-docs/api-docs.yml | 59 ++++++++++++++- .../customizing_presidio_analyzer.ipynb | 74 ++++++++++++++++++- e2e-tests/tests/test_analyzer.py | 73 ++++++++++++++++++ presidio-analyzer/app.py | 1 + .../presidio_analyzer/analyzer_engine.py | 6 ++ .../presidio_analyzer/analyzer_request.py | 8 ++ .../tests/test_analyzer_engine.py | 74 +++++++++++++++++++ 8 files changed, 341 insertions(+), 7 deletions(-) diff --git a/docs/analyzer/adding_recognizers.md b/docs/analyzer/adding_recognizers.md index 15de9f582..18987712c 100644 --- a/docs/analyzer/adding_recognizers.md +++ b/docs/analyzer/adding_recognizers.md @@ -126,6 +126,59 @@ To add a recognizer to the list of pre-defined recognizers: 3. Add the recognizer to the `recognizers_map` dict in the `RecognizerRegistry.load_predefined_recognizers` method. In this map, the key is the language the recognizer supports, and the value is the class itself. If your recognizer detects entities in multiple languages, add it to under the "ALL" key. 4. Optional: Update documentation (e.g., the [supported entities list](../supported_entities.md)). +### Creating ad-hoc recognizers + +In addition to recognizers in code, it is possible to create ad-hoc recognizers via the Presidio Analyzer API for regex and deny-list based logic. +These recognizers, in JSON form, are added to the `/analyze` request and are only used in the context of this request. + +- The json structure for a regex ad-hoc recognizer is the following: + + ```json + { + "text": "John Smith drivers license is AC432223. Zip code: 10023", + "language": "en", + "ad_hoc_recognizers":[ + { + "name": "Zip code Recognizer", + "supported_language": "en", + "patterns": [ + { + "name": "zip code (weak)", + "regex": "(\\b\\d{5}(?:\\-\\d{4})?\\b)", + "score": 0.01 + } + ], + "supported_entity":"ZIP" + } + ] + } + ``` + +- The json structure for a deny-list based recognizers is the following: + + ```json + { + "text": "Mr. John Smith's drivers license is AC432223", + "language": "en", + "ad_hoc_recognizers":[ + { + "name": "Mr. Recognizer", + "supported_language": "en", + "deny_list": ["Mr", "Mr.", "Mister"], + "supported_entity":"MR_TITLE" + }, + { + "name": "Ms. Recognizer", + "supported_language": "en", + "deny_list": ["Ms", "Ms.", "Miss", "Mrs", "Mrs."], + "supported_entity":"MS_TITLE" + } + ] + } + ``` + +In both examples, the `/analyze` request is extended with a list of `ad_hoc_recognizers`, which could be either `patterns`, `deny_list` or both. + ## PII detection in different languages For recognizers in new languages, refer to the [languages documentation](languages.md). diff --git a/docs/api-docs/api-docs.yml b/docs/api-docs/api-docs.yml index 2f4a76776..ff6a14da2 100644 --- a/docs/api-docs/api-docs.yml +++ b/docs/api-docs/api-docs.yml @@ -281,13 +281,27 @@ components: Enhanced Request : value: { - "text": "John Smith drivers license is AC432223", + "text": "John Smith drivers license is AC432223 and the zip code is 12345", "language": "en", "return_decision_process": false, "correlation_id": "123e4567-e89b-12d3-a456-426614174000", "score_threshold": 0.6, - "entities": ["US_DRIVER_LICENSE"], - "trace": false + "entities": ["US_DRIVER_LICENSE", "ZIP"], + "trace": false, + "ad_hoc_recognizers":[ + { + "name": "Zip code Recognizer", + "supported_language": "en", + "patterns": [ + { + "name": "zip code (weak)", + "regex": "(\\b\\d{5}(?:\\-\\d{4})?\\b)", + "score": 0.01 + } + ], + "supported_entity":"ZIP" + } + ] } AnonymizeRequest: @@ -377,6 +391,10 @@ components: return_decision_process: type: boolean description: "Whether to include analysis explanation in the response" + ad_hoc_recognizers: + type: array + items: + $ref: "#/components/schemas/PatternRecognizer" AnonymizeRequest: type: object @@ -515,7 +533,42 @@ components: type: number format: double description: "Result of a validation (e.g. checksum)" + + Pattern: + type: object + properties: + name: + type: string + description: "Name of regular expression pattern" + regex: + type: string + description: "Regex pattern string" + score: + type: double + description: "Detection confidence of this pattern (0.01 if very noisy, 0.6-1.0 if very specific)" + + PatternRecognizer: + type: object + description: "A regular expressions or deny-list based recognizer" + properties: + name: + type: string + description: "Name of recognizer" + supported_language: + type: string + description: "Language code supported by this recognizer" + patterns: + type: array + items: + $ref: "#/components/schemas/Pattern" + deny_list: + type: array + items: + type: string + supported_entity: + type: string + description: "The name of entity this ad hoc recognizer detects" EntityTypes: description: "The supported PII entity types." diff --git a/docs/samples/python/customizing_presidio_analyzer.ipynb b/docs/samples/python/customizing_presidio_analyzer.ipynb index a8c03576d..97512cfb4 100644 --- a/docs/samples/python/customizing_presidio_analyzer.ipynb +++ b/docs/samples/python/customizing_presidio_analyzer.ipynb @@ -145,7 +145,27 @@ "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-04-21 17:59:32,749] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'I'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:32,752] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'suspect'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:32,754] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'Professor'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:32,756] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'Plum'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:32,758] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token ','. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:32,759] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'in'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:32,763] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'the'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:32,766] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'Dining'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:32,768] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'Room'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:32,770] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token ','. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:32,771] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'with'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:32,774] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'the'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:32,776] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'candlestick'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n" + ] + } + ], "source": [ "results = analyzer.analyze(text=text1, language=\"en\")" ] @@ -356,6 +376,20 @@ "execution_count": 14, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-04-21 17:59:35,950] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'Roberto'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:35,953] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'lives'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:35,954] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'in'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:35,956] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'Five'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:35,958] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token '10'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:35,959] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'Broad'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:35,960] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'st'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:35,962] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token '.'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -427,6 +461,16 @@ "execution_count": 15, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-04-21 17:59:39,783] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'My'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:39,784] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'name'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:39,785] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'is'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:39,786] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'David'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -434,7 +478,7 @@ "Results from Spanish request:\n", "[type: PERSON, start: 13, end: 18, score: 0.85]\n", "Results from English request:\n", - "[type: PERSON, start: 11, end: 16, score: 0.85]\n" + "[]\n" ] } ], @@ -506,7 +550,7 @@ ], "source": [ "# Define the regex pattern\n", - "regex = \"(\\d{5}(?:\\-\\d{4})?)$\" # very weak regex pattern, taken from here: https://stackoverflow.com/questions/2577236/regex-for-zip-code\n", + "regex = r\"(\\b\\d{5}(?:\\-\\d{4})?\\b)\" # very weak regex pattern\n", "zipcode_pattern = Pattern(name=\"zip code (weak)\", regex=regex, score=0.01)\n", "\n", "# Define the recognizer with the defined pattern\n", @@ -559,6 +603,17 @@ "execution_count": 19, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-04-21 17:59:42,049] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'My'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:42,049] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'zip'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:42,049] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'code'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:42,052] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'is'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:42,052] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token '90210'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -607,6 +662,17 @@ "execution_count": 20, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2021-04-21 17:59:42,084] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'My'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:42,084] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'zip'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:42,084] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'code'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:42,084] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'is'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", + "[2021-04-21 17:59:42,088] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token '90210'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -614,7 +680,7 @@ "Decision process output:\n", "\n", "{'original_score': 0.01,\n", - " 'pattern': '(\\\\d{5}(?:\\\\-\\\\d{4})?)$',\n", + " 'pattern': '(\\\\b\\\\d{5}(?:\\\\-\\\\d{4})?\\\\b)',\n", " 'pattern_name': 'zip code (weak)',\n", " 'recognizer': 'PatternRecognizer',\n", " 'score': 0.4,\n", diff --git a/e2e-tests/tests/test_analyzer.py b/e2e-tests/tests/test_analyzer.py index 4a9f28238..9260da23e 100644 --- a/e2e-tests/tests/test_analyzer.py +++ b/e2e-tests/tests/test_analyzer.py @@ -1,4 +1,5 @@ import pytest + from common.assertions import equal_json_strings from common.methods import analyze, analyzer_supported_entities @@ -269,3 +270,75 @@ def test_given_an_illegal_input_for_supported_entities_then_igonre_and_proceed() """ assert response_status == 200 assert equal_json_strings(expected_response, response_content) + + +@pytest.mark.api +def test_given_ad_hoc_pattern_recognizer_the_right_entities_are_returned(): + request_body = r""" + { + "text": "John Smith drivers license is AC432223. Zip code: 10023", + "language": "en", + "ad_hoc_recognizers":[ + { + "name": "Zip code Recognizer", + "supported_language": "en", + "patterns": [ + { + "name": "zip code (weak)", + "regex": "(\\b\\d{5}(?:\\-\\d{4})?\\b)", + "score": 0.01 + } + ], + "supported_entity":"ZIP" + } + ] + } + """ + + response_status, response_content = analyze(request_body) + + expected_response = """ + [ + {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation":null}, + {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, "analysis_explanation":null}, + {"entity_type": "ZIP", "start": 50, "end": 55, "score": 0.01, "analysis_explanation":null} + ] + """ + assert response_status == 200 + assert equal_json_strings(expected_response, response_content) + + +@pytest.mark.api +def test_given_ad_hoc_deny_list_recognizer_the_right_entities_are_returned(): + request_body = r""" + { + "text": "Mr. John Smith's drivers license is AC432223", + "language": "en", + "ad_hoc_recognizers":[ + { + "name": "Mr. Recognizer", + "supported_language": "en", + "deny_list": ["Mr", "Mr.", "Mister"], + "supported_entity":"MR_TITLE" + }, + { + "name": "Ms. Recognizer", + "supported_language": "en", + "deny_list": ["Ms", "Ms.", "Miss", "Mrs", "Mrs."], + "supported_entity":"MS_TITLE" + } + ] + } + """ + + response_status, response_content = analyze(request_body) + + expected_response = """ + [ + {"entity_type": "PERSON", "start": 4, "end": 14, "score": 0.85, "analysis_explanation":null}, + {"entity_type": "US_DRIVER_LICENSE", "start": 36, "end": 44, "score": 0.6499999999999999, "analysis_explanation":null}, + {"entity_type": "MR_TITLE", "start": 0, "end": 3, "score": 1.0, "analysis_explanation":null} + ] + """ + assert response_status == 200 + assert equal_json_strings(expected_response, response_content) diff --git a/presidio-analyzer/app.py b/presidio-analyzer/app.py index ae3557521..7174843d6 100644 --- a/presidio-analyzer/app.py +++ b/presidio-analyzer/app.py @@ -64,6 +64,7 @@ def analyze() -> Tuple[str, int]: score_threshold=req_data.score_threshold, entities=req_data.entities, return_decision_process=req_data.return_decision_process, + ad_hoc_recognizers=req_data.ad_hoc_recognizers, ) return Response( diff --git a/presidio-analyzer/presidio_analyzer/analyzer_engine.py b/presidio-analyzer/presidio_analyzer/analyzer_engine.py index 43e1ba235..b83b2f74b 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_engine.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_engine.py @@ -113,6 +113,7 @@ def analyze( correlation_id: Optional[str] = None, score_threshold: Optional[float] = None, return_decision_process: Optional[bool] = False, + ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None, ) -> List[RecognizerResult]: """ Find PII entities in text using different PII recognizers for a given language. @@ -126,6 +127,8 @@ def analyze( to return an identified entity :param return_decision_process: Whether the analysis decision process steps returned in the response. + :param ad_hoc_recognizers: List of recognizers which will be used only + for this specific request. :return: an array of the found entities in the text :example: @@ -147,6 +150,9 @@ def analyze( language=language, entities=entities, all_fields=all_fields ) + if ad_hoc_recognizers: + recognizers.extend(ad_hoc_recognizers) + if all_fields: # Since all_fields=True, list all entities by iterating # over all recognizers diff --git a/presidio-analyzer/presidio_analyzer/analyzer_request.py b/presidio-analyzer/presidio_analyzer/analyzer_request.py index 3f3450d14..ec936e6f4 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_request.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_request.py @@ -1,5 +1,7 @@ from typing import Dict +from presidio_analyzer import PatternRecognizer + class AnalyzerRequest: """ @@ -25,3 +27,9 @@ def __init__(self, req_data: Dict): self.correlation_id = req_data.get("correlation_id") self.score_threshold = req_data.get("score_threshold") self.return_decision_process = req_data.get("return_decision_process") + ad_hoc_recognizers = req_data.get("ad_hoc_recognizers") + self.ad_hoc_recognizers = [] + if ad_hoc_recognizers: + self.ad_hoc_recognizers = [ + PatternRecognizer.from_dict(rec) for rec in ad_hoc_recognizers + ] diff --git a/presidio-analyzer/tests/test_analyzer_engine.py b/presidio-analyzer/tests/test_analyzer_engine.py index b12e6c921..69270cb3d 100644 --- a/presidio-analyzer/tests/test_analyzer_engine.py +++ b/presidio-analyzer/tests/test_analyzer_engine.py @@ -519,3 +519,77 @@ def test_when_read_test_spacy_nlp_conf_file_then_returns_spacy_nlp_engine( assert isinstance(engine.nlp_engine, SpacyNlpEngine) assert engine.nlp_engine.nlp is not None + + +def test_when_ad_hoc_pattern_recognizer_is_added_then_result_contains_result( + loaded_analyzer_engine, +): + text = "John Smith drivers license is AC432223 and his zip code is 10023" + regex = r"(\b\d{5}(?:\-\d{4})?\b)" + zipcode_pattern = Pattern(name="zip code (weak)", regex=regex, score=0.01) + + zip_code_recognizer = PatternRecognizer( + supported_entity="ZIP", patterns=[zipcode_pattern] + ) + + responses = loaded_analyzer_engine.analyze( + text=text, language="en", ad_hoc_recognizers=[zip_code_recognizer] + ) + + detected_entities = [response.entity_type for response in responses] + assert "ZIP" in detected_entities + + +def test_when_ad_hoc_deny_list_recognizer_is_added_then_result_contains_result( + loaded_analyzer_engine, +): + text = "Mr. John Smith's drivers license is AC432223" + + mr_recognizer = PatternRecognizer(supported_entity="MR", deny_list=["Mr.", "Mr"]) + + responses = loaded_analyzer_engine.analyze( + text=text, language="en", ad_hoc_recognizers=[mr_recognizer] + ) + + detected_entities = [response.entity_type for response in responses] + assert "MR" in detected_entities + + +def test_when_ad_hoc_deny_list_recognizer_is_added_then_result_does_not_persist( + loaded_analyzer_engine, +): + text = "Mr. John Smith's drivers license is AC432223" + + mr_recognizer = PatternRecognizer(supported_entity="MR", deny_list=["Mr.", "Mr"]) + + responses1 = loaded_analyzer_engine.analyze( + text=text, language="en", ad_hoc_recognizers=[mr_recognizer] + ) + responses2 = loaded_analyzer_engine.analyze(text=text, language="en") + + detected_entities1 = [response.entity_type for response in responses1] + assert "MR" in detected_entities1 + + detected_entities2 = [response.entity_type for response in responses2] + assert "MR" not in detected_entities2 + + +def test_when_ad_hoc_deny_list_recognizer_contains_both_regex_and_deny_list( + loaded_analyzer_engine, +): + text = "Mr. John Smith's zip code is 10023 or 999" + regex = r"(\b\d{5}(?:\-\d{4})?\b)" + zipcode_pattern = Pattern(name="zip code (weak)", regex=regex, score=0.01) + + zip_recognizer = PatternRecognizer( + supported_entity="ZIP", deny_list=["999"], patterns=[zipcode_pattern] + ) + + responses = loaded_analyzer_engine.analyze( + text=text, language="en", ad_hoc_recognizers=[zip_recognizer] + ) + + detected_zips = [ + response.entity_type for response in responses if response.entity_type == "ZIP" + ] + assert len(detected_zips) == 2 From 0c2dad72b74fbd44fb9cc43173a8564153655449 Mon Sep 17 00:00:00 2001 From: omri374 Date: Thu, 22 Apr 2021 13:06:24 +0300 Subject: [PATCH 2/8] yaml fix --- docs/api-docs/api-docs.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/api-docs/api-docs.yml b/docs/api-docs/api-docs.yml index ff6a14da2..5b0e04d6e 100644 --- a/docs/api-docs/api-docs.yml +++ b/docs/api-docs/api-docs.yml @@ -544,7 +544,8 @@ components: type: string description: "Regex pattern string" score: - type: double + type: number + format: double description: "Detection confidence of this pattern (0.01 if very noisy, 0.6-1.0 if very specific)" From 201e672c096e613d8919134b30309f3938d5053b Mon Sep 17 00:00:00 2001 From: omri374 Date: Thu, 22 Apr 2021 14:39:08 +0300 Subject: [PATCH 3/8] updated jupyter notebook and removed disabling of tagger (required in spacy 3) --- docs/api-docs/api-docs.yml | 1 + .../customizing_presidio_analyzer.ipynb | 68 +------------------ .../nlp_engine/spacy_nlp_engine.py | 2 +- 3 files changed, 3 insertions(+), 68 deletions(-) diff --git a/docs/api-docs/api-docs.yml b/docs/api-docs/api-docs.yml index 5b0e04d6e..6fa8652c6 100644 --- a/docs/api-docs/api-docs.yml +++ b/docs/api-docs/api-docs.yml @@ -393,6 +393,7 @@ components: description: "Whether to include analysis explanation in the response" ad_hoc_recognizers: type: array + description: "list of recognizers to be used in the context of this request only (ad-hoc)." items: $ref: "#/components/schemas/PatternRecognizer" diff --git a/docs/samples/python/customizing_presidio_analyzer.ipynb b/docs/samples/python/customizing_presidio_analyzer.ipynb index 97512cfb4..ea4258b26 100644 --- a/docs/samples/python/customizing_presidio_analyzer.ipynb +++ b/docs/samples/python/customizing_presidio_analyzer.ipynb @@ -145,27 +145,7 @@ "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2021-04-21 17:59:32,749] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'I'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:32,752] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'suspect'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:32,754] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'Professor'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:32,756] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'Plum'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:32,758] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token ','. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:32,759] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'in'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:32,763] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'the'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:32,766] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'Dining'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:32,768] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'Room'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:32,770] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token ','. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:32,771] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'with'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:32,774] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'the'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:32,776] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'candlestick'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n" - ] - } - ], + "outputs": [], "source": [ "results = analyzer.analyze(text=text1, language=\"en\")" ] @@ -376,20 +356,6 @@ "execution_count": 14, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2021-04-21 17:59:35,950] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'Roberto'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:35,953] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'lives'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:35,954] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'in'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:35,956] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'Five'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:35,958] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token '10'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:35,959] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'Broad'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:35,960] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'st'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:35,962] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token '.'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -461,16 +427,6 @@ "execution_count": 15, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2021-04-21 17:59:39,783] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'My'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:39,784] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'name'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:39,785] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'is'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:39,786] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'David'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -603,17 +559,6 @@ "execution_count": 19, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2021-04-21 17:59:42,049] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'My'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:42,049] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'zip'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:42,049] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'code'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:42,052] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'is'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:42,052] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token '90210'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -662,17 +607,6 @@ "execution_count": 20, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2021-04-21 17:59:42,084] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'My'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:42,084] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'zip'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:42,084] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'code'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:42,084] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token 'is'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n", - "[2021-04-21 17:59:42,088] [WARNING] [W108] The rule-based lemmatizer did not find POS annotation for the token '90210'. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.\n" - ] - }, { "name": "stdout", "output_type": "stream", diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py index 92f7ca5a1..94035bc91 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py @@ -34,7 +34,7 @@ def __init__(self, models: Optional[Dict[str, str]] = None): logger.debug(f"Loading SpaCy models: {models.values()}") self.nlp = { - lang_code: spacy.load(model_name, disable=["parser", "tagger"]) + lang_code: spacy.load(model_name, disable=["parser"]) for lang_code, model_name in models.items() } From 5d58e68bb9da804e6a50e08c87f641dddf738494 Mon Sep 17 00:00:00 2001 From: omri374 Date: Thu, 22 Apr 2021 14:46:26 +0300 Subject: [PATCH 4/8] remove David --- .../samples/python/customizing_presidio_analyzer.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/samples/python/customizing_presidio_analyzer.ipynb b/docs/samples/python/customizing_presidio_analyzer.ipynb index ea4258b26..378370280 100644 --- a/docs/samples/python/customizing_presidio_analyzer.ipynb +++ b/docs/samples/python/customizing_presidio_analyzer.ipynb @@ -424,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -432,9 +432,9 @@ "output_type": "stream", "text": [ "Results from Spanish request:\n", - "[type: PERSON, start: 13, end: 18, score: 0.85]\n", + "[type: PERSON, start: 13, end: 19, score: 0.85]\n", "Results from English request:\n", - "[]\n" + "[type: PERSON, start: 11, end: 17, score: 0.85]\n" ] } ], @@ -462,11 +462,11 @@ ")\n", "\n", "# Analyze in different languages\n", - "results_spanish = analyzer.analyze(text=\"Mi nombre es David\", language=\"es\")\n", + "results_spanish = analyzer.analyze(text=\"Mi nombre es Morris\", language=\"es\")\n", "print(\"Results from Spanish request:\")\n", "print(results_spanish)\n", "\n", - "results_english = analyzer.analyze(text=\"My name is David\", language=\"en\")\n", + "results_english = analyzer.analyze(text=\"My name is Morris\", language=\"en\")\n", "print(\"Results from English request:\")\n", "print(results_english)" ] From dfeb19d066751dd364b82ae6c7ef53d4d95340b7 Mon Sep 17 00:00:00 2001 From: omri374 Date: Tue, 4 May 2021 11:09:52 +0300 Subject: [PATCH 5/8] added error handling + additional tests --- e2e-tests/tests/test_analyzer.py | 69 +++++++++++++++++++ presidio-analyzer/app.py | 10 ++- .../presidio_analyzer/analyzer_engine.py | 34 ++++++++- presidio-analyzer/tests/conftest.py | 27 +++++++- .../tests/test_analyzer_engine.py | 55 +++++++++++---- .../tests/test_context_support.py | 4 +- .../tests/test_iban_recognizer.py | 4 +- .../tests/test_spacy_recognizer.py | 5 -- .../tests/test_stanza_recognizer.py | 4 +- 9 files changed, 186 insertions(+), 26 deletions(-) diff --git a/e2e-tests/tests/test_analyzer.py b/e2e-tests/tests/test_analyzer.py index 9260da23e..59968012c 100644 --- a/e2e-tests/tests/test_analyzer.py +++ b/e2e-tests/tests/test_analyzer.py @@ -308,6 +308,75 @@ def test_given_ad_hoc_pattern_recognizer_the_right_entities_are_returned(): assert equal_json_strings(expected_response, response_content) +def test_given_wrong_ad_hoc_json_exception_is_given(): + malformed_request_body = r""" + { + "text": "John Smith drivers license is AC432223. Zip code: 10023", + "language": "en", + "ad_hoc_recognizers":[ + { + "name": "Zip code Recognizer", + "supported_language": "en", + "patterns": [ + { + "type": "zip code (weak)", + "bebex": "(\\b\\d{5}(?:\\-\\d{4})?\\b)", + "confidence": 0.01 + } + ], + "supported_entity":"ZIP" + } + ] + } + """ + response_status, response_content = analyze(malformed_request_body) + + expected_response = """ + { + "error":"Failed to parse /analyze request for AnalyzerEngine.analyze(). __init__() got an unexpected keyword argument \'type\'" + } + """ + + assert response_status == 400 + assert equal_json_strings(expected_response, response_content) + + +def test_given_ad_hoc_pattern_recognizer_context_raises_confidence(): + request_body = r""" + { + "text": "John Smith drivers license is AC432223. Zip code: 10023", + "language": "en", + "ad_hoc_recognizers":[ + { + "name": "Zip code Recognizer", + "supported_language": "en", + "patterns": [ + { + "name": "zip code (weak)", + "regex": "(\\b\\d{5}(?:\\-\\d{4})?\\b)", + "score": 0.01 + } + ], + "context": ["Zip","code"], + "supported_entity":"ZIP" + } + ] + } + """ + + response_status, response_content = analyze(request_body) + + expected_response = """ + [ + {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation":null}, + {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, "analysis_explanation":null}, + {"entity_type": "ZIP", "start": 50, "end": 55, "score": 0.4, "analysis_explanation":null} + ] + """ + assert response_status == 200 + assert equal_json_strings(expected_response, response_content) + + @pytest.mark.api def test_given_ad_hoc_deny_list_recognizer_the_right_entities_are_returned(): request_body = r""" diff --git a/presidio-analyzer/app.py b/presidio-analyzer/app.py index 7174843d6..82ec516e8 100644 --- a/presidio-analyzer/app.py +++ b/presidio-analyzer/app.py @@ -49,8 +49,8 @@ def health() -> str: def analyze() -> Tuple[str, int]: """Execute the analyzer function.""" # Parse the request params - req_data = AnalyzerRequest(request.get_json()) try: + req_data = AnalyzerRequest(request.get_json()) if not req_data.text: raise Exception("No text provided") @@ -75,6 +75,14 @@ def analyze() -> Tuple[str, int]: ), content_type="application/json", ) + except TypeError as te: + error_msg = ( + f"Failed to parse /analyze request " + f"for AnalyzerEngine.analyze(). {te.args[0]}" + ) + self.logger.error(error_msg) + return jsonify(error=error_msg), 400 + except Exception as e: self.logger.error( f"A fatal error occurred during execution of " diff --git a/presidio-analyzer/presidio_analyzer/analyzer_engine.py b/presidio-analyzer/presidio_analyzer/analyzer_engine.py index b83b2f74b..f0294d265 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_engine.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_engine.py @@ -151,7 +151,12 @@ def analyze( ) if ad_hoc_recognizers: - recognizers.extend(ad_hoc_recognizers) + recognizers_to_add = self._filter_ad_hoc_recognizers( + ad_hoc_recognizers=ad_hoc_recognizers, + language=language, + entities=entities, + ) + recognizers.extend(recognizers_to_add) if all_fields: # Since all_fields=True, list all entities by iterating @@ -196,6 +201,33 @@ def analyze( return results + @staticmethod + def _filter_ad_hoc_recognizers( + ad_hoc_recognizers: List[EntityRecognizer], + language: str, + entities: Optional[List[str]] = None, + ) -> List[EntityRecognizer]: + """ + Return the ad hoc recognizers relevant for the requested entities and language. + + :param ad_hoc_recognizers: List of recognizers provided in the analyze request + :param entities: List of entities to return for this request + :param language: Language code for this request + :return: A list of EntityRecognizer + """ + if not entities: + return ad_hoc_recognizers + + subset = [] + for entity in entities: + subset = [ + rec + for rec in ad_hoc_recognizers + if entity in rec.supported_entities + and language == rec.supported_language + ] + return subset + def __remove_low_scores( self, results: List[RecognizerResult], score_threshold: float = None ) -> List[RecognizerResult]: diff --git a/presidio-analyzer/tests/conftest.py b/presidio-analyzer/tests/conftest.py index 436e57d89..d13c4e5ca 100644 --- a/presidio-analyzer/tests/conftest.py +++ b/presidio-analyzer/tests/conftest.py @@ -4,7 +4,7 @@ import pytest import spacy -from presidio_analyzer import EntityRecognizer +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer from presidio_analyzer import RecognizerRegistry from presidio_analyzer.nlp_engine import NlpEngineProvider from presidio_analyzer.predefined_recognizers import NLP_RECOGNIZERS @@ -81,6 +81,11 @@ def loaded_registry(): return RecognizerRegistry() +@pytest.fixture(scope="module") +def nlp_engine(nlp_engines): + return nlp_engines["spacy_en"] + + @pytest.fixture(scope="module") def mock_registry(): return RecognizerRegistryMock() @@ -108,6 +113,26 @@ def mock_bn_model(): bn.to_disk("bn_test") +@pytest.fixture(scope="session") +def zip_code_recognizer(): + regex = r"(\b\d{5}(?:\-\d{4})?\b)" + zipcode_pattern = Pattern(name="zip code (weak)", regex=regex, score=0.01) + zip_recognizer = PatternRecognizer( + supported_entity="ZIP", patterns=[zipcode_pattern] + ) + return zip_recognizer + + +@pytest.fixture(scope="session") +def zip_code_deny_list_recognizer(): + regex = r"(\b\d{5}(?:\-\d{4})?\b)" + zipcode_pattern = Pattern(name="zip code (weak)", regex=regex, score=0.01) + zip_recognizer = PatternRecognizer( + supported_entity="ZIP", deny_list=["999"], patterns=[zipcode_pattern] + ) + return zip_recognizer + + def pytest_sessionfinish(): """Remove files created during mock spaCy models creation.""" he_test_model_path = Path(Path(__file__).parent.parent, "he_test") diff --git a/presidio-analyzer/tests/test_analyzer_engine.py b/presidio-analyzer/tests/test_analyzer_engine.py index 69270cb3d..69921210d 100644 --- a/presidio-analyzer/tests/test_analyzer_engine.py +++ b/presidio-analyzer/tests/test_analyzer_engine.py @@ -522,15 +522,9 @@ def test_when_read_test_spacy_nlp_conf_file_then_returns_spacy_nlp_engine( def test_when_ad_hoc_pattern_recognizer_is_added_then_result_contains_result( - loaded_analyzer_engine, + loaded_analyzer_engine, zip_code_recognizer ): text = "John Smith drivers license is AC432223 and his zip code is 10023" - regex = r"(\b\d{5}(?:\-\d{4})?\b)" - zipcode_pattern = Pattern(name="zip code (weak)", regex=regex, score=0.01) - - zip_code_recognizer = PatternRecognizer( - supported_entity="ZIP", patterns=[zipcode_pattern] - ) responses = loaded_analyzer_engine.analyze( text=text, language="en", ad_hoc_recognizers=[zip_code_recognizer] @@ -575,21 +569,52 @@ def test_when_ad_hoc_deny_list_recognizer_is_added_then_result_does_not_persist( def test_when_ad_hoc_deny_list_recognizer_contains_both_regex_and_deny_list( - loaded_analyzer_engine, + loaded_analyzer_engine, zip_code_deny_list_recognizer ): text = "Mr. John Smith's zip code is 10023 or 999" - regex = r"(\b\d{5}(?:\-\d{4})?\b)" - zipcode_pattern = Pattern(name="zip code (weak)", regex=regex, score=0.01) - - zip_recognizer = PatternRecognizer( - supported_entity="ZIP", deny_list=["999"], patterns=[zipcode_pattern] - ) responses = loaded_analyzer_engine.analyze( - text=text, language="en", ad_hoc_recognizers=[zip_recognizer] + text=text, language="en", ad_hoc_recognizers=[zip_code_deny_list_recognizer] ) detected_zips = [ response.entity_type for response in responses if response.entity_type == "ZIP" ] assert len(detected_zips) == 2 + + +def test_entities_filter_for_ad_hoc_removes_recognizer(loaded_analyzer_engine): + text = "Mr. John Smith's zip code is 10002" + + mr_recognizer = PatternRecognizer(supported_entity="MR", deny_list=["Mr.", "Mr"]) + responses1 = loaded_analyzer_engine.analyze( + text=text, language="en", ad_hoc_recognizers=[mr_recognizer] + ) + responses2 = loaded_analyzer_engine.analyze( + text=text, + language="en", + ad_hoc_recognizers=[mr_recognizer], + entities=["PERSON"], + ) + + assert "MR" in [resp.entity_type for resp in responses1] + assert "MR" not in [resp.entity_type for resp in responses2] + + +def test_ad_hoc_with_context_support_higher_confidence(nlp_engine, zip_code_recognizer): + text = "Mr. John Smith's zip code is 10023" + analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine) + + responses1 = analyzer_engine.analyze( + text=text, language="en", ad_hoc_recognizers=[zip_code_recognizer] + ) + + zip_code_recognizer.context = ["zip", "code"] + responses2 = analyzer_engine.analyze( + text=text, language="en", ad_hoc_recognizers=[zip_code_recognizer] + ) + + zip_result_no_context = [resp for resp in responses1 if resp.entity_type == "ZIP"] + zip_result_with_context = [resp for resp in responses2 if resp.entity_type == "ZIP"] + + assert zip_result_no_context[0].score < zip_result_with_context[0].score diff --git a/presidio-analyzer/tests/test_context_support.py b/presidio-analyzer/tests/test_context_support.py index dc52baedb..1d9b6f9d3 100644 --- a/presidio-analyzer/tests/test_context_support.py +++ b/presidio-analyzer/tests/test_context_support.py @@ -72,7 +72,9 @@ def mock_nlp_artifacts(): return NlpArtifacts([], [], [], [], None, "en") -def test_when_text_with_context_then_improves_score(dataset, nlp_engine, mock_nlp_artifacts): +def test_when_text_with_context_then_improves_score( + dataset, nlp_engine, mock_nlp_artifacts +): for item in dataset: text, recognizer, entities = item nlp_artifacts = nlp_engine.process_text(text, "en") diff --git a/presidio-analyzer/tests/test_iban_recognizer.py b/presidio-analyzer/tests/test_iban_recognizer.py index e8b7f9a44..48e8c275a 100644 --- a/presidio-analyzer/tests/test_iban_recognizer.py +++ b/presidio-analyzer/tests/test_iban_recognizer.py @@ -360,7 +360,9 @@ def update_iban_checksum(iban): # fmt: on ], ) -def test_when_all_ibans_then_succeed(iban, expected_len, expected_res, recognizer, entities, max_score): +def test_when_all_ibans_then_succeed( + iban, expected_len, expected_res, recognizer, entities, max_score +): results = recognizer.analyze(iban, entities) assert len(results) == expected_len for res, (start, end) in zip(results, expected_res): diff --git a/presidio-analyzer/tests/test_spacy_recognizer.py b/presidio-analyzer/tests/test_spacy_recognizer.py index 2381aa4dd..9c41d492e 100644 --- a/presidio-analyzer/tests/test_spacy_recognizer.py +++ b/presidio-analyzer/tests/test_spacy_recognizer.py @@ -8,11 +8,6 @@ def entities(): return ["PERSON", "DATE_TIME"] -@pytest.fixture(scope="module") -def nlp_engine(nlp_engines): - return nlp_engines["spacy_en"] - - @pytest.fixture(scope="module") def nlp_recognizer(nlp_recognizers): return nlp_recognizers["spacy"] diff --git a/presidio-analyzer/tests/test_stanza_recognizer.py b/presidio-analyzer/tests/test_stanza_recognizer.py index 3c08900e9..bd9e4b57b 100644 --- a/presidio-analyzer/tests/test_stanza_recognizer.py +++ b/presidio-analyzer/tests/test_stanza_recognizer.py @@ -71,7 +71,9 @@ def test_when_using_stanze_then_all_stanza_result_correct( @pytest.mark.skip_engine("stanza_en") -def test_when_person_in_text_then_person_full_name_complex_found(nlp_engine, nlp_recognizer, entities): +def test_when_person_in_text_then_person_full_name_complex_found( + nlp_engine, nlp_recognizer, entities +): text = "Richard (Rick) C. Henderson" results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) From db8dde3951707ce8f2257b98289d8917b486e4b4 Mon Sep 17 00:00:00 2001 From: omri374 Date: Tue, 4 May 2021 11:40:01 +0300 Subject: [PATCH 6/8] updates to docs --- docs/analyzer/adding_recognizers.md | 3 +++ docs/api-docs/api-docs.yml | 1 + e2e-tests/tests/test_analyzer.py | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/analyzer/adding_recognizers.md b/docs/analyzer/adding_recognizers.md index 18987712c..5ac984a10 100644 --- a/docs/analyzer/adding_recognizers.md +++ b/docs/analyzer/adding_recognizers.md @@ -148,6 +148,7 @@ These recognizers, in JSON form, are added to the `/analyze` request and are onl "score": 0.01 } ], + "context": ["zip", "code"], "supported_entity":"ZIP" } ] @@ -179,6 +180,8 @@ These recognizers, in JSON form, are added to the `/analyze` request and are onl In both examples, the `/analyze` request is extended with a list of `ad_hoc_recognizers`, which could be either `patterns`, `deny_list` or both. +Additional examples can be found in the [OpenAPI spec](../api-docs/api-docs.html). + ## PII detection in different languages For recognizers in new languages, refer to the [languages documentation](languages.md). diff --git a/docs/api-docs/api-docs.yml b/docs/api-docs/api-docs.yml index 6fa8652c6..39d018c38 100644 --- a/docs/api-docs/api-docs.yml +++ b/docs/api-docs/api-docs.yml @@ -299,6 +299,7 @@ components: "score": 0.01 } ], + "context": ["zip", "code"], "supported_entity":"ZIP" } ] diff --git a/e2e-tests/tests/test_analyzer.py b/e2e-tests/tests/test_analyzer.py index 59968012c..664ced509 100644 --- a/e2e-tests/tests/test_analyzer.py +++ b/e2e-tests/tests/test_analyzer.py @@ -337,8 +337,8 @@ def test_given_wrong_ad_hoc_json_exception_is_given(): } """ - assert response_status == 400 assert equal_json_strings(expected_response, response_content) + assert response_status == 400 def test_given_ad_hoc_pattern_recognizer_context_raises_confidence(): @@ -357,7 +357,7 @@ def test_given_ad_hoc_pattern_recognizer_context_raises_confidence(): "score": 0.01 } ], - "context": ["Zip","code"], + "context": ["zip", "code"], "supported_entity":"ZIP" } ] From a956ce94905173e6603a3deab6311b7b20734bca Mon Sep 17 00:00:00 2001 From: omri374 Date: Tue, 4 May 2021 11:43:20 +0300 Subject: [PATCH 7/8] added description in open API --- docs/api-docs/api-docs.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/api-docs/api-docs.yml b/docs/api-docs/api-docs.yml index 39d018c38..e026e2963 100644 --- a/docs/api-docs/api-docs.yml +++ b/docs/api-docs/api-docs.yml @@ -562,10 +562,17 @@ components: type: string description: "Language code supported by this recognizer" patterns: + description: "List of type Pattern containing regex expressions with additional metadata." type: array items: $ref: "#/components/schemas/Pattern" deny_list: + type: array + description: "List of words to be returned as PII if found." + items: + type: string + context: + description: "List of words to be used to increase confidence if found in the vicinity of detected entities." type: array items: type: string From 87b157cc05449afff46bc82f7c392fe1b28b8b37 Mon Sep 17 00:00:00 2001 From: omri374 Date: Tue, 4 May 2021 15:27:40 +0300 Subject: [PATCH 8/8] bug fix for entities filtering --- .../presidio_analyzer/analyzer_engine.py | 40 ++----------------- .../recognizer_registry.py | 8 +++- .../tests/test_analyzer_engine.py | 15 +++++++ 3 files changed, 26 insertions(+), 37 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/analyzer_engine.py b/presidio-analyzer/presidio_analyzer/analyzer_engine.py index f0294d265..f7a52f1f9 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_engine.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_engine.py @@ -147,17 +147,12 @@ def analyze( all_fields = not entities recognizers = self.registry.get_recognizers( - language=language, entities=entities, all_fields=all_fields + language=language, + entities=entities, + all_fields=all_fields, + ad_hoc_recognizers=ad_hoc_recognizers, ) - if ad_hoc_recognizers: - recognizers_to_add = self._filter_ad_hoc_recognizers( - ad_hoc_recognizers=ad_hoc_recognizers, - language=language, - entities=entities, - ) - recognizers.extend(recognizers_to_add) - if all_fields: # Since all_fields=True, list all entities by iterating # over all recognizers @@ -201,33 +196,6 @@ def analyze( return results - @staticmethod - def _filter_ad_hoc_recognizers( - ad_hoc_recognizers: List[EntityRecognizer], - language: str, - entities: Optional[List[str]] = None, - ) -> List[EntityRecognizer]: - """ - Return the ad hoc recognizers relevant for the requested entities and language. - - :param ad_hoc_recognizers: List of recognizers provided in the analyze request - :param entities: List of entities to return for this request - :param language: Language code for this request - :return: A list of EntityRecognizer - """ - if not entities: - return ad_hoc_recognizers - - subset = [] - for entity in entities: - subset = [ - rec - for rec in ad_hoc_recognizers - if entity in rec.supported_entities - and language == rec.supported_language - ] - return subset - def __remove_low_scores( self, results: List[RecognizerResult], score_threshold: float = None ) -> List[RecognizerResult]: diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index cbc2862aa..2c10c01a1 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -1,3 +1,4 @@ +import copy import logging from typing import Optional, List, Iterable, Union, Type @@ -107,6 +108,7 @@ def get_recognizers( language: str, entities: Optional[List[str]] = None, all_fields: bool = False, + ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None, ) -> List[EntityRecognizer]: """ Return a list of recognizers which supports the specified name and language. @@ -114,6 +116,8 @@ def get_recognizers( :param entities: the requested entities :param language: the requested language :param all_fields: a flag to return all fields of a requested language. + :param ad_hoc_recognizers: Additional recognizers provided by the user + as part of the request :return: A list of the recognizers which supports the supplied entities and language """ @@ -123,7 +127,9 @@ def get_recognizers( if entities is None and all_fields is False: raise ValueError("No entities provided") - all_possible_recognizers = self.recognizers + all_possible_recognizers = copy.copy(self.recognizers) + if ad_hoc_recognizers: + all_possible_recognizers.extend(ad_hoc_recognizers) # filter out unwanted recognizers to_return = set() diff --git a/presidio-analyzer/tests/test_analyzer_engine.py b/presidio-analyzer/tests/test_analyzer_engine.py index 69921210d..b0ae5210d 100644 --- a/presidio-analyzer/tests/test_analyzer_engine.py +++ b/presidio-analyzer/tests/test_analyzer_engine.py @@ -618,3 +618,18 @@ def test_ad_hoc_with_context_support_higher_confidence(nlp_engine, zip_code_reco zip_result_with_context = [resp for resp in responses2 if resp.entity_type == "ZIP"] assert zip_result_no_context[0].score < zip_result_with_context[0].score + + +def test_ad_hoc_when_no_other_recognizers_are_requested_returns_only_ad_hoc_results( + loaded_analyzer_engine, zip_code_recognizer +): + text = "Mr. John Smith's zip code is 10023" + + responses = loaded_analyzer_engine.analyze( + text=text, + language="en", + ad_hoc_recognizers=[zip_code_recognizer], + entities=["ZIP"], + ) + + assert "ZIP" in [resp.entity_type for resp in responses]