microsoft · omri374 · May 5, 2021 · Apr 22, 2021 · Apr 22, 2021 · Apr 22, 2021
diff --git a/docs/analyzer/adding_recognizers.md b/docs/analyzer/adding_recognizers.md
@@ -126,6 +126,59 @@ To add a recognizer to the list of pre-defined recognizers:
 3. Add the recognizer to the `recognizers_map` dict in the `RecognizerRegistry.load_predefined_recognizers` method. In this map, the key is the language the recognizer supports, and the value is the class itself. If your recognizer detects entities in multiple languages, add it to under the "ALL" key.
 4. Optional: Update documentation (e.g., the [supported entities list](../supported_entities.md)).
 
+### Creating ad-hoc recognizers
+
+In addition to recognizers in code, it is possible to create ad-hoc recognizers via the Presidio Analyzer API for regex and deny-list based logic.
+These recognizers, in JSON form, are added to the `/analyze` request and are only used in the context of this request.
+
+- The json structure for a regex ad-hoc recognizer is the following:
+
+    ```json
+    {
+        "text": "John Smith drivers license is AC432223. Zip code: 10023",
+        "language": "en",
+        "ad_hoc_recognizers":[
+            {
+            "name": "Zip code Recognizer",
+            "supported_language": "en",
+            "patterns": [
+                {
+                "name": "zip code (weak)", 
+                "regex": "(\\b\\d{5}(?:\\-\\d{4})?\\b)", 
+                "score": 0.01
+                }
+            ],
+            "supported_entity":"ZIP"
+            }
+        ]
+    }
+    ```
+
+- The json structure for a deny-list based recognizers is the following:
+
+    ```json
+    {
+        "text": "Mr. John Smith's drivers license is AC432223",
+        "language": "en",
+        "ad_hoc_recognizers":[
+            {
+            "name": "Mr. Recognizer",
+            "supported_language": "en",
+            "deny_list": ["Mr", "Mr.", "Mister"],
+            "supported_entity":"MR_TITLE"
+            },
+            {
+            "name": "Ms. Recognizer",
+            "supported_language": "en",
+            "deny_list": ["Ms", "Ms.", "Miss", "Mrs", "Mrs."],
+            "supported_entity":"MS_TITLE"
+            }
+        ]
+    }
+    ```
+
+In both examples, the `/analyze` request is extended with a list of `ad_hoc_recognizers`, which could be either `patterns`, `deny_list` or both.
+
 ## PII detection in different languages
 
 For recognizers in new languages, refer to the [languages documentation](languages.md).

diff --git a/docs/api-docs/api-docs.yml b/docs/api-docs/api-docs.yml
@@ -281,13 +281,27 @@ components:
             Enhanced Request :
               value:
                 {
-                  "text": "John Smith drivers license is AC432223",
+                  "text": "John Smith drivers license is AC432223 and the zip code is 12345",
                   "language": "en",
                   "return_decision_process": false,
                   "correlation_id": "123e4567-e89b-12d3-a456-426614174000",
                   "score_threshold": 0.6,
-                  "entities": ["US_DRIVER_LICENSE"],
-                  "trace": false
+                  "entities": ["US_DRIVER_LICENSE", "ZIP"],
+                  "trace": false,
+                  "ad_hoc_recognizers":[
+                    {
+                    "name": "Zip code Recognizer",
+                    "supported_language": "en",
+                    "patterns": [
+                        {
+                        "name": "zip code (weak)", 
+                        "regex": "(\\b\\d{5}(?:\\-\\d{4})?\\b)", 
+                        "score": 0.01
+                        }
+                    ],
+                    "supported_entity":"ZIP"
+                    }
+        ]
                 }
 
     AnonymizeRequest:
@@ -377,6 +391,11 @@ components:
         return_decision_process:
           type: boolean
           description: "Whether to include analysis explanation in the response"
+        ad_hoc_recognizers:
+          type: array
+          description: "list of recognizers to be used in the context of this request only (ad-hoc)."
+          items:
+            $ref: "#/components/schemas/PatternRecognizer"
 
     AnonymizeRequest:
       type: object
@@ -515,8 +534,44 @@ components:
           type: number
           format: double
           description: "Result of a validation (e.g. checksum)"
+
+    Pattern:
+      type: object
+      properties:
+        name:
+          type: string
+          description: "Name of regular expression pattern"
+        regex:
+          type: string
+          description: "Regex pattern string"
+        score:
+          type: number
+          format: double
+          description: "Detection confidence of this pattern (0.01 if very noisy, 0.6-1.0 if very specific)"
 
 
+    PatternRecognizer:
+      type: object
+      description: "A regular expressions or deny-list based recognizer"
+      properties:
+        name:
+          type: string
+          description: "Name of recognizer"
+        supported_language:
+          type: string
+          description: "Language code supported by this recognizer"
+        patterns:
+          type: array
+          items:
+            $ref: "#/components/schemas/Pattern"
+        deny_list:
+          type: array
+          items:
+            type: string
+        supported_entity:
+          type: string
+          description: "The name of entity this ad hoc recognizer detects"
+
     EntityTypes:
       description: "The supported PII entity types."
       type: string

diff --git a/docs/samples/python/customizing_presidio_analyzer.ipynb b/docs/samples/python/customizing_presidio_analyzer.ipynb
@@ -424,17 +424,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Results from Spanish request:\n",
-      "[type: PERSON, start: 13, end: 18, score: 0.85]\n",
+      "[type: PERSON, start: 13, end: 19, score: 0.85]\n",
       "Results from English request:\n",
-      "[type: PERSON, start: 11, end: 16, score: 0.85]\n"
+      "[type: PERSON, start: 11, end: 17, score: 0.85]\n"
      ]
     }
    ],
@@ -462,11 +462,11 @@
     ")\n",
     "\n",
     "# Analyze in different languages\n",
-    "results_spanish = analyzer.analyze(text=\"Mi nombre es David\", language=\"es\")\n",
+    "results_spanish = analyzer.analyze(text=\"Mi nombre es Morris\", language=\"es\")\n",
     "print(\"Results from Spanish request:\")\n",
     "print(results_spanish)\n",
     "\n",
-    "results_english = analyzer.analyze(text=\"My name is David\", language=\"en\")\n",
+    "results_english = analyzer.analyze(text=\"My name is Morris\", language=\"en\")\n",
     "print(\"Results from English request:\")\n",
     "print(results_english)"
    ]
@@ -506,7 +506,7 @@
    ],
    "source": [
     "# Define the regex pattern\n",
-    "regex = \"(\\d{5}(?:\\-\\d{4})?)$\" # very weak regex pattern, taken from here: https://stackoverflow.com/questions/2577236/regex-for-zip-code\n",
+    "regex = r\"(\\b\\d{5}(?:\\-\\d{4})?\\b)\" # very weak regex pattern\n",
     "zipcode_pattern = Pattern(name=\"zip code (weak)\", regex=regex, score=0.01)\n",
     "\n",
     "# Define the recognizer with the defined pattern\n",
@@ -614,7 +614,7 @@
       "Decision process output:\n",
       "\n",
       "{'original_score': 0.01,\n",
-      " 'pattern': '(\\\\d{5}(?:\\\\-\\\\d{4})?)$',\n",
+      " 'pattern': '(\\\\b\\\\d{5}(?:\\\\-\\\\d{4})?\\\\b)',\n",
       " 'pattern_name': 'zip code (weak)',\n",
       " 'recognizer': 'PatternRecognizer',\n",
       " 'score': 0.4,\n",

diff --git a/e2e-tests/tests/test_analyzer.py b/e2e-tests/tests/test_analyzer.py
@@ -1,4 +1,5 @@
 import pytest
+
 from common.assertions import equal_json_strings
 from common.methods import analyze, analyzer_supported_entities
 
@@ -269,3 +270,75 @@ def test_given_an_illegal_input_for_supported_entities_then_igonre_and_proceed()
     """
     assert response_status == 200
     assert equal_json_strings(expected_response, response_content)
+
+
+@pytest.mark.api
+def test_given_ad_hoc_pattern_recognizer_the_right_entities_are_returned():
+    request_body = r"""
+     {
+         "text": "John Smith drivers license is AC432223. Zip code: 10023",
+         "language": "en",
+         "ad_hoc_recognizers":[
+             {
+                "name": "Zip code Recognizer",
+                "supported_language": "en",
+                "patterns": [
+                    {
+                    "name": "zip code (weak)", 
+                    "regex": "(\\b\\d{5}(?:\\-\\d{4})?\\b)", 
+                    "score": 0.01
+                    }
+                ],
+                "supported_entity":"ZIP"
+            }
+        ]
+     }
+     """
+
+    response_status, response_content = analyze(request_body)
+
+    expected_response = """
+     [
+         {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation":null},
+         {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, "analysis_explanation":null},
+         {"entity_type": "ZIP", "start": 50, "end": 55, "score": 0.01, "analysis_explanation":null}
+     ]
+     """
+    assert response_status == 200
+    assert equal_json_strings(expected_response, response_content)
+
+
+@pytest.mark.api
+def test_given_ad_hoc_deny_list_recognizer_the_right_entities_are_returned():
+    request_body = r"""
+    {
+        "text": "Mr. John Smith's drivers license is AC432223",
+        "language": "en",
+        "ad_hoc_recognizers":[
+            {
+            "name": "Mr. Recognizer",
+            "supported_language": "en",
+            "deny_list": ["Mr", "Mr.", "Mister"],
+            "supported_entity":"MR_TITLE"
+            },
+            {
+            "name": "Ms. Recognizer",
+            "supported_language": "en",
+            "deny_list": ["Ms", "Ms.", "Miss", "Mrs", "Mrs."],
+            "supported_entity":"MS_TITLE"
+            }
+        ]
+    }
+     """
+
+    response_status, response_content = analyze(request_body)
+
+    expected_response = """
+     [
+         {"entity_type": "PERSON", "start": 4, "end": 14, "score": 0.85, "analysis_explanation":null},
+         {"entity_type": "US_DRIVER_LICENSE", "start": 36, "end": 44, "score": 0.6499999999999999, "analysis_explanation":null},
+         {"entity_type": "MR_TITLE", "start": 0, "end": 3, "score": 1.0, "analysis_explanation":null}
+     ]
+     """
+    assert response_status == 200
+    assert equal_json_strings(expected_response, response_content)
diff --git a/presidio-analyzer/app.py b/presidio-analyzer/app.py
@@ -64,6 +64,7 @@ def analyze() -> Tuple[str, int]:
                     score_threshold=req_data.score_threshold,
                     entities=req_data.entities,
                     return_decision_process=req_data.return_decision_process,
+                    ad_hoc_recognizers=req_data.ad_hoc_recognizers,
                 )
 
                 return Response(

diff --git a/presidio-analyzer/presidio_analyzer/analyzer_engine.py b/presidio-analyzer/presidio_analyzer/analyzer_engine.py
@@ -113,6 +113,7 @@ def analyze(
         correlation_id: Optional[str] = None,
         score_threshold: Optional[float] = None,
         return_decision_process: Optional[bool] = False,
+        ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,
     ) -> List[RecognizerResult]:
         """
         Find PII entities in text using different PII recognizers for a given language.
@@ -126,6 +127,8 @@ def analyze(
         to return an identified entity
         :param return_decision_process: Whether the analysis decision process steps
         returned in the response.
+        :param ad_hoc_recognizers: List of recognizers which will be used only
+        for this specific request.
         :return: an array of the found entities in the text
 
         :example:
@@ -147,6 +150,9 @@ def analyze(
             language=language, entities=entities, all_fields=all_fields
         )
 
+        if ad_hoc_recognizers:
+            recognizers.extend(ad_hoc_recognizers)
+
         if all_fields:
             # Since all_fields=True, list all entities by iterating
             # over all recognizers

diff --git a/presidio-analyzer/presidio_analyzer/analyzer_request.py b/presidio-analyzer/presidio_analyzer/analyzer_request.py
@@ -1,5 +1,7 @@
 from typing import Dict
 
+from presidio_analyzer import PatternRecognizer
+
 
 class AnalyzerRequest:
     """
@@ -25,3 +27,9 @@ def __init__(self, req_data: Dict):
         self.correlation_id = req_data.get("correlation_id")
         self.score_threshold = req_data.get("score_threshold")
         self.return_decision_process = req_data.get("return_decision_process")
+        ad_hoc_recognizers = req_data.get("ad_hoc_recognizers")
+        self.ad_hoc_recognizers = []
+        if ad_hoc_recognizers:
+            self.ad_hoc_recognizers = [
+                PatternRecognizer.from_dict(rec) for rec in ad_hoc_recognizers
+            ]
diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py
@@ -34,7 +34,7 @@ def __init__(self, models: Optional[Dict[str, str]] = None):
         logger.debug(f"Loading SpaCy models: {models.values()}")
 
         self.nlp = {
-            lang_code: spacy.load(model_name, disable=["parser", "tagger"])
+            lang_code: spacy.load(model_name, disable=["parser"])
             for lang_code, model_name in models.items()
         }