Merge branch 'main' into docs/clarify-k8s-installation

microsoft · Oct 14, 2024 · b989181 · b989181
2 parents 21f8257 + 21361f9
commit b989181
Show file tree

Hide file tree

Showing 18 changed files with 278 additions and 46 deletions.
diff --git a/docs/analyzer/adding_recognizers.md b/docs/analyzer/adding_recognizers.md
@@ -150,7 +150,7 @@ To add a recognizer to the list of pre-defined recognizers:
 
 1. Clone the repo.
 2. Create a file containing the new recognizer Python class.
-3. Add the recognizer to the `recognizers_map` dict in the `RecognizerRegistry.load_predefined_recognizers` method. In this map, the key is the language the recognizer supports, and the value is the class itself. If your recognizer detects entities in multiple languages, add it to under the "ALL" key.
+3. Add the recognizer to the `recognizers` in the [`default_recognizers`](../../presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml) config. Details of recognizer paramers are given [Here](./recognizer_registry_provider.md#the-recognizer-parameters).
 4. Optional: Update documentation (e.g., the [supported entities list](../supported_entities.md)).
 
 ### Azure AI Language recognizer

diff --git a/docs/analyzer/nlp_engines/transformers.md b/docs/analyzer/nlp_engines/transformers.md
@@ -55,7 +55,75 @@ Then, also download a spaCy pipeline/model:
 python -m spacy download en_core_web_sm
 ```
 
-#### Creating a configuration file
+
+### Configuring the NER pipeline
+
+Once the models are downloaded, one option to configure them is to create a YAML configuration file.
+Note that the configuration needs to contain both a `spaCy` pipeline name and a transformers model name.
+In addition, different configurations for parsing the results of the transformers model can be added.
+
+The NER model configuration can be done in a YAML file or in Python:
+
+#### Configuring the NER pipeline via code
+
+Example configuration in Python:
+
+```python
+# Transformer model config
+model_config = [
+    {"lang_code": "en",
+     "model_name": {
+         "spacy": "en_core_web_sm", # for tokenization, lemmatization
+         "transformers": "StanfordAIMI/stanford-deidentifier-base" # for NER
+    }
+}]
+
+# Entity mappings between the model's and Presidio's
+mapping = dict(
+    PER="PERSON",
+    LOC="LOCATION",
+    ORG="ORGANIZATION",
+    AGE="AGE",
+    ID="ID",
+    EMAIL="EMAIL",
+    DATE="DATE_TIME",
+    PHONE="PHONE_NUMBER",
+    PERSON="PERSON",
+    LOCATION="LOCATION",
+    GPE="LOCATION",
+    ORGANIZATION="ORGANIZATION",
+    NORP="NRP",
+    PATIENT="PERSON",
+    STAFF="PERSON",
+    HOSP="LOCATION",
+    PATORG="ORGANIZATION",
+    TIME="DATE_TIME",
+    HCW="PERSON",
+    HOSPITAL="LOCATION",
+    FACILITY="LOCATION",
+    VENDOR="ORGANIZATION",
+)
+
+labels_to_ignore = ["O"]
+
+ner_model_configuration = NerModelConfiguration(
+    model_to_presidio_entity_mapping=mapping,
+    alignment_mode="expand", # "strict", "contract", "expand"
+    aggregation_strategy="max", # "simple", "first", "average", "max"
+    labels_to_ignore = labels_to_ignore)
+
+transformers_nlp_engine = TransformersNlpEngine(
+    models=model_config,
+    ner_model_configuration=ner_model_configuration)
+
+# Transformer-based analyzer
+analyzer = AnalyzerEngine(
+    nlp_engine=transformers_nlp_engine, 
+    supported_languages=["en"]
+)
+```
+
+#### Creating a YAML configuration file
 
 Once the models are downloaded, one option to configure them is to create a YAML configuration file.
 Note that the configuration needs to contain both a `spaCy` pipeline name and a transformers model name.
@@ -75,9 +143,9 @@ models:
 ner_model_configuration:
   labels_to_ignore:
   - O
-  aggregation_strategy: simple # "simple", "first", "average", "max"
+  aggregation_strategy: max # "simple", "first", "average", "max"
   stride: 16
-  alignment_mode: strict # "strict", "contract", "expand"
+  alignment_mode: expand # "strict", "contract", "expand"
   model_to_presidio_entity_mapping:
     PER: PERSON
     LOC: LOCATION
@@ -92,33 +160,15 @@ ner_model_configuration:
     DATE: DATE_TIME
     PHONE: PHONE_NUMBER
     HCW: PERSON
-    HOSPITAL: ORGANIZATION
+    HOSPITAL: LOCATION
+    VENDOR: ORGANIZATION
 
   low_confidence_score_multiplier: 0.4
   low_score_entity_names:
   - ID
 ```
 
-Where:
-
-- `model_name.spacy` is a name of a spaCy model/pipeline, which would wrap the transformers NER model. For example, `en_core_web_sm`.
-- The `model_name.transformers` is the full path for a huggingface model. Models can be found on [HuggingFace Models Hub](https://huggingface.co/models?pipeline_tag=token-classification). For example, `obi/deid_roberta_i2b2`
-
-The `ner_model_configuration` section contains the following parameters:
-
-- `labels_to_ignore`: A list of labels to ignore. For example, `O` (no entity) or entities you are not interested in returning.
-- `aggregation_strategy`: The strategy to use when aggregating the results of the transformers model.
-- `stride`: The value is the length of the window overlap in transformer tokenizer tokens.
-- `alignment_mode`: The strategy to use when aligning the results of the transformers model to the original text.
-- `model_to_presidio_entity_mapping`: A mapping between the transformers model labels and the Presidio entity types.
-- `low_confidence_score_multiplier`: A multiplier to apply to the score of entities with low confidence.
-- `low_score_entity_names`: A list of entity types to apply the low confidence score multiplier to.
-
-See more information on parameters on the [spacy-huggingface-pipelines Github repo](https://github.com/explosion/spacy-huggingface-pipelines#token-classification).
-
-Once created, see [the NLP configuration documentation](../customizing_nlp_models.md#Configure-Presidio-to-use-the-new-model) for more information.
-
-#### Calling the new model
+##### Calling the new model
 
 Once the configuration file is created, it can be used to create a new `TransformersNlpEngine`:
 
@@ -143,6 +193,26 @@ Once the configuration file is created, it can be used to create a new `Transfor
     print(results_english)
 ```
 
+#### Explaning the configuration options
+
+- `model_name.spacy` is a name of a spaCy model/pipeline, which would wrap the transformers NER model. For example, `en_core_web_sm`.
+- The `model_name.transformers` is the full path for a huggingface model. Models can be found on [HuggingFace Models Hub](https://huggingface.co/models?pipeline_tag=token-classification). For example, `obi/deid_roberta_i2b2`
+
+The `ner_model_configuration` section contains the following parameters:
+
+- `labels_to_ignore`: A list of labels to ignore. For example, `O` (no entity) or entities you are not interested in returning.
+- `aggregation_strategy`: The strategy to use when aggregating the results of the transformers model.
+- `stride`: The value is the length of the window overlap in transformer tokenizer tokens.
+- `alignment_mode`: The strategy to use when aligning the results of the transformers model to the original text.
+- `model_to_presidio_entity_mapping`: A mapping between the transformers model labels and the Presidio entity types.
+- `low_confidence_score_multiplier`: A multiplier to apply to the score of entities with low confidence.
+- `low_score_entity_names`: A list of entity types to apply the low confidence score multiplier to.
+
+See more information on parameters on the [spacy-huggingface-pipelines Github repo](https://github.com/explosion/spacy-huggingface-pipelines#token-classification).
+
+Once created, see [the NLP configuration documentation](../customizing_nlp_models.md#Configure-Presidio-to-use-the-new-model) for more information.
+
+
 ### Training your own model
 
 !!! note "Note"

diff --git a/docs/analyzer/recognizer_registry_provider.md b/docs/analyzer/recognizer_registry_provider.md
@@ -89,7 +89,7 @@ The recognizer list comprises of both the predefined and custom recognizers, for
     deny_list_score: 1
 ```
 
-The recognizer parameters:
+### The recognizer parameters
 
   - `supported_languages`: A list of supported languages that the analyzer will support. In case this field is missing, a recognizer will be created for each supported language provided to the `AnalyzerEngine`. 
   In addition to the language code, this field also contains a list of context words, which increases confidence in the detection in case it is found in the surroundings of a detected entity (as seen in the credit card example above).

diff --git a/docs/supported_entities.md b/docs/supported_entities.md
@@ -40,6 +40,7 @@ For more information, refer to the [adding new recognizers documentation](analyz
 |Entity Type|Description|Detection Method|
 |--- |--- |--- |
 |UK_NHS|A UK NHS number is 10 digits.|Pattern match, context and checksum|
+|UK_NINO|UK [National Insurance Number](https://en.wikipedia.org/wiki/National_Insurance_number) is a unique identifier used in the administration of National Insurance and tax.|Pattern match and context|
 
 ### Spain
 

diff --git a/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml b/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml
@@ -53,6 +53,11 @@ recognizers:
     - en
     type: predefined
 
+  - name: UkNinoRecognizer
+    supported_languages:
+    - en
+    type: predefined
+
   - name: SgFinRecognizer
     supported_languages: 
     - en
@@ -163,4 +168,4 @@ recognizers:
     type: predefined
 
   - name: InVoterRecognizer
-    type: predefined
+    type: predefined
diff --git a/presidio-analyzer/presidio_analyzer/conf/transformers.yaml b/presidio-analyzer/presidio_analyzer/conf/transformers.yaml
@@ -36,8 +36,9 @@ ner_model_configuration:
     TIME: DATE_TIME
     PHONE: PHONE_NUMBER
     HCW: PERSON
-    HOSPITAL: ORGANIZATION
+    HOSPITAL: LOCATION
     FACILITY: LOCATION
+    VENDOR: ORGANIZATION
 
   low_confidence_score_multiplier: 0.4
   low_score_entity_names:

diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py
@@ -37,6 +37,7 @@
 from .spacy_recognizer import SpacyRecognizer
 from .stanza_recognizer import StanzaRecognizer
 from .uk_nhs_recognizer import NhsRecognizer
+from .uk_nino_recognizer import UkNinoRecognizer
 from .url_recognizer import UrlRecognizer
 from .us_bank_recognizer import UsBankRecognizer
 from .us_driver_license_recognizer import UsLicenseRecognizer
@@ -104,4 +105,5 @@
     "InPassportRecognizer",
     "FiPersonalIdentityCodeRecognizer",
     "EsNieRecognizer",
+    "UkNinoRecognizer",
 ]
diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/azure_ai_language.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/azure_ai_language.py
@@ -25,6 +25,7 @@ def __init__(
         ta_client: Optional["TextAnalyticsClient"] = None,
         azure_ai_key: Optional[str] = None,
         azure_ai_endpoint: Optional[str] = None,
+        **kwargs
     ):
         """
         Wrap the PII detection in Azure AI Language.
@@ -36,6 +37,7 @@ def __init__(
         the client will be created using the key and endpoint.
         :param azure_ai_key: Azure AI for language key
         :param azure_ai_endpoint: Azure AI for language endpoint
+        :param kwargs: Additional arguments required by the parent class
 
         For more info, see https://learn.microsoft.com/en-us/azure/ai-services/language-service/personally-identifiable-information/overview
         """  # noqa E501
@@ -45,6 +47,7 @@ def __init__(
             supported_language=supported_language,
             name="Azure AI Language PII",
             version="5.2.0",
+            **kwargs
         )
 
         is_available = bool(TextAnalyticsClient)

diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/uk_nino_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/uk_nino_recognizer.py
@@ -0,0 +1,40 @@
+from typing import List, Optional
+
+from presidio_analyzer import Pattern, PatternRecognizer
+
+
+class UkNinoRecognizer(PatternRecognizer):
+    """
+    Recognizes UK National Insurance Number using regex.
+
+    :param patterns: List of patterns to be used by this recognizer
+    :param context: List of context words to increase confidence in detection
+    :param supported_language: Language this recognizer supports
+    :param supported_entity: The entity this recognizer can detect
+    """
+
+    PATTERNS = [
+        Pattern(
+            "NINO (medium)",
+            r"\b(?!bg|gb|nk|kn|nt|tn|zz|BG|GB|NK|KN|NT|TN|ZZ) ?([a-ceghj-pr-tw-zA-CEGHJ-PR-TW-Z]{1}[a-ceghj-npr-tw-zA-CEGHJ-NPR-TW-Z]{1}) ?([0-9]{2}) ?([0-9]{2}) ?([0-9]{2}) ?([a-dA-D{1}])\b",  # noqa: E501
+            0.5,
+        ),
+    ]
+
+    CONTEXT = ["national insurance", "ni number", "nino"]
+
+    def __init__(
+        self,
+        patterns: Optional[List[Pattern]] = None,
+        context: Optional[List[str]] = None,
+        supported_language: str = "en",
+        supported_entity: str = "UK_NINO",
+    ):
+        patterns = patterns if patterns else self.PATTERNS
+        context = context if context else self.CONTEXT
+        super().__init__(
+            supported_entity=supported_entity,
+            patterns=patterns,
+            context=context,
+            supported_language=supported_language,
+        )
diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/us_ssn_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/us_ssn_recognizer.py
@@ -27,8 +27,8 @@ class UsSsnRecognizer(PatternRecognizer):
         # "sec", # Task #603: Support keyphrases ("social sec")
         "ssn",
         "ssns",
-        "ssn#",
-        "ss#",
+        # "ssn#",  # iss:1452 - a # does not work with LemmaContextAwareEnhancer
+        # "ss#",  # iss:1452 - a # does not work with LemmaContextAwareEnhancer
         "ssid",
     ]
 

diff --git a/presidio-analyzer/tests/conf/test_azure_ai_language_reco.yaml b/presidio-analyzer/tests/conf/test_azure_ai_language_reco.yaml
@@ -0,0 +1,18 @@
+recognizer_registry:
+  global_regex_flags: 26
+  recognizers:
+    - name: MockAzureAiLanguageRecognizer
+      type: predefined
+      ta_client: "test" # This is a placeholder for testing purposes
+
+
+supported_languages:
+  - en
+default_score_threshold: 0.7
+
+nlp_configuration:
+  nlp_engine_name: spacy
+  models:
+    -
+      lang_code: en
+      model_name: en_core_web_lg
diff --git a/presidio-analyzer/tests/data/context_sentences_tests.txt b/presidio-analyzer/tests/data/context_sentences_tests.txt
@@ -8,16 +8,22 @@ IP_ADDRESS
 my ip: 192.168.0.1
 
 US_SSN
-my ssn is 078-051120 07805-1120
+my ssn is 078-051121
 
 US_SSN
 my social security number is 078051120
 
 US_SSN
-my social security number is 078-05-1120
+my social security number is 078-05-1121
 
 US_SSN
-my social security number is 078051120
+my social security number is 078051121
+
+US_SSN
+my ssns is 078-05-1121
+
+US_SSN
+my ssid is 078-05-1121
 
 PHONE_NUMBER
 my phone number is (425) 882-9090
-Original file line number
+Diff line change
@@ Expand Up / @@ -27,8 +27,8 @@ class UsSsnRecognizer(PatternRecognizer): @@
             # "sec", # Task #603: Support keyphrases ("social sec")
             "ssn",
             "ssns",
-            "ssn#",
-            "ss#",
+            # "ssn#",  # iss:1452 - a # does not work with LemmaContextAwareEnhancer
+            # "ss#",  # iss:1452 - a # does not work with LemmaContextAwareEnhancer
             "ssid",
         ]
@@ Expand Down @@