Skip to content

Commit

Permalink
predefined pattern recognizer : IN_VEHICLE_REGISTRATION (#1288)
Browse files Browse the repository at this point in the history
* IN_PAN pattern recognizer

Added India PAN (Permanent Account Number) recognizer

* refined IN_PAN regex

refined the regex for better recognition and enhanced the test cases accordingly

* Update recognizer_registry.py

Fixed lint error that was missed earlier.

* Fixed Lint errors

Added test cases , verification and context data

* Added more test cases in test_in_pan_recognizer.py

Added negative test cases per review comments.

* added IN_AADHAAR recognizer

* Update in_aadhaar_recognizer.py

linted code

* Update in_aadhaar_recognizer.py

update pattern recognizer value per suggestion in review

* added utility function class

added PresidioAnalyzerUtils class with generic functions. removed usage of stdnum

* Create test_analyzer_utils.py

added test cases for analyzer_utils.py in prescribed format

* Update test_recognizer_registry.py

added to the count of predefined recognizers

* added predefined recognizer : IN_VEHICLE_REGISTRATION

Added India specific predefined pattern recognizer for vehicle registration number

* review comments incorporated

reinstated python 3.9 compatibility, reorganized code

* review comments incorporated

Logic reverted from analyzer_utils to recognizer classfile

* added null/min vehicle number size

added min size check to avoid failures per review comment

* incorporated review comments

---------

Co-authored-by: Omri Mendels <omri374@users.noreply.github.com>
  • Loading branch information
devopam and omri374 authored Feb 21, 2024
1 parent a8d2c90 commit dee6562
Show file tree
Hide file tree
Showing 8 changed files with 518 additions and 11 deletions.
1 change: 1 addition & 0 deletions docs/supported_entities.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ For more information, refer to the [adding new recognizers documentation](analyz
|------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|--- |
| IN_PAN | The Indian Permanent Account Number (PAN) is a unique 12 character alphanumeric identifier issued to all business and individual entities registered as Tax Payers. | Pattern match, context |
| IN_AADHAAR | Indian government issued unique 12 digit individual identity number | Pattern match, context, and checksum |
| IN_VEHICLE_REGISTRATION | Indian government issued transport (govt, personal, diplomatic, defence) vehicle registration number | Pattern match, context, and checksum |

## Adding a custom PII entity

Expand Down
4 changes: 2 additions & 2 deletions presidio-analyzer/presidio_analyzer/analyzer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ class PresidioAnalyzerUtils:
"""
Utility functions for Presidio Analyzer.
The class provides a bundle of utility functions that help centralizing the logic
for reusability and maintainability
The class provides a bundle of utility functions that help centralizing the
logic for re-usability and maintainability
"""

@staticmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from .pl_pesel_recognizer import PlPeselRecognizer
from .azure_ai_language import AzureAILanguageRecognizer
from .in_aadhaar_recognizer import InAadhaarRecognizer
from .in_vehicle_registration_recognizer import InVehicleRegistrationRecognizer

NLP_RECOGNIZERS = {
"spacy": SpacyRecognizer,
Expand Down Expand Up @@ -78,5 +79,6 @@
"InPanRecognizer",
"PlPeselRecognizer",
"AzureAILanguageRecognizer",
"InAadhaarRecognizer"
"InAadhaarRecognizer",
"InVehicleRegistrationRecognizer",
]

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions presidio-analyzer/presidio_analyzer/recognizer_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
InPanRecognizer,
PlPeselRecognizer,
InAadhaarRecognizer,
InVehicleRegistrationRecognizer,
)

logger = logging.getLogger("presidio-analyzer")
Expand Down Expand Up @@ -103,6 +104,7 @@ def load_predefined_recognizers(
AuMedicareRecognizer,
InPanRecognizer,
InAadhaarRecognizer,
InVehicleRegistrationRecognizer,
],
"es": [EsNifRecognizer],
"it": [
Expand Down
2 changes: 1 addition & 1 deletion presidio-analyzer/tests/test_analyzer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
["abCba", False, True],
["ABBA", False, True],
["aBba", True, True],
["NotAPalindrome", True, False],
]


sanitizer_test_set = [
[" a|b:c ::-", [("-", ""), (" ", ""), (":", ""), ("|", "")], "abc"],
["def", "", "def"],
Expand Down
102 changes: 102 additions & 0 deletions presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import pytest

from tests import assert_result
from presidio_analyzer.predefined_recognizers import InVehicleRegistrationRecognizer


@pytest.fixture(scope="module")
def recognizer():
return InVehicleRegistrationRecognizer()


@pytest.fixture(scope="module")
def entities():
return ["IN_VEHICLE_REGISTRATION"]


@pytest.mark.parametrize(
"text, expected_len, expected_position, expected_score",
[
# fmt: off
("KA53ME3456", 1, (0, 10), 1),
("KA99ME3456", 1, (0, 10), 0.50),
("MN2412", 1, (0, 6), 0.01),
("MCX1243", 1, (0, 7), 0.2),
("I15432", 1, (0, 6), 0.01),
("DL3CJI0001", 1, (0, 10), 1),
("ABNE123456", 0, (), (),),
("My Bike's registration number is OD02BA2341 with a lot of text beyond",
1, (33, 43), 1),
# fmt: on
],
)
def test_when_regn_in_text_then_all_regns_found(
text,
expected_len,
expected_position,
expected_score,
recognizer,
entities,
):
results = recognizer.analyze(text, entities)

assert len(results) == expected_len
if results:
assert_result(
results[0],
entities[0],
expected_position[0],
expected_position[1],
expected_score,
)


def test_list_length():
"""
Tests for static counts of each metadata lists defined
:return: True/False
"""
assert len(InVehicleRegistrationRecognizer.in_old_states) == 3
assert len(InVehicleRegistrationRecognizer.in_non_standard_state_or_ut) == 1
assert len(InVehicleRegistrationRecognizer.in_states) == 29
assert len(InVehicleRegistrationRecognizer.in_old_union_territories) == 2
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_wb) == 97
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_up) == 85
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_uk) == 20
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ts) == 37
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_tr) == 8
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_tn) == 98
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_sk) == 8
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_rj) == 57
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_py) == 5
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_pb) == 98
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_or) == 30
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_od) == 34
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_nl) == 10
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_mz) == 8
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_mp) == 70
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_mn) == 7
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ml) == 10
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_mh) == 50
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ld) == 9
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_la) == 2
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_kl) == 98
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ka) == 70
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_jh) == 23
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_hr) == 98
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_hp) == 98
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_gj) == 39
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ga) == 12
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_dl) == 13
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_dn) == 1
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_dd) == 3
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ch) == 4
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_cg) == 30
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_br) == 38
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_as) == 33
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ar) == 20
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ap) == 2
assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_an) == 1
assert len(InVehicleRegistrationRecognizer.in_vehicle_diplomatic_codes) == 3
assert len(InVehicleRegistrationRecognizer.in_vehicle_armed_forces_codes) == 11
assert len(InVehicleRegistrationRecognizer.in_vehicle_foreign_mission_codes) == 41
11 changes: 4 additions & 7 deletions presidio-analyzer/tests/test_recognizer_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
PatternRecognizer,
EntityRecognizer,
Pattern,
AnalyzerEngine
AnalyzerEngine,
)
from presidio_analyzer.predefined_recognizers import SpacyRecognizer

Expand Down Expand Up @@ -57,8 +57,8 @@ def test_when_get_recognizers_then_all_recognizers_returned(mock_recognizer_regi
registry = mock_recognizer_registry
registry.load_predefined_recognizers()
recognizers = registry.get_recognizers(language="en", all_fields=True)
# 1 custom recognizer in english + 23 predefined
assert len(recognizers) == 1 + 23
# 1 custom recognizer in english + 24 predefined
assert len(recognizers) == 1 + 24


def test_when_get_recognizers_then_return_all_fields(mock_recognizer_registry):
Expand Down Expand Up @@ -229,9 +229,6 @@ def test_recognizer_removed_and_returned_entities_are_correct():
assert "DATE_TIME" in supported_entities
assert "PERSON" not in supported_entities

analyzer = AnalyzerEngine(
registry=registry,
supported_languages='en'
)
analyzer = AnalyzerEngine(registry=registry, supported_languages="en")

analyzer.analyze("My name is David", language="en")

0 comments on commit dee6562

Please sign in to comment.