From f22695be7e43f02f4c28a84340dfd0429c078e33 Mon Sep 17 00:00:00 2001 From: miltonsim Date: Sat, 17 Feb 2024 09:52:32 +0800 Subject: [PATCH 1/3] feat: Support 'M' prefix in SG_NRIC_FIN Recognizer and expand tests - Updated SG_NRIC_FIN Recognizer to include 'M' prefix for validating NRIC numbers issued to foreigners from 2022 onwards. - Added new test cases to ensure comprehensive coverage and validation accuracy for all supported NRIC prefixes (S, T, F, G, M). --- .../sg_fin_recognizer.py | 2 +- .../tests/test_sg_fin_recognizer.py | 55 ++++++++++++++----- 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py index 35c04a2c1..985c8ace7 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py @@ -19,7 +19,7 @@ class SgFinRecognizer(PatternRecognizer): PATTERNS = [ Pattern("Nric (weak)", r"(?i)(\b[A-Z][0-9]{7}[A-Z]\b)", 0.3), - Pattern("Nric (medium)", r"(?i)(\b[STFG][0-9]{7}[A-Z]\b)", 0.5), + Pattern("Nric (medium)", r"(?i)(\b[STFGM][0-9]{7}[A-Z]\b)", 0.5), ] CONTEXT = ["fin", "fin#", "nric", "nric#"] diff --git a/presidio-analyzer/tests/test_sg_fin_recognizer.py b/presidio-analyzer/tests/test_sg_fin_recognizer.py index 92b068dc4..8e8ff45c7 100644 --- a/presidio-analyzer/tests/test_sg_fin_recognizer.py +++ b/presidio-analyzer/tests/test_sg_fin_recognizer.py @@ -1,6 +1,6 @@ import pytest -from tests import assert_result +from tests import assert_result_within_score_range from presidio_analyzer.predefined_recognizers import SgFinRecognizer @@ -15,28 +15,57 @@ def entities(): @pytest.mark.parametrize( - "text, expected_len, expected_position, expected_score", + "text, expected_len, expected_positions, expected_score_ranges", [ # fmt: off - ("G1122144L", 1, (0, 9), 0.5), ("PA12348L", 0, (), (),) + ## Medium match + # Test with valid NRIC/FIN starting with S + ("S2740116C", 1, [(0, 9)], [(0.5, 0.8)]), + # Test with valid NRIC/FIN starting with T + ("T1234567Z", 1, [(0, 9)], [(0.5, 0.8)]), + # Test with valid NRIC/FIN starting with F + ("F2346401L", 1, [(0, 9)], [(0.5, 0.8)]), + # Test with valid NRIC/FIN starting with G + ("G1122144L", 1, [(0, 9)], [(0.5, 0.8)]), + # Test with valid NRIC/FIN starting with M + ("M4332674T", 1, [(0, 9)], [(0.5, 0.8)]), + # Test with multiple valid NRIC/FINs + ("S9108268C T7572225C", 2, [(0, 9), (10, 19)], [(0.5, 0.8)] * 2), + + # ## Weak match + # Test with invalid NRIC/FIN starting with A + ("A1234567Z", 1, [(0, 9)], [(0, 0.3)]), + # # Test with invalid NRIC/FIN starting with B + ("B1234567Z", 1, [(0, 9)], [(0, 0.3)]), + + ## No match + # Test with invalid length + ("PA12348L", 0, [], []), + # Test with empty string + ("", 0, [], []), # fmt: on ], ) def test_when_sgfins_in_text_then_all_sg_fins_found( text, expected_len, - expected_position, - expected_score, + expected_positions, + expected_score_ranges, recognizer, entities, + max_score, ): results = recognizer.analyze(text, entities) assert len(results) == expected_len - if results: - assert_result( - results[0], - entities[0], - expected_position[0], - expected_position[1], - expected_score, - ) + + for result, (start_pos, end_pos), (start_score, end_score) in zip( + results, expected_positions, expected_score_ranges + ): + # Adjust end_score if it's marked with a placeholder value that indicates it should be considered as max_score + if end_score == "max": + end_score = max_score + + # Assuming assert_result_within_score_range checks the position and verifies the score is within the specified range + assert_result_within_score_range( + result, entities[0], start_pos, end_pos, start_score, end_score + ) \ No newline at end of file From ac409156b3a3dc16542f9dd49ecd6f5680ce3a9c Mon Sep 17 00:00:00 2001 From: miltonsim Date: Sat, 17 Feb 2024 11:45:56 +0800 Subject: [PATCH 2/3] test: Add NRIC/FIN recognition test with surrounding text - Introduced a new test case to validate the recognition of valid NRIC/FIN numbers when embedded within a sentence --- presidio-analyzer/tests/test_sg_fin_recognizer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/presidio-analyzer/tests/test_sg_fin_recognizer.py b/presidio-analyzer/tests/test_sg_fin_recognizer.py index 8e8ff45c7..e18720abe 100644 --- a/presidio-analyzer/tests/test_sg_fin_recognizer.py +++ b/presidio-analyzer/tests/test_sg_fin_recognizer.py @@ -31,6 +31,8 @@ def entities(): ("M4332674T", 1, [(0, 9)], [(0.5, 0.8)]), # Test with multiple valid NRIC/FINs ("S9108268C T7572225C", 2, [(0, 9), (10, 19)], [(0.5, 0.8)] * 2), + # Test with valid NRIC/FIN in a sentence + ("NRIC S2740116C was processed", 1, [(5, 14)], [(0.5, 0.8)]), # ## Weak match # Test with invalid NRIC/FIN starting with A @@ -61,6 +63,8 @@ def test_when_sgfins_in_text_then_all_sg_fins_found( for result, (start_pos, end_pos), (start_score, end_score) in zip( results, expected_positions, expected_score_ranges ): + import logging + logging.info(f"result: {result}") # Adjust end_score if it's marked with a placeholder value that indicates it should be considered as max_score if end_score == "max": end_score = max_score From 9b0f6d4518a106a4592ce38e34b57c677e22c2bf Mon Sep 17 00:00:00 2001 From: miltonsim Date: Sat, 17 Feb 2024 11:46:26 +0800 Subject: [PATCH 3/3] docs: Update FIN number pattern to include 'M' --- .../predefined_recognizers/sg_fin_recognizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py index 985c8ace7..eb8b31748 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py @@ -2,8 +2,8 @@ from presidio_analyzer import Pattern, PatternRecognizer -# Weak pattern: all FIN number start with "S", "T", "F" or "G" -# and ends with a character, e.g., G3311100L +# Weak pattern: all FIN number start with "S", "T", "F", "G" or "M" +# and ends with a character, e.g., S2740116C # Ref: https://en.wikipedia.org/wiki/National_Registration_Identity_Card