microsoft · omri374 · Feb 19, 2024 · Feb 17, 2024 · Feb 17, 2024 · Feb 17, 2024
diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py
@@ -2,8 +2,8 @@
 
 from presidio_analyzer import Pattern, PatternRecognizer
 
-# Weak pattern: all FIN number start with "S", "T", "F" or "G"
-# and ends with a character, e.g., G3311100L
+# Weak pattern: all FIN number start with "S", "T", "F", "G" or "M"
+# and ends with a character, e.g., S2740116C
 # Ref: https://en.wikipedia.org/wiki/National_Registration_Identity_Card
 
 
@@ -19,7 +19,7 @@ class SgFinRecognizer(PatternRecognizer):
 
     PATTERNS = [
         Pattern("Nric (weak)", r"(?i)(\b[A-Z][0-9]{7}[A-Z]\b)", 0.3),
-        Pattern("Nric (medium)", r"(?i)(\b[STFG][0-9]{7}[A-Z]\b)", 0.5),
+        Pattern("Nric (medium)", r"(?i)(\b[STFGM][0-9]{7}[A-Z]\b)", 0.5),
     ]
 
     CONTEXT = ["fin", "fin#", "nric", "nric#"]

diff --git a/presidio-analyzer/tests/test_sg_fin_recognizer.py b/presidio-analyzer/tests/test_sg_fin_recognizer.py
@@ -1,6 +1,6 @@
 import pytest
 
-from tests import assert_result
+from tests import assert_result_within_score_range
 from presidio_analyzer.predefined_recognizers import SgFinRecognizer
 
 
@@ -15,28 +15,61 @@ def entities():
 
 
 @pytest.mark.parametrize(
-    "text, expected_len, expected_position, expected_score",
+    "text, expected_len, expected_positions, expected_score_ranges",
     [
         # fmt: off
-        ("G1122144L", 1, (0, 9), 0.5), ("PA12348L", 0, (), (),)
+        ## Medium match
+        # Test with valid NRIC/FIN starting with S
+        ("S2740116C", 1, [(0, 9)], [(0.5, 0.8)]),
+        # Test with valid NRIC/FIN starting with T
+        ("T1234567Z", 1, [(0, 9)], [(0.5, 0.8)]),
+        # Test with valid NRIC/FIN starting with F
+        ("F2346401L", 1, [(0, 9)], [(0.5, 0.8)]),
+        # Test with valid NRIC/FIN starting with G
+        ("G1122144L", 1, [(0, 9)], [(0.5, 0.8)]),
+        # Test with valid NRIC/FIN starting with M
+        ("M4332674T", 1, [(0, 9)], [(0.5, 0.8)]),
+        # Test with multiple valid NRIC/FINs
+        ("S9108268C T7572225C", 2, [(0, 9), (10, 19)], [(0.5, 0.8)] * 2),
+        # Test with valid NRIC/FIN in a sentence
+        ("NRIC S2740116C was processed", 1, [(5, 14)], [(0.5, 0.8)]),
+
+        # ## Weak match
+        # Test with invalid NRIC/FIN starting with A
+        ("A1234567Z", 1, [(0, 9)], [(0, 0.3)]),
+        # # Test with invalid NRIC/FIN starting with B
+        ("B1234567Z", 1, [(0, 9)], [(0, 0.3)]),
+
+        ## No match
+        # Test with invalid length
+        ("PA12348L", 0, [], []),
+        # Test with empty string
+        ("", 0, [], []),
         # fmt: on
     ],
 )
 def test_when_sgfins_in_text_then_all_sg_fins_found(
     text,
     expected_len,
-    expected_position,
-    expected_score,
+    expected_positions,
+    expected_score_ranges,
     recognizer,
     entities,
+    max_score,
 ):
     results = recognizer.analyze(text, entities)
     assert len(results) == expected_len
-    if results:
-        assert_result(
-            results[0],
-            entities[0],
-            expected_position[0],
-            expected_position[1],
-            expected_score,
-        )
+
+    for result, (start_pos, end_pos), (start_score, end_score) in zip(
+        results, expected_positions, expected_score_ranges
+    ):
+        import logging
+        logging.info(f"result: {result}")
+        # Adjust end_score if it's marked with a placeholder value that indicates it should be considered as max_score
+        if end_score == "max":
+            end_score = max_score
+
+        # Assuming assert_result_within_score_range checks the position and verifies the score is within the specified range
+        assert_result_within_score_range(
+            result, entities[0], start_pos, end_pos, start_score, end_score
+        )