From f22695be7e43f02f4c28a84340dfd0429c078e33 Mon Sep 17 00:00:00 2001
From: miltonsim <miltonsim2000@gmail.com>
Date: Sat, 17 Feb 2024 09:52:32 +0800
Subject: [PATCH 1/3] feat: Support 'M' prefix in SG_NRIC_FIN Recognizer and
 expand tests

- Updated SG_NRIC_FIN Recognizer to include 'M' prefix for validating NRIC numbers issued to foreigners from 2022 onwards.
- Added new test cases to ensure comprehensive coverage and validation accuracy for all supported NRIC prefixes (S, T, F, G, M).
---
 .../sg_fin_recognizer.py                      |  2 +-
 .../tests/test_sg_fin_recognizer.py           | 55 ++++++++++++++-----
 2 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py
index 35c04a2c1..985c8ace7 100644
--- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py
+++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py
@@ -19,7 +19,7 @@ class SgFinRecognizer(PatternRecognizer):
 
     PATTERNS = [
         Pattern("Nric (weak)", r"(?i)(\b[A-Z][0-9]{7}[A-Z]\b)", 0.3),
-        Pattern("Nric (medium)", r"(?i)(\b[STFG][0-9]{7}[A-Z]\b)", 0.5),
+        Pattern("Nric (medium)", r"(?i)(\b[STFGM][0-9]{7}[A-Z]\b)", 0.5),
     ]
 
     CONTEXT = ["fin", "fin#", "nric", "nric#"]
diff --git a/presidio-analyzer/tests/test_sg_fin_recognizer.py b/presidio-analyzer/tests/test_sg_fin_recognizer.py
index 92b068dc4..8e8ff45c7 100644
--- a/presidio-analyzer/tests/test_sg_fin_recognizer.py
+++ b/presidio-analyzer/tests/test_sg_fin_recognizer.py
@@ -1,6 +1,6 @@
 import pytest
 
-from tests import assert_result
+from tests import assert_result_within_score_range
 from presidio_analyzer.predefined_recognizers import SgFinRecognizer
 
 
@@ -15,28 +15,57 @@ def entities():
 
 
 @pytest.mark.parametrize(
-    "text, expected_len, expected_position, expected_score",
+    "text, expected_len, expected_positions, expected_score_ranges",
     [
         # fmt: off
-        ("G1122144L", 1, (0, 9), 0.5), ("PA12348L", 0, (), (),)
+        ## Medium match
+        # Test with valid NRIC/FIN starting with S
+        ("S2740116C", 1, [(0, 9)], [(0.5, 0.8)]),
+        # Test with valid NRIC/FIN starting with T
+        ("T1234567Z", 1, [(0, 9)], [(0.5, 0.8)]),
+        # Test with valid NRIC/FIN starting with F
+        ("F2346401L", 1, [(0, 9)], [(0.5, 0.8)]),
+        # Test with valid NRIC/FIN starting with G
+        ("G1122144L", 1, [(0, 9)], [(0.5, 0.8)]),
+        # Test with valid NRIC/FIN starting with M
+        ("M4332674T", 1, [(0, 9)], [(0.5, 0.8)]),
+        # Test with multiple valid NRIC/FINs
+        ("S9108268C T7572225C", 2, [(0, 9), (10, 19)], [(0.5, 0.8)] * 2),
+
+        # ## Weak match
+        # Test with invalid NRIC/FIN starting with A
+        ("A1234567Z", 1, [(0, 9)], [(0, 0.3)]),
+        # # Test with invalid NRIC/FIN starting with B
+        ("B1234567Z", 1, [(0, 9)], [(0, 0.3)]),
+        
+        ## No match
+        # Test with invalid length
+        ("PA12348L", 0, [], []),
+        # Test with empty string
+        ("", 0, [], []),
         # fmt: on
     ],
 )
 def test_when_sgfins_in_text_then_all_sg_fins_found(
     text,
     expected_len,
-    expected_position,
-    expected_score,
+    expected_positions,
+    expected_score_ranges,
     recognizer,
     entities,
+    max_score,
 ):
     results = recognizer.analyze(text, entities)
     assert len(results) == expected_len
-    if results:
-        assert_result(
-            results[0],
-            entities[0],
-            expected_position[0],
-            expected_position[1],
-            expected_score,
-        )
+
+    for result, (start_pos, end_pos), (start_score, end_score) in zip(
+        results, expected_positions, expected_score_ranges
+    ):
+        # Adjust end_score if it's marked with a placeholder value that indicates it should be considered as max_score
+        if end_score == "max":
+            end_score = max_score
+
+        # Assuming assert_result_within_score_range checks the position and verifies the score is within the specified range
+        assert_result_within_score_range(
+            result, entities[0], start_pos, end_pos, start_score, end_score
+        )
\ No newline at end of file

From ac409156b3a3dc16542f9dd49ecd6f5680ce3a9c Mon Sep 17 00:00:00 2001
From: miltonsim <miltonsim2000@gmail.com>
Date: Sat, 17 Feb 2024 11:45:56 +0800
Subject: [PATCH 2/3] test: Add NRIC/FIN recognition test with surrounding text

- Introduced a new test case to validate the recognition of valid NRIC/FIN numbers when embedded within a sentence
---
 presidio-analyzer/tests/test_sg_fin_recognizer.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/presidio-analyzer/tests/test_sg_fin_recognizer.py b/presidio-analyzer/tests/test_sg_fin_recognizer.py
index 8e8ff45c7..e18720abe 100644
--- a/presidio-analyzer/tests/test_sg_fin_recognizer.py
+++ b/presidio-analyzer/tests/test_sg_fin_recognizer.py
@@ -31,6 +31,8 @@ def entities():
         ("M4332674T", 1, [(0, 9)], [(0.5, 0.8)]),
         # Test with multiple valid NRIC/FINs
         ("S9108268C T7572225C", 2, [(0, 9), (10, 19)], [(0.5, 0.8)] * 2),
+        # Test with valid NRIC/FIN in a sentence
+        ("NRIC S2740116C was processed", 1, [(5, 14)], [(0.5, 0.8)]),
 
         # ## Weak match
         # Test with invalid NRIC/FIN starting with A
@@ -61,6 +63,8 @@ def test_when_sgfins_in_text_then_all_sg_fins_found(
     for result, (start_pos, end_pos), (start_score, end_score) in zip(
         results, expected_positions, expected_score_ranges
     ):
+        import logging
+        logging.info(f"result: {result}")
         # Adjust end_score if it's marked with a placeholder value that indicates it should be considered as max_score
         if end_score == "max":
             end_score = max_score

From 9b0f6d4518a106a4592ce38e34b57c677e22c2bf Mon Sep 17 00:00:00 2001
From: miltonsim <miltonsim2000@gmail.com>
Date: Sat, 17 Feb 2024 11:46:26 +0800
Subject: [PATCH 3/3] docs: Update FIN number pattern to include 'M'

---
 .../predefined_recognizers/sg_fin_recognizer.py               | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py
index 985c8ace7..eb8b31748 100644
--- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py
+++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/sg_fin_recognizer.py
@@ -2,8 +2,8 @@
 
 from presidio_analyzer import Pattern, PatternRecognizer
 
-# Weak pattern: all FIN number start with "S", "T", "F" or "G"
-# and ends with a character, e.g., G3311100L
+# Weak pattern: all FIN number start with "S", "T", "F", "G" or "M"
+# and ends with a character, e.g., S2740116C
 # Ref: https://en.wikipedia.org/wiki/National_Registration_Identity_Card