Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Support 'M' prefix in SG_NRIC_FIN Recognizer and expand tests #1304

Merged
merged 3 commits into from
Feb 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from presidio_analyzer import Pattern, PatternRecognizer

# Weak pattern: all FIN number start with "S", "T", "F" or "G"
# and ends with a character, e.g., G3311100L
# Weak pattern: all FIN number start with "S", "T", "F", "G" or "M"
# and ends with a character, e.g., S2740116C
# Ref: https://en.wikipedia.org/wiki/National_Registration_Identity_Card


Expand All @@ -19,7 +19,7 @@ class SgFinRecognizer(PatternRecognizer):

PATTERNS = [
Pattern("Nric (weak)", r"(?i)(\b[A-Z][0-9]{7}[A-Z]\b)", 0.3),
Pattern("Nric (medium)", r"(?i)(\b[STFG][0-9]{7}[A-Z]\b)", 0.5),
Pattern("Nric (medium)", r"(?i)(\b[STFGM][0-9]{7}[A-Z]\b)", 0.5),
]

CONTEXT = ["fin", "fin#", "nric", "nric#"]
Expand Down
59 changes: 46 additions & 13 deletions presidio-analyzer/tests/test_sg_fin_recognizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from tests import assert_result
from tests import assert_result_within_score_range
from presidio_analyzer.predefined_recognizers import SgFinRecognizer


Expand All @@ -15,28 +15,61 @@ def entities():


@pytest.mark.parametrize(
"text, expected_len, expected_position, expected_score",
"text, expected_len, expected_positions, expected_score_ranges",
[
# fmt: off
("G1122144L", 1, (0, 9), 0.5), ("PA12348L", 0, (), (),)
## Medium match
# Test with valid NRIC/FIN starting with S
("S2740116C", 1, [(0, 9)], [(0.5, 0.8)]),
# Test with valid NRIC/FIN starting with T
("T1234567Z", 1, [(0, 9)], [(0.5, 0.8)]),
# Test with valid NRIC/FIN starting with F
("F2346401L", 1, [(0, 9)], [(0.5, 0.8)]),
# Test with valid NRIC/FIN starting with G
("G1122144L", 1, [(0, 9)], [(0.5, 0.8)]),
# Test with valid NRIC/FIN starting with M
("M4332674T", 1, [(0, 9)], [(0.5, 0.8)]),
# Test with multiple valid NRIC/FINs
("S9108268C T7572225C", 2, [(0, 9), (10, 19)], [(0.5, 0.8)] * 2),
# Test with valid NRIC/FIN in a sentence
("NRIC S2740116C was processed", 1, [(5, 14)], [(0.5, 0.8)]),

# ## Weak match
# Test with invalid NRIC/FIN starting with A
("A1234567Z", 1, [(0, 9)], [(0, 0.3)]),
# # Test with invalid NRIC/FIN starting with B
("B1234567Z", 1, [(0, 9)], [(0, 0.3)]),

## No match
# Test with invalid length
("PA12348L", 0, [], []),
# Test with empty string
("", 0, [], []),
# fmt: on
],
)
def test_when_sgfins_in_text_then_all_sg_fins_found(
text,
expected_len,
expected_position,
expected_score,
expected_positions,
expected_score_ranges,
recognizer,
entities,
max_score,
):
results = recognizer.analyze(text, entities)
assert len(results) == expected_len
if results:
assert_result(
results[0],
entities[0],
expected_position[0],
expected_position[1],
expected_score,
)

for result, (start_pos, end_pos), (start_score, end_score) in zip(
results, expected_positions, expected_score_ranges
):
import logging
logging.info(f"result: {result}")
# Adjust end_score if it's marked with a placeholder value that indicates it should be considered as max_score
if end_score == "max":
end_score = max_score

# Assuming assert_result_within_score_range checks the position and verifies the score is within the specified range
assert_result_within_score_range(
result, entities[0], start_pos, end_pos, start_score, end_score
)
Loading