Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Regex Inspector and Email Inspector example. #115

Merged
merged 37 commits into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
eaaac1e
add InspectorInitError
MooooCat Jan 18, 2024
b88689b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 18, 2024
bec9bec
[Sweep GHA Fix] The GitHub Actions run failed with... (#116)
sweep-ai[bot] Jan 18, 2024
058b810
add regex inspector (still draft)
MooooCat Jan 18, 2024
eaaae27
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 18, 2024
2b4daac
Merge branch 'main' into feature-regex-inspector
MooooCat Jan 19, 2024
d8e8ab1
add regex base inspector
MooooCat Jan 19, 2024
6843762
add some personal info inspector
MooooCat Jan 19, 2024
c4f5c61
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 19, 2024
3686323
fix hookimpl
MooooCat Jan 19, 2024
1de7d6a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 19, 2024
b1df970
Merge branch 'main' into feature-regex-inspector
MooooCat Jan 20, 2024
4bfde14
Merge branch 'main' into feature-regex-inspector
MooooCat Jan 20, 2024
3223ca9
add _inspect_level
MooooCat Jan 22, 2024
bd69436
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 22, 2024
2deef03
discard weird change from sweep
MooooCat Jan 22, 2024
de216e5
fix typo in sweep commits
MooooCat Jan 22, 2024
b0404ef
add PII attribute
MooooCat Jan 22, 2024
f752406
add email test case (still draft)
MooooCat Jan 22, 2024
4e8d1b6
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 22, 2024
73fce1b
add personal info inspector
MooooCat Jan 23, 2024
1248f11
add test cases (still draft)
MooooCat Jan 23, 2024
91bf735
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 23, 2024
861b9a0
add domain_verification
MooooCat Jan 24, 2024
10f27b2
update localized inspectors
MooooCat Jan 24, 2024
b1cad2d
add test cases
MooooCat Jan 24, 2024
212eb98
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 24, 2024
945b548
discard sweep change
MooooCat Jan 24, 2024
bc1d724
fix col name typo
MooooCat Jan 24, 2024
9ebe668
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 24, 2024
0c9d08b
fix expection type
MooooCat Jan 24, 2024
17c117d
Merge branch 'main' into feature-regex-inspector
MooooCat Jan 24, 2024
f78a710
add corner test cases
MooooCat Jan 24, 2024
7e064b9
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 24, 2024
62b576e
fix version typo
MooooCat Jan 26, 2024
1166c32
add inspector manager testcase
MooooCat Jan 26, 2024
378d0a9
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions sdgx/data_models/inspectors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sdgx.data_models.metadata import Metadata

from sdgx.data_models.relationship import Relationship
from sdgx.exceptions import DataModelError
from sdgx.exceptions import InspectorInitError


class Inspector:
Expand All @@ -28,7 +28,7 @@ class Inspector:

_inspect_level: int = 10
"""
Inspected level is a concept newly introduced in version 0.1.5. Since a single column in the table may be marked by different inspectors at the same time (for example: the email column may be recognized as email, but it may also be recognized as the id column, and it may also be recognized by different inspectors at the same time identified as a discrete column, which will cause confusion in subsequent processing), the inspect_leve is used when determining the specific type of a column.
Inspected level is a concept newly introduced in version 0.1.6. Since a single column in the table may be marked by different inspectors at the same time (for example: the email column may be recognized as email, but it may also be recognized as the id column, and it may also be recognized by different inspectors at the same time identified as a discrete column, which will cause confusion in subsequent processing), the inspect_leve is used when determining the specific type of a column.

We will preset different inspector levels for different inspectors, usually more specific inspectors will get higher levels, and general inspectors (like discrete) will have inspect_level.

Expand All @@ -44,7 +44,7 @@ def inspect_level(self, value: int):
if value > 0 and value <= 100:
self._inspect_level = value
else:
raise DataModelError("The inspect_level should be set in [1, 100].")
raise InspectorInitError("The inspect_level should be set in [1, 100].")

def __init__(self, inspect_level=None, *args, **kwargs):
self.ready: bool = False
Expand Down
87 changes: 87 additions & 0 deletions sdgx/data_models/inspectors/personal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import re

from sdgx.data_models.inspectors.extension import hookimpl
from sdgx.data_models.inspectors.regex import RegexInspector


class EmailInspector(RegexInspector):
pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"

data_type_name = "email"

_inspect_level = 30

pii = True


class ChinaMainlandIDInspector(RegexInspector):
pattern = (
r"^[1-9]\d{5}(18|19|20)\d{2}((0[1-9])|(1[0-2]))(([0-2][1-9])|10|20|30|31)\d{3}[0-9Xx]$"
)

data_type_name = "china_mainland_id"

_inspect_level = 30

pii = True


class ChinaMainlandMobilePhoneInspector(RegexInspector):
pattern = r"^1[3-9]\d{9}$"

data_type_name = "china_mainland_mobile_phone"

_inspect_level = 30

pii = True


# 邮编
class ChinaMainlandPostCode(RegexInspector):
pattern = r"^[0-9]{6}$"

_match_percentage = 0.95
"""
Since zip codes and six-digit integers are the same, here we increase match_percentage to prevent some pure integer columns from being recognized.
"""

data_type_name = "china_mainland_postcode"

_inspect_level = 20

pii = False


# 统一社会信用代码
class ChinaMainlandUnifiedSocialCreditCode(RegexInspector):
pattern = r"^[0-9A-HJ-NPQRTUWXY]{2}\d{6}[0-9A-HJ-NPQRTUWXY]{10}$"

data_type_name = "unified_social_credit_code"

_inspect_level = 30

pii = True

pattern_ID = (
r"^[1-9]\d{5}(18|19|20)\d{2}((0[1-9])|(1[0-2]))(([0-2][1-9])|10|20|30|31)\d{3}[0-9Xx]$"
)

p_id = re.compile(pattern_ID)

def domain_verification(self, each_sample):
if re.match(self.p_id, each_sample):
return False
return True


@hookimpl
def register(manager):
manager.register("EmailInspector", EmailInspector)

manager.register("ChinaMainlandIDInspector", ChinaMainlandIDInspector)

manager.register("ChinaMainlandMobilePhoneInspector", ChinaMainlandMobilePhoneInspector)

manager.register("ChinaMainlandPostCode", ChinaMainlandPostCode)

manager.register("ChinaMainlandUnifiedSocialCreditCode", ChinaMainlandUnifiedSocialCreditCode)
117 changes: 117 additions & 0 deletions sdgx/data_models/inspectors/regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from __future__ import annotations

import re
from typing import Any

import pandas as pd

from sdgx.data_models.inspectors.base import Inspector
from sdgx.exceptions import InspectorInitError

# By default, we will not directly register the RegexInspector to the Inspector Manager
# Instead, use it as a baseclass or user-defined regex, then put it into the Inspector Manager or use it alone


class RegexInspector(Inspector):
"""RegexInspector
RegexInspector is a sdgx inspector that uses regular expression rules to detect column data types. It can be initialized with a custom expression, or it can be inherited and applied to specific data types,such as email, US address, HKID etc.
"""

pattern: str = None
"""
pattern is the regular expression string of current inspector.
"""

data_type_name: str = None
"""
data_type_name is the name of the data type, such as email, US address, HKID etc.
"""

_match_percentage: float = 0.8
"""
match_percentage shoud > 0.5 and < 1.

Due to the existence of empty data, wrong data, etc., the match_percentage is the proportion of the current regular expression compound. When the number of compound regular expressions is higher than this ratio, the column can be considered fit the current data type.
"""

@property
def match_percentage(self):
return self._match_percentage

@match_percentage.setter
def match_percentage(self, value):
if value > 0.5 and value <= 1:
self._match_percentage = value
else:
raise InspectorInitError("The match_percentage should be set in (0.5, 1].")

def __init__(
self,
pattern: str = None,
data_type_name: str = None,
match_percentage: float = None,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self.regex_columns: set[str] = set()

# this pattern should be a re pattern
if pattern:
self.pattern = pattern
# check pattern
if self.pattern is None:
raise InspectorInitError("Regular expression NOT found.")
self.p = re.compile(self.pattern)

# set data_type_name
if data_type_name:
if data_type_name.endswith("_columns"):
self.data_type_name = data_type_name[:-8]
else:
self.data_type_name = data_type_name
elif not self.data_type_name:
self.data_type_name = f"regex_{self.pattern}_columns"
# then chech the data type name
if self.data_type_name is None:
raise InspectorInitError("Inspector's data type undefined.")

# set percentage
if match_percentage:
self.match_percentage = match_percentage

def fit(self, raw_data: pd.DataFrame, *args, **kwargs):
"""Fit the inspector.

Finds the list of regex columns from the raw data.

Args:
raw_data (pd.DataFrame): Raw data
"""
for each_col in raw_data.columns:
each_match_rate = self._fit_column(raw_data[each_col])
if each_match_rate > self.match_percentage:
self.regex_columns.add(each_col)

self.ready = True

def domain_verification(self, each_sample):
return True

def _fit_column(self, column_data: pd.Series):
"""
Regular expression matching for a single column, returning the matching ratio.
"""
length = len(column_data)
match_cnt = 0
for i in column_data:
m = re.match(self.p, str(i))
d = self.domain_verification(str(i))
if m and d:
match_cnt += 1
return match_cnt / length

def inspect(self, *args, **kwargs) -> dict[str, Any]:
"""Inspect raw data and generate metadata."""

return {self.data_type_name + "_columns": list(self.regex_columns)}
4 changes: 4 additions & 0 deletions sdgx/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,7 @@ class MetadataCombinerInvalidError(MetadataCombinerError):

class MetadataCombinerInitError(MetadataCombinerError):
ERROR_CODE = 9006


class InspectorInitError(DataModelError):
ERROR_CODE = 9007
12 changes: 1 addition & 11 deletions tests/data_models/inspector/test_bool.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest

from sdgx.data_models.inspectors.bool import BoolInspector
from sdgx.exceptions import DataModelError
from sdgx.exceptions import InspectorInitError


@pytest.fixture
Expand Down Expand Up @@ -39,23 +39,13 @@ def test_inspector_demo_data(inspector: BoolInspector, raw_data):
assert not inspector.bool_columns
assert sorted(inspector.inspect()["bool_columns"]) == sorted([])
assert inspector.inspect_level == 10
# test inspect_level.setter
try:
inspector.inspect_level = 120
except Exception as e:
assert type(e) == DataModelError


def test_inspector_generated_data(inspector: BoolInspector, bool_test_df: pd.DataFrame):
# use generated id data
inspector.fit(bool_test_df)
assert inspector.bool_columns
assert sorted(inspector.inspect()["bool_columns"]) == sorted(["bool_random"])
assert inspector.inspect_level == 10
try:
inspector.inspect_level = 0
except Exception as e:
assert type(e) == DataModelError


if __name__ == "__main__":
Expand Down
Loading
Loading