-
Notifications
You must be signed in to change notification settings - Fork 584
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
changelog Static analysis docstrings, types preliminary tests engine static analysis isort Minor refactorings Update README.md Fix late binding issues and example removal of old samples Refactoring, adding example pre-clean-break-commit broken commit, fixing TabularConfigBuilder Rename TabularConfig pre-breaking replace commit removal of some old experimental files rename tabular to structured restructuring presidio tabular - pre del commit Add project TODOs testing dump presidio tabular
- Loading branch information
Showing
17 changed files
with
1,238 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Presidio structured | ||
|
||
## Status | ||
|
||
### TODO | ||
|
||
For TODOs, see draft PR. | ||
|
||
## Description | ||
|
||
The Presidio stuctured is.. | ||
|
||
## Deploy Presidio analyzer to Azure | ||
|
||
## Simple usage example | ||
|
||
## Documentation | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
"""Anonymizer root module.""" | ||
import logging | ||
|
||
# Set up default logging (with NullHandler) | ||
|
||
|
||
# logging.getLogger("presidio-str").addHandler(logging.NullHandler()) | ||
|
||
# __all__ = ["AnonymizerEngine", "DeanonymizeEngine", "BatchAnonymizerEngine"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from .analysis_builder import JsonAnalysisBuilder, TabularAnalysisBuilder | ||
from .config import StructuredAnalysis | ||
from .data import CsvReader, JsonDataTransformer, JsonReader, PandasDataTransformer | ||
from .tabular_engine import TabularEngine | ||
|
||
__all__ = [ | ||
"TabularEngine", | ||
"JsonAnalysisBuilder", | ||
"TabularAnalysisBuilder", | ||
"StructuredAnalysis", | ||
"CsvReader", | ||
"JsonReader", | ||
"PandasDataTransformer", | ||
"JsonDataTransformer", | ||
] |
163 changes: 163 additions & 0 deletions
163
presidio-structured/presidio_structured/analysis_builder.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
from abc import ABC, abstractmethod | ||
from collections import Counter | ||
from collections.abc import Iterable | ||
from typing import Any, Dict, Iterator, Union | ||
|
||
from pandas import DataFrame | ||
from presidio_analyzer import ( | ||
AnalyzerEngine, | ||
BatchAnalyzerEngine, | ||
DictAnalyzerResult, | ||
RecognizerResult, | ||
) | ||
|
||
from presidio_structured.config import StructuredAnalysis | ||
|
||
|
||
class AnalysisBuilder(ABC): | ||
""" | ||
Abstract base class for a configuration generator. | ||
""" | ||
|
||
def __init__(self): | ||
"""Initialize the configuration generator.""" | ||
self.analyzer = AnalyzerEngine() | ||
|
||
@abstractmethod | ||
def generate_analysis(self, data: Union[Dict, DataFrame]) -> StructuredAnalysis: | ||
""" | ||
Abstract method to generate a configuration from the given data. | ||
:param data: The input data. Can be a dictionary or DataFrame instance. | ||
:type data: Union[Dict, DataFrame] | ||
:return: The generated configuration. | ||
:rtype StructuredAnalysis: | ||
""" | ||
pass | ||
|
||
|
||
class JsonAnalysisBuilder(AnalysisBuilder): | ||
"""Concrete configuration generator for JSON data.""" | ||
|
||
def generate_analysis(self, data: Dict) -> StructuredAnalysis: | ||
""" | ||
Generate a configuration from the given JSON data. | ||
:param data: The input JSON data. | ||
:type data: Dict | ||
:return: The generated configuration. | ||
:rtype StructuredAnalysis: | ||
""" | ||
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) | ||
analyzer_results = batch_analyzer.analyze_dict(input_dict=data, language="en") | ||
return self._generate_analysis_from_results_json(analyzer_results) | ||
|
||
def _generate_analysis_from_results_json( | ||
self, analyzer_results: Iterator[DictAnalyzerResult], prefix: str = "" | ||
) -> StructuredAnalysis: | ||
""" | ||
Generate a configuration from the given analyzer results. | ||
:param analyzer_results: The analyzer results. | ||
:type analyzer_results: Iterator[DictAnalyzerResult] | ||
:param prefix: The prefix for the configuration keys. | ||
:type prefix: str | ||
:return: The generated configuration. | ||
:rtype StructuredAnalysis: | ||
""" | ||
mappings = {} | ||
|
||
if not isinstance(analyzer_results, Iterable): | ||
return mappings | ||
|
||
for result in analyzer_results: | ||
current_key = prefix + result.key | ||
|
||
if isinstance(result.value, dict): | ||
nested_mappings = self._generate_analysis_from_results_json( | ||
result.recognizer_results, prefix=current_key + "." | ||
) | ||
mappings.update(nested_mappings.entity_mapping) | ||
|
||
if sum(1 for _ in result.recognizer_results) > 0: | ||
for recognizer_result in result.recognizer_results: | ||
mappings[current_key] = recognizer_result.entity_type | ||
return StructuredAnalysis(entity_mapping=mappings) | ||
|
||
|
||
class TabularAnalysisBuilder(AnalysisBuilder): | ||
"""Concrete configuration generator for tabular data.""" | ||
|
||
def generate_analysis( | ||
self, df: DataFrame, n: int = 100, language: str = "en" | ||
) -> StructuredAnalysis: | ||
""" | ||
Generate a configuration from the given tabular data. | ||
:param df: The input tabular data (dataframe). | ||
:type df: DataFrame | ||
:param n: The number of samples to be taken from the dataframe. | ||
:type n: int | ||
:param language: The language to be used for analysis. | ||
:type language: str | ||
:return: The generated configuration. | ||
:rtype StructuredAnalysis: | ||
""" | ||
if n > len(df): | ||
n = len(df) | ||
|
||
df = df.sample(n) | ||
|
||
key_recognizer_result_map = self._find_most_common_entity(df, language) | ||
|
||
key_entity_map = { | ||
key: result.entity_type | ||
for key, result in key_recognizer_result_map.items() | ||
if result.entity_type != "NON_PII" | ||
} | ||
|
||
return StructuredAnalysis(entity_mapping=key_entity_map) | ||
|
||
def _find_most_common_entity( | ||
self, df: DataFrame, language: str | ||
) -> Dict[str, RecognizerResult]: | ||
""" | ||
Find the most common entity in a dataframe column. | ||
:param df: The dataframe where entities will be searched. | ||
:type df: DataFrame | ||
:param language: Language to be used in the analysis engine. | ||
:type language: str | ||
:return: A dictionary mapping column names to the most common RecognizerResult. | ||
:rtype: Dict[str, RecognizerResult] | ||
""" | ||
key_recognizer_result_map = {} | ||
|
||
for column in df.columns: | ||
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) | ||
analyzer_results = batch_analyzer.analyze_iterator( | ||
[val for val in df[column]], language=language | ||
) | ||
|
||
if all(len(res) == 0 for res in analyzer_results): | ||
key_recognizer_result_map[column] = RecognizerResult( | ||
entity_type="NON_PII", start=0, end=1, score=1.0 | ||
) | ||
continue | ||
# Grabbing most common type | ||
types_list = [ | ||
res[0].entity_type for res in analyzer_results if len(res) > 0 | ||
] | ||
type_counter = Counter(types_list) | ||
most_common_type = type_counter.most_common(1)[0][0] | ||
# Grabbing the average confidence score for the most common type. | ||
scores = [ | ||
res[0].score | ||
for res in analyzer_results | ||
if len(res) > 0 and res[0].entity_type == most_common_type | ||
] | ||
average_score = sum(scores) / len(scores) if scores else 0.0 | ||
key_recognizer_result_map[column] = RecognizerResult( | ||
most_common_type, 0, 1, average_score | ||
) | ||
return key_recognizer_result_map |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from .structured_analysis import StructuredAnalysis | ||
|
||
__all__ = [ | ||
"StructuredAnalysis", | ||
] |
13 changes: 13 additions & 0 deletions
13
presidio-structured/presidio_structured/config/structured_analysis.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
""" Structured Analysis module. """ | ||
|
||
from dataclasses import dataclass | ||
from typing import Dict | ||
|
||
|
||
@dataclass | ||
class StructuredAnalysis: | ||
"""Dataclass containing entity analysis from structured data. Currently only contains entity mapping.""" | ||
|
||
entity_mapping: Dict[ | ||
str, str | ||
] # NOTE ideally Literal[...] with allowed EntityTypes, but cannot unpack in Literal. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from .data_reader import CsvReader, JsonReader | ||
from .data_transformers import JsonDataTransformer, PandasDataTransformer | ||
|
||
__all__ = [ | ||
"CsvReader", | ||
"JsonReader", | ||
"PandasDataTransformer", | ||
"JsonDataTransformer", | ||
] |
67 changes: 67 additions & 0 deletions
67
presidio-structured/presidio_structured/data/data_reader.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import json | ||
from abc import ABC, abstractmethod | ||
from typing import Any, Dict | ||
|
||
import pandas as pd | ||
|
||
|
||
class ReaderBase(ABC): | ||
""" | ||
Base class for data readers. | ||
This class should not be instantiated directly. Instead use or define a reader subclass. | ||
""" | ||
|
||
@abstractmethod | ||
def read(self, path: str) -> Any: | ||
""" | ||
Extract data from file located at path. | ||
:param path: String defining the location of the file to read. | ||
:return: The data read from the file. | ||
""" | ||
pass | ||
|
||
|
||
class CsvReader(ReaderBase): | ||
""" | ||
Reader for reading csv files. | ||
Usage:: | ||
reader = CsvReader() | ||
data = reader.read(path="filepath.csv") | ||
""" | ||
|
||
def read(self, path: str) -> pd.DataFrame: | ||
""" | ||
Read csv file to pandas dataframe. | ||
:param path: String defining the location of the csv file to read. | ||
:return: Pandas DataFrame with the data read from the csv file. | ||
""" | ||
return pd.read_csv(path) | ||
|
||
|
||
class JsonReader(ReaderBase): | ||
""" | ||
Reader for reading json files. | ||
Usage:: | ||
reader = JsonReader() | ||
data = reader.read(path="filepath.json") | ||
""" | ||
|
||
def read(self, path: str) -> Dict[str, Any]: | ||
""" | ||
Read json file to dict. | ||
:param path: String defining the location of the json file to read. | ||
:return: dictionary with the data read from the json file. | ||
""" | ||
with open(path) as f: | ||
data = json.load(f) | ||
return data |
Oops, something went wrong.