Skip to content

Commit

Permalink
presidio-structured
Browse files Browse the repository at this point in the history
changelog

Static analysis

docstrings, types

preliminary tests engine

static analysis

isort

Minor refactorings

Update README.md

Fix late binding issues and example

removal of old samples

Refactoring, adding example

pre-clean-break-commit

broken commit, fixing TabularConfigBuilder

Rename TabularConfig

pre-breaking replace commit

removal of some old experimental files

rename tabular to structured

restructuring presidio tabular - pre del commit

Add project TODOs

testing dump presidio tabular
  • Loading branch information
Jakob-98 committed Oct 26, 2023
1 parent d8541e9 commit 8c6be26
Show file tree
Hide file tree
Showing 17 changed files with 1,238 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
All notable changes to this project will be documented in this file.

## [Unreleased]
## [2.2.3x] - 24.10.23
### Added
#### Structured
* Added V1 of presidio-structured, a library (presidio-structured) which re-uses existing logic from existing presidio components to allow anonymization of (semi-)structured data.

## [2.2.33] - June 1st 2023
### Added
Expand Down
18 changes: 18 additions & 0 deletions presidio-structured/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Presidio structured

## Status

### TODO

For TODOs, see draft PR.

## Description

The Presidio stuctured is..

## Deploy Presidio analyzer to Azure

## Simple usage example

## Documentation

9 changes: 9 additions & 0 deletions presidio-structured/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Anonymizer root module."""
import logging

# Set up default logging (with NullHandler)


# logging.getLogger("presidio-str").addHandler(logging.NullHandler())

# __all__ = ["AnonymizerEngine", "DeanonymizeEngine", "BatchAnonymizerEngine"]
15 changes: 15 additions & 0 deletions presidio-structured/presidio_structured/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .analysis_builder import JsonAnalysisBuilder, TabularAnalysisBuilder
from .config import StructuredAnalysis
from .data import CsvReader, JsonDataTransformer, JsonReader, PandasDataTransformer
from .tabular_engine import TabularEngine

__all__ = [
"TabularEngine",
"JsonAnalysisBuilder",
"TabularAnalysisBuilder",
"StructuredAnalysis",
"CsvReader",
"JsonReader",
"PandasDataTransformer",
"JsonDataTransformer",
]
163 changes: 163 additions & 0 deletions presidio-structured/presidio_structured/analysis_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
from abc import ABC, abstractmethod
from collections import Counter
from collections.abc import Iterable
from typing import Any, Dict, Iterator, Union

from pandas import DataFrame
from presidio_analyzer import (
AnalyzerEngine,
BatchAnalyzerEngine,
DictAnalyzerResult,
RecognizerResult,
)

from presidio_structured.config import StructuredAnalysis


class AnalysisBuilder(ABC):
"""
Abstract base class for a configuration generator.
"""

def __init__(self):
"""Initialize the configuration generator."""
self.analyzer = AnalyzerEngine()

@abstractmethod
def generate_analysis(self, data: Union[Dict, DataFrame]) -> StructuredAnalysis:
"""
Abstract method to generate a configuration from the given data.
:param data: The input data. Can be a dictionary or DataFrame instance.
:type data: Union[Dict, DataFrame]
:return: The generated configuration.
:rtype StructuredAnalysis:
"""
pass


class JsonAnalysisBuilder(AnalysisBuilder):
"""Concrete configuration generator for JSON data."""

def generate_analysis(self, data: Dict) -> StructuredAnalysis:
"""
Generate a configuration from the given JSON data.
:param data: The input JSON data.
:type data: Dict
:return: The generated configuration.
:rtype StructuredAnalysis:
"""
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer)
analyzer_results = batch_analyzer.analyze_dict(input_dict=data, language="en")
return self._generate_analysis_from_results_json(analyzer_results)

def _generate_analysis_from_results_json(
self, analyzer_results: Iterator[DictAnalyzerResult], prefix: str = ""
) -> StructuredAnalysis:
"""
Generate a configuration from the given analyzer results.
:param analyzer_results: The analyzer results.
:type analyzer_results: Iterator[DictAnalyzerResult]
:param prefix: The prefix for the configuration keys.
:type prefix: str
:return: The generated configuration.
:rtype StructuredAnalysis:
"""
mappings = {}

if not isinstance(analyzer_results, Iterable):
return mappings

for result in analyzer_results:
current_key = prefix + result.key

if isinstance(result.value, dict):
nested_mappings = self._generate_analysis_from_results_json(
result.recognizer_results, prefix=current_key + "."
)
mappings.update(nested_mappings.entity_mapping)

if sum(1 for _ in result.recognizer_results) > 0:
for recognizer_result in result.recognizer_results:
mappings[current_key] = recognizer_result.entity_type
return StructuredAnalysis(entity_mapping=mappings)


class TabularAnalysisBuilder(AnalysisBuilder):
"""Concrete configuration generator for tabular data."""

def generate_analysis(
self, df: DataFrame, n: int = 100, language: str = "en"
) -> StructuredAnalysis:
"""
Generate a configuration from the given tabular data.
:param df: The input tabular data (dataframe).
:type df: DataFrame
:param n: The number of samples to be taken from the dataframe.
:type n: int
:param language: The language to be used for analysis.
:type language: str
:return: The generated configuration.
:rtype StructuredAnalysis:
"""
if n > len(df):
n = len(df)

df = df.sample(n)

key_recognizer_result_map = self._find_most_common_entity(df, language)

key_entity_map = {
key: result.entity_type
for key, result in key_recognizer_result_map.items()
if result.entity_type != "NON_PII"
}

return StructuredAnalysis(entity_mapping=key_entity_map)

def _find_most_common_entity(
self, df: DataFrame, language: str
) -> Dict[str, RecognizerResult]:
"""
Find the most common entity in a dataframe column.
:param df: The dataframe where entities will be searched.
:type df: DataFrame
:param language: Language to be used in the analysis engine.
:type language: str
:return: A dictionary mapping column names to the most common RecognizerResult.
:rtype: Dict[str, RecognizerResult]
"""
key_recognizer_result_map = {}

for column in df.columns:
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer)
analyzer_results = batch_analyzer.analyze_iterator(
[val for val in df[column]], language=language
)

if all(len(res) == 0 for res in analyzer_results):
key_recognizer_result_map[column] = RecognizerResult(
entity_type="NON_PII", start=0, end=1, score=1.0
)
continue
# Grabbing most common type
types_list = [
res[0].entity_type for res in analyzer_results if len(res) > 0
]
type_counter = Counter(types_list)
most_common_type = type_counter.most_common(1)[0][0]
# Grabbing the average confidence score for the most common type.
scores = [
res[0].score
for res in analyzer_results
if len(res) > 0 and res[0].entity_type == most_common_type
]
average_score = sum(scores) / len(scores) if scores else 0.0
key_recognizer_result_map[column] = RecognizerResult(
most_common_type, 0, 1, average_score
)
return key_recognizer_result_map
5 changes: 5 additions & 0 deletions presidio-structured/presidio_structured/config/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .structured_analysis import StructuredAnalysis

__all__ = [
"StructuredAnalysis",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
""" Structured Analysis module. """

from dataclasses import dataclass
from typing import Dict


@dataclass
class StructuredAnalysis:
"""Dataclass containing entity analysis from structured data. Currently only contains entity mapping."""

entity_mapping: Dict[
str, str
] # NOTE ideally Literal[...] with allowed EntityTypes, but cannot unpack in Literal.
9 changes: 9 additions & 0 deletions presidio-structured/presidio_structured/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from .data_reader import CsvReader, JsonReader
from .data_transformers import JsonDataTransformer, PandasDataTransformer

__all__ = [
"CsvReader",
"JsonReader",
"PandasDataTransformer",
"JsonDataTransformer",
]
67 changes: 67 additions & 0 deletions presidio-structured/presidio_structured/data/data_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import json
from abc import ABC, abstractmethod
from typing import Any, Dict

import pandas as pd


class ReaderBase(ABC):
"""
Base class for data readers.
This class should not be instantiated directly. Instead use or define a reader subclass.
"""

@abstractmethod
def read(self, path: str) -> Any:
"""
Extract data from file located at path.
:param path: String defining the location of the file to read.
:return: The data read from the file.
"""
pass


class CsvReader(ReaderBase):
"""
Reader for reading csv files.
Usage::
reader = CsvReader()
data = reader.read(path="filepath.csv")
"""

def read(self, path: str) -> pd.DataFrame:
"""
Read csv file to pandas dataframe.
:param path: String defining the location of the csv file to read.
:return: Pandas DataFrame with the data read from the csv file.
"""
return pd.read_csv(path)


class JsonReader(ReaderBase):
"""
Reader for reading json files.
Usage::
reader = JsonReader()
data = reader.read(path="filepath.json")
"""

def read(self, path: str) -> Dict[str, Any]:
"""
Read json file to dict.
:param path: String defining the location of the json file to read.
:return: dictionary with the data read from the json file.
"""
with open(path) as f:
data = json.load(f)
return data
Loading

0 comments on commit 8c6be26

Please sign in to comment.