From 8c6be263ffd50844d849e40939a3fe3ddb1efab2 Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Wed, 13 Sep 2023 10:35:10 -0700 Subject: [PATCH 01/52] presidio-structured changelog Static analysis docstrings, types preliminary tests engine static analysis isort Minor refactorings Update README.md Fix late binding issues and example removal of old samples Refactoring, adding example pre-clean-break-commit broken commit, fixing TabularConfigBuilder Rename TabularConfig pre-breaking replace commit removal of some old experimental files rename tabular to structured restructuring presidio tabular - pre del commit Add project TODOs testing dump presidio tabular --- CHANGELOG.md | 4 + presidio-structured/README.md | 18 + presidio-structured/__init__.py | 9 + .../presidio_structured/__init__.py | 15 + .../presidio_structured/analysis_builder.py | 163 +++++ .../presidio_structured/config/__init__.py | 5 + .../config/structured_analysis.py | 13 + .../presidio_structured/data/__init__.py | 9 + .../presidio_structured/data/data_reader.py | 67 +++ .../data/data_transformers.py | 206 +++++++ .../presidio_structured/tabular_engine.py | 59 ++ presidio-structured/sample/example.ipynb | 561 ++++++++++++++++++ .../sample/sample_data/test_complex_json.json | 37 ++ .../sample/sample_data/test_csv.csv | 4 + .../sample/sample_data/test_json.json | 11 + presidio-structured/tests/__init__.py | 1 + .../tests/test_tabular_engine.py | 56 ++ 17 files changed, 1238 insertions(+) create mode 100644 presidio-structured/README.md create mode 100644 presidio-structured/__init__.py create mode 100644 presidio-structured/presidio_structured/__init__.py create mode 100644 presidio-structured/presidio_structured/analysis_builder.py create mode 100644 presidio-structured/presidio_structured/config/__init__.py create mode 100644 presidio-structured/presidio_structured/config/structured_analysis.py create mode 100644 presidio-structured/presidio_structured/data/__init__.py create mode 100644 presidio-structured/presidio_structured/data/data_reader.py create mode 100644 presidio-structured/presidio_structured/data/data_transformers.py create mode 100644 presidio-structured/presidio_structured/tabular_engine.py create mode 100644 presidio-structured/sample/example.ipynb create mode 100644 presidio-structured/sample/sample_data/test_complex_json.json create mode 100644 presidio-structured/sample/sample_data/test_csv.csv create mode 100644 presidio-structured/sample/sample_data/test_json.json create mode 100644 presidio-structured/tests/__init__.py create mode 100644 presidio-structured/tests/test_tabular_engine.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 02be21136..9e5f09d36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +## [2.2.3x] - 24.10.23 +### Added +#### Structured +* Added V1 of presidio-structured, a library (presidio-structured) which re-uses existing logic from existing presidio components to allow anonymization of (semi-)structured data. ## [2.2.33] - June 1st 2023 ### Added diff --git a/presidio-structured/README.md b/presidio-structured/README.md new file mode 100644 index 000000000..27c5c7c43 --- /dev/null +++ b/presidio-structured/README.md @@ -0,0 +1,18 @@ +# Presidio structured + +## Status + +### TODO + +For TODOs, see draft PR. + +## Description + +The Presidio stuctured is.. + +## Deploy Presidio analyzer to Azure + +## Simple usage example + +## Documentation + diff --git a/presidio-structured/__init__.py b/presidio-structured/__init__.py new file mode 100644 index 000000000..358dbb403 --- /dev/null +++ b/presidio-structured/__init__.py @@ -0,0 +1,9 @@ +"""Anonymizer root module.""" +import logging + +# Set up default logging (with NullHandler) + + +# logging.getLogger("presidio-str").addHandler(logging.NullHandler()) + +# __all__ = ["AnonymizerEngine", "DeanonymizeEngine", "BatchAnonymizerEngine"] diff --git a/presidio-structured/presidio_structured/__init__.py b/presidio-structured/presidio_structured/__init__.py new file mode 100644 index 000000000..b121a198d --- /dev/null +++ b/presidio-structured/presidio_structured/__init__.py @@ -0,0 +1,15 @@ +from .analysis_builder import JsonAnalysisBuilder, TabularAnalysisBuilder +from .config import StructuredAnalysis +from .data import CsvReader, JsonDataTransformer, JsonReader, PandasDataTransformer +from .tabular_engine import TabularEngine + +__all__ = [ + "TabularEngine", + "JsonAnalysisBuilder", + "TabularAnalysisBuilder", + "StructuredAnalysis", + "CsvReader", + "JsonReader", + "PandasDataTransformer", + "JsonDataTransformer", +] diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py new file mode 100644 index 000000000..01a282e92 --- /dev/null +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -0,0 +1,163 @@ +from abc import ABC, abstractmethod +from collections import Counter +from collections.abc import Iterable +from typing import Any, Dict, Iterator, Union + +from pandas import DataFrame +from presidio_analyzer import ( + AnalyzerEngine, + BatchAnalyzerEngine, + DictAnalyzerResult, + RecognizerResult, +) + +from presidio_structured.config import StructuredAnalysis + + +class AnalysisBuilder(ABC): + """ + Abstract base class for a configuration generator. + """ + + def __init__(self): + """Initialize the configuration generator.""" + self.analyzer = AnalyzerEngine() + + @abstractmethod + def generate_analysis(self, data: Union[Dict, DataFrame]) -> StructuredAnalysis: + """ + Abstract method to generate a configuration from the given data. + + :param data: The input data. Can be a dictionary or DataFrame instance. + :type data: Union[Dict, DataFrame] + :return: The generated configuration. + :rtype StructuredAnalysis: + """ + pass + + +class JsonAnalysisBuilder(AnalysisBuilder): + """Concrete configuration generator for JSON data.""" + + def generate_analysis(self, data: Dict) -> StructuredAnalysis: + """ + Generate a configuration from the given JSON data. + + :param data: The input JSON data. + :type data: Dict + :return: The generated configuration. + :rtype StructuredAnalysis: + """ + batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) + analyzer_results = batch_analyzer.analyze_dict(input_dict=data, language="en") + return self._generate_analysis_from_results_json(analyzer_results) + + def _generate_analysis_from_results_json( + self, analyzer_results: Iterator[DictAnalyzerResult], prefix: str = "" + ) -> StructuredAnalysis: + """ + Generate a configuration from the given analyzer results. + + :param analyzer_results: The analyzer results. + :type analyzer_results: Iterator[DictAnalyzerResult] + :param prefix: The prefix for the configuration keys. + :type prefix: str + :return: The generated configuration. + :rtype StructuredAnalysis: + """ + mappings = {} + + if not isinstance(analyzer_results, Iterable): + return mappings + + for result in analyzer_results: + current_key = prefix + result.key + + if isinstance(result.value, dict): + nested_mappings = self._generate_analysis_from_results_json( + result.recognizer_results, prefix=current_key + "." + ) + mappings.update(nested_mappings.entity_mapping) + + if sum(1 for _ in result.recognizer_results) > 0: + for recognizer_result in result.recognizer_results: + mappings[current_key] = recognizer_result.entity_type + return StructuredAnalysis(entity_mapping=mappings) + + +class TabularAnalysisBuilder(AnalysisBuilder): + """Concrete configuration generator for tabular data.""" + + def generate_analysis( + self, df: DataFrame, n: int = 100, language: str = "en" + ) -> StructuredAnalysis: + """ + Generate a configuration from the given tabular data. + + :param df: The input tabular data (dataframe). + :type df: DataFrame + :param n: The number of samples to be taken from the dataframe. + :type n: int + :param language: The language to be used for analysis. + :type language: str + :return: The generated configuration. + :rtype StructuredAnalysis: + """ + if n > len(df): + n = len(df) + + df = df.sample(n) + + key_recognizer_result_map = self._find_most_common_entity(df, language) + + key_entity_map = { + key: result.entity_type + for key, result in key_recognizer_result_map.items() + if result.entity_type != "NON_PII" + } + + return StructuredAnalysis(entity_mapping=key_entity_map) + + def _find_most_common_entity( + self, df: DataFrame, language: str + ) -> Dict[str, RecognizerResult]: + """ + Find the most common entity in a dataframe column. + + :param df: The dataframe where entities will be searched. + :type df: DataFrame + :param language: Language to be used in the analysis engine. + :type language: str + :return: A dictionary mapping column names to the most common RecognizerResult. + :rtype: Dict[str, RecognizerResult] + """ + key_recognizer_result_map = {} + + for column in df.columns: + batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) + analyzer_results = batch_analyzer.analyze_iterator( + [val for val in df[column]], language=language + ) + + if all(len(res) == 0 for res in analyzer_results): + key_recognizer_result_map[column] = RecognizerResult( + entity_type="NON_PII", start=0, end=1, score=1.0 + ) + continue + # Grabbing most common type + types_list = [ + res[0].entity_type for res in analyzer_results if len(res) > 0 + ] + type_counter = Counter(types_list) + most_common_type = type_counter.most_common(1)[0][0] + # Grabbing the average confidence score for the most common type. + scores = [ + res[0].score + for res in analyzer_results + if len(res) > 0 and res[0].entity_type == most_common_type + ] + average_score = sum(scores) / len(scores) if scores else 0.0 + key_recognizer_result_map[column] = RecognizerResult( + most_common_type, 0, 1, average_score + ) + return key_recognizer_result_map diff --git a/presidio-structured/presidio_structured/config/__init__.py b/presidio-structured/presidio_structured/config/__init__.py new file mode 100644 index 000000000..85341c3e5 --- /dev/null +++ b/presidio-structured/presidio_structured/config/__init__.py @@ -0,0 +1,5 @@ +from .structured_analysis import StructuredAnalysis + +__all__ = [ + "StructuredAnalysis", +] diff --git a/presidio-structured/presidio_structured/config/structured_analysis.py b/presidio-structured/presidio_structured/config/structured_analysis.py new file mode 100644 index 000000000..f9a00c519 --- /dev/null +++ b/presidio-structured/presidio_structured/config/structured_analysis.py @@ -0,0 +1,13 @@ +""" Structured Analysis module. """ + +from dataclasses import dataclass +from typing import Dict + + +@dataclass +class StructuredAnalysis: + """Dataclass containing entity analysis from structured data. Currently only contains entity mapping.""" + + entity_mapping: Dict[ + str, str + ] # NOTE ideally Literal[...] with allowed EntityTypes, but cannot unpack in Literal. diff --git a/presidio-structured/presidio_structured/data/__init__.py b/presidio-structured/presidio_structured/data/__init__.py new file mode 100644 index 000000000..4b97eb048 --- /dev/null +++ b/presidio-structured/presidio_structured/data/__init__.py @@ -0,0 +1,9 @@ +from .data_reader import CsvReader, JsonReader +from .data_transformers import JsonDataTransformer, PandasDataTransformer + +__all__ = [ + "CsvReader", + "JsonReader", + "PandasDataTransformer", + "JsonDataTransformer", +] diff --git a/presidio-structured/presidio_structured/data/data_reader.py b/presidio-structured/presidio_structured/data/data_reader.py new file mode 100644 index 000000000..bbcb3d672 --- /dev/null +++ b/presidio-structured/presidio_structured/data/data_reader.py @@ -0,0 +1,67 @@ +import json +from abc import ABC, abstractmethod +from typing import Any, Dict + +import pandas as pd + + +class ReaderBase(ABC): + """ + Base class for data readers. + + This class should not be instantiated directly. Instead use or define a reader subclass. + """ + + @abstractmethod + def read(self, path: str) -> Any: + """ + Extract data from file located at path. + + :param path: String defining the location of the file to read. + :return: The data read from the file. + """ + pass + + +class CsvReader(ReaderBase): + """ + Reader for reading csv files. + + Usage:: + + reader = CsvReader() + data = reader.read(path="filepath.csv") + + """ + + def read(self, path: str) -> pd.DataFrame: + """ + Read csv file to pandas dataframe. + + :param path: String defining the location of the csv file to read. + :return: Pandas DataFrame with the data read from the csv file. + """ + return pd.read_csv(path) + + +class JsonReader(ReaderBase): + """ + Reader for reading json files. + + Usage:: + + reader = JsonReader() + data = reader.read(path="filepath.json") + + """ + + def read(self, path: str) -> Dict[str, Any]: + """ + Read json file to dict. + + :param path: String defining the location of the json file to read. + :return: dictionary with the data read from the json file. + """ + with open(path) as f: + data = json.load(f) + return data diff --git a/presidio-structured/presidio_structured/data/data_transformers.py b/presidio-structured/presidio_structured/data/data_transformers.py new file mode 100644 index 000000000..20a8decf6 --- /dev/null +++ b/presidio-structured/presidio_structured/data/data_transformers.py @@ -0,0 +1,206 @@ +from abc import ABC, abstractmethod +from typing import Any, Callable, Dict, List, Union + +from pandas import DataFrame +from presidio_anonymizer.entities import OperatorConfig +from presidio_anonymizer.operators import OperatorsFactory, OperatorType + + +class DataTransformerBase(ABC): + """ + Abstract base class to handle logic of operations over the text using the operators. + """ + + def __init__(self) -> None: + """Initializes DataTransformerBase object.""" + pass + + def operate( + self, + data: Any, + structured_analysis: "StructuredAnalysis", + operators: Dict[str, OperatorConfig], + ) -> Any: + """ + Performs operations over the text using the operators, as per the structured analysis. + + :param data: Data to be operated on. + :param structured_analysis: Analysis schema as per the structured data. + :param operators: Dictionary containing operator configuration objects. + :return: Data after being operated upon. + """ + key_to_operator_mapping = self._generate_operator_mapping( + structured_analysis, operators + ) + return self._process(data, key_to_operator_mapping) + + @abstractmethod + def _process( + self, data: Dict | DataFrame, key_to_operator_mapping: Dict[str, Callable] + ) -> Dict | DataFrame: + """ + Abstract method for subclasses to provide operation implementation. + + :param data: Data to be operated on. + :param key_to_operator_mapping: Mapping of keys to operators. + :return: Operated data. + """ + pass + + @staticmethod + def _create_operator_callable(operator, params): + def operator_callable(text): + return operator.operate(params=params, text=text) + + return operator_callable + + def _generate_operator_mapping( + self, config, operators: Dict[str, OperatorConfig] + ) -> Dict[str, Callable]: + """ + Generate a mapping of keys to operator callables. + + :param config: Configuration object containing mapping of entity types to keys. + :param operators: Dictionary containing operator configuration objects. + :return: Dictionary mapping keys to operator callables. + """ + key_to_operator_mapping = {} + + operators_factory = OperatorsFactory() + for key, entity in config.entity_mapping.items(): + operator_config = operators.get(entity, operators.get("DEFAULT", None)) + if operator_config is None: + raise ValueError(f"Operator for entity {entity} not found") + # NOTE: hardcoded OperatorType.Anonymize, as this is the only one supported for now. + operator = operators_factory.create_operator_class( + operator_config.operator_name, OperatorType.Anonymize + ) + operator_callable = self._create_operator_callable( + operator, operator_config.params + ) + key_to_operator_mapping[key] = operator_callable + + return key_to_operator_mapping + + def _operate_on_text( + self, + text_to_operate_on: str, + operator_callable: Callable, + ) -> str: + """ + Operates on the provided text using the operator callable. + + :param text_to_operate_on: Text to be operated on. + :param operator_callable: Callable that performs operation on the text. + :return: Text after operation. + """ + return operator_callable(text_to_operate_on) + + +class PandasDataTransformer(DataTransformerBase): + def _process( + self, data: DataFrame, key_to_operator_mapping: Dict[str, Callable] + ) -> DataFrame: + """ + Operates on the given pandas DataFrame based on the provided operators. + + :param data: DataFrame to be operated on. + :param key_to_operator_mapping: Mapping of keys to operator callables. + :return: DataFrame after the operation. + """ + + if not isinstance(data, DataFrame): + raise ValueError("Data must be a pandas DataFrame") + + for key, operator_callable in key_to_operator_mapping.items(): + for idx, row in data.iterrows(): + text_to_operate_on = row[key] + operated_text = self._operate_on_text( + text_to_operate_on, operator_callable + ) + data.at[idx, key] = operated_text + return data + + +class JsonDataTransformer(DataTransformerBase): + """JSON Data Transformer, Supports arbitrary nesting of dictionaries and lists.""" + + @staticmethod + def _get_nested_value(data: Union[Dict, List], path: List[str]) -> Any: + """ + Recursively retrieves the value from nested data using a given path. + + :param data: Nested data (list or dictionary). + :param path: List of keys/indexes representing the path. + :return: Retrieved value. + """ + for i, key in enumerate(path): + if isinstance(data, list): + if key.isdigit(): + data = data[int(key)] + else: + return [ + JsonDataTransformer._get_nested_value(item, path[i:]) + for item in data + ] + elif isinstance(data, dict): + data = data.get(key) + else: + return data + return data + + @staticmethod + def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> None: + """ + Recursively sets a value in nested data using a given path. + :param data: Nested data (JSON-like). + :param path: List of keys/indexes representing the path. + :param value: Value to be set. + """ + for i, key in enumerate(path): + if isinstance(data, list): + if i + 1 < len(path) and path[i + 1].isdigit(): + idx = int(path[i + 1]) + while len(data) <= idx: + data.append({}) + data = data[idx] + continue + else: + for item in data: + JsonDataTransformer._set_nested_value(item, path[i:], value) + return + elif isinstance(data, dict): + if i == len(path) - 1: + data[key] = value + else: + data = data.setdefault(key, {}) + + def _process( + self, data: Union[Dict, List], key_to_operator_mapping: Dict[str, Callable] + ) -> Union[Dict, List]: + """ + Operates on the given JSON-like data (nested dictionary/list) based on the provided configuration. + :param data: JSON-like data to be operated on. + :param config: Configuration object containing operator information. + :return: JSON-like data after the operation. + """ + for key, operator_callable in key_to_operator_mapping.items(): + keys = key.split(".") + if isinstance(data, list): + for idx, item in enumerate(data): + self._process(item, key_to_operator_mapping) + else: + text_to_operate_on = self._get_nested_value(data, keys) + if text_to_operate_on is not None: + if isinstance(text_to_operate_on, list): + for text in text_to_operate_on: + operated_text = self._operate_on_text( + text, operator_callable + ) + self._set_nested_value(data, keys, operated_text) + else: + operated_text = self._operate_on_text( + text_to_operate_on, operator_callable + ) + self._set_nested_value(data, keys, operated_text) + return data diff --git a/presidio-structured/presidio_structured/tabular_engine.py b/presidio-structured/presidio_structured/tabular_engine.py new file mode 100644 index 000000000..010277d4f --- /dev/null +++ b/presidio-structured/presidio_structured/tabular_engine.py @@ -0,0 +1,59 @@ +from typing import Any, Dict, Union + +from pandas import DataFrame +from presidio_anonymizer.entities import OperatorConfig + +from presidio_structured.config import StructuredAnalysis +from presidio_structured.data.data_transformers import DataTransformerBase + +DEFAULT = "replace" + + +class TabularEngine: + """ + Class to implement methods for anonymizing tabular data. + """ + + def __init__(self, data_transformer: DataTransformerBase): + """ + Initialize the class with a data transformer. + + :param data_transformer: Instance of DataTransformerBase. + """ + self.data_transformer = data_transformer + + def anonymize( + self, + data: Union[Dict, DataFrame], + structured_analysis: StructuredAnalysis, + operators: Dict[str, OperatorConfig] = None, + ) -> Union[Dict, DataFrame]: + """ + Anonymize the given data using the given configuration. + + :param data: input data as dictionary or pandas DataFrame. + :param structured_analysis: structured analysis configuration. + :param operators: a dictionary of operator configurations, optional. + :return: Anonymized dictionary or DataFrame. + """ + operators = self.__check_or_add_default_operator(operators) + + return self.data_transformer.operate(data, structured_analysis, operators) + + @staticmethod + def __check_or_add_default_operator( + operators: Dict[str, OperatorConfig] + ) -> Dict[str, OperatorConfig]: + """ + Check if the provided operators dictionary has a default operator. + If not, add a default operator. + + :param operators: dictionary of operator configurations. + :return: operators dictionary with the default operator added if it was not initially present. + """ + default_operator = OperatorConfig(DEFAULT) + if not operators: + return {"DEFAULT": default_operator} + if not operators.get("DEFAULT"): + operators["DEFAULT"] = default_operator + return operators diff --git a/presidio-structured/sample/example.ipynb b/presidio-structured/sample/example.ipynb new file mode 100644 index 000000000..c04f51ffd --- /dev/null +++ b/presidio-structured/sample/example.ipynb @@ -0,0 +1,561 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "sys.path.append(os.path.abspath(\"..\"))\n", + "from presidio_structured import TabularEngine, JsonAnalysisBuilder, TabularAnalysisBuilder, StructuredAnalysis, CsvReader, JsonReader, JsonDataTransformer, PandasDataTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading in data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameemailstreetcitystatepostal_code
01John Doejohn.doe@example.com123 Main StAnytownCA12345
12Jane Smithjane.smith@example.com456 Elm StSomewhereTX67890
23Alice Johnsonalice.johnson@example.com789 Pine StElsewhereNY11223
\n", + "
" + ], + "text/plain": [ + " id name email street city state \\\n", + "0 1 John Doe john.doe@example.com 123 Main St Anytown CA \n", + "1 2 Jane Smith jane.smith@example.com 456 Elm St Somewhere TX \n", + "2 3 Alice Johnson alice.johnson@example.com 789 Pine St Elsewhere NY \n", + "\n", + " postal_code \n", + "0 12345 \n", + "1 67890 \n", + "2 11223 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_df = CsvReader().read(\"./sample_data/test_csv.csv\")\n", + "sample_df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 1,\n", + " 'name': 'John Doe',\n", + " 'email': 'john.doe@example.com',\n", + " 'address': {'street': '123 Main St',\n", + " 'city': 'Anytown',\n", + " 'state': 'CA',\n", + " 'postal_code': '12345'}}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_json = JsonReader().read(\"./sample_data/test_json.json\")\n", + "sample_json" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'users': [{'id': 1,\n", + " 'name': 'John Doe',\n", + " 'email': 'john.doe@example.com',\n", + " 'address': {'street': '123 Main St',\n", + " 'city': 'Anytown',\n", + " 'state': 'CA',\n", + " 'postal_code': '12345'}},\n", + " {'id': 2,\n", + " 'name': 'Jane Smith',\n", + " 'email': 'jane.smith@example.com',\n", + " 'address': {'street': '456 Elm St',\n", + " 'city': 'Somewhere',\n", + " 'state': 'TX',\n", + " 'postal_code': '67890'}},\n", + " {'id': 3,\n", + " 'name': 'Alice Johnson',\n", + " 'email': 'alice.johnson@example.com',\n", + " 'address': {'street': '789 Pine St',\n", + " 'city': 'Elsewhere',\n", + " 'state': 'NY',\n", + " 'postal_code': '11223'}}]}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# contains nested objects in lists\n", + "sample_complex_json = JsonReader().read(\"./sample_data/test_complex_json.json\")\n", + "sample_complex_json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tabular (csv) data: defining & generating tabular analysis, anonymization." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "StructuredAnalysis(entity_mapping={'name': 'PERSON', 'email': 'EMAIL_ADDRESS', 'city': 'LOCATION', 'state': 'LOCATION'})" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Automatically detect the entity for the columns\n", + "tabular_analysis = TabularAnalysisBuilder().generate_analysis(sample_df)\n", + "tabular_analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameemailstreetcitystatepostal_code
01<None><None>123 Main St<None><None>12345
12<None><None>456 Elm St<None><None>67890
23<None><None>789 Pine St<None><None>11223
\n", + "
" + ], + "text/plain": [ + " id name email street city state postal_code\n", + "0 1 123 Main St 12345\n", + "1 2 456 Elm St 67890\n", + "2 3 789 Pine St 11223" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# anonymized data defaults to be replaced with None, unless operators is specified\n", + "\n", + "pandas_engine = TabularEngine(data_transformer=PandasDataTransformer())\n", + "df_to_be_anonymized = sample_df.copy() # in-place anonymization\n", + "anonymized_df = pandas_engine.anonymize(df_to_be_anonymized, tabular_analysis, operators=None) # explicit None for clarity\n", + "anonymized_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### We can also define operators using OperatorConfig similar as to the AnonymizerEngine:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameemailstreetcitystatepostal_code
01person...davidbarnett@example.org123 Main St<None><None>12345
12person...hortondaniel@example.org456 Elm St<None><None>67890
23person...mcleanmelissa@example.net789 Pine St<None><None>11223
\n", + "
" + ], + "text/plain": [ + " id name email street city state \\\n", + "0 1 person... davidbarnett@example.org 123 Main St \n", + "1 2 person... hortondaniel@example.org 456 Elm St \n", + "2 3 person... mcleanmelissa@example.net 789 Pine St \n", + "\n", + " postal_code \n", + "0 12345 \n", + "1 67890 \n", + "2 11223 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from presidio_anonymizer.entities.engine import OperatorConfig\n", + "from faker import Faker\n", + "fake = Faker()\n", + "\n", + "operators = {\n", + " \"PERSON\": OperatorConfig(\"replace\", {\"new_value\": \"person...\"}),\n", + " \"EMAIL_ADDRESS\": OperatorConfig(\"custom\", {\"lambda\": lambda x: fake.safe_email()})\n", + " # etc...\n", + " }\n", + "anonymized_df = pandas_engine.anonymize(sample_df, tabular_analysis, operators=operators)\n", + "anonymized_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semi-structured (JSON) data: simple and complex analysis, anonymization" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "StructuredAnalysis(entity_mapping={'name': 'PERSON', 'email': 'URL', 'address.city': 'LOCATION', 'address.state': 'LOCATION'})" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "json_analysis = JsonAnalysisBuilder().generate_analysis(sample_json)\n", + "json_analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Analyzer.analyze_iterator only works on primitive types (int, float, bool, str). Lists of objects are not yet supported.\n" + ] + } + ], + "source": [ + "# Currently does not support nested objects in lists\n", + "try:\n", + " json_complex_analysis = JsonAnalysisBuilder().generate_analysis(sample_complex_json)\n", + "except ValueError as e:\n", + " print(e)\n", + "\n", + "# however, we can define it manually:\n", + "json_complex_analysis = StructuredAnalysis(entity_mapping={\n", + " \"users.name\":\"PERSON\",\n", + " \"users.address.street\":\"LOCATION\",\n", + " \"users.address.city\":\"LOCATION\",\n", + " \"users.address.state\":\"LOCATION\",\n", + " \"users.email\": \"EMAIL_ADDRESS\",\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 1,\n", + " 'name': 'person...',\n", + " 'email': '',\n", + " 'address': {'street': '123 Main St',\n", + " 'city': '',\n", + " 'state': '',\n", + " 'postal_code': '12345'}}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# anonymizing simple data\n", + "json_engine = TabularEngine(data_transformer=JsonDataTransformer())\n", + "anonymized_json = json_engine.anonymize(sample_json, json_analysis, operators=operators)\n", + "anonymized_json" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'users': [{'id': 1,\n", + " 'name': 'person...',\n", + " 'email': 'wendyboyd@example.org',\n", + " 'address': {'street': '',\n", + " 'city': '',\n", + " 'state': '',\n", + " 'postal_code': '12345'}},\n", + " {'id': 2,\n", + " 'name': 'person...',\n", + " 'email': 'wendyboyd@example.org',\n", + " 'address': {'street': '',\n", + " 'city': '',\n", + " 'state': '',\n", + " 'postal_code': '67890'}},\n", + " {'id': 3,\n", + " 'name': 'person...',\n", + " 'email': 'wendyboyd@example.org',\n", + " 'address': {'street': '',\n", + " 'city': '',\n", + " 'state': '',\n", + " 'postal_code': '11223'}}]}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymized_complex_json = json_engine.anonymize(sample_complex_json, json_complex_analysis, operators=operators)\n", + "anonymized_complex_json" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/presidio-structured/sample/sample_data/test_complex_json.json b/presidio-structured/sample/sample_data/test_complex_json.json new file mode 100644 index 000000000..fce99ead7 --- /dev/null +++ b/presidio-structured/sample/sample_data/test_complex_json.json @@ -0,0 +1,37 @@ +{ + "users": [ + { + "id": 1, + "name": "John Doe", + "email": "john.doe@example.com", + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + "postal_code": "12345" + } + }, + { + "id": 2, + "name": "Jane Smith", + "email": "jane.smith@example.com", + "address": { + "street": "456 Elm St", + "city": "Somewhere", + "state": "TX", + "postal_code": "67890" + } + }, + { + "id": 3, + "name": "Alice Johnson", + "email": "alice.johnson@example.com", + "address": { + "street": "789 Pine St", + "city": "Elsewhere", + "state": "NY", + "postal_code": "11223" + } + } + ] +} diff --git a/presidio-structured/sample/sample_data/test_csv.csv b/presidio-structured/sample/sample_data/test_csv.csv new file mode 100644 index 000000000..64e235473 --- /dev/null +++ b/presidio-structured/sample/sample_data/test_csv.csv @@ -0,0 +1,4 @@ +id,name,email,street,city,state,postal_code +1,John Doe,john.doe@example.com,123 Main St,Anytown,CA,12345 +2,Jane Smith,jane.smith@example.com,456 Elm St,Somewhere,TX,67890 +3,Alice Johnson,alice.johnson@example.com,789 Pine St,Elsewhere,NY,11223 \ No newline at end of file diff --git a/presidio-structured/sample/sample_data/test_json.json b/presidio-structured/sample/sample_data/test_json.json new file mode 100644 index 000000000..9e416fc90 --- /dev/null +++ b/presidio-structured/sample/sample_data/test_json.json @@ -0,0 +1,11 @@ +{ + "id": 1, + "name": "John Doe", + "email": "john.doe@example.com", + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + "postal_code": "12345" + } +} \ No newline at end of file diff --git a/presidio-structured/tests/__init__.py b/presidio-structured/tests/__init__.py new file mode 100644 index 000000000..53dc04c35 --- /dev/null +++ b/presidio-structured/tests/__init__.py @@ -0,0 +1 @@ +"""Unit test package for presidio_structured.""" diff --git a/presidio-structured/tests/test_tabular_engine.py b/presidio-structured/tests/test_tabular_engine.py new file mode 100644 index 000000000..e3c433583 --- /dev/null +++ b/presidio-structured/tests/test_tabular_engine.py @@ -0,0 +1,56 @@ +from unittest.mock import Mock + +import pytest +from presidio_anonymizer.entities import OperatorConfig + +from presidio_structured import TabularEngine + + +def test_tabular_engine_anonymize_calls_data_transformer_operate(): + # Arrange + data_transformer = Mock() + tabular_engine = TabularEngine(data_transformer) + data = Mock() + structured_analysis = Mock() + operators = {"DEFAULT": OperatorConfig("replace")} + + # Act + tabular_engine.anonymize(data, structured_analysis, operators) + + # Assert + data_transformer.operate.assert_called_once_with( + data, structured_analysis, operators + ) + + +def test_tabular_engine_anonymize_adds_default_operator_if_none_provided(): + # Arrange + data_transformer = Mock() + tabular_engine = TabularEngine(data_transformer) + data = Mock() + structured_analysis = Mock() + + # Act + tabular_engine.anonymize(data, structured_analysis) + + # Assert + data_transformer.operate.assert_called_once() + args, _ = data_transformer.operate.call_args + assert "DEFAULT" in args[2] + + +def test_tabular_engine_anonymize_does_not_override_existing_default_operator(): + # Arrange + data_transformer = Mock() + tabular_engine = TabularEngine(data_transformer) + data = Mock() + structured_analysis = Mock() + operators = {"DEFAULT": OperatorConfig("custom")} + + # Act + tabular_engine.anonymize(data, structured_analysis, operators) + + # Assert + data_transformer.operate.assert_called_once_with( + data, structured_analysis, operators + ) From 87e4d184282c8103b28c80b8739f81281969088c Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Tue, 31 Oct 2023 13:44:02 +0100 Subject: [PATCH 02/52] Add unit tests --- .../presidio_structured/data/data_reader.py | 2 + .../data/data_transformers.py | 8 +- presidio-structured/tests/conftest.py | 82 +++++++++++++++++++ presidio-structured/tests/data/__init__.py | 0 .../tests/data/test_data_transformers.py | 56 +++++++++++++ .../tests/test_analysis_builder.py | 55 +++++++++++++ 6 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 presidio-structured/tests/conftest.py create mode 100644 presidio-structured/tests/data/__init__.py create mode 100644 presidio-structured/tests/data/test_data_transformers.py create mode 100644 presidio-structured/tests/test_analysis_builder.py diff --git a/presidio-structured/presidio_structured/data/data_reader.py b/presidio-structured/presidio_structured/data/data_reader.py index bbcb3d672..f453ba051 100644 --- a/presidio-structured/presidio_structured/data/data_reader.py +++ b/presidio-structured/presidio_structured/data/data_reader.py @@ -1,3 +1,5 @@ +""" Helper data classes, mostly simple wrappers to ensure consistent user interface. """ + import json from abc import ABC, abstractmethod from typing import Any, Dict diff --git a/presidio-structured/presidio_structured/data/data_transformers.py b/presidio-structured/presidio_structured/data/data_transformers.py index 20a8decf6..b57330443 100644 --- a/presidio-structured/presidio_structured/data/data_transformers.py +++ b/presidio-structured/presidio_structured/data/data_transformers.py @@ -5,6 +5,8 @@ from presidio_anonymizer.entities import OperatorConfig from presidio_anonymizer.operators import OperatorsFactory, OperatorType +from presidio_structured.config import StructuredAnalysis + class DataTransformerBase(ABC): """ @@ -18,7 +20,7 @@ def __init__(self) -> None: def operate( self, data: Any, - structured_analysis: "StructuredAnalysis", + structured_analysis: StructuredAnalysis, operators: Dict[str, OperatorConfig], ) -> Any: """ @@ -184,6 +186,10 @@ def _process( :param config: Configuration object containing operator information. :return: JSON-like data after the operation. """ + + if not isinstance(data, (dict, list)): + raise ValueError("Data must be a JSON-like object") + for key, operator_callable in key_to_operator_mapping.items(): keys = key.split(".") if isinstance(data, list): diff --git a/presidio-structured/tests/conftest.py b/presidio-structured/tests/conftest.py new file mode 100644 index 000000000..e8b42dd33 --- /dev/null +++ b/presidio-structured/tests/conftest.py @@ -0,0 +1,82 @@ +import pandas as pd +import pytest +from presidio_anonymizer.entities import OperatorConfig +from presidio_structured import TabularAnalysisBuilder, JsonAnalysisBuilder +from presidio_structured.config import StructuredAnalysis + + +@pytest.fixture +def sample_df(): + data = { + "name": ["John Doe", "Jane Doe", "John Smith"], + "email": ["john@example.com", "jane@example.com", "johnsmith@example.com"], + "phone": ["1234567890", "0987654321", "1122334455"], + } + return pd.DataFrame(data) + + +@pytest.fixture +def sample_json(): + data = { + "id": 1, + "name": "John Doe", + "email": "john.doe@example.com", + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + "postal_code": "12345" + } + } + return data + + +@pytest.fixture +def sample_json_with_array(): + data = {'users': + [ + {'id': 1, 'name': 'John Doe'}, + {'id': 2, 'name': 'Jane Doe'} + ]} + return data + + +@pytest.fixture +def json_analysis_builder(): + return JsonAnalysisBuilder() + + +@pytest.fixture +def tabular_analysis_builder(): + return TabularAnalysisBuilder() + + +@pytest.fixture +def operators(): + return { + "PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}), + "DEFAULT": OperatorConfig("replace", {"new_value": "DEFAULT_REPLACEMENT"}) + } + +@pytest.fixture +def operators_no_default(): + return { + "PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}), + } + +@pytest.fixture +def tabular_analysis(): + return StructuredAnalysis(entity_mapping={ + "name": "PERSON", + "email": "EMAIL_ADDRESS", + "phone": "PHONE_NUMBER", + }) + + +@pytest.fixture +def json_analysis(): + return StructuredAnalysis(entity_mapping={ + "name": "PERSON", + "address.city": "LOCATION", + } + ) diff --git a/presidio-structured/tests/data/__init__.py b/presidio-structured/tests/data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/presidio-structured/tests/data/test_data_transformers.py b/presidio-structured/tests/data/test_data_transformers.py new file mode 100644 index 000000000..0156d77ad --- /dev/null +++ b/presidio-structured/tests/data/test_data_transformers.py @@ -0,0 +1,56 @@ +import pytest +from pandas import DataFrame +from presidio_anonymizer.entities import OperatorConfig +from presidio_structured.data.data_transformers import DataTransformerBase, PandasDataTransformer, JsonDataTransformer +from presidio_structured.config import StructuredAnalysis + +class TestDataTransformerBase: + def test_abstract_init_raises(self, sample_df, tabular_analysis_builder, operators): + with pytest.raises(TypeError): + DataTransformerBase() + +class TestPandasDataTransformer: + def test_process(self, sample_df, operators, tabular_analysis): + transformer = PandasDataTransformer() + result = transformer.operate(sample_df, tabular_analysis, operators) + assert isinstance(result, DataFrame) + for key in tabular_analysis.entity_mapping: + if key == 'name': + assert all(result[key] == "PERSON_REPLACEMENT") + else: + assert all(result[key] == "DEFAULT_REPLACEMENT") + + def test_process_no_default_should_raise(self, sample_df, operators_no_default, tabular_analysis): + transformer = PandasDataTransformer() + with pytest.raises(ValueError): + transformer.operate(sample_df, tabular_analysis, operators_no_default) + + def test_process_invalid_data(self, sample_json, tabular_analysis, operators): + transformer = PandasDataTransformer() + with pytest.raises(ValueError): + transformer.operate(sample_json, tabular_analysis, operators) + +class TestJsonDataTransformer: + def test_process(self, sample_json, operators, json_analysis): + transformer = JsonDataTransformer() + result = transformer.operate(sample_json, json_analysis, operators) + assert isinstance(result, dict) + for key, value in json_analysis.entity_mapping.items(): + keys = key.split(".") + nested_value = sample_json + for inner_key in keys: + nested_value = nested_value[inner_key] + if value == 'PERSON': + assert nested_value == "PERSON_REPLACEMENT" + else: + assert nested_value == "DEFAULT_REPLACEMENT" + + def test_process_no_default_should_raise(self, sample_json, operators_no_default, json_analysis): + transformer = JsonDataTransformer() + with pytest.raises(ValueError): + transformer.operate(sample_json, json_analysis, operators_no_default) + + def test_process_invalid_data(self, sample_df, json_analysis, operators): + transformer = JsonDataTransformer() + with pytest.raises(ValueError): + transformer.operate(sample_df, json_analysis, operators) \ No newline at end of file diff --git a/presidio-structured/tests/test_analysis_builder.py b/presidio-structured/tests/test_analysis_builder.py new file mode 100644 index 000000000..d7ef97b30 --- /dev/null +++ b/presidio-structured/tests/test_analysis_builder.py @@ -0,0 +1,55 @@ +import pandas as pd +import pytest +from presidio_structured import TabularAnalysisBuilder, JsonAnalysisBuilder + +# NOTE: we won't go into depth unit-testing all analyzers, as that is covered in the presidio-analyzer tests + +def test_generate_analysis_tabular(tabular_analysis_builder, sample_df): + structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) + + assert structured_analysis.entity_mapping["name"] == "PERSON" + assert structured_analysis.entity_mapping["email"] == "EMAIL_ADDRESS" + assert structured_analysis.entity_mapping["phone"] == "PHONE_NUMBER" + +def test_generate_analysis_tabular_with_sampling(tabular_analysis_builder, sample_df): + structured_analysis = tabular_analysis_builder.generate_analysis(sample_df, n=2) + + assert len(structured_analysis.entity_mapping) == 3 + assert structured_analysis.entity_mapping["name"] == "PERSON" + assert structured_analysis.entity_mapping["email"] == "EMAIL_ADDRESS" + assert structured_analysis.entity_mapping["phone"] == "PHONE_NUMBER" + +def test_generate_analysis_tabular_with_invalid_sampling(tabular_analysis_builder, sample_df): + with pytest.raises(ValueError): + tabular_analysis_builder.generate_analysis(sample_df, n=-1) + +def test_find_most_common_entity(tabular_analysis_builder, sample_df): + key_recognizer_result_map = tabular_analysis_builder._find_most_common_entity(sample_df, "en") + + assert len(key_recognizer_result_map) == 3 + assert key_recognizer_result_map["name"].entity_type == "PERSON" + assert key_recognizer_result_map["email"].entity_type == "EMAIL_ADDRESS" + assert key_recognizer_result_map["phone"].entity_type == "PHONE_NUMBER" + +def test_find_most_common_entity_with_empty_df(tabular_analysis_builder): + df = pd.DataFrame() + key_recognizer_result_map = tabular_analysis_builder._find_most_common_entity(df, "en") + + assert len(key_recognizer_result_map) == 0 + +def test_generate_analysis_json(json_analysis_builder, sample_json): + structured_analysis = json_analysis_builder.generate_analysis(sample_json) + + assert structured_analysis.entity_mapping["name"] == "PERSON" + assert structured_analysis.entity_mapping["address.city"] == "LOCATION" + +def test_generate_analysis_json_with_list_should_raise(json_analysis_builder, sample_json_with_array): + # this feature is not supported by the BatchAnalyzerEngine used in the JsonAnalysisBuilder + with pytest.raises(ValueError): + json_analysis_builder.generate_analysis(sample_json_with_array) + +def test_generate_analysis_json_with_empty_data(json_analysis_builder): + data = {} + structured_analysis = json_analysis_builder.generate_analysis(data) + + assert len(structured_analysis.entity_mapping) == 0 From f9ec126292c49966f9cfd94c9d4103035f3d8037 Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Thu, 9 Nov 2023 09:55:20 +0100 Subject: [PATCH 03/52] rename engine, add buildfile --- .pipelines/templates/build-structured.yml | 25 +++++++++ presidio-structured/Pipfile | 16 ++++++ presidio-structured/logging.ini | 27 ++++++++++ .../presidio_structured/__init__.py | 4 +- ...tabular_engine.py => structured_engine.py} | 2 +- presidio-structured/sample/example.ipynb | 32 ++++++------ presidio-structured/setup.py | 52 +++++++++++++++++++ .../tests/test_tabular_engine.py | 20 +++---- 8 files changed, 149 insertions(+), 29 deletions(-) create mode 100644 .pipelines/templates/build-structured.yml create mode 100644 presidio-structured/Pipfile create mode 100644 presidio-structured/logging.ini rename presidio-structured/presidio_structured/{tabular_engine.py => structured_engine.py} (98%) create mode 100644 presidio-structured/setup.py diff --git a/.pipelines/templates/build-structured.yml b/.pipelines/templates/build-structured.yml new file mode 100644 index 000000000..2c9d4c5f4 --- /dev/null +++ b/.pipelines/templates/build-structured.yml @@ -0,0 +1,25 @@ +steps: + - task: Bash@3 + displayName: 'Setup pipenv' + inputs: + targetType: 'inline' + script: | + set -eux # fail on error + python -m pip install --upgrade pip + python -m pip install pipenv + pipenv --python 3 + + - task: Bash@3 + displayName: 'Install deps' + inputs: + targetType: 'inline' + workingDirectory: 'presidio-structured' + script: | + set -eux # fail on error + pipenv install --deploy --dev + + - template: ./build-python.yml + parameters: + SERVICE: 'Structured' + WORKING_FOLDER: 'presidio-structured' + diff --git a/presidio-structured/Pipfile b/presidio-structured/Pipfile new file mode 100644 index 000000000..bc9ae43e1 --- /dev/null +++ b/presidio-structured/Pipfile @@ -0,0 +1,16 @@ +[[source]] +url = "https://pypi.python.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +flask = ">=1.1" +presidio-analyzer = ">=2.2.31" +presidio-anonymizer = ">=2.2.31" + +[dev-packages] +pytest = "*" +flake8 = { version = ">=3.7.9" } +pep8-naming = "*" +flake8-docstrings = "*" +pre_commit = "*" diff --git a/presidio-structured/logging.ini b/presidio-structured/logging.ini new file mode 100644 index 000000000..62c58c1f8 --- /dev/null +++ b/presidio-structured/logging.ini @@ -0,0 +1,27 @@ +[loggers] +keys=root,presidio-structured + +[handlers] +keys=consoleHandler + +[formatters] +keys=simpleFormatter + +[logger_root] +level=INFO +handlers=consoleHandler + +[logger_presidio-structured] +level=INFO +handlers=consoleHandler +qualname=presidio-structured +propagate=0 + +[handler_consoleHandler] +class=StreamHandler +level=INFO +formatter=simpleFormatter +args=(sys.stdout,) + +[formatter_simpleFormatter] +format=%(asctime)s - %(name)s - %(levelname)s - %(message)s \ No newline at end of file diff --git a/presidio-structured/presidio_structured/__init__.py b/presidio-structured/presidio_structured/__init__.py index b121a198d..572d9981a 100644 --- a/presidio-structured/presidio_structured/__init__.py +++ b/presidio-structured/presidio_structured/__init__.py @@ -1,10 +1,10 @@ from .analysis_builder import JsonAnalysisBuilder, TabularAnalysisBuilder from .config import StructuredAnalysis from .data import CsvReader, JsonDataTransformer, JsonReader, PandasDataTransformer -from .tabular_engine import TabularEngine +from .structured_engine import StructuredEngine __all__ = [ - "TabularEngine", + "StructuredEngine", "JsonAnalysisBuilder", "TabularAnalysisBuilder", "StructuredAnalysis", diff --git a/presidio-structured/presidio_structured/tabular_engine.py b/presidio-structured/presidio_structured/structured_engine.py similarity index 98% rename from presidio-structured/presidio_structured/tabular_engine.py rename to presidio-structured/presidio_structured/structured_engine.py index 010277d4f..bda14790b 100644 --- a/presidio-structured/presidio_structured/tabular_engine.py +++ b/presidio-structured/presidio_structured/structured_engine.py @@ -9,7 +9,7 @@ DEFAULT = "replace" -class TabularEngine: +class StructuredEngine: """ Class to implement methods for anonymizing tabular data. """ diff --git a/presidio-structured/sample/example.ipynb b/presidio-structured/sample/example.ipynb index c04f51ffd..4b552b797 100644 --- a/presidio-structured/sample/example.ipynb +++ b/presidio-structured/sample/example.ipynb @@ -9,7 +9,7 @@ "import os\n", "import sys\n", "sys.path.append(os.path.abspath(\"..\"))\n", - "from presidio_structured import TabularEngine, JsonAnalysisBuilder, TabularAnalysisBuilder, StructuredAnalysis, CsvReader, JsonReader, JsonDataTransformer, PandasDataTransformer" + "from presidio_structured import StructuredEngine, JsonAnalysisBuilder, TabularAnalysisBuilder, StructuredAnalysis, CsvReader, JsonReader, JsonDataTransformer, PandasDataTransformer" ] }, { @@ -211,7 +211,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -286,7 +286,7 @@ "2 3 789 Pine St 11223" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -294,7 +294,7 @@ "source": [ "# anonymized data defaults to be replaced with None, unless operators is specified\n", "\n", - "pandas_engine = TabularEngine(data_transformer=PandasDataTransformer())\n", + "pandas_engine = StructuredEngine(data_transformer=PandasDataTransformer())\n", "df_to_be_anonymized = sample_df.copy() # in-place anonymization\n", "anonymized_df = pandas_engine.anonymize(df_to_be_anonymized, tabular_analysis, operators=None) # explicit None for clarity\n", "anonymized_df" @@ -347,7 +347,7 @@ " 0\n", " 1\n", " person...\n", - " davidbarnett@example.org\n", + " mccoyryan@example.com\n", " 123 Main St\n", " <None>\n", " <None>\n", @@ -357,7 +357,7 @@ " 1\n", " 2\n", " person...\n", - " hortondaniel@example.org\n", + " harristricia@example.net\n", " 456 Elm St\n", " <None>\n", " <None>\n", @@ -367,7 +367,7 @@ " 2\n", " 3\n", " person...\n", - " mcleanmelissa@example.net\n", + " thomasmikayla@example.org\n", " 789 Pine St\n", " <None>\n", " <None>\n", @@ -379,9 +379,9 @@ ], "text/plain": [ " id name email street city state \\\n", - "0 1 person... davidbarnett@example.org 123 Main St \n", - "1 2 person... hortondaniel@example.org 456 Elm St \n", - "2 3 person... mcleanmelissa@example.net 789 Pine St \n", + "0 1 person... mccoyryan@example.com 123 Main St \n", + "1 2 person... harristricia@example.net 456 Elm St \n", + "2 3 person... thomasmikayla@example.org 789 Pine St \n", "\n", " postal_code \n", "0 12345 \n", @@ -490,14 +490,14 @@ ], "source": [ "# anonymizing simple data\n", - "json_engine = TabularEngine(data_transformer=JsonDataTransformer())\n", + "json_engine = StructuredEngine(data_transformer=JsonDataTransformer())\n", "anonymized_json = json_engine.anonymize(sample_json, json_analysis, operators=operators)\n", "anonymized_json" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -505,28 +505,28 @@ "text/plain": [ "{'users': [{'id': 1,\n", " 'name': 'person...',\n", - " 'email': 'wendyboyd@example.org',\n", + " 'email': 'tricia10@example.com',\n", " 'address': {'street': '',\n", " 'city': '',\n", " 'state': '',\n", " 'postal_code': '12345'}},\n", " {'id': 2,\n", " 'name': 'person...',\n", - " 'email': 'wendyboyd@example.org',\n", + " 'email': 'tricia10@example.com',\n", " 'address': {'street': '',\n", " 'city': '',\n", " 'state': '',\n", " 'postal_code': '67890'}},\n", " {'id': 3,\n", " 'name': 'person...',\n", - " 'email': 'wendyboyd@example.org',\n", + " 'email': 'tricia10@example.com',\n", " 'address': {'street': '',\n", " 'city': '',\n", " 'state': '',\n", " 'postal_code': '11223'}}]}" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } diff --git a/presidio-structured/setup.py b/presidio-structured/setup.py new file mode 100644 index 000000000..136c609e7 --- /dev/null +++ b/presidio-structured/setup.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# noqa: D100 +import os.path +from os import path + +from setuptools import setup, find_packages + +test_requirements = ["pytest>=3", "flake8==3.7.9"] + +__version__ = "" +this_directory = path.abspath(path.dirname(__file__)) +parent_directory = os.path.abspath(os.path.join(this_directory, os.pardir)) + +with open(path.join(this_directory, "README.MD"), encoding="utf-8") as f: + long_description = f.read() + +try: + with open(os.path.join(parent_directory, "VERSION")) as version_file: + __version__ = version_file.read().strip() +except Exception: + __version__ = os.environ.get("PRESIDIO_VERSION", "0.0.1-alpha") + +setup( + name="presidio_anonymizer", + python_requires=">=3.5", + version=__version__, + classifiers=[ + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + ], + description="Persidio structured package - analyses and anonymizes structured and semistructured data.", + license="MIT license", + include_package_data=True, + keywords="presidio_structured", + install_requires=["pycryptodome>=3.10.1"], + # packages=find_packages(include=["presidio_anonymizer", "presidio_anonymizer.*"]),#TODO? + test_suite="tests", + tests_require=test_requirements, + url="https://github.com/microsoft/presidio", + zip_safe=False, + trusted_host=["pypi.org"], + long_description=long_description, + long_description_content_type="text/markdown", +) + diff --git a/presidio-structured/tests/test_tabular_engine.py b/presidio-structured/tests/test_tabular_engine.py index e3c433583..96d0f9188 100644 --- a/presidio-structured/tests/test_tabular_engine.py +++ b/presidio-structured/tests/test_tabular_engine.py @@ -3,19 +3,19 @@ import pytest from presidio_anonymizer.entities import OperatorConfig -from presidio_structured import TabularEngine +from presidio_structured import StructuredEngine -def test_tabular_engine_anonymize_calls_data_transformer_operate(): +def test_structured_engine_anonymize_calls_data_transformer_operate(): # Arrange data_transformer = Mock() - tabular_engine = TabularEngine(data_transformer) + structured_engine = StructuredEngine(data_transformer) data = Mock() structured_analysis = Mock() operators = {"DEFAULT": OperatorConfig("replace")} # Act - tabular_engine.anonymize(data, structured_analysis, operators) + structured_engine.anonymize(data, structured_analysis, operators) # Assert data_transformer.operate.assert_called_once_with( @@ -23,15 +23,15 @@ def test_tabular_engine_anonymize_calls_data_transformer_operate(): ) -def test_tabular_engine_anonymize_adds_default_operator_if_none_provided(): +def test_structured_engine_anonymize_adds_default_operator_if_none_provided(): # Arrange data_transformer = Mock() - tabular_engine = TabularEngine(data_transformer) + structured_engine = StructuredEngine(data_transformer) data = Mock() structured_analysis = Mock() # Act - tabular_engine.anonymize(data, structured_analysis) + structured_engine.anonymize(data, structured_analysis) # Assert data_transformer.operate.assert_called_once() @@ -39,16 +39,16 @@ def test_tabular_engine_anonymize_adds_default_operator_if_none_provided(): assert "DEFAULT" in args[2] -def test_tabular_engine_anonymize_does_not_override_existing_default_operator(): +def test_structured_engine_anonymize_does_not_override_existing_default_operator(): # Arrange data_transformer = Mock() - tabular_engine = TabularEngine(data_transformer) + structured_engine = StructuredEngine(data_transformer) data = Mock() structured_analysis = Mock() operators = {"DEFAULT": OperatorConfig("custom")} # Act - tabular_engine.anonymize(data, structured_analysis, operators) + structured_engine.anonymize(data, structured_analysis, operators) # Assert data_transformer.operate.assert_called_once_with( From e4622fd5e6bc739793c69fba41ca0d92a9815605 Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Thu, 9 Nov 2023 09:59:57 +0100 Subject: [PATCH 04/52] Update setup.py --- presidio-structured/setup.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/presidio-structured/setup.py b/presidio-structured/setup.py index 136c609e7..076d536b7 100644 --- a/presidio-structured/setup.py +++ b/presidio-structured/setup.py @@ -35,12 +35,11 @@ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", ], - description="Persidio structured package - analyses and anonymizes structured and semistructured data.", + description="Presidio structured package - analyses and anonymizes structured and semistructured data.", license="MIT license", include_package_data=True, keywords="presidio_structured", - install_requires=["pycryptodome>=3.10.1"], - # packages=find_packages(include=["presidio_anonymizer", "presidio_anonymizer.*"]),#TODO? + install_requires=["presidio-analyzer>=2.2", "presidio-anonymizer>=2.2"], test_suite="tests", tests_require=test_requirements, url="https://github.com/microsoft/presidio", From 1427528bf8163f7785c7f554ccf03cdf7cd208d3 Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Fri, 17 Nov 2023 15:32:31 +0100 Subject: [PATCH 05/52] lint-build-test --- .pipelines/templates/build-structured.yml | 2 ++ .pipelines/templates/lint-build-test.yml | 23 +++++++++++++++++++++++ presidio-structured/setup.py | 4 ++-- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/.pipelines/templates/build-structured.yml b/.pipelines/templates/build-structured.yml index 2c9d4c5f4..13064583c 100644 --- a/.pipelines/templates/build-structured.yml +++ b/.pipelines/templates/build-structured.yml @@ -17,6 +17,8 @@ steps: script: | set -eux # fail on error pipenv install --deploy --dev + pipenv run pip install -e ../presidio-analyzer/. # Use the existing analyzer and not the one in PyPI + pipenv run pip install -e ../presidio-anonymizer/. # Use the existing analyzer and not the one in PyPI - template: ./build-python.yml parameters: diff --git a/.pipelines/templates/lint-build-test.yml b/.pipelines/templates/lint-build-test.yml index eb72380a4..7d36259c4 100644 --- a/.pipelines/templates/lint-build-test.yml +++ b/.pipelines/templates/lint-build-test.yml @@ -109,3 +109,26 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - template: ./build-cli.yml + - job: TestStructured + displayName: Test Presidio Structured + dependsOn: + - 'Inclusivelint' + pool: + vmImage: 'ubuntu-latest' + strategy: + matrix: + Python38: + python.version: '3.8' + Python39: + python.version: '3.9' + Python310: + python.version: '3.10' + Python311: + python.version: '3.11' + + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '$(python.version)' + displayName: 'Use Python $(python.version)' + - template: ./build-structured.yml diff --git a/presidio-structured/setup.py b/presidio-structured/setup.py index 076d536b7..12f59533f 100644 --- a/presidio-structured/setup.py +++ b/presidio-structured/setup.py @@ -3,7 +3,7 @@ import os.path from os import path -from setuptools import setup, find_packages +from setuptools import setup test_requirements = ["pytest>=3", "flake8==3.7.9"] @@ -21,7 +21,7 @@ __version__ = os.environ.get("PRESIDIO_VERSION", "0.0.1-alpha") setup( - name="presidio_anonymizer", + name="presidio_structured", python_requires=">=3.5", version=__version__, classifiers=[ From 463bebaac0a4efed17ae4ce5c2091f5d80aa4554 Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Fri, 17 Nov 2023 15:35:33 +0100 Subject: [PATCH 06/52] Update lint-build-test.yml --- .pipelines/templates/lint-build-test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pipelines/templates/lint-build-test.yml b/.pipelines/templates/lint-build-test.yml index 68c716e9b..d362273cf 100644 --- a/.pipelines/templates/lint-build-test.yml +++ b/.pipelines/templates/lint-build-test.yml @@ -76,6 +76,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - template: ./build-image-redactor.yml + - job: TestCli displayName: Test Cli pool: @@ -97,10 +98,9 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - template: ./build-cli.yml - - job: TestStructured + + - job: TestStructured displayName: Test Presidio Structured - dependsOn: - - 'Inclusivelint' pool: vmImage: 'ubuntu-latest' strategy: From 5f36b40e3de49b727ef95b02686c18ae2a8dc571 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 13:03:13 +0100 Subject: [PATCH 07/52] Add packages to setup.py --- presidio-structured/setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/presidio-structured/setup.py b/presidio-structured/setup.py index 12f59533f..d813203e3 100644 --- a/presidio-structured/setup.py +++ b/presidio-structured/setup.py @@ -3,7 +3,7 @@ import os.path from os import path -from setuptools import setup +from setuptools import setup, find_packages test_requirements = ["pytest>=3", "flake8==3.7.9"] @@ -24,6 +24,7 @@ name="presidio_structured", python_requires=">=3.5", version=__version__, + packages=find_packages(include=["presidio_structured", "presidio_structured.*"]), classifiers=[ "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", From 6693817a36e3a73f2bb69da53a1262ddea426e03 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 13:14:04 +0100 Subject: [PATCH 08/52] Update presidio-structured to alpha version --- CHANGELOG.md | 2 +- presidio-structured/README.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bb3f2a009..b5482076d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ All notable changes to this project will be documented in this file. ## [Unreleased] ### Added #### Structured -* Added V1 of presidio-structured, a library (presidio-structured) which re-uses existing logic from existing presidio components to allow anonymization of (semi-)structured data. +* Added alpha of presidio-structured, a library (presidio-structured) which re-uses existing logic from existing presidio components to allow anonymization of (semi-)structured data. ## [2.2.351] - Nov. 6th 2024 ### Changed diff --git a/presidio-structured/README.md b/presidio-structured/README.md index 27c5c7c43..90193c554 100644 --- a/presidio-structured/README.md +++ b/presidio-structured/README.md @@ -2,6 +2,8 @@ ## Status +**Alpha**: This package is currently in alpha, meaning it is in its early stages of development. Features and functionality may change as the project evolves. + ### TODO For TODOs, see draft PR. From 25e961ed98365ec34d3d080af8a3d2ec0e53d66e Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 13:18:37 +0100 Subject: [PATCH 09/52] Update Presidio structured README.md --- presidio-structured/README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/presidio-structured/README.md b/presidio-structured/README.md index 90193c554..b534e2718 100644 --- a/presidio-structured/README.md +++ b/presidio-structured/README.md @@ -4,17 +4,18 @@ **Alpha**: This package is currently in alpha, meaning it is in its early stages of development. Features and functionality may change as the project evolves. -### TODO - -For TODOs, see draft PR. - ## Description -The Presidio stuctured is.. +The Presidio structured package is a flexible and customizable framework designed to identify and protect structured sensitive data. This tool extends the capabilities of Presidio, focusing on structured data formats. ## Deploy Presidio analyzer to Azure +TODO: [Instructions on deploying the Presidio analyzer to Azure will be here] + ## Simple usage example +TODO: [A basic example of how to use the Presidio structured package will be here] + ## Documentation +TODO: [Link to the comprehensive documentation, guides, and API references] From c356dd2b1eed81480b1f7420f61a853d4ad6f150 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 13:19:39 +0100 Subject: [PATCH 10/52] Add logging configuration to presidio-structured module --- presidio-structured/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-structured/__init__.py b/presidio-structured/__init__.py index 358dbb403..a3ec0a887 100644 --- a/presidio-structured/__init__.py +++ b/presidio-structured/__init__.py @@ -4,6 +4,6 @@ # Set up default logging (with NullHandler) -# logging.getLogger("presidio-str").addHandler(logging.NullHandler()) +logging.getLogger("presidio-str").addHandler(logging.NullHandler()) # __all__ = ["AnonymizerEngine", "DeanonymizeEngine", "BatchAnonymizerEngine"] From 3d9bf2f8bf24bc14b074690a061f1dd776f73ce7 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 13:21:48 +0100 Subject: [PATCH 11/52] Refactor AnalysisBuilder constructor to accept an optional AnalyzerEngine parameter --- presidio-structured/presidio_structured/analysis_builder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 01a282e92..8de828f81 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -19,9 +19,9 @@ class AnalysisBuilder(ABC): Abstract base class for a configuration generator. """ - def __init__(self): + def __init__(self, analyzer: AnalyzerEngine = None) -> None: """Initialize the configuration generator.""" - self.analyzer = AnalyzerEngine() + self.analyzer = AnalyzerEngine() if analyzer is None else analyzer @abstractmethod def generate_analysis(self, data: Union[Dict, DataFrame]) -> StructuredAnalysis: From fe0750f5ca511adcc3e12ad1444bc0c68d8e8065 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 13:37:27 +0100 Subject: [PATCH 12/52] Fix entity mapping in JsonAnalysisBuilder --- presidio-structured/presidio_structured/analysis_builder.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 8de828f81..dd77e22e9 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -79,9 +79,8 @@ def _generate_analysis_from_results_json( ) mappings.update(nested_mappings.entity_mapping) - if sum(1 for _ in result.recognizer_results) > 0: - for recognizer_result in result.recognizer_results: - mappings[current_key] = recognizer_result.entity_type + if result.recognizer_results > 0: + mappings[current_key] = result.recognizer_results[0].entity_type return StructuredAnalysis(entity_mapping=mappings) From 48a0cd67dd90d6e4f32f9984830b5c22a89d3167 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 13:40:42 +0100 Subject: [PATCH 13/52] Drop type in docstring in analysis builder classes --- .../presidio_structured/analysis_builder.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index dd77e22e9..5ae833afb 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -29,9 +29,7 @@ def generate_analysis(self, data: Union[Dict, DataFrame]) -> StructuredAnalysis: Abstract method to generate a configuration from the given data. :param data: The input data. Can be a dictionary or DataFrame instance. - :type data: Union[Dict, DataFrame] :return: The generated configuration. - :rtype StructuredAnalysis: """ pass @@ -44,9 +42,7 @@ def generate_analysis(self, data: Dict) -> StructuredAnalysis: Generate a configuration from the given JSON data. :param data: The input JSON data. - :type data: Dict :return: The generated configuration. - :rtype StructuredAnalysis: """ batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) analyzer_results = batch_analyzer.analyze_dict(input_dict=data, language="en") @@ -59,11 +55,8 @@ def _generate_analysis_from_results_json( Generate a configuration from the given analyzer results. :param analyzer_results: The analyzer results. - :type analyzer_results: Iterator[DictAnalyzerResult] :param prefix: The prefix for the configuration keys. - :type prefix: str :return: The generated configuration. - :rtype StructuredAnalysis: """ mappings = {} @@ -94,13 +87,9 @@ def generate_analysis( Generate a configuration from the given tabular data. :param df: The input tabular data (dataframe). - :type df: DataFrame :param n: The number of samples to be taken from the dataframe. - :type n: int :param language: The language to be used for analysis. - :type language: str :return: The generated configuration. - :rtype StructuredAnalysis: """ if n > len(df): n = len(df) @@ -124,11 +113,8 @@ def _find_most_common_entity( Find the most common entity in a dataframe column. :param df: The dataframe where entities will be searched. - :type df: DataFrame :param language: Language to be used in the analysis engine. - :type language: str :return: A dictionary mapping column names to the most common RecognizerResult. - :rtype: Dict[str, RecognizerResult] """ key_recognizer_result_map = {} From 7a6ed72d7f80af163865da7c9dd35ba914533dcc Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 13:42:10 +0100 Subject: [PATCH 14/52] Refactor TabularAnalysisBuilder to use BatchAnalyzerEngine for all columns --- presidio-structured/presidio_structured/analysis_builder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 5ae833afb..75be5997f 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -118,8 +118,9 @@ def _find_most_common_entity( """ key_recognizer_result_map = {} + batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) + for column in df.columns: - batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) analyzer_results = batch_analyzer.analyze_iterator( [val for val in df[column]], language=language ) From fff9a36eeb0799880582d200141335e2105992da Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 13:44:16 +0100 Subject: [PATCH 15/52] Update data_reader.py with type hints for file paths --- .../presidio_structured/data/data_reader.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/presidio-structured/presidio_structured/data/data_reader.py b/presidio-structured/presidio_structured/data/data_reader.py index f453ba051..fd36ec72a 100644 --- a/presidio-structured/presidio_structured/data/data_reader.py +++ b/presidio-structured/presidio_structured/data/data_reader.py @@ -2,7 +2,8 @@ import json from abc import ABC, abstractmethod -from typing import Any, Dict +from pathlib import Path +from typing import Any, Dict, Union import pandas as pd @@ -15,7 +16,7 @@ class ReaderBase(ABC): """ @abstractmethod - def read(self, path: str) -> Any: + def read(self, path: Union[str, Path]) -> Any: """ Extract data from file located at path. @@ -36,7 +37,7 @@ class CsvReader(ReaderBase): """ - def read(self, path: str) -> pd.DataFrame: + def read(self, path: Union[str, Path]) -> pd.DataFrame: """ Read csv file to pandas dataframe. @@ -57,7 +58,7 @@ class JsonReader(ReaderBase): """ - def read(self, path: str) -> Dict[str, Any]: + def read(self, path: Union[str, Path]) -> Dict[str, Any]: """ Read json file to dict. From 0915d9f1c82e15fdcee67f407e0dec4dd3eee2e6 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 13:45:23 +0100 Subject: [PATCH 16/52] Update data_reader.py to include additional keyword arguments in read() method --- .../presidio_structured/data/data_reader.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/presidio-structured/presidio_structured/data/data_reader.py b/presidio-structured/presidio_structured/data/data_reader.py index fd36ec72a..0149f6527 100644 --- a/presidio-structured/presidio_structured/data/data_reader.py +++ b/presidio-structured/presidio_structured/data/data_reader.py @@ -16,7 +16,7 @@ class ReaderBase(ABC): """ @abstractmethod - def read(self, path: Union[str, Path]) -> Any: + def read(self, path: Union[str, Path], **kwargs) -> Any: """ Extract data from file located at path. @@ -37,14 +37,14 @@ class CsvReader(ReaderBase): """ - def read(self, path: Union[str, Path]) -> pd.DataFrame: + def read(self, path: Union[str, Path], **kwargs) -> pd.DataFrame: """ Read csv file to pandas dataframe. :param path: String defining the location of the csv file to read. :return: Pandas DataFrame with the data read from the csv file. """ - return pd.read_csv(path) + return pd.read_csv(path, **kwargs) class JsonReader(ReaderBase): @@ -58,7 +58,7 @@ class JsonReader(ReaderBase): """ - def read(self, path: Union[str, Path]) -> Dict[str, Any]: + def read(self, path: Union[str, Path], **kwargs) -> Dict[str, Any]: """ Read json file to dict. @@ -66,5 +66,5 @@ def read(self, path: Union[str, Path]) -> Dict[str, Any]: :return: dictionary with the data read from the json file. """ with open(path) as f: - data = json.load(f) + data = json.load(f, **kwargs) return data From d0db1c312c5b168cc0fc3375ac6ee5d8d685d129 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 13:54:46 +0100 Subject: [PATCH 17/52] Update Transformer to Processor term in StructuredEngine --- .../{data_transformers.py => data_processors.py} | 14 +++++++------- .../presidio_structured/structured_engine.py | 12 ++++++------ presidio-structured/sample/example.ipynb | 6 +++--- 3 files changed, 16 insertions(+), 16 deletions(-) rename presidio-structured/presidio_structured/data/{data_transformers.py => data_processors.py} (94%) diff --git a/presidio-structured/presidio_structured/data/data_transformers.py b/presidio-structured/presidio_structured/data/data_processors.py similarity index 94% rename from presidio-structured/presidio_structured/data/data_transformers.py rename to presidio-structured/presidio_structured/data/data_processors.py index b57330443..d191c6d11 100644 --- a/presidio-structured/presidio_structured/data/data_transformers.py +++ b/presidio-structured/presidio_structured/data/data_processors.py @@ -8,13 +8,13 @@ from presidio_structured.config import StructuredAnalysis -class DataTransformerBase(ABC): +class DataProcessorBase(ABC): """ Abstract base class to handle logic of operations over the text using the operators. """ def __init__(self) -> None: - """Initializes DataTransformerBase object.""" + """Initializes DataProcessorBase object.""" pass def operate( @@ -99,7 +99,7 @@ def _operate_on_text( return operator_callable(text_to_operate_on) -class PandasDataTransformer(DataTransformerBase): +class PandasDataProcessor(DataProcessorBase): def _process( self, data: DataFrame, key_to_operator_mapping: Dict[str, Callable] ) -> DataFrame: @@ -124,8 +124,8 @@ def _process( return data -class JsonDataTransformer(DataTransformerBase): - """JSON Data Transformer, Supports arbitrary nesting of dictionaries and lists.""" +class JsonDataProcessor(DataProcessorBase): + """JSON Data Processor, Supports arbitrary nesting of dictionaries and lists.""" @staticmethod def _get_nested_value(data: Union[Dict, List], path: List[str]) -> Any: @@ -142,7 +142,7 @@ def _get_nested_value(data: Union[Dict, List], path: List[str]) -> Any: data = data[int(key)] else: return [ - JsonDataTransformer._get_nested_value(item, path[i:]) + JsonDataProcessor._get_nested_value(item, path[i:]) for item in data ] elif isinstance(data, dict): @@ -169,7 +169,7 @@ def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> N continue else: for item in data: - JsonDataTransformer._set_nested_value(item, path[i:], value) + JsonDataProcessor._set_nested_value(item, path[i:], value) return elif isinstance(data, dict): if i == len(path) - 1: diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py index bda14790b..bc0a2f765 100644 --- a/presidio-structured/presidio_structured/structured_engine.py +++ b/presidio-structured/presidio_structured/structured_engine.py @@ -4,7 +4,7 @@ from presidio_anonymizer.entities import OperatorConfig from presidio_structured.config import StructuredAnalysis -from presidio_structured.data.data_transformers import DataTransformerBase +from presidio_structured.data.data_processors import DataProcessorBase DEFAULT = "replace" @@ -14,13 +14,13 @@ class StructuredEngine: Class to implement methods for anonymizing tabular data. """ - def __init__(self, data_transformer: DataTransformerBase): + def __init__(self, data_processor: DataProcessorBase): """ - Initialize the class with a data transformer. + Initialize the class with a data processor. - :param data_transformer: Instance of DataTransformerBase. + :param data_processor: Instance of DataProcessorBase. """ - self.data_transformer = data_transformer + self.data_processor = data_processor def anonymize( self, @@ -38,7 +38,7 @@ def anonymize( """ operators = self.__check_or_add_default_operator(operators) - return self.data_transformer.operate(data, structured_analysis, operators) + return self.data_processor.operate(data, structured_analysis, operators) @staticmethod def __check_or_add_default_operator( diff --git a/presidio-structured/sample/example.ipynb b/presidio-structured/sample/example.ipynb index 4b552b797..24e19de19 100644 --- a/presidio-structured/sample/example.ipynb +++ b/presidio-structured/sample/example.ipynb @@ -9,7 +9,7 @@ "import os\n", "import sys\n", "sys.path.append(os.path.abspath(\"..\"))\n", - "from presidio_structured import StructuredEngine, JsonAnalysisBuilder, TabularAnalysisBuilder, StructuredAnalysis, CsvReader, JsonReader, JsonDataTransformer, PandasDataTransformer" + "from presidio_structured import StructuredEngine, JsonAnalysisBuilder, TabularAnalysisBuilder, StructuredAnalysis, CsvReader, JsonReader, JsonDataProcessor, PandasDataProcessor" ] }, { @@ -294,7 +294,7 @@ "source": [ "# anonymized data defaults to be replaced with None, unless operators is specified\n", "\n", - "pandas_engine = StructuredEngine(data_transformer=PandasDataTransformer())\n", + "pandas_engine = StructuredEngine(data_processor=PandasDataProcessor())\n", "df_to_be_anonymized = sample_df.copy() # in-place anonymization\n", "anonymized_df = pandas_engine.anonymize(df_to_be_anonymized, tabular_analysis, operators=None) # explicit None for clarity\n", "anonymized_df" @@ -490,7 +490,7 @@ ], "source": [ "# anonymizing simple data\n", - "json_engine = StructuredEngine(data_transformer=JsonDataTransformer())\n", + "json_engine = StructuredEngine(data_processor=JsonDataProcessor())\n", "anonymized_json = json_engine.anonymize(sample_json, json_analysis, operators=operators)\n", "anonymized_json" ] From 39315589c1d490021237ac5ef3af6dc304da6bc9 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 14:09:54 +0100 Subject: [PATCH 18/52] Add PandasDataProcessor as default to StructuredEngine init --- .../presidio_structured/structured_engine.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py index bc0a2f765..3c76f3a55 100644 --- a/presidio-structured/presidio_structured/structured_engine.py +++ b/presidio-structured/presidio_structured/structured_engine.py @@ -4,7 +4,10 @@ from presidio_anonymizer.entities import OperatorConfig from presidio_structured.config import StructuredAnalysis -from presidio_structured.data.data_processors import DataProcessorBase +from presidio_structured.data.data_processors import ( + DataProcessorBase, + PandasDataProcessor, +) DEFAULT = "replace" @@ -14,12 +17,14 @@ class StructuredEngine: Class to implement methods for anonymizing tabular data. """ - def __init__(self, data_processor: DataProcessorBase): + def __init__(self, data_processor: DataProcessorBase = None) -> None: """ Initialize the class with a data processor. :param data_processor: Instance of DataProcessorBase. """ + if data_processor is None: + data_processor = PandasDataProcessor() self.data_processor = data_processor def anonymize( From 5977230edbd0a08770b2518fc69dfbad098077ff Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 14:19:05 +0100 Subject: [PATCH 19/52] Move structured sample files to the docs --- .../samples/python/csv_sample_data/test_structured.csv | 0 .../samples/python/example_structured.ipynb | 6 +++--- .../samples/python/sample_data/test_structured.json | 0 .../samples/python/sample_data/test_structured_complex.json | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename presidio-structured/sample/sample_data/test_csv.csv => docs/samples/python/csv_sample_data/test_structured.csv (100%) rename presidio-structured/sample/example.ipynb => docs/samples/python/example_structured.ipynb (98%) rename presidio-structured/sample/sample_data/test_json.json => docs/samples/python/sample_data/test_structured.json (100%) rename presidio-structured/sample/sample_data/test_complex_json.json => docs/samples/python/sample_data/test_structured_complex.json (100%) diff --git a/presidio-structured/sample/sample_data/test_csv.csv b/docs/samples/python/csv_sample_data/test_structured.csv similarity index 100% rename from presidio-structured/sample/sample_data/test_csv.csv rename to docs/samples/python/csv_sample_data/test_structured.csv diff --git a/presidio-structured/sample/example.ipynb b/docs/samples/python/example_structured.ipynb similarity index 98% rename from presidio-structured/sample/example.ipynb rename to docs/samples/python/example_structured.ipynb index 24e19de19..b6ca5b792 100644 --- a/presidio-structured/sample/example.ipynb +++ b/docs/samples/python/example_structured.ipynb @@ -107,7 +107,7 @@ } ], "source": [ - "sample_df = CsvReader().read(\"./sample_data/test_csv.csv\")\n", + "sample_df = CsvReader().read(\"./csv_sample_data/test_structured.csv\")\n", "sample_df" ] }, @@ -134,7 +134,7 @@ } ], "source": [ - "sample_json = JsonReader().read(\"./sample_data/test_json.json\")\n", + "sample_json = JsonReader().read(\"./sample_data/test_structured.json\")\n", "sample_json" ] }, @@ -176,7 +176,7 @@ ], "source": [ "# contains nested objects in lists\n", - "sample_complex_json = JsonReader().read(\"./sample_data/test_complex_json.json\")\n", + "sample_complex_json = JsonReader().read(\"./sample_data/test_structured_complex.json\")\n", "sample_complex_json" ] }, diff --git a/presidio-structured/sample/sample_data/test_json.json b/docs/samples/python/sample_data/test_structured.json similarity index 100% rename from presidio-structured/sample/sample_data/test_json.json rename to docs/samples/python/sample_data/test_structured.json diff --git a/presidio-structured/sample/sample_data/test_complex_json.json b/docs/samples/python/sample_data/test_structured_complex.json similarity index 100% rename from presidio-structured/sample/sample_data/test_complex_json.json rename to docs/samples/python/sample_data/test_structured_complex.json From 1770112680681092e14791795e5e13e8748ed090 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 14:26:25 +0100 Subject: [PATCH 20/52] Add Presidio Structured Notebook to samples index --- docs/samples/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/samples/index.md b/docs/samples/index.md index 3d7f462e1..4a113aa54 100644 --- a/docs/samples/index.md +++ b/docs/samples/index.md @@ -14,6 +14,7 @@ | Usage | Images | Python Notebook | [Plot custom bounding boxes](https://github.com/microsoft/presidio/blob/main/docs/samples/python/plot_custom_bboxes.ipynb) | Usage | Text | Python Notebook | [Integrating with external services](https://github.com/microsoft/presidio/blob/main/docs/samples/python/integrating_with_external_services.ipynb) | | Usage | Text | Python file | [Remote Recognizer](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_remote_recognizer.py) | +| Usage | Structured | Python Notebook | [Presidio Structured Basic Usage Notebook](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_structured.ipynb) | | Usage | Text | Python file | [Azure AI Language as a Remote Recognizer](python/text_analytics/index.md) | | Usage | CSV | Python file | [Analyze and Anonymize CSV file](https://github.com/microsoft/presidio/blob/main/docs/samples/python/process_csv_file.py) | | Usage | Text | Python | [Using Flair as an external PII model](https://github.com/microsoft/presidio/blob/main/docs/samples/python/flair_recognizer.py)| From c202f0c954b49dd4540e3daf6733db53b46f2e14 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 14:34:58 +0100 Subject: [PATCH 21/52] Remove unnecessary imports in structured sample --- docs/samples/python/example_structured.ipynb | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/samples/python/example_structured.ipynb b/docs/samples/python/example_structured.ipynb index b6ca5b792..031f0576f 100644 --- a/docs/samples/python/example_structured.ipynb +++ b/docs/samples/python/example_structured.ipynb @@ -7,8 +7,6 @@ "outputs": [], "source": [ "import os\n", - "import sys\n", - "sys.path.append(os.path.abspath(\"..\"))\n", "from presidio_structured import StructuredEngine, JsonAnalysisBuilder, TabularAnalysisBuilder, StructuredAnalysis, CsvReader, JsonReader, JsonDataProcessor, PandasDataProcessor" ] }, From 91f9f6b4c7bb55affd4053f236c669102a230019 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 14:38:59 +0100 Subject: [PATCH 22/52] Update to processors in structured __init__ files --- presidio-structured/presidio_structured/__init__.py | 6 +++--- presidio-structured/presidio_structured/data/__init__.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/presidio-structured/presidio_structured/__init__.py b/presidio-structured/presidio_structured/__init__.py index 572d9981a..71138e63c 100644 --- a/presidio-structured/presidio_structured/__init__.py +++ b/presidio-structured/presidio_structured/__init__.py @@ -1,6 +1,6 @@ from .analysis_builder import JsonAnalysisBuilder, TabularAnalysisBuilder from .config import StructuredAnalysis -from .data import CsvReader, JsonDataTransformer, JsonReader, PandasDataTransformer +from .data import CsvReader, JsonDataProcessor, JsonReader, PandasDataProcessor from .structured_engine import StructuredEngine __all__ = [ @@ -10,6 +10,6 @@ "StructuredAnalysis", "CsvReader", "JsonReader", - "PandasDataTransformer", - "JsonDataTransformer", + "PandasDataProcessor", + "JsonDataProcessor", ] diff --git a/presidio-structured/presidio_structured/data/__init__.py b/presidio-structured/presidio_structured/data/__init__.py index 4b97eb048..b888f9829 100644 --- a/presidio-structured/presidio_structured/data/__init__.py +++ b/presidio-structured/presidio_structured/data/__init__.py @@ -1,9 +1,9 @@ from .data_reader import CsvReader, JsonReader -from .data_transformers import JsonDataTransformer, PandasDataTransformer +from .data_processors import JsonDataProcessor, PandasDataProcessor __all__ = [ "CsvReader", "JsonReader", - "PandasDataTransformer", - "JsonDataTransformer", + "PandasDataProcessor", + "JsonDataProcessor", ] From d71ff88a3b8355fc2e645cd89079441aa3c63f0e Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 14:52:34 +0100 Subject: [PATCH 23/52] Add explanation for structured table sample --- docs/samples/python/example_structured.ipynb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/samples/python/example_structured.ipynb b/docs/samples/python/example_structured.ipynb index 031f0576f..d7f6caf93 100644 --- a/docs/samples/python/example_structured.ipynb +++ b/docs/samples/python/example_structured.ipynb @@ -109,6 +109,13 @@ "sample_df" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This sample is a structured dataset containing sensitive data like names, emails, and addresses. It differs from the sample for the batch analyzer/anonymizer engines example, which includes narrative phrases that might contain sensitive data. The presence of personal data embedded in these phrases requires to analyze and to anonymize the text inside the cells, which is not the case for our structured sample, where the sensitive data is already separated into columns." + ] + }, { "cell_type": "code", "execution_count": 3, From 15e03c38d5a00264e082568205bfede7f6ff9b73 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Wed, 22 Nov 2023 14:55:23 +0100 Subject: [PATCH 24/52] Delete unnecessary __init__s in structured test --- presidio-structured/tests/__init__.py | 1 - presidio-structured/tests/data/__init__.py | 0 2 files changed, 1 deletion(-) delete mode 100644 presidio-structured/tests/__init__.py delete mode 100644 presidio-structured/tests/data/__init__.py diff --git a/presidio-structured/tests/__init__.py b/presidio-structured/tests/__init__.py deleted file mode 100644 index 53dc04c35..000000000 --- a/presidio-structured/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Unit test package for presidio_structured.""" diff --git a/presidio-structured/tests/data/__init__.py b/presidio-structured/tests/data/__init__.py deleted file mode 100644 index e69de29bb..000000000 From 354e223198edc6eb31679d8bd817cbaae87e50e1 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Thu, 23 Nov 2023 09:46:56 +0100 Subject: [PATCH 25/52] Fix bug in JsonAnalysisBuilder entity mapping --- presidio-structured/presidio_structured/analysis_builder.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 75be5997f..424eab96e 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -71,9 +71,9 @@ def _generate_analysis_from_results_json( result.recognizer_results, prefix=current_key + "." ) mappings.update(nested_mappings.entity_mapping) - - if result.recognizer_results > 0: - mappings[current_key] = result.recognizer_results[0].entity_type + first_recognizer_result = next(iter(result.recognizer_results), None) + if first_recognizer_result is not None: + mappings[current_key] = first_recognizer_result.entity_type return StructuredAnalysis(entity_mapping=mappings) From db1f3d8c378f822843cb1d56bbcd7a69fd14610c Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Fri, 24 Nov 2023 16:11:10 +0100 Subject: [PATCH 26/52] pr comments, nits, minor tests --- docs/samples/python/example_structured.ipynb | 43 +++++++++---------- presidio-structured/__init__.py | 5 +-- .../presidio_structured/analysis_builder.py | 2 +- .../presidio_structured/structured_engine.py | 4 +- .../tests/data/test_data_transformers.py | 34 +++++++-------- .../tests/test_tabular_engine.py | 38 +++++++++++----- 6 files changed, 68 insertions(+), 58 deletions(-) diff --git a/docs/samples/python/example_structured.ipynb b/docs/samples/python/example_structured.ipynb index d7f6caf93..f0630a44f 100644 --- a/docs/samples/python/example_structured.ipynb +++ b/docs/samples/python/example_structured.ipynb @@ -6,10 +6,16 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", "from presidio_structured import StructuredEngine, JsonAnalysisBuilder, TabularAnalysisBuilder, StructuredAnalysis, CsvReader, JsonReader, JsonDataProcessor, PandasDataProcessor" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This sample showcases presidio-structured on structured and semi-structured data containing sensitive data like names, emails, and addresses. It differs from the sample for the batch analyzer/anonymizer engines example, which includes narrative phrases that might contain sensitive data. The presence of personal data embedded in these phrases requires to analyze and to anonymize the text inside the cells, which is not the case for our structured sample, where the sensitive data is already separated into columns." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -109,13 +115,6 @@ "sample_df" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This sample is a structured dataset containing sensitive data like names, emails, and addresses. It differs from the sample for the batch analyzer/anonymizer engines example, which includes narrative phrases that might contain sensitive data. The presence of personal data embedded in these phrases requires to analyze and to anonymize the text inside the cells, which is not the case for our structured sample, where the sensitive data is already separated into columns." - ] - }, { "cell_type": "code", "execution_count": 3, @@ -216,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -291,7 +290,7 @@ "2 3 789 Pine St 11223" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -352,7 +351,7 @@ " 0\n", " 1\n", " person...\n", - " mccoyryan@example.com\n", + " jamestaylor@example.net\n", " 123 Main St\n", " <None>\n", " <None>\n", @@ -362,7 +361,7 @@ " 1\n", " 2\n", " person...\n", - " harristricia@example.net\n", + " brian49@example.com\n", " 456 Elm St\n", " <None>\n", " <None>\n", @@ -372,7 +371,7 @@ " 2\n", " 3\n", " person...\n", - " thomasmikayla@example.org\n", + " clarkcody@example.org\n", " 789 Pine St\n", " <None>\n", " <None>\n", @@ -383,10 +382,10 @@ "" ], "text/plain": [ - " id name email street city state \\\n", - "0 1 person... mccoyryan@example.com 123 Main St \n", - "1 2 person... harristricia@example.net 456 Elm St \n", - "2 3 person... thomasmikayla@example.org 789 Pine St \n", + " id name email street city state \\\n", + "0 1 person... jamestaylor@example.net 123 Main St \n", + "1 2 person... brian49@example.com 456 Elm St \n", + "2 3 person... clarkcody@example.org 789 Pine St \n", "\n", " postal_code \n", "0 12345 \n", @@ -428,7 +427,7 @@ { "data": { "text/plain": [ - "StructuredAnalysis(entity_mapping={'name': 'PERSON', 'email': 'URL', 'address.city': 'LOCATION', 'address.state': 'LOCATION'})" + "StructuredAnalysis(entity_mapping={'name': 'PERSON', 'email': 'EMAIL_ADDRESS', 'address.city': 'LOCATION', 'address.state': 'LOCATION'})" ] }, "execution_count": 8, @@ -481,7 +480,7 @@ "text/plain": [ "{'id': 1,\n", " 'name': 'person...',\n", - " 'email': '',\n", + " 'email': 'virginia29@example.org',\n", " 'address': {'street': '123 Main St',\n", " 'city': '',\n", " 'state': '',\n", @@ -510,21 +509,21 @@ "text/plain": [ "{'users': [{'id': 1,\n", " 'name': 'person...',\n", - " 'email': 'tricia10@example.com',\n", + " 'email': 'david90@example.org',\n", " 'address': {'street': '',\n", " 'city': '',\n", " 'state': '',\n", " 'postal_code': '12345'}},\n", " {'id': 2,\n", " 'name': 'person...',\n", - " 'email': 'tricia10@example.com',\n", + " 'email': 'david90@example.org',\n", " 'address': {'street': '',\n", " 'city': '',\n", " 'state': '',\n", " 'postal_code': '67890'}},\n", " {'id': 3,\n", " 'name': 'person...',\n", - " 'email': 'tricia10@example.com',\n", + " 'email': 'david90@example.org',\n", " 'address': {'street': '',\n", " 'city': '',\n", " 'state': '',\n", diff --git a/presidio-structured/__init__.py b/presidio-structured/__init__.py index a3ec0a887..ed6a669d8 100644 --- a/presidio-structured/__init__.py +++ b/presidio-structured/__init__.py @@ -3,7 +3,4 @@ # Set up default logging (with NullHandler) - -logging.getLogger("presidio-str").addHandler(logging.NullHandler()) - -# __all__ = ["AnonymizerEngine", "DeanonymizeEngine", "BatchAnonymizerEngine"] +logging.getLogger("presidio-structured").addHandler(logging.NullHandler()) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 424eab96e..1955c505e 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -52,7 +52,7 @@ def _generate_analysis_from_results_json( self, analyzer_results: Iterator[DictAnalyzerResult], prefix: str = "" ) -> StructuredAnalysis: """ - Generate a configuration from the given analyzer results. + Generate a configuration from the given analyzer results. Always uses the first recognizer result if there are more than one. :param analyzer_results: The analyzer results. :param prefix: The prefix for the configuration keys. diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py index 3c76f3a55..b36a8f59b 100644 --- a/presidio-structured/presidio_structured/structured_engine.py +++ b/presidio-structured/presidio_structured/structured_engine.py @@ -17,14 +17,12 @@ class StructuredEngine: Class to implement methods for anonymizing tabular data. """ - def __init__(self, data_processor: DataProcessorBase = None) -> None: + def __init__(self, data_processor: DataProcessorBase = PandasDataProcessor()) -> None: """ Initialize the class with a data processor. :param data_processor: Instance of DataProcessorBase. """ - if data_processor is None: - data_processor = PandasDataProcessor() self.data_processor = data_processor def anonymize( diff --git a/presidio-structured/tests/data/test_data_transformers.py b/presidio-structured/tests/data/test_data_transformers.py index 0156d77ad..638ed03c9 100644 --- a/presidio-structured/tests/data/test_data_transformers.py +++ b/presidio-structured/tests/data/test_data_transformers.py @@ -1,18 +1,18 @@ import pytest from pandas import DataFrame from presidio_anonymizer.entities import OperatorConfig -from presidio_structured.data.data_transformers import DataTransformerBase, PandasDataTransformer, JsonDataTransformer +from presidio_structured.data.data_processors import DataProcessorBase, PandasDataProcessor, JsonDataProcessor from presidio_structured.config import StructuredAnalysis -class TestDataTransformerBase: +class TestDataProcessorBase: def test_abstract_init_raises(self, sample_df, tabular_analysis_builder, operators): with pytest.raises(TypeError): - DataTransformerBase() + DataProcessorBase() -class TestPandasDataTransformer: +class TestPandasDataProcessor: def test_process(self, sample_df, operators, tabular_analysis): - transformer = PandasDataTransformer() - result = transformer.operate(sample_df, tabular_analysis, operators) + processor = PandasDataProcessor() + result = processor.operate(sample_df, tabular_analysis, operators) assert isinstance(result, DataFrame) for key in tabular_analysis.entity_mapping: if key == 'name': @@ -21,19 +21,19 @@ def test_process(self, sample_df, operators, tabular_analysis): assert all(result[key] == "DEFAULT_REPLACEMENT") def test_process_no_default_should_raise(self, sample_df, operators_no_default, tabular_analysis): - transformer = PandasDataTransformer() + processor = PandasDataProcessor() with pytest.raises(ValueError): - transformer.operate(sample_df, tabular_analysis, operators_no_default) + processor.operate(sample_df, tabular_analysis, operators_no_default) def test_process_invalid_data(self, sample_json, tabular_analysis, operators): - transformer = PandasDataTransformer() + processor = PandasDataProcessor() with pytest.raises(ValueError): - transformer.operate(sample_json, tabular_analysis, operators) + processor.operate(sample_json, tabular_analysis, operators) -class TestJsonDataTransformer: +class TestJsonDataProcessor: def test_process(self, sample_json, operators, json_analysis): - transformer = JsonDataTransformer() - result = transformer.operate(sample_json, json_analysis, operators) + processor = JsonDataProcessor() + result = processor.operate(sample_json, json_analysis, operators) assert isinstance(result, dict) for key, value in json_analysis.entity_mapping.items(): keys = key.split(".") @@ -46,11 +46,11 @@ def test_process(self, sample_json, operators, json_analysis): assert nested_value == "DEFAULT_REPLACEMENT" def test_process_no_default_should_raise(self, sample_json, operators_no_default, json_analysis): - transformer = JsonDataTransformer() + processor = JsonDataProcessor() with pytest.raises(ValueError): - transformer.operate(sample_json, json_analysis, operators_no_default) + processor.operate(sample_json, json_analysis, operators_no_default) def test_process_invalid_data(self, sample_df, json_analysis, operators): - transformer = JsonDataTransformer() + processor = JsonDataProcessor() with pytest.raises(ValueError): - transformer.operate(sample_df, json_analysis, operators) \ No newline at end of file + processor.operate(sample_df, json_analysis, operators) \ No newline at end of file diff --git a/presidio-structured/tests/test_tabular_engine.py b/presidio-structured/tests/test_tabular_engine.py index 96d0f9188..4d758a94f 100644 --- a/presidio-structured/tests/test_tabular_engine.py +++ b/presidio-structured/tests/test_tabular_engine.py @@ -1,15 +1,18 @@ from unittest.mock import Mock +import pandas as pd import pytest + from presidio_anonymizer.entities import OperatorConfig from presidio_structured import StructuredEngine +from presidio_structured.data.data_processors import JsonDataProcessor -def test_structured_engine_anonymize_calls_data_transformer_operate(): +def test_structured_engine_anonymize_calls_data_processor_operate(): # Arrange - data_transformer = Mock() - structured_engine = StructuredEngine(data_transformer) + data_processor = Mock() + structured_engine = StructuredEngine(data_processor) data = Mock() structured_analysis = Mock() operators = {"DEFAULT": OperatorConfig("replace")} @@ -18,15 +21,15 @@ def test_structured_engine_anonymize_calls_data_transformer_operate(): structured_engine.anonymize(data, structured_analysis, operators) # Assert - data_transformer.operate.assert_called_once_with( + data_processor.operate.assert_called_once_with( data, structured_analysis, operators ) def test_structured_engine_anonymize_adds_default_operator_if_none_provided(): # Arrange - data_transformer = Mock() - structured_engine = StructuredEngine(data_transformer) + data_processor = Mock() + structured_engine = StructuredEngine(data_processor) data = Mock() structured_analysis = Mock() @@ -34,15 +37,15 @@ def test_structured_engine_anonymize_adds_default_operator_if_none_provided(): structured_engine.anonymize(data, structured_analysis) # Assert - data_transformer.operate.assert_called_once() - args, _ = data_transformer.operate.call_args + data_processor.operate.assert_called_once() + args, _ = data_processor.operate.call_args assert "DEFAULT" in args[2] def test_structured_engine_anonymize_does_not_override_existing_default_operator(): # Arrange - data_transformer = Mock() - structured_engine = StructuredEngine(data_transformer) + data_processor = Mock() + structured_engine = StructuredEngine(data_processor) data = Mock() structured_analysis = Mock() operators = {"DEFAULT": OperatorConfig("custom")} @@ -51,6 +54,19 @@ def test_structured_engine_anonymize_does_not_override_existing_default_operator structured_engine.anonymize(data, structured_analysis, operators) # Assert - data_transformer.operate.assert_called_once_with( + data_processor.operate.assert_called_once_with( data, structured_analysis, operators ) + +def test_json_processor_with_pandas_dataframe_will_raise(tabular_analysis): + data_processor = JsonDataProcessor() + structured_engine = StructuredEngine(data_processor) + data = pd.DataFrame({"name": ["John", "Jane"]}) + with pytest.raises(ValueError): + structured_engine.anonymize(data, tabular_analysis) + +def test_pandas_processor_with_json_will_raise(json_analysis): + structured_engine = StructuredEngine() # default PandasDataProcessor + data = {"name": ["John", "Jane"]} + with pytest.raises(ValueError): + structured_engine.anonymize(data, json_analysis) \ No newline at end of file From 29f7f8a16f01390ba6b9d800adbe5651eac86162 Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Mon, 27 Nov 2023 11:34:38 +0100 Subject: [PATCH 27/52] README --- presidio-structured/README.md | 96 +++++++++++++++++-- presidio-structured/__init__.py | 6 -- .../presidio_structured/__init__.py | 5 + 3 files changed, 94 insertions(+), 13 deletions(-) delete mode 100644 presidio-structured/__init__.py diff --git a/presidio-structured/README.md b/presidio-structured/README.md index b534e2718..c89c5fb14 100644 --- a/presidio-structured/README.md +++ b/presidio-structured/README.md @@ -6,16 +6,98 @@ ## Description -The Presidio structured package is a flexible and customizable framework designed to identify and protect structured sensitive data. This tool extends the capabilities of Presidio, focusing on structured data formats. +The Presidio structured package is a flexible and customizable framework designed to identify and protect structured sensitive data. This tool extends the capabilities of Presidio, focusing on structured data formats such as tabular formats and semi-structured formats (JSON). -## Deploy Presidio analyzer to Azure +## Installation -TODO: [Instructions on deploying the Presidio analyzer to Azure will be here] +### As a python package: -## Simple usage example +To install the `presidio-structured` package, run the following command: -TODO: [A basic example of how to use the Presidio structured package will be here] +```sh +pip install presidio-structured +``` -## Documentation +#### Getting started -TODO: [Link to the comprehensive documentation, guides, and API references] +Example 1: Anonymizing DataFrames + +```python +import pandas as pd +from presidio_structured import StructuredEngine, TabularAnalysisBuilder +from presidio_anonymizer.entities import OperatorConfig +from faker import Faker # optionally using faker as an example + +# Initialize the engine with a Pandas data processor (default) +pandas_engine = StructuredEngine() + +# Create a sample DataFrame +sample_df = pd.DataFrame({'name': ['John Doe', 'Jane Smith'], 'email': ['john.doe@example.com', 'jane.smith@example.com']}) + +# Generate a tabular analysis which describes PII entities in the DataFrame. +tabular_analysis = TabularAnalysisBuilder().generate_analysis(sample_df) + +# Define anonymization operators +fake = Faker() +operators = { + "PERSON": OperatorConfig("replace", {"new_value": "REDACTED"}), + "EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: fake.safe_email()}) +} + +# Anonymize DataFrame +anonymized_df = pandas_engine.anonymize(sample_df, tabular_analysis, operators=operators) +print(anonymized_df) +``` + +Example 2: Anonymizing JSON Data + +```python +from presidio_structured import StructuredEngine, JsonAnalysisBuilder, StructuredAnalysis, JsonDataProcessor +from presidio_anonymizer.entities import OperatorConfig +from faker import Faker # optionally using faker as an example + +# Initialize the engine with a JSON data processor +json_engine = StructuredEngine(data_processor=JsonDataProcessor()) + + +# Sample JSON data +sample_json = { + "user": { + "name": "John Doe", + "email": "john.doe@example.com" + } +} + +# Generate analysis for simple JSON data +json_analysis = JsonAnalysisBuilder().generate_analysis(sample_json) + +# Define anonymization operators +fake = Faker() # using faker for email generation. +operators = { + "PERSON": OperatorConfig("replace", {"new_value": "REDACTED"}), + "EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: fake.safe_email()}) +} + +# Anonymize JSON data +anonymized_json = json_engine.anonymize(sample_json, json_analysis, operators=operators) +print(anonymized_json) + +# Handling Json Data with nested objects in lists +sample_complex_json = { + "users": [ + {"name": "John Doe", "email": "john.doe@example.com"}, + {"name": "Jane Smith", "email": "jane.smith@example.com"} + ] +} + +# Nesting objects in lists is not supported in JsonAnalysisBuilder for now, +# Manually defining the analysis for complex JSON data +json_complex_analysis = StructuredAnalysis(entity_mapping={ + "users.name": "PERSON", + "users.email": "EMAIL_ADDRESS" +}) + +# Anonymize complex JSON data +anonymized_complex_json = json_engine.anonymize(sample_complex_json, json_complex_analysis, operators=operators) +print(anonymized_complex_json) +``` diff --git a/presidio-structured/__init__.py b/presidio-structured/__init__.py deleted file mode 100644 index ed6a669d8..000000000 --- a/presidio-structured/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Anonymizer root module.""" -import logging - -# Set up default logging (with NullHandler) - -logging.getLogger("presidio-structured").addHandler(logging.NullHandler()) diff --git a/presidio-structured/presidio_structured/__init__.py b/presidio-structured/presidio_structured/__init__.py index 71138e63c..80bc83405 100644 --- a/presidio-structured/presidio_structured/__init__.py +++ b/presidio-structured/presidio_structured/__init__.py @@ -1,8 +1,13 @@ +""" presidio-structured root module. """ +import logging + from .analysis_builder import JsonAnalysisBuilder, TabularAnalysisBuilder from .config import StructuredAnalysis from .data import CsvReader, JsonDataProcessor, JsonReader, PandasDataProcessor from .structured_engine import StructuredEngine +logging.getLogger("presidio-structured").addHandler(logging.NullHandler()) + __all__ = [ "StructuredEngine", "JsonAnalysisBuilder", From 33182bb53f81f0ad631a0932ade4157ed314008b Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Mon, 27 Nov 2023 11:44:14 +0100 Subject: [PATCH 28/52] Add TabularAnalysisBuilder --- docs/samples/python/example_structured.ipynb | 4 ++-- presidio-structured/README.md | 4 ++-- presidio-structured/presidio_structured/__init__.py | 4 ++-- presidio-structured/presidio_structured/analysis_builder.py | 5 ++++- .../presidio_structured/data/data_processors.py | 1 - presidio-structured/tests/conftest.py | 4 ++-- presidio-structured/tests/test_analysis_builder.py | 2 +- 7 files changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/samples/python/example_structured.ipynb b/docs/samples/python/example_structured.ipynb index f0630a44f..33968e691 100644 --- a/docs/samples/python/example_structured.ipynb +++ b/docs/samples/python/example_structured.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "outputs": [], "source": [ - "from presidio_structured import StructuredEngine, JsonAnalysisBuilder, TabularAnalysisBuilder, StructuredAnalysis, CsvReader, JsonReader, JsonDataProcessor, PandasDataProcessor" + "from presidio_structured import StructuredEngine, JsonAnalysisBuilder, PandasAnalysisBuilder, StructuredAnalysis, CsvReader, JsonReader, JsonDataProcessor, PandasDataProcessor" ] }, { @@ -209,7 +209,7 @@ ], "source": [ "# Automatically detect the entity for the columns\n", - "tabular_analysis = TabularAnalysisBuilder().generate_analysis(sample_df)\n", + "tabular_analysis = PandasAnalysisBuilder().generate_analysis(sample_df)\n", "tabular_analysis" ] }, diff --git a/presidio-structured/README.md b/presidio-structured/README.md index c89c5fb14..b80efdfea 100644 --- a/presidio-structured/README.md +++ b/presidio-structured/README.md @@ -24,7 +24,7 @@ Example 1: Anonymizing DataFrames ```python import pandas as pd -from presidio_structured import StructuredEngine, TabularAnalysisBuilder +from presidio_structured import StructuredEngine, PandasAnalysisBuilder from presidio_anonymizer.entities import OperatorConfig from faker import Faker # optionally using faker as an example @@ -35,7 +35,7 @@ pandas_engine = StructuredEngine() sample_df = pd.DataFrame({'name': ['John Doe', 'Jane Smith'], 'email': ['john.doe@example.com', 'jane.smith@example.com']}) # Generate a tabular analysis which describes PII entities in the DataFrame. -tabular_analysis = TabularAnalysisBuilder().generate_analysis(sample_df) +tabular_analysis = PandasAnalysisBuilder().generate_analysis(sample_df) # Define anonymization operators fake = Faker() diff --git a/presidio-structured/presidio_structured/__init__.py b/presidio-structured/presidio_structured/__init__.py index 80bc83405..83242132e 100644 --- a/presidio-structured/presidio_structured/__init__.py +++ b/presidio-structured/presidio_structured/__init__.py @@ -1,7 +1,7 @@ """ presidio-structured root module. """ import logging -from .analysis_builder import JsonAnalysisBuilder, TabularAnalysisBuilder +from .analysis_builder import JsonAnalysisBuilder, PandasAnalysisBuilder from .config import StructuredAnalysis from .data import CsvReader, JsonDataProcessor, JsonReader, PandasDataProcessor from .structured_engine import StructuredEngine @@ -11,7 +11,7 @@ __all__ = [ "StructuredEngine", "JsonAnalysisBuilder", - "TabularAnalysisBuilder", + "PandasAnalysisBuilder", "StructuredAnalysis", "CsvReader", "JsonReader", diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 1955c505e..4d2fda978 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -76,8 +76,11 @@ def _generate_analysis_from_results_json( mappings[current_key] = first_recognizer_result.entity_type return StructuredAnalysis(entity_mapping=mappings) +class TabularAnalysisbuilder(AnalysisBuilder): + """ Placeholder class for generalizing tabular data analysis builders (e.g. PySpark). Only implemented as PandasAnalysisBuilder for now.""" + pass -class TabularAnalysisBuilder(AnalysisBuilder): +class PandasAnalysisBuilder(TabularAnalysisbuilder): """Concrete configuration generator for tabular data.""" def generate_analysis( diff --git a/presidio-structured/presidio_structured/data/data_processors.py b/presidio-structured/presidio_structured/data/data_processors.py index d191c6d11..710799b3f 100644 --- a/presidio-structured/presidio_structured/data/data_processors.py +++ b/presidio-structured/presidio_structured/data/data_processors.py @@ -98,7 +98,6 @@ def _operate_on_text( """ return operator_callable(text_to_operate_on) - class PandasDataProcessor(DataProcessorBase): def _process( self, data: DataFrame, key_to_operator_mapping: Dict[str, Callable] diff --git a/presidio-structured/tests/conftest.py b/presidio-structured/tests/conftest.py index e8b42dd33..d896d809e 100644 --- a/presidio-structured/tests/conftest.py +++ b/presidio-structured/tests/conftest.py @@ -1,7 +1,7 @@ import pandas as pd import pytest from presidio_anonymizer.entities import OperatorConfig -from presidio_structured import TabularAnalysisBuilder, JsonAnalysisBuilder +from presidio_structured import PandasAnalysisBuilder, JsonAnalysisBuilder from presidio_structured.config import StructuredAnalysis @@ -48,7 +48,7 @@ def json_analysis_builder(): @pytest.fixture def tabular_analysis_builder(): - return TabularAnalysisBuilder() + return PandasAnalysisBuilder() @pytest.fixture diff --git a/presidio-structured/tests/test_analysis_builder.py b/presidio-structured/tests/test_analysis_builder.py index d7ef97b30..0438d19c5 100644 --- a/presidio-structured/tests/test_analysis_builder.py +++ b/presidio-structured/tests/test_analysis_builder.py @@ -1,6 +1,6 @@ import pandas as pd import pytest -from presidio_structured import TabularAnalysisBuilder, JsonAnalysisBuilder +from presidio_structured import PandasAnalysisBuilder, JsonAnalysisBuilder # NOTE: we won't go into depth unit-testing all analyzers, as that is covered in the presidio-analyzer tests From 43c39d861a7587484e6f0ced1aa9fc57f519926e Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Mon, 27 Nov 2023 12:03:24 +0100 Subject: [PATCH 29/52] Some basic logging --- .../presidio_structured/analysis_builder.py | 15 +++++++++++++-- .../presidio_structured/data/data_processors.py | 6 +++++- .../presidio_structured/structured_engine.py | 10 +++++++--- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 4d2fda978..d8f88087c 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -1,7 +1,8 @@ +import logging from abc import ABC, abstractmethod from collections import Counter from collections.abc import Iterable -from typing import Any, Dict, Iterator, Union +from typing import Dict, Iterator, Union from pandas import DataFrame from presidio_analyzer import ( @@ -22,6 +23,7 @@ class AnalysisBuilder(ABC): def __init__(self, analyzer: AnalyzerEngine = None) -> None: """Initialize the configuration generator.""" self.analyzer = AnalyzerEngine() if analyzer is None else analyzer + self.logger = logging.getLogger("presidio-structured") @abstractmethod def generate_analysis(self, data: Union[Dict, DataFrame]) -> StructuredAnalysis: @@ -44,6 +46,7 @@ def generate_analysis(self, data: Dict) -> StructuredAnalysis: :param data: The input JSON data. :return: The generated configuration. """ + self.logger.debug("Starting JSON BatchAnalyzer analysis") batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) analyzer_results = batch_analyzer.analyze_dict(input_dict=data, language="en") return self._generate_analysis_from_results_json(analyzer_results) @@ -61,7 +64,8 @@ def _generate_analysis_from_results_json( mappings = {} if not isinstance(analyzer_results, Iterable): - return mappings + self.logger.debug("No analyzer results found, returning empty StructuredAnalysis") + return StructuredAnalysis(entity_mapping=mappings) for result in analyzer_results: current_key = prefix + result.key @@ -73,6 +77,9 @@ def _generate_analysis_from_results_json( mappings.update(nested_mappings.entity_mapping) first_recognizer_result = next(iter(result.recognizer_results), None) if first_recognizer_result is not None: + self.logger.debug( + f"Found entity {first_recognizer_result.entity_type} in {current_key}" + ) mappings[current_key] = first_recognizer_result.entity_type return StructuredAnalysis(entity_mapping=mappings) @@ -95,6 +102,9 @@ def generate_analysis( :return: The generated configuration. """ if n > len(df): + self.logger.debug( + f"Number of samples ({n}) is larger than the number of rows ({len(df)}), using all rows" + ) n = len(df) df = df.sample(n) @@ -124,6 +134,7 @@ def _find_most_common_entity( batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) for column in df.columns: + self.logger.debug(f"Finding most common PII entity for column {column}") analyzer_results = batch_analyzer.analyze_iterator( [val for val in df[column]], language=language ) diff --git a/presidio-structured/presidio_structured/data/data_processors.py b/presidio-structured/presidio_structured/data/data_processors.py index 710799b3f..afe88ab2a 100644 --- a/presidio-structured/presidio_structured/data/data_processors.py +++ b/presidio-structured/presidio_structured/data/data_processors.py @@ -1,3 +1,4 @@ +import logging from abc import ABC, abstractmethod from typing import Any, Callable, Dict, List, Union @@ -15,7 +16,7 @@ class DataProcessorBase(ABC): def __init__(self) -> None: """Initializes DataProcessorBase object.""" - pass + self.logger = logging.getLogger("presidio-structured") def operate( self, @@ -70,6 +71,7 @@ def _generate_operator_mapping( operators_factory = OperatorsFactory() for key, entity in config.entity_mapping.items(): + self.logger.debug(f"Creating operator for key {key} and entity {entity}") operator_config = operators.get(entity, operators.get("DEFAULT", None)) if operator_config is None: raise ValueError(f"Operator for entity {entity} not found") @@ -114,6 +116,7 @@ def _process( raise ValueError("Data must be a pandas DataFrame") for key, operator_callable in key_to_operator_mapping.items(): + self.logger.debug(f"Operating on column {key}") for idx, row in data.iterrows(): text_to_operate_on = row[key] operated_text = self._operate_on_text( @@ -190,6 +193,7 @@ def _process( raise ValueError("Data must be a JSON-like object") for key, operator_callable in key_to_operator_mapping.items(): + self.logger.debug(f"Operating on key {key}") keys = key.split(".") if isinstance(data, list): for idx, item in enumerate(data): diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py index b36a8f59b..f35f063a9 100644 --- a/presidio-structured/presidio_structured/structured_engine.py +++ b/presidio-structured/presidio_structured/structured_engine.py @@ -1,4 +1,5 @@ -from typing import Any, Dict, Union +import logging +from typing import Dict, Union from pandas import DataFrame from presidio_anonymizer.entities import OperatorConfig @@ -24,6 +25,7 @@ def __init__(self, data_processor: DataProcessorBase = PandasDataProcessor()) -> :param data_processor: Instance of DataProcessorBase. """ self.data_processor = data_processor + self.logger = logging.getLogger("presidio-structured") def anonymize( self, @@ -39,12 +41,12 @@ def anonymize( :param operators: a dictionary of operator configurations, optional. :return: Anonymized dictionary or DataFrame. """ + self.loggger.debug("Starting anonymization") operators = self.__check_or_add_default_operator(operators) return self.data_processor.operate(data, structured_analysis, operators) - @staticmethod - def __check_or_add_default_operator( + def __check_or_add_default_operator(self, operators: Dict[str, OperatorConfig] ) -> Dict[str, OperatorConfig]: """ @@ -56,7 +58,9 @@ def __check_or_add_default_operator( """ default_operator = OperatorConfig(DEFAULT) if not operators: + self.logger.debug("No operators provided, using default operator") return {"DEFAULT": default_operator} if not operators.get("DEFAULT"): + self.logger.debug("No default operator provided, using default operator") operators["DEFAULT"] = default_operator return operators From 411f1bd3cd7242ac6faa881358386ebabdd05179 Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Mon, 27 Nov 2023 12:06:09 +0100 Subject: [PATCH 30/52] linting --- .../presidio_structured/analysis_builder.py | 9 ++++-- .../data/data_processors.py | 1 + .../presidio_structured/structured_engine.py | 8 +++-- presidio-structured/setup.py | 1 - presidio-structured/tests/conftest.py | 29 ++++++++++--------- .../tests/data/test_data_transformers.py | 25 +++++++++++----- .../tests/test_analysis_builder.py | 25 ++++++++++++---- .../tests/test_tabular_engine.py | 14 ++++----- 8 files changed, 71 insertions(+), 41 deletions(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index d8f88087c..473a06927 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -64,7 +64,9 @@ def _generate_analysis_from_results_json( mappings = {} if not isinstance(analyzer_results, Iterable): - self.logger.debug("No analyzer results found, returning empty StructuredAnalysis") + self.logger.debug( + "No analyzer results found, returning empty StructuredAnalysis" + ) return StructuredAnalysis(entity_mapping=mappings) for result in analyzer_results: @@ -83,10 +85,13 @@ def _generate_analysis_from_results_json( mappings[current_key] = first_recognizer_result.entity_type return StructuredAnalysis(entity_mapping=mappings) + class TabularAnalysisbuilder(AnalysisBuilder): - """ Placeholder class for generalizing tabular data analysis builders (e.g. PySpark). Only implemented as PandasAnalysisBuilder for now.""" + """Placeholder class for generalizing tabular data analysis builders (e.g. PySpark). Only implemented as PandasAnalysisBuilder for now.""" + pass + class PandasAnalysisBuilder(TabularAnalysisbuilder): """Concrete configuration generator for tabular data.""" diff --git a/presidio-structured/presidio_structured/data/data_processors.py b/presidio-structured/presidio_structured/data/data_processors.py index afe88ab2a..d2eaf9fb1 100644 --- a/presidio-structured/presidio_structured/data/data_processors.py +++ b/presidio-structured/presidio_structured/data/data_processors.py @@ -100,6 +100,7 @@ def _operate_on_text( """ return operator_callable(text_to_operate_on) + class PandasDataProcessor(DataProcessorBase): def _process( self, data: DataFrame, key_to_operator_mapping: Dict[str, Callable] diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py index f35f063a9..47d367073 100644 --- a/presidio-structured/presidio_structured/structured_engine.py +++ b/presidio-structured/presidio_structured/structured_engine.py @@ -18,7 +18,9 @@ class StructuredEngine: Class to implement methods for anonymizing tabular data. """ - def __init__(self, data_processor: DataProcessorBase = PandasDataProcessor()) -> None: + def __init__( + self, data_processor: DataProcessorBase = PandasDataProcessor() + ) -> None: """ Initialize the class with a data processor. @@ -46,8 +48,8 @@ def anonymize( return self.data_processor.operate(data, structured_analysis, operators) - def __check_or_add_default_operator(self, - operators: Dict[str, OperatorConfig] + def __check_or_add_default_operator( + self, operators: Dict[str, OperatorConfig] ) -> Dict[str, OperatorConfig]: """ Check if the provided operators dictionary has a default operator. diff --git a/presidio-structured/setup.py b/presidio-structured/setup.py index d813203e3..2b3a7e04d 100644 --- a/presidio-structured/setup.py +++ b/presidio-structured/setup.py @@ -49,4 +49,3 @@ long_description=long_description, long_description_content_type="text/markdown", ) - diff --git a/presidio-structured/tests/conftest.py b/presidio-structured/tests/conftest.py index d896d809e..c7560b477 100644 --- a/presidio-structured/tests/conftest.py +++ b/presidio-structured/tests/conftest.py @@ -25,19 +25,15 @@ def sample_json(): "street": "123 Main St", "city": "Anytown", "state": "CA", - "postal_code": "12345" - } + "postal_code": "12345", + }, } return data @pytest.fixture def sample_json_with_array(): - data = {'users': - [ - {'id': 1, 'name': 'John Doe'}, - {'id': 2, 'name': 'Jane Doe'} - ]} + data = {"users": [{"id": 1, "name": "John Doe"}, {"id": 2, "name": "Jane Doe"}]} return data @@ -55,28 +51,33 @@ def tabular_analysis_builder(): def operators(): return { "PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}), - "DEFAULT": OperatorConfig("replace", {"new_value": "DEFAULT_REPLACEMENT"}) + "DEFAULT": OperatorConfig("replace", {"new_value": "DEFAULT_REPLACEMENT"}), } + @pytest.fixture def operators_no_default(): return { "PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}), } + @pytest.fixture def tabular_analysis(): - return StructuredAnalysis(entity_mapping={ + return StructuredAnalysis( + entity_mapping={ "name": "PERSON", "email": "EMAIL_ADDRESS", "phone": "PHONE_NUMBER", - }) + } + ) @pytest.fixture def json_analysis(): - return StructuredAnalysis(entity_mapping={ - "name": "PERSON", - "address.city": "LOCATION", - } + return StructuredAnalysis( + entity_mapping={ + "name": "PERSON", + "address.city": "LOCATION", + } ) diff --git a/presidio-structured/tests/data/test_data_transformers.py b/presidio-structured/tests/data/test_data_transformers.py index 638ed03c9..c9bd365f2 100644 --- a/presidio-structured/tests/data/test_data_transformers.py +++ b/presidio-structured/tests/data/test_data_transformers.py @@ -1,26 +1,32 @@ import pytest from pandas import DataFrame -from presidio_anonymizer.entities import OperatorConfig -from presidio_structured.data.data_processors import DataProcessorBase, PandasDataProcessor, JsonDataProcessor -from presidio_structured.config import StructuredAnalysis +from presidio_structured.data.data_processors import ( + DataProcessorBase, + PandasDataProcessor, + JsonDataProcessor, +) + class TestDataProcessorBase: def test_abstract_init_raises(self, sample_df, tabular_analysis_builder, operators): with pytest.raises(TypeError): DataProcessorBase() + class TestPandasDataProcessor: def test_process(self, sample_df, operators, tabular_analysis): processor = PandasDataProcessor() result = processor.operate(sample_df, tabular_analysis, operators) assert isinstance(result, DataFrame) for key in tabular_analysis.entity_mapping: - if key == 'name': + if key == "name": assert all(result[key] == "PERSON_REPLACEMENT") else: assert all(result[key] == "DEFAULT_REPLACEMENT") - def test_process_no_default_should_raise(self, sample_df, operators_no_default, tabular_analysis): + def test_process_no_default_should_raise( + self, sample_df, operators_no_default, tabular_analysis + ): processor = PandasDataProcessor() with pytest.raises(ValueError): processor.operate(sample_df, tabular_analysis, operators_no_default) @@ -30,6 +36,7 @@ def test_process_invalid_data(self, sample_json, tabular_analysis, operators): with pytest.raises(ValueError): processor.operate(sample_json, tabular_analysis, operators) + class TestJsonDataProcessor: def test_process(self, sample_json, operators, json_analysis): processor = JsonDataProcessor() @@ -40,12 +47,14 @@ def test_process(self, sample_json, operators, json_analysis): nested_value = sample_json for inner_key in keys: nested_value = nested_value[inner_key] - if value == 'PERSON': + if value == "PERSON": assert nested_value == "PERSON_REPLACEMENT" else: assert nested_value == "DEFAULT_REPLACEMENT" - def test_process_no_default_should_raise(self, sample_json, operators_no_default, json_analysis): + def test_process_no_default_should_raise( + self, sample_json, operators_no_default, json_analysis + ): processor = JsonDataProcessor() with pytest.raises(ValueError): processor.operate(sample_json, json_analysis, operators_no_default) @@ -53,4 +62,4 @@ def test_process_no_default_should_raise(self, sample_json, operators_no_default def test_process_invalid_data(self, sample_df, json_analysis, operators): processor = JsonDataProcessor() with pytest.raises(ValueError): - processor.operate(sample_df, json_analysis, operators) \ No newline at end of file + processor.operate(sample_df, json_analysis, operators) diff --git a/presidio-structured/tests/test_analysis_builder.py b/presidio-structured/tests/test_analysis_builder.py index 0438d19c5..4d1a6a834 100644 --- a/presidio-structured/tests/test_analysis_builder.py +++ b/presidio-structured/tests/test_analysis_builder.py @@ -1,9 +1,9 @@ import pandas as pd import pytest -from presidio_structured import PandasAnalysisBuilder, JsonAnalysisBuilder # NOTE: we won't go into depth unit-testing all analyzers, as that is covered in the presidio-analyzer tests + def test_generate_analysis_tabular(tabular_analysis_builder, sample_df): structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) @@ -11,6 +11,7 @@ def test_generate_analysis_tabular(tabular_analysis_builder, sample_df): assert structured_analysis.entity_mapping["email"] == "EMAIL_ADDRESS" assert structured_analysis.entity_mapping["phone"] == "PHONE_NUMBER" + def test_generate_analysis_tabular_with_sampling(tabular_analysis_builder, sample_df): structured_analysis = tabular_analysis_builder.generate_analysis(sample_df, n=2) @@ -19,35 +20,49 @@ def test_generate_analysis_tabular_with_sampling(tabular_analysis_builder, sampl assert structured_analysis.entity_mapping["email"] == "EMAIL_ADDRESS" assert structured_analysis.entity_mapping["phone"] == "PHONE_NUMBER" -def test_generate_analysis_tabular_with_invalid_sampling(tabular_analysis_builder, sample_df): + +def test_generate_analysis_tabular_with_invalid_sampling( + tabular_analysis_builder, sample_df +): with pytest.raises(ValueError): tabular_analysis_builder.generate_analysis(sample_df, n=-1) + def test_find_most_common_entity(tabular_analysis_builder, sample_df): - key_recognizer_result_map = tabular_analysis_builder._find_most_common_entity(sample_df, "en") + key_recognizer_result_map = tabular_analysis_builder._find_most_common_entity( + sample_df, "en" + ) assert len(key_recognizer_result_map) == 3 assert key_recognizer_result_map["name"].entity_type == "PERSON" assert key_recognizer_result_map["email"].entity_type == "EMAIL_ADDRESS" assert key_recognizer_result_map["phone"].entity_type == "PHONE_NUMBER" + def test_find_most_common_entity_with_empty_df(tabular_analysis_builder): df = pd.DataFrame() - key_recognizer_result_map = tabular_analysis_builder._find_most_common_entity(df, "en") + key_recognizer_result_map = tabular_analysis_builder._find_most_common_entity( + df, "en" + ) assert len(key_recognizer_result_map) == 0 + def test_generate_analysis_json(json_analysis_builder, sample_json): structured_analysis = json_analysis_builder.generate_analysis(sample_json) assert structured_analysis.entity_mapping["name"] == "PERSON" assert structured_analysis.entity_mapping["address.city"] == "LOCATION" -def test_generate_analysis_json_with_list_should_raise(json_analysis_builder, sample_json_with_array): + +def test_generate_analysis_json_with_list_should_raise( + json_analysis_builder, sample_json_with_array +): # this feature is not supported by the BatchAnalyzerEngine used in the JsonAnalysisBuilder with pytest.raises(ValueError): json_analysis_builder.generate_analysis(sample_json_with_array) + def test_generate_analysis_json_with_empty_data(json_analysis_builder): data = {} structured_analysis = json_analysis_builder.generate_analysis(data) diff --git a/presidio-structured/tests/test_tabular_engine.py b/presidio-structured/tests/test_tabular_engine.py index 4d758a94f..45aca7804 100644 --- a/presidio-structured/tests/test_tabular_engine.py +++ b/presidio-structured/tests/test_tabular_engine.py @@ -21,9 +21,7 @@ def test_structured_engine_anonymize_calls_data_processor_operate(): structured_engine.anonymize(data, structured_analysis, operators) # Assert - data_processor.operate.assert_called_once_with( - data, structured_analysis, operators - ) + data_processor.operate.assert_called_once_with(data, structured_analysis, operators) def test_structured_engine_anonymize_adds_default_operator_if_none_provided(): @@ -54,9 +52,8 @@ def test_structured_engine_anonymize_does_not_override_existing_default_operator structured_engine.anonymize(data, structured_analysis, operators) # Assert - data_processor.operate.assert_called_once_with( - data, structured_analysis, operators - ) + data_processor.operate.assert_called_once_with(data, structured_analysis, operators) + def test_json_processor_with_pandas_dataframe_will_raise(tabular_analysis): data_processor = JsonDataProcessor() @@ -65,8 +62,9 @@ def test_json_processor_with_pandas_dataframe_will_raise(tabular_analysis): with pytest.raises(ValueError): structured_engine.anonymize(data, tabular_analysis) + def test_pandas_processor_with_json_will_raise(json_analysis): - structured_engine = StructuredEngine() # default PandasDataProcessor + structured_engine = StructuredEngine() # default PandasDataProcessor data = {"name": ["John", "Jane"]} with pytest.raises(ValueError): - structured_engine.anonymize(data, json_analysis) \ No newline at end of file + structured_engine.anonymize(data, json_analysis) From e31ff12ca9f5e104a5a3b4b85ba69eb596357f47 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Mon, 27 Nov 2023 13:49:30 +0100 Subject: [PATCH 31/52] Fix typo in logger variable name --- presidio-structured/presidio_structured/structured_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py index 47d367073..d6f0e0369 100644 --- a/presidio-structured/presidio_structured/structured_engine.py +++ b/presidio-structured/presidio_structured/structured_engine.py @@ -43,7 +43,7 @@ def anonymize( :param operators: a dictionary of operator configurations, optional. :return: Anonymized dictionary or DataFrame. """ - self.loggger.debug("Starting anonymization") + self.logger.debug("Starting anonymization") operators = self.__check_or_add_default_operator(operators) return self.data_processor.operate(data, structured_analysis, operators) From bdd7e20595ceb7c3eb60b4bd4c234e3f68d2655a Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Mon, 27 Nov 2023 13:49:37 +0100 Subject: [PATCH 32/52] Refactor analysis builder to include score threshold --- .../presidio_structured/analysis_builder.py | 35 +++++++++++++++- .../tests/test_analysis_builder.py | 40 +++++++++++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 473a06927..542c2363c 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from collections import Counter from collections.abc import Iterable -from typing import Dict, Iterator, Union +from typing import Dict, Iterator, List, Optional, Union from pandas import DataFrame from presidio_analyzer import ( @@ -96,7 +96,11 @@ class PandasAnalysisBuilder(TabularAnalysisbuilder): """Concrete configuration generator for tabular data.""" def generate_analysis( - self, df: DataFrame, n: int = 100, language: str = "en" + self, + df: DataFrame, + n: int = 100, + language: str = "en", + score_threshold: Optional[float] = None, ) -> StructuredAnalysis: """ Generate a configuration from the given tabular data. @@ -116,6 +120,11 @@ def generate_analysis( key_recognizer_result_map = self._find_most_common_entity(df, language) + # Remove low score results + key_recognizer_result_map = self.__remove_low_scores( + key_recognizer_result_map, score_threshold + ) + key_entity_map = { key: result.entity_type for key, result in key_recognizer_result_map.items() @@ -166,3 +175,25 @@ def _find_most_common_entity( most_common_type, 0, 1, average_score ) return key_recognizer_result_map + + def __remove_low_scores( + self, + key_recognizer_result_map: Dict[str, RecognizerResult], + score_threshold: float = None, + ) -> List[RecognizerResult]: + """ + Remove results for which the confidence is lower than the threshold. + + :param results: Dict of column names to RecognizerResult + :param score_threshold: float value for minimum possible confidence + :return: List[RecognizerResult] + """ + if score_threshold is None: + score_threshold = self.analyzer.default_score_threshold + + new_key_recognizer_result_map = {} + for column, result in key_recognizer_result_map.items(): + if result.score >= score_threshold: + new_key_recognizer_result_map[column] = result + + return new_key_recognizer_result_map diff --git a/presidio-structured/tests/test_analysis_builder.py b/presidio-structured/tests/test_analysis_builder.py index 4d1a6a834..201859053 100644 --- a/presidio-structured/tests/test_analysis_builder.py +++ b/presidio-structured/tests/test_analysis_builder.py @@ -1,6 +1,10 @@ import pandas as pd import pytest +from presidio_analyzer import AnalyzerEngine + +from presidio_structured import PandasAnalysisBuilder + # NOTE: we won't go into depth unit-testing all analyzers, as that is covered in the presidio-analyzer tests @@ -48,6 +52,42 @@ def test_find_most_common_entity_with_empty_df(tabular_analysis_builder): assert len(key_recognizer_result_map) == 0 +def test_when_threshold_is_zero_then_all_results_pass( + tabular_analysis_builder, sample_df +): + structured_analysis = tabular_analysis_builder.generate_analysis( + sample_df, score_threshold=0 + ) + + assert len(structured_analysis.entity_mapping) == 3 + + +def test_when_threshold_is_half_then_phone_does_not_pass( + tabular_analysis_builder, sample_df +): + structured_analysis = tabular_analysis_builder.generate_analysis( + sample_df, score_threshold=0.5 + ) + + assert len(structured_analysis.entity_mapping) == 2 + + +def test_when_default_threshold_is_half_then_phone_does_not_pass(sample_df): + analyzer_engine = AnalyzerEngine(default_score_threshold=0.5) + tabular_analysis_builder = PandasAnalysisBuilder(analyzer_engine) + structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) + + assert len(structured_analysis.entity_mapping) == 2 + + +def test_when_default_threshold_is_zero_then_all_results_pass( + tabular_analysis_builder, sample_df +): + structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) + + assert len(structured_analysis.entity_mapping) == 3 + + def test_generate_analysis_json(json_analysis_builder, sample_json): structured_analysis = json_analysis_builder.generate_analysis(sample_json) From 15b756f8f42b5541a31c2e9119b27839a8c61b1e Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Mon, 27 Nov 2023 14:22:01 +0100 Subject: [PATCH 33/52] Linting, continued --- .../presidio_structured/__init__.py | 9 +++- .../presidio_structured/analysis_builder.py | 36 ++++++++++------ .../presidio_structured/config/__init__.py | 1 + .../config/structured_analysis.py | 7 ++-- .../presidio_structured/data/__init__.py | 2 + .../data/data_processors.py | 41 +++++++++++++------ .../presidio_structured/data/data_reader.py | 4 +- .../presidio_structured/structured_engine.py | 19 +++++---- presidio-structured/setup.cfg | 10 +++++ presidio-structured/setup.py | 7 +++- presidio-structured/tests/conftest.py | 27 +++++++++--- .../tests/data/test_data_transformers.py | 16 ++++++-- .../tests/test_analysis_builder.py | 22 ++++++---- .../tests/test_tabular_engine.py | 10 +++-- 14 files changed, 150 insertions(+), 61 deletions(-) create mode 100644 presidio-structured/setup.cfg diff --git a/presidio-structured/presidio_structured/__init__.py b/presidio-structured/presidio_structured/__init__.py index 83242132e..7ad40b67a 100644 --- a/presidio-structured/presidio_structured/__init__.py +++ b/presidio-structured/presidio_structured/__init__.py @@ -1,9 +1,14 @@ -""" presidio-structured root module. """ +"""presidio-structured root module.""" import logging from .analysis_builder import JsonAnalysisBuilder, PandasAnalysisBuilder from .config import StructuredAnalysis -from .data import CsvReader, JsonDataProcessor, JsonReader, PandasDataProcessor +from .data import ( + CsvReader, + JsonDataProcessor, + JsonReader, + PandasDataProcessor, +) from .structured_engine import StructuredEngine logging.getLogger("presidio-structured").addHandler(logging.NullHandler()) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 473a06927..b679977b9 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -16,9 +16,7 @@ class AnalysisBuilder(ABC): - """ - Abstract base class for a configuration generator. - """ + """Abstract base class for a configuration generator.""" def __init__(self, analyzer: AnalyzerEngine = None) -> None: """Initialize the configuration generator.""" @@ -26,7 +24,9 @@ def __init__(self, analyzer: AnalyzerEngine = None) -> None: self.logger = logging.getLogger("presidio-structured") @abstractmethod - def generate_analysis(self, data: Union[Dict, DataFrame]) -> StructuredAnalysis: + def generate_analysis( + self, data: Union[Dict, DataFrame] + ) -> StructuredAnalysis: """ Abstract method to generate a configuration from the given data. @@ -48,14 +48,17 @@ def generate_analysis(self, data: Dict) -> StructuredAnalysis: """ self.logger.debug("Starting JSON BatchAnalyzer analysis") batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) - analyzer_results = batch_analyzer.analyze_dict(input_dict=data, language="en") + analyzer_results = batch_analyzer.analyze_dict( + input_dict=data, language="en" + ) return self._generate_analysis_from_results_json(analyzer_results) def _generate_analysis_from_results_json( self, analyzer_results: Iterator[DictAnalyzerResult], prefix: str = "" ) -> StructuredAnalysis: """ - Generate a configuration from the given analyzer results. Always uses the first recognizer result if there are more than one. + Generate a configuration from the given analyzer results. \ + Always uses the first recognizer result if there are more than one. :param analyzer_results: The analyzer results. :param prefix: The prefix for the configuration keys. @@ -77,17 +80,21 @@ def _generate_analysis_from_results_json( result.recognizer_results, prefix=current_key + "." ) mappings.update(nested_mappings.entity_mapping) - first_recognizer_result = next(iter(result.recognizer_results), None) + first_recognizer_result = next( + iter(result.recognizer_results), None + ) if first_recognizer_result is not None: self.logger.debug( - f"Found entity {first_recognizer_result.entity_type} in {current_key}" + f"Found entity {first_recognizer_result.entity_type} \ + in {current_key}" ) mappings[current_key] = first_recognizer_result.entity_type return StructuredAnalysis(entity_mapping=mappings) class TabularAnalysisbuilder(AnalysisBuilder): - """Placeholder class for generalizing tabular data analysis builders (e.g. PySpark). Only implemented as PandasAnalysisBuilder for now.""" + """Placeholder class for generalizing tabular data analysis builders \ + (e.g. PySpark). Only implemented as PandasAnalysisBuilder for now.""" pass @@ -108,13 +115,16 @@ def generate_analysis( """ if n > len(df): self.logger.debug( - f"Number of samples ({n}) is larger than the number of rows ({len(df)}), using all rows" + f"Number of samples ({n}) is larger than the number of rows \ + ({len(df)}), using all rows" ) n = len(df) df = df.sample(n) - key_recognizer_result_map = self._find_most_common_entity(df, language) + key_recognizer_result_map = self._find_most_common_entity( + df, language + ) key_entity_map = { key: result.entity_type @@ -139,7 +149,9 @@ def _find_most_common_entity( batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) for column in df.columns: - self.logger.debug(f"Finding most common PII entity for column {column}") + self.logger.debug( + f"Finding most common PII entity for column {column}" + ) analyzer_results = batch_analyzer.analyze_iterator( [val for val in df[column]], language=language ) diff --git a/presidio-structured/presidio_structured/config/__init__.py b/presidio-structured/presidio_structured/config/__init__.py index 85341c3e5..f7724c726 100644 --- a/presidio-structured/presidio_structured/config/__init__.py +++ b/presidio-structured/presidio_structured/config/__init__.py @@ -1,3 +1,4 @@ +"""Config module for presidio-structured.""" from .structured_analysis import StructuredAnalysis __all__ = [ diff --git a/presidio-structured/presidio_structured/config/structured_analysis.py b/presidio-structured/presidio_structured/config/structured_analysis.py index f9a00c519..ca1b75d2c 100644 --- a/presidio-structured/presidio_structured/config/structured_analysis.py +++ b/presidio-structured/presidio_structured/config/structured_analysis.py @@ -1,4 +1,4 @@ -""" Structured Analysis module. """ +"""Structured Analysis module.""" from dataclasses import dataclass from typing import Dict @@ -6,8 +6,9 @@ @dataclass class StructuredAnalysis: - """Dataclass containing entity analysis from structured data. Currently only contains entity mapping.""" + """Dataclass containing entity analysis from structured data.\ + Currently only contains entity mapping.""" entity_mapping: Dict[ str, str - ] # NOTE ideally Literal[...] with allowed EntityTypes, but cannot unpack in Literal. + ] diff --git a/presidio-structured/presidio_structured/data/__init__.py b/presidio-structured/presidio_structured/data/__init__.py index b888f9829..a65a622dd 100644 --- a/presidio-structured/presidio_structured/data/__init__.py +++ b/presidio-structured/presidio_structured/data/__init__.py @@ -1,3 +1,5 @@ +"""Data module.""" + from .data_reader import CsvReader, JsonReader from .data_processors import JsonDataProcessor, PandasDataProcessor diff --git a/presidio-structured/presidio_structured/data/data_processors.py b/presidio-structured/presidio_structured/data/data_processors.py index d2eaf9fb1..05b8b82df 100644 --- a/presidio-structured/presidio_structured/data/data_processors.py +++ b/presidio-structured/presidio_structured/data/data_processors.py @@ -10,12 +10,10 @@ class DataProcessorBase(ABC): - """ - Abstract base class to handle logic of operations over the text using the operators. - """ + """Abstract class to handle logic of operations over text using the operators.""" def __init__(self) -> None: - """Initializes DataProcessorBase object.""" + """Initialize DataProcessorBase object.""" self.logger = logging.getLogger("presidio-structured") def operate( @@ -25,7 +23,8 @@ def operate( operators: Dict[str, OperatorConfig], ) -> Any: """ - Performs operations over the text using the operators, as per the structured analysis. + Perform operations over the text using the operators, \ + as per the structured analysis. :param data: Data to be operated on. :param structured_analysis: Analysis schema as per the structured data. @@ -39,7 +38,9 @@ def operate( @abstractmethod def _process( - self, data: Dict | DataFrame, key_to_operator_mapping: Dict[str, Callable] + self, + data: Dict | DataFrame, + key_to_operator_mapping: Dict[str, Callable], ) -> Dict | DataFrame: """ Abstract method for subclasses to provide operation implementation. @@ -71,11 +72,15 @@ def _generate_operator_mapping( operators_factory = OperatorsFactory() for key, entity in config.entity_mapping.items(): - self.logger.debug(f"Creating operator for key {key} and entity {entity}") - operator_config = operators.get(entity, operators.get("DEFAULT", None)) + self.logger.debug( + f"Creating operator for key {key} and entity {entity}" + ) + operator_config = operators.get( + entity, operators.get("DEFAULT", None) + ) if operator_config is None: raise ValueError(f"Operator for entity {entity} not found") - # NOTE: hardcoded OperatorType.Anonymize, as this is the only one supported for now. + # NOTE: hardcoded OperatorType.Anonymize, as this is the only one supported. operator = operators_factory.create_operator_class( operator_config.operator_name, OperatorType.Anonymize ) @@ -102,6 +107,8 @@ def _operate_on_text( class PandasDataProcessor(DataProcessorBase): + """Pandas Data Processor.""" + def _process( self, data: DataFrame, key_to_operator_mapping: Dict[str, Callable] ) -> DataFrame: @@ -155,9 +162,12 @@ def _get_nested_value(data: Union[Dict, List], path: List[str]) -> Any: return data @staticmethod - def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> None: + def _set_nested_value( + data: Union[Dict, List], path: List[str], value: Any + ) -> None: """ Recursively sets a value in nested data using a given path. + :param data: Nested data (JSON-like). :param path: List of keys/indexes representing the path. :param value: Value to be set. @@ -172,7 +182,9 @@ def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> N continue else: for item in data: - JsonDataProcessor._set_nested_value(item, path[i:], value) + JsonDataProcessor._set_nested_value( + item, path[i:], value + ) return elif isinstance(data, dict): if i == len(path) - 1: @@ -181,10 +193,13 @@ def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> N data = data.setdefault(key, {}) def _process( - self, data: Union[Dict, List], key_to_operator_mapping: Dict[str, Callable] + self, + data: Union[Dict, List], + key_to_operator_mapping: Dict[str, Callable], ) -> Union[Dict, List]: """ - Operates on the given JSON-like data (nested dictionary/list) based on the provided configuration. + Operates on the given JSON-like data based on the provided configuration. + :param data: JSON-like data to be operated on. :param config: Configuration object containing operator information. :return: JSON-like data after the operation. diff --git a/presidio-structured/presidio_structured/data/data_reader.py b/presidio-structured/presidio_structured/data/data_reader.py index 0149f6527..ab1d675a1 100644 --- a/presidio-structured/presidio_structured/data/data_reader.py +++ b/presidio-structured/presidio_structured/data/data_reader.py @@ -1,4 +1,4 @@ -""" Helper data classes, mostly simple wrappers to ensure consistent user interface. """ +"""Helper data classes, mostly simple wrappers to ensure consistent user interface.""" import json from abc import ABC, abstractmethod @@ -12,7 +12,7 @@ class ReaderBase(ABC): """ Base class for data readers. - This class should not be instantiated directly. Instead use or define a reader subclass. + This class should not be instantiated directly, instead init a subclass. """ @abstractmethod diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py index 47d367073..aa046693c 100644 --- a/presidio-structured/presidio_structured/structured_engine.py +++ b/presidio-structured/presidio_structured/structured_engine.py @@ -14,9 +14,7 @@ class StructuredEngine: - """ - Class to implement methods for anonymizing tabular data. - """ + """Class to implement methods for anonymizing tabular data.""" def __init__( self, data_processor: DataProcessorBase = PandasDataProcessor() @@ -46,23 +44,28 @@ def anonymize( self.loggger.debug("Starting anonymization") operators = self.__check_or_add_default_operator(operators) - return self.data_processor.operate(data, structured_analysis, operators) + return self.data_processor.operate( + data, structured_analysis, operators + ) def __check_or_add_default_operator( self, operators: Dict[str, OperatorConfig] ) -> Dict[str, OperatorConfig]: """ - Check if the provided operators dictionary has a default operator. - If not, add a default operator. + Check if the provided operators dictionary has a default operator. \ + If not, add a default operator. :param operators: dictionary of operator configurations. - :return: operators dictionary with the default operator added if it was not initially present. + :return: operators dictionary with the default operator added \ + if it was not initially present. """ default_operator = OperatorConfig(DEFAULT) if not operators: self.logger.debug("No operators provided, using default operator") return {"DEFAULT": default_operator} if not operators.get("DEFAULT"): - self.logger.debug("No default operator provided, using default operator") + self.logger.debug( + "No default operator provided, using default operator" + ) operators["DEFAULT"] = default_operator return operators diff --git a/presidio-structured/setup.cfg b/presidio-structured/setup.cfg new file mode 100644 index 000000000..732559f8e --- /dev/null +++ b/presidio-structured/setup.cfg @@ -0,0 +1,10 @@ +[flake8] +max-line-length = 88 +exclude = + .git, + __pycache__, + build, + dist, + tests +docstring-convention = numpy +extend-ignore = E203 D100 D202 ANN101 ANN102 ANN204 ANN203 TC \ No newline at end of file diff --git a/presidio-structured/setup.py b/presidio-structured/setup.py index 2b3a7e04d..ab9aeeb80 100644 --- a/presidio-structured/setup.py +++ b/presidio-structured/setup.py @@ -24,7 +24,9 @@ name="presidio_structured", python_requires=">=3.5", version=__version__, - packages=find_packages(include=["presidio_structured", "presidio_structured.*"]), + packages=find_packages( + include=["presidio_structured", "presidio_structured.*"] + ), classifiers=[ "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", @@ -36,7 +38,8 @@ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", ], - description="Presidio structured package - analyses and anonymizes structured and semistructured data.", + description="Presidio structured package - analyses and anonymizes \ + structured and semistructured data.", license="MIT license", include_package_data=True, keywords="presidio_structured", diff --git a/presidio-structured/tests/conftest.py b/presidio-structured/tests/conftest.py index c7560b477..11f78a805 100644 --- a/presidio-structured/tests/conftest.py +++ b/presidio-structured/tests/conftest.py @@ -1,3 +1,5 @@ +""" Pytest fixtures for presidio-structured tests. """ + import pandas as pd import pytest from presidio_anonymizer.entities import OperatorConfig @@ -9,7 +11,11 @@ def sample_df(): data = { "name": ["John Doe", "Jane Doe", "John Smith"], - "email": ["john@example.com", "jane@example.com", "johnsmith@example.com"], + "email": [ + "john@example.com", + "jane@example.com", + "johnsmith@example.com", + ], "phone": ["1234567890", "0987654321", "1122334455"], } return pd.DataFrame(data) @@ -33,7 +39,12 @@ def sample_json(): @pytest.fixture def sample_json_with_array(): - data = {"users": [{"id": 1, "name": "John Doe"}, {"id": 2, "name": "Jane Doe"}]} + data = { + "users": [ + {"id": 1, "name": "John Doe"}, + {"id": 2, "name": "Jane Doe"}, + ] + } return data @@ -50,15 +61,21 @@ def tabular_analysis_builder(): @pytest.fixture def operators(): return { - "PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}), - "DEFAULT": OperatorConfig("replace", {"new_value": "DEFAULT_REPLACEMENT"}), + "PERSON": OperatorConfig( + "replace", {"new_value": "PERSON_REPLACEMENT"} + ), + "DEFAULT": OperatorConfig( + "replace", {"new_value": "DEFAULT_REPLACEMENT"} + ), } @pytest.fixture def operators_no_default(): return { - "PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}), + "PERSON": OperatorConfig( + "replace", {"new_value": "PERSON_REPLACEMENT"} + ), } diff --git a/presidio-structured/tests/data/test_data_transformers.py b/presidio-structured/tests/data/test_data_transformers.py index c9bd365f2..514569ce9 100644 --- a/presidio-structured/tests/data/test_data_transformers.py +++ b/presidio-structured/tests/data/test_data_transformers.py @@ -8,7 +8,9 @@ class TestDataProcessorBase: - def test_abstract_init_raises(self, sample_df, tabular_analysis_builder, operators): + def test_abstract_init_raises( + self, sample_df, tabular_analysis_builder, operators + ): with pytest.raises(TypeError): DataProcessorBase() @@ -29,9 +31,13 @@ def test_process_no_default_should_raise( ): processor = PandasDataProcessor() with pytest.raises(ValueError): - processor.operate(sample_df, tabular_analysis, operators_no_default) + processor.operate( + sample_df, tabular_analysis, operators_no_default + ) - def test_process_invalid_data(self, sample_json, tabular_analysis, operators): + def test_process_invalid_data( + self, sample_json, tabular_analysis, operators + ): processor = PandasDataProcessor() with pytest.raises(ValueError): processor.operate(sample_json, tabular_analysis, operators) @@ -57,7 +63,9 @@ def test_process_no_default_should_raise( ): processor = JsonDataProcessor() with pytest.raises(ValueError): - processor.operate(sample_json, json_analysis, operators_no_default) + processor.operate( + sample_json, json_analysis, operators_no_default + ) def test_process_invalid_data(self, sample_df, json_analysis, operators): processor = JsonDataProcessor() diff --git a/presidio-structured/tests/test_analysis_builder.py b/presidio-structured/tests/test_analysis_builder.py index 4d1a6a834..92ad91720 100644 --- a/presidio-structured/tests/test_analysis_builder.py +++ b/presidio-structured/tests/test_analysis_builder.py @@ -1,3 +1,5 @@ +""" Test the analysis builder """ + import pandas as pd import pytest @@ -5,15 +7,21 @@ def test_generate_analysis_tabular(tabular_analysis_builder, sample_df): - structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) + structured_analysis = tabular_analysis_builder.generate_analysis( + sample_df + ) assert structured_analysis.entity_mapping["name"] == "PERSON" assert structured_analysis.entity_mapping["email"] == "EMAIL_ADDRESS" assert structured_analysis.entity_mapping["phone"] == "PHONE_NUMBER" -def test_generate_analysis_tabular_with_sampling(tabular_analysis_builder, sample_df): - structured_analysis = tabular_analysis_builder.generate_analysis(sample_df, n=2) +def test_generate_analysis_tabular_with_sampling( + tabular_analysis_builder, sample_df +): + structured_analysis = tabular_analysis_builder.generate_analysis( + sample_df, n=2 + ) assert len(structured_analysis.entity_mapping) == 3 assert structured_analysis.entity_mapping["name"] == "PERSON" @@ -29,8 +37,8 @@ def test_generate_analysis_tabular_with_invalid_sampling( def test_find_most_common_entity(tabular_analysis_builder, sample_df): - key_recognizer_result_map = tabular_analysis_builder._find_most_common_entity( - sample_df, "en" + key_recognizer_result_map = ( + tabular_analysis_builder._find_most_common_entity(sample_df, "en") ) assert len(key_recognizer_result_map) == 3 @@ -41,8 +49,8 @@ def test_find_most_common_entity(tabular_analysis_builder, sample_df): def test_find_most_common_entity_with_empty_df(tabular_analysis_builder): df = pd.DataFrame() - key_recognizer_result_map = tabular_analysis_builder._find_most_common_entity( - df, "en" + key_recognizer_result_map = ( + tabular_analysis_builder._find_most_common_entity(df, "en") ) assert len(key_recognizer_result_map) == 0 diff --git a/presidio-structured/tests/test_tabular_engine.py b/presidio-structured/tests/test_tabular_engine.py index 45aca7804..3fe02f272 100644 --- a/presidio-structured/tests/test_tabular_engine.py +++ b/presidio-structured/tests/test_tabular_engine.py @@ -21,7 +21,9 @@ def test_structured_engine_anonymize_calls_data_processor_operate(): structured_engine.anonymize(data, structured_analysis, operators) # Assert - data_processor.operate.assert_called_once_with(data, structured_analysis, operators) + data_processor.operate.assert_called_once_with( + data, structured_analysis, operators + ) def test_structured_engine_anonymize_adds_default_operator_if_none_provided(): @@ -40,7 +42,7 @@ def test_structured_engine_anonymize_adds_default_operator_if_none_provided(): assert "DEFAULT" in args[2] -def test_structured_engine_anonymize_does_not_override_existing_default_operator(): +def test_structured_engine_anonymize_doesnt_override_existing_default_operator(): # Arrange data_processor = Mock() structured_engine = StructuredEngine(data_processor) @@ -52,7 +54,9 @@ def test_structured_engine_anonymize_does_not_override_existing_default_operator structured_engine.anonymize(data, structured_analysis, operators) # Assert - data_processor.operate.assert_called_once_with(data, structured_analysis, operators) + data_processor.operate.assert_called_once_with( + data, structured_analysis, operators + ) def test_json_processor_with_pandas_dataframe_will_raise(tabular_analysis): From d4e317cfcf76d2e61e5e6f2a5ca0c282c2436dab Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Mon, 27 Nov 2023 14:33:09 +0100 Subject: [PATCH 34/52] Update Pipfile --- presidio-structured/Pipfile | 1 + 1 file changed, 1 insertion(+) diff --git a/presidio-structured/Pipfile b/presidio-structured/Pipfile index bc9ae43e1..4205f8b63 100644 --- a/presidio-structured/Pipfile +++ b/presidio-structured/Pipfile @@ -7,6 +7,7 @@ name = "pypi" flask = ">=1.1" presidio-analyzer = ">=2.2.31" presidio-anonymizer = ">=2.2.31" +pandas = ">=1.5.2" [dev-packages] pytest = "*" From 6513668bfec1d3fe6ec5d8d6e24cf9b5f60b7947 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Mon, 27 Nov 2023 17:49:30 +0100 Subject: [PATCH 35/52] Refactor JsonAnalysisBuilder to support language parameter --- .../presidio_structured/analysis_builder.py | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index c942761cd..80dcc0232 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -25,7 +25,9 @@ def __init__(self, analyzer: AnalyzerEngine = None) -> None: @abstractmethod def generate_analysis( - self, data: Union[Dict, DataFrame] + self, + data: Union[Dict, DataFrame], + language: str = "en", ) -> StructuredAnalysis: """ Abstract method to generate a configuration from the given data. @@ -39,7 +41,11 @@ def generate_analysis( class JsonAnalysisBuilder(AnalysisBuilder): """Concrete configuration generator for JSON data.""" - def generate_analysis(self, data: Dict) -> StructuredAnalysis: + def generate_analysis( + self, + data: Dict, + language: str = "en", + ) -> StructuredAnalysis: """ Generate a configuration from the given JSON data. @@ -48,9 +54,7 @@ def generate_analysis(self, data: Dict) -> StructuredAnalysis: """ self.logger.debug("Starting JSON BatchAnalyzer analysis") batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) - analyzer_results = batch_analyzer.analyze_dict( - input_dict=data, language="en" - ) + analyzer_results = batch_analyzer.analyze_dict(input_dict=data, language=language) return self._generate_analysis_from_results_json(analyzer_results) def _generate_analysis_from_results_json( @@ -80,9 +84,7 @@ def _generate_analysis_from_results_json( result.recognizer_results, prefix=current_key + "." ) mappings.update(nested_mappings.entity_mapping) - first_recognizer_result = next( - iter(result.recognizer_results), None - ) + first_recognizer_result = next(iter(result.recognizer_results), None) if first_recognizer_result is not None: self.logger.debug( f"Found entity {first_recognizer_result.entity_type} \ @@ -126,9 +128,7 @@ def generate_analysis( df = df.sample(n) - key_recognizer_result_map = self._find_most_common_entity( - df, language - ) + key_recognizer_result_map = self._find_most_common_entity(df, language) # Remove low score results key_recognizer_result_map = self.__remove_low_scores( @@ -158,9 +158,7 @@ def _find_most_common_entity( batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) for column in df.columns: - self.logger.debug( - f"Finding most common PII entity for column {column}" - ) + self.logger.debug(f"Finding most common PII entity for column {column}") analyzer_results = batch_analyzer.analyze_iterator( [val for val in df[column]], language=language ) From df2a4e02115f372191d72d2d7b27e850500c8b2a Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Mon, 27 Nov 2023 17:51:18 +0100 Subject: [PATCH 36/52] Fix not camel case in TabularAnalysisBuilder --- presidio-structured/presidio_structured/analysis_builder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 80dcc0232..1a933bece 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -94,14 +94,14 @@ def _generate_analysis_from_results_json( return StructuredAnalysis(entity_mapping=mappings) -class TabularAnalysisbuilder(AnalysisBuilder): +class TabularAnalysisBuilder(AnalysisBuilder): """Placeholder class for generalizing tabular data analysis builders \ (e.g. PySpark). Only implemented as PandasAnalysisBuilder for now.""" pass -class PandasAnalysisBuilder(TabularAnalysisbuilder): +class PandasAnalysisBuilder(TabularAnalysisBuilder): """Concrete configuration generator for tabular data.""" def generate_analysis( From 75da36a5ecf62110e0b28c6f79ba8966f7cbe0ad Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Mon, 27 Nov 2023 17:52:30 +0100 Subject: [PATCH 37/52] Add score_threshold parameter to AnalysisBuilder --- .../presidio_structured/analysis_builder.py | 49 ++++++++++--------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 1a933bece..5e7432db1 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -28,6 +28,7 @@ def generate_analysis( self, data: Union[Dict, DataFrame], language: str = "en", + score_threshold: Optional[float] = None, ) -> StructuredAnalysis: """ Abstract method to generate a configuration from the given data. @@ -37,6 +38,27 @@ def generate_analysis( """ pass + def _remove_low_scores( + self, + key_recognizer_result_map: Dict[str, RecognizerResult], + score_threshold: float = None, + ) -> List[RecognizerResult]: + """ + Remove results for which the confidence is lower than the threshold. + + :param results: Dict of column names to RecognizerResult + :param score_threshold: float value for minimum possible confidence + :return: List[RecognizerResult] + """ + if score_threshold is None: + score_threshold = self.analyzer.default_score_threshold + + new_key_recognizer_result_map = {} + for column, result in key_recognizer_result_map.items(): + if result.score >= score_threshold: + new_key_recognizer_result_map[column] = result + + return new_key_recognizer_result_map class JsonAnalysisBuilder(AnalysisBuilder): """Concrete configuration generator for JSON data.""" @@ -45,6 +67,7 @@ def generate_analysis( self, data: Dict, language: str = "en", + score_threshold: Optional[float] = None, ) -> StructuredAnalysis: """ Generate a configuration from the given JSON data. @@ -131,7 +154,7 @@ def generate_analysis( key_recognizer_result_map = self._find_most_common_entity(df, language) # Remove low score results - key_recognizer_result_map = self.__remove_low_scores( + key_recognizer_result_map = self._remove_low_scores( key_recognizer_result_map, score_threshold ) @@ -184,26 +207,4 @@ def _find_most_common_entity( key_recognizer_result_map[column] = RecognizerResult( most_common_type, 0, 1, average_score ) - return key_recognizer_result_map - - def __remove_low_scores( - self, - key_recognizer_result_map: Dict[str, RecognizerResult], - score_threshold: float = None, - ) -> List[RecognizerResult]: - """ - Remove results for which the confidence is lower than the threshold. - - :param results: Dict of column names to RecognizerResult - :param score_threshold: float value for minimum possible confidence - :return: List[RecognizerResult] - """ - if score_threshold is None: - score_threshold = self.analyzer.default_score_threshold - - new_key_recognizer_result_map = {} - for column, result in key_recognizer_result_map.items(): - if result.score >= score_threshold: - new_key_recognizer_result_map[column] = result - - return new_key_recognizer_result_map + return key_recognizer_result_map \ No newline at end of file From 54fb99cb767d01d5bcea802b0af238dee988786d Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Mon, 27 Nov 2023 17:57:37 +0100 Subject: [PATCH 38/52] Refactor JSON analysis builder to gain consistency --- .../presidio_structured/analysis_builder.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 5e7432db1..89371c265 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -60,6 +60,7 @@ def _remove_low_scores( return new_key_recognizer_result_map + class JsonAnalysisBuilder(AnalysisBuilder): """Concrete configuration generator for JSON data.""" @@ -77,12 +78,19 @@ def generate_analysis( """ self.logger.debug("Starting JSON BatchAnalyzer analysis") batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) - analyzer_results = batch_analyzer.analyze_dict(input_dict=data, language=language) - return self._generate_analysis_from_results_json(analyzer_results) + analyzer_results = batch_analyzer.analyze_dict( + input_dict=data, language=language + ) + + key_recognizer_result_map = self._generate_analysis_from_results_json( + analyzer_results + ) + + return StructuredAnalysis(entity_mapping=key_recognizer_result_map) def _generate_analysis_from_results_json( self, analyzer_results: Iterator[DictAnalyzerResult], prefix: str = "" - ) -> StructuredAnalysis: + ) -> Dict[str, RecognizerResult]: """ Generate a configuration from the given analyzer results. \ Always uses the first recognizer result if there are more than one. @@ -91,13 +99,13 @@ def _generate_analysis_from_results_json( :param prefix: The prefix for the configuration keys. :return: The generated configuration. """ - mappings = {} + key_recognizer_result_map = {} if not isinstance(analyzer_results, Iterable): self.logger.debug( "No analyzer results found, returning empty StructuredAnalysis" ) - return StructuredAnalysis(entity_mapping=mappings) + return key_recognizer_result_map for result in analyzer_results: current_key = prefix + result.key @@ -106,15 +114,15 @@ def _generate_analysis_from_results_json( nested_mappings = self._generate_analysis_from_results_json( result.recognizer_results, prefix=current_key + "." ) - mappings.update(nested_mappings.entity_mapping) + key_recognizer_result_map.update(nested_mappings.entity_mapping) first_recognizer_result = next(iter(result.recognizer_results), None) if first_recognizer_result is not None: self.logger.debug( - f"Found entity {first_recognizer_result.entity_type} \ + f"Found result with entity {first_recognizer_result.entity_type} \ in {current_key}" ) - mappings[current_key] = first_recognizer_result.entity_type - return StructuredAnalysis(entity_mapping=mappings) + key_recognizer_result_map[current_key] = first_recognizer_result + return key_recognizer_result_map class TabularAnalysisBuilder(AnalysisBuilder): @@ -207,4 +215,4 @@ def _find_most_common_entity( key_recognizer_result_map[column] = RecognizerResult( most_common_type, 0, 1, average_score ) - return key_recognizer_result_map \ No newline at end of file + return key_recognizer_result_map From 7fe314afb1dcb71dd6c7ca680c690014c83f3ef8 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Mon, 27 Nov 2023 17:59:30 +0100 Subject: [PATCH 39/52] Remove low score results in JsonAnalysisBuilder --- presidio-structured/presidio_structured/analysis_builder.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 89371c265..385ea829d 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -86,6 +86,11 @@ def generate_analysis( analyzer_results ) + # Remove low score results + key_recognizer_result_map = self._remove_low_scores( + key_recognizer_result_map, score_threshold + ) + return StructuredAnalysis(entity_mapping=key_recognizer_result_map) def _generate_analysis_from_results_json( From c25d82fee481746330be83994d650c955a3d38d3 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Mon, 27 Nov 2023 18:47:46 +0100 Subject: [PATCH 40/52] Add tests to json analysis with score threshold --- .../tests/test_analysis_builder.py | 46 +++++++++++++++++-- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/presidio-structured/tests/test_analysis_builder.py b/presidio-structured/tests/test_analysis_builder.py index 37dec3bc5..a272ea1df 100644 --- a/presidio-structured/tests/test_analysis_builder.py +++ b/presidio-structured/tests/test_analysis_builder.py @@ -5,7 +5,7 @@ from presidio_analyzer import AnalyzerEngine -from presidio_structured import PandasAnalysisBuilder +from presidio_structured import JsonAnalysisBuilder, PandasAnalysisBuilder # NOTE: we won't go into depth unit-testing all analyzers, as that is covered in the presidio-analyzer tests @@ -60,7 +60,7 @@ def test_find_most_common_entity_with_empty_df(tabular_analysis_builder): assert len(key_recognizer_result_map) == 0 -def test_when_threshold_is_zero_then_all_results_pass( +def test_analysis_tabular_when_threshold_is_zero_then_all_results_pass( tabular_analysis_builder, sample_df ): structured_analysis = tabular_analysis_builder.generate_analysis( @@ -70,7 +70,7 @@ def test_when_threshold_is_zero_then_all_results_pass( assert len(structured_analysis.entity_mapping) == 3 -def test_when_threshold_is_half_then_phone_does_not_pass( +def test_analysis_tabular_when_threshold_is_half_then_phone_does_not_pass( tabular_analysis_builder, sample_df ): structured_analysis = tabular_analysis_builder.generate_analysis( @@ -80,7 +80,7 @@ def test_when_threshold_is_half_then_phone_does_not_pass( assert len(structured_analysis.entity_mapping) == 2 -def test_when_default_threshold_is_half_then_phone_does_not_pass(sample_df): +def test_analysis_tabular_when_default_threshold_is_half_then_phone_does_not_pass(sample_df): analyzer_engine = AnalyzerEngine(default_score_threshold=0.5) tabular_analysis_builder = PandasAnalysisBuilder(analyzer_engine) structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) @@ -88,7 +88,7 @@ def test_when_default_threshold_is_half_then_phone_does_not_pass(sample_df): assert len(structured_analysis.entity_mapping) == 2 -def test_when_default_threshold_is_zero_then_all_results_pass( +def test_analysis_tabular_when_default_threshold_is_zero_then_all_results_pass( tabular_analysis_builder, sample_df ): structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) @@ -116,3 +116,39 @@ def test_generate_analysis_json_with_empty_data(json_analysis_builder): structured_analysis = json_analysis_builder.generate_analysis(data) assert len(structured_analysis.entity_mapping) == 0 + + +def test_analysis_json_when_threshold_is_zero_then_all_results_pass( + json_analysis_builder, sample_json +): + structured_analysis = json_analysis_builder.generate_analysis( + sample_json, score_threshold=0 + ) + + assert len(structured_analysis.entity_mapping) == 4 + + +def test_analysis_tabular_when_threshold_is_half_then_phone_does_not_pass( + json_analysis_builder, sample_json +): + structured_analysis = json_analysis_builder.generate_analysis( + sample_json, score_threshold=0.9 + ) + + assert len(structured_analysis.entity_mapping) == 2 + + +def test_analysis_tabular_when_default_threshold_is_half_then_phone_does_not_pass(sample_json): + analyzer_engine = AnalyzerEngine(default_score_threshold=0.9) + json_analysis_builder = JsonAnalysisBuilder(analyzer_engine) + structured_analysis = json_analysis_builder.generate_analysis(sample_json) + + assert len(structured_analysis.entity_mapping) == 2 + + +def test_analysis_tabular_when_default_threshold_is_zero_then_all_results_pass( + json_analysis_builder, sample_json +): + structured_analysis = json_analysis_builder.generate_analysis(sample_json) + + assert len(structured_analysis.entity_mapping) == 4 \ No newline at end of file From 0f3364d61c9dbccb556e2d00fa51bded794080fb Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Mon, 27 Nov 2023 18:48:50 +0100 Subject: [PATCH 41/52] Fix bug in JSON analysis to update map with nested_mappings --- presidio-structured/presidio_structured/analysis_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 385ea829d..bd6aec421 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -119,7 +119,7 @@ def _generate_analysis_from_results_json( nested_mappings = self._generate_analysis_from_results_json( result.recognizer_results, prefix=current_key + "." ) - key_recognizer_result_map.update(nested_mappings.entity_mapping) + key_recognizer_result_map.update(nested_mappings) first_recognizer_result = next(iter(result.recognizer_results), None) if first_recognizer_result is not None: self.logger.debug( From 0d6ebfcb2531fbe48f19b7b01844424c9ea7a5b0 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Mon, 27 Nov 2023 18:50:35 +0100 Subject: [PATCH 42/52] Fix bug in JSON analysis to take only entity types --- presidio-structured/presidio_structured/analysis_builder.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index bd6aec421..4c1e7b949 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -91,7 +91,11 @@ def generate_analysis( key_recognizer_result_map, score_threshold ) - return StructuredAnalysis(entity_mapping=key_recognizer_result_map) + key_entity_map = { + key: result.entity_type for key, result in key_recognizer_result_map.items() + } + + return StructuredAnalysis(entity_mapping=key_entity_map) def _generate_analysis_from_results_json( self, analyzer_results: Iterator[DictAnalyzerResult], prefix: str = "" From 5f60ee52a88c1752b88ec695f8deb591f66c7852 Mon Sep 17 00:00:00 2001 From: "enrique.botia" Date: Mon, 27 Nov 2023 18:56:26 +0100 Subject: [PATCH 43/52] Fix typos in test anl json names and assert values --- presidio-structured/tests/test_analysis_builder.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/presidio-structured/tests/test_analysis_builder.py b/presidio-structured/tests/test_analysis_builder.py index a272ea1df..45db88f21 100644 --- a/presidio-structured/tests/test_analysis_builder.py +++ b/presidio-structured/tests/test_analysis_builder.py @@ -128,25 +128,25 @@ def test_analysis_json_when_threshold_is_zero_then_all_results_pass( assert len(structured_analysis.entity_mapping) == 4 -def test_analysis_tabular_when_threshold_is_half_then_phone_does_not_pass( +def test_analysis_json_when_threshold_is_high_then_only_email_passes( json_analysis_builder, sample_json ): structured_analysis = json_analysis_builder.generate_analysis( sample_json, score_threshold=0.9 ) - assert len(structured_analysis.entity_mapping) == 2 + assert len(structured_analysis.entity_mapping) == 1 -def test_analysis_tabular_when_default_threshold_is_half_then_phone_does_not_pass(sample_json): +def test_analysis_json_when_default_threshold_is_high_then_only_email_passes(sample_json): analyzer_engine = AnalyzerEngine(default_score_threshold=0.9) json_analysis_builder = JsonAnalysisBuilder(analyzer_engine) structured_analysis = json_analysis_builder.generate_analysis(sample_json) - assert len(structured_analysis.entity_mapping) == 2 + assert len(structured_analysis.entity_mapping) == 1 -def test_analysis_tabular_when_default_threshold_is_zero_then_all_results_pass( +def test_analysis_json_when_default_threshold_is_zero_then_all_results_pass( json_analysis_builder, sample_json ): structured_analysis = json_analysis_builder.generate_analysis(sample_json) From b94251389fb90116ff340a30184ccc99a739593d Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Tue, 28 Nov 2023 13:13:26 +0100 Subject: [PATCH 44/52] Update build-structured.yml --- .pipelines/templates/build-structured.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pipelines/templates/build-structured.yml b/.pipelines/templates/build-structured.yml index 13064583c..0f4689aca 100644 --- a/.pipelines/templates/build-structured.yml +++ b/.pipelines/templates/build-structured.yml @@ -16,6 +16,7 @@ steps: workingDirectory: 'presidio-structured' script: | set -eux # fail on error + export PYTHONPATH=. pipenv install --deploy --dev pipenv run pip install -e ../presidio-analyzer/. # Use the existing analyzer and not the one in PyPI pipenv run pip install -e ../presidio-anonymizer/. # Use the existing analyzer and not the one in PyPI From f042ffea2b75ba70ea86eeccf919b90e4b9a59b3 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 29 Nov 2023 10:57:51 +0200 Subject: [PATCH 45/52] Create __init__.py --- presidio-structured/tests/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 presidio-structured/tests/__init__.py diff --git a/presidio-structured/tests/__init__.py b/presidio-structured/tests/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/presidio-structured/tests/__init__.py @@ -0,0 +1 @@ + From 22ee87d911367acaf2ead4faf5b810e4ebfd8097 Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Wed, 29 Nov 2023 11:41:38 +0100 Subject: [PATCH 46/52] Type hint fix python <3.10, loggger typo --- .../presidio_structured/data/data_processors.py | 4 ++-- presidio-structured/presidio_structured/structured_engine.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/presidio-structured/presidio_structured/data/data_processors.py b/presidio-structured/presidio_structured/data/data_processors.py index 05b8b82df..ed1de168f 100644 --- a/presidio-structured/presidio_structured/data/data_processors.py +++ b/presidio-structured/presidio_structured/data/data_processors.py @@ -39,9 +39,9 @@ def operate( @abstractmethod def _process( self, - data: Dict | DataFrame, + data: Union[Dict, DataFrame], key_to_operator_mapping: Dict[str, Callable], - ) -> Dict | DataFrame: + ) -> Union[Dict, DataFrame]: """ Abstract method for subclasses to provide operation implementation. diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py index aa046693c..db7419d77 100644 --- a/presidio-structured/presidio_structured/structured_engine.py +++ b/presidio-structured/presidio_structured/structured_engine.py @@ -41,7 +41,7 @@ def anonymize( :param operators: a dictionary of operator configurations, optional. :return: Anonymized dictionary or DataFrame. """ - self.loggger.debug("Starting anonymization") + self.logger.debug("Starting anonymization") operators = self.__check_or_add_default_operator(operators) return self.data_processor.operate( From 575498f46600af6b5bf29346208529f0dcaa7a51 Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Wed, 29 Nov 2023 11:49:24 +0100 Subject: [PATCH 47/52] Update setup.py --- presidio-structured/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-structured/setup.py b/presidio-structured/setup.py index ab9aeeb80..64f4df664 100644 --- a/presidio-structured/setup.py +++ b/presidio-structured/setup.py @@ -11,7 +11,7 @@ this_directory = path.abspath(path.dirname(__file__)) parent_directory = os.path.abspath(os.path.join(this_directory, os.pardir)) -with open(path.join(this_directory, "README.MD"), encoding="utf-8") as f: +with open(path.join(this_directory, "README.md"), encoding="utf-8") as f: long_description = f.read() try: From 4e2bea40b170e1375f6cfb0e09410a6613189b7b Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Tue, 9 Jan 2024 09:13:08 +0100 Subject: [PATCH 48/52] PR comments variety --- .pipelines/templates/build-structured.yml | 1 - .../presidio_structured/analysis_builder.py | 20 ++++++++++--------- .../data/data_processors.py | 6 +++--- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/.pipelines/templates/build-structured.yml b/.pipelines/templates/build-structured.yml index 0f4689aca..13064583c 100644 --- a/.pipelines/templates/build-structured.yml +++ b/.pipelines/templates/build-structured.yml @@ -16,7 +16,6 @@ steps: workingDirectory: 'presidio-structured' script: | set -eux # fail on error - export PYTHONPATH=. pipenv install --deploy --dev pipenv run pip install -e ../presidio-analyzer/. # Use the existing analyzer and not the one in PyPI pipenv run pip install -e ../presidio-anonymizer/. # Use the existing analyzer and not the one in PyPI diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 4c1e7b949..c2a5fe4c1 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -14,14 +14,16 @@ from presidio_structured.config import StructuredAnalysis +NON_PII_ENTITY_TYPE = "NON_PII" + +logger = logging.getLogger("presidio-structured") class AnalysisBuilder(ABC): """Abstract base class for a configuration generator.""" - def __init__(self, analyzer: AnalyzerEngine = None) -> None: + def __init__(self, analyzer: Optional[AnalyzerEngine] = None) -> None: """Initialize the configuration generator.""" self.analyzer = AnalyzerEngine() if analyzer is None else analyzer - self.logger = logging.getLogger("presidio-structured") @abstractmethod def generate_analysis( @@ -76,7 +78,7 @@ def generate_analysis( :param data: The input JSON data. :return: The generated configuration. """ - self.logger.debug("Starting JSON BatchAnalyzer analysis") + logger.debug("Starting JSON BatchAnalyzer analysis") batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) analyzer_results = batch_analyzer.analyze_dict( input_dict=data, language=language @@ -111,7 +113,7 @@ def _generate_analysis_from_results_json( key_recognizer_result_map = {} if not isinstance(analyzer_results, Iterable): - self.logger.debug( + logger.debug( "No analyzer results found, returning empty StructuredAnalysis" ) return key_recognizer_result_map @@ -126,7 +128,7 @@ def _generate_analysis_from_results_json( key_recognizer_result_map.update(nested_mappings) first_recognizer_result = next(iter(result.recognizer_results), None) if first_recognizer_result is not None: - self.logger.debug( + logger.debug( f"Found result with entity {first_recognizer_result.entity_type} \ in {current_key}" ) @@ -160,7 +162,7 @@ def generate_analysis( :return: The generated configuration. """ if n > len(df): - self.logger.debug( + logger.debug( f"Number of samples ({n}) is larger than the number of rows \ ({len(df)}), using all rows" ) @@ -178,7 +180,7 @@ def generate_analysis( key_entity_map = { key: result.entity_type for key, result in key_recognizer_result_map.items() - if result.entity_type != "NON_PII" + if result.entity_type != NON_PII_ENTITY_TYPE } return StructuredAnalysis(entity_mapping=key_entity_map) @@ -198,14 +200,14 @@ def _find_most_common_entity( batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) for column in df.columns: - self.logger.debug(f"Finding most common PII entity for column {column}") + logger.debug(f"Finding most common PII entity for column {column}") analyzer_results = batch_analyzer.analyze_iterator( [val for val in df[column]], language=language ) if all(len(res) == 0 for res in analyzer_results): key_recognizer_result_map[column] = RecognizerResult( - entity_type="NON_PII", start=0, end=1, score=1.0 + entity_type=NON_PII_ENTITY_TYPE, start=0, end=1, score=1.0 ) continue # Grabbing most common type diff --git a/presidio-structured/presidio_structured/data/data_processors.py b/presidio-structured/presidio_structured/data/data_processors.py index ed1de168f..e3204a52d 100644 --- a/presidio-structured/presidio_structured/data/data_processors.py +++ b/presidio-structured/presidio_structured/data/data_processors.py @@ -201,7 +201,7 @@ def _process( Operates on the given JSON-like data based on the provided configuration. :param data: JSON-like data to be operated on. - :param config: Configuration object containing operator information. + :param key_to_operator_mapping: maps keys to Callable operators. :return: JSON-like data after the operation. """ @@ -212,11 +212,11 @@ def _process( self.logger.debug(f"Operating on key {key}") keys = key.split(".") if isinstance(data, list): - for idx, item in enumerate(data): + for item in data: self._process(item, key_to_operator_mapping) else: text_to_operate_on = self._get_nested_value(data, keys) - if text_to_operate_on is not None: + if text_to_operate_on: if isinstance(text_to_operate_on, list): for text in text_to_operate_on: operated_text = self._operate_on_text( From 0388c8934b993d7ec500f81a16679d4d9c9d7d40 Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Tue, 9 Jan 2024 10:24:52 +0100 Subject: [PATCH 49/52] further pr comments --- .../presidio_structured/analysis_builder.py | 15 +++++++++++---- .../presidio_structured/data/data_processors.py | 6 +++--- .../presidio_structured/structured_engine.py | 7 ++++--- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index c2a5fe4c1..216c6a63e 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -18,6 +18,7 @@ logger = logging.getLogger("presidio-structured") + class AnalysisBuilder(ABC): """Abstract base class for a configuration generator.""" @@ -66,6 +67,11 @@ def _remove_low_scores( class JsonAnalysisBuilder(AnalysisBuilder): """Concrete configuration generator for JSON data.""" + def __init__(self, analyzer: Optional[AnalyzerEngine] = None) -> None: + """Initialize the JSON analysis builder.""" + super().__init__(analyzer) + self.batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) + def generate_analysis( self, data: Dict, @@ -79,8 +85,7 @@ def generate_analysis( :return: The generated configuration. """ logger.debug("Starting JSON BatchAnalyzer analysis") - batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) - analyzer_results = batch_analyzer.analyze_dict( + analyzer_results = self.batch_analyzer.analyze_dict( input_dict=data, language=language ) @@ -149,7 +154,7 @@ class PandasAnalysisBuilder(TabularAnalysisBuilder): def generate_analysis( self, df: DataFrame, - n: int = 100, + n: Optional[int] = None, language: str = "en", score_threshold: Optional[float] = None, ) -> StructuredAnalysis: @@ -161,7 +166,9 @@ def generate_analysis( :param language: The language to be used for analysis. :return: The generated configuration. """ - if n > len(df): + if not n: + n = len(df) + elif n > len(df): logger.debug( f"Number of samples ({n}) is larger than the number of rows \ ({len(df)}), using all rows" diff --git a/presidio-structured/presidio_structured/data/data_processors.py b/presidio-structured/presidio_structured/data/data_processors.py index e3204a52d..32b99b020 100644 --- a/presidio-structured/presidio_structured/data/data_processors.py +++ b/presidio-structured/presidio_structured/data/data_processors.py @@ -125,12 +125,12 @@ def _process( for key, operator_callable in key_to_operator_mapping.items(): self.logger.debug(f"Operating on column {key}") - for idx, row in data.iterrows(): - text_to_operate_on = row[key] + for row in data.itertuples(index=True): + text_to_operate_on = getattr(row, key) operated_text = self._operate_on_text( text_to_operate_on, operator_callable ) - data.at[idx, key] = operated_text + data.at[row.Index, key] = operated_text return data diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py index db7419d77..b0cccda3d 100644 --- a/presidio-structured/presidio_structured/structured_engine.py +++ b/presidio-structured/presidio_structured/structured_engine.py @@ -1,5 +1,5 @@ import logging -from typing import Dict, Union +from typing import Dict, Union, Optional from pandas import DataFrame from presidio_anonymizer.entities import OperatorConfig @@ -17,14 +17,15 @@ class StructuredEngine: """Class to implement methods for anonymizing tabular data.""" def __init__( - self, data_processor: DataProcessorBase = PandasDataProcessor() + self, data_processor: Optional[DataProcessorBase] = None ) -> None: """ Initialize the class with a data processor. :param data_processor: Instance of DataProcessorBase. """ - self.data_processor = data_processor + if data_processor is None: + data_processor = PandasDataProcessor() self.logger = logging.getLogger("presidio-structured") def anonymize( From cdc8923834c3b9e0f92c34ad2a8508e377ca51b6 Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Tue, 9 Jan 2024 14:44:47 +0100 Subject: [PATCH 50/52] readme, refactor score, refactor tabular analysis --- presidio-structured/README.md | 2 +- .../presidio_structured/analysis_builder.py | 123 +++++++++++------- .../config/structured_analysis.py | 12 +- .../presidio_structured/structured_engine.py | 5 +- .../tests/test_analysis_builder.py | 59 +-------- 5 files changed, 98 insertions(+), 103 deletions(-) diff --git a/presidio-structured/README.md b/presidio-structured/README.md index b80efdfea..bae17ca3a 100644 --- a/presidio-structured/README.md +++ b/presidio-structured/README.md @@ -6,7 +6,7 @@ ## Description -The Presidio structured package is a flexible and customizable framework designed to identify and protect structured sensitive data. This tool extends the capabilities of Presidio, focusing on structured data formats such as tabular formats and semi-structured formats (JSON). +The Presidio structured package is a flexible and customizable framework designed to identify and protect structured sensitive data. This tool extends the capabilities of Presidio, focusing on structured data formats such as tabular formats and semi-structured formats (JSON). It leverages the detection capabilities of Presidio-Analyzer to identify columns or keys containing personally identifiable information (PII), and establishes a mapping between these column/keys names and the detected PII entities. Following the detection, Presidio-Anonymizer is used to apply de-identification techniques to each value in columns identified as containing PII, ensuring the sensitive data is appropriately protected. ## Installation diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index 216c6a63e..f58d8bce8 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -22,9 +22,15 @@ class AnalysisBuilder(ABC): """Abstract base class for a configuration generator.""" - def __init__(self, analyzer: Optional[AnalyzerEngine] = None) -> None: + def __init__(self, analyzer: Optional[AnalyzerEngine] = None, \ + analyzer_score_threshold: Optional[float] = None) -> None: """Initialize the configuration generator.""" - self.analyzer = AnalyzerEngine() if analyzer is None else analyzer + default_score_threshold = analyzer_score_threshold if \ + analyzer_score_threshold is not None else 0 + self.analyzer = \ + AnalyzerEngine(default_score_threshold = default_score_threshold) \ + if analyzer is None else analyzer + self.batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) @abstractmethod def generate_analysis( @@ -46,6 +52,8 @@ def _remove_low_scores( key_recognizer_result_map: Dict[str, RecognizerResult], score_threshold: float = None, ) -> List[RecognizerResult]: + #FIXME: score threshold between anaylzer engine (Top level) and analysis builder (lower level) are different in the way they work. + # it should never be used in here, but always in analyzer engine.... """ Remove results for which the confidence is lower than the threshold. @@ -67,16 +75,10 @@ def _remove_low_scores( class JsonAnalysisBuilder(AnalysisBuilder): """Concrete configuration generator for JSON data.""" - def __init__(self, analyzer: Optional[AnalyzerEngine] = None) -> None: - """Initialize the JSON analysis builder.""" - super().__init__(analyzer) - self.batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) - def generate_analysis( self, data: Dict, language: str = "en", - score_threshold: Optional[float] = None, ) -> StructuredAnalysis: """ Generate a configuration from the given JSON data. @@ -93,10 +95,6 @@ def generate_analysis( analyzer_results ) - # Remove low score results - key_recognizer_result_map = self._remove_low_scores( - key_recognizer_result_map, score_threshold - ) key_entity_map = { key: result.entity_type for key, result in key_recognizer_result_map.items() @@ -156,7 +154,6 @@ def generate_analysis( df: DataFrame, n: Optional[int] = None, language: str = "en", - score_threshold: Optional[float] = None, ) -> StructuredAnalysis: """ Generate a configuration from the given tabular data. @@ -164,7 +161,7 @@ def generate_analysis( :param df: The input tabular data (dataframe). :param n: The number of samples to be taken from the dataframe. :param language: The language to be used for analysis. - :return: The generated configuration. + :return: A StructuredAnalysis object containing the analysis results. """ if not n: n = len(df) @@ -175,14 +172,9 @@ def generate_analysis( ) n = len(df) - df = df.sample(n) + df = df.sample(n, random_state=123) - key_recognizer_result_map = self._find_most_common_entity(df, language) - - # Remove low score results - key_recognizer_result_map = self._remove_low_scores( - key_recognizer_result_map, score_threshold - ) + key_recognizer_result_map = self._generate_key_rec_results_map(df, language) key_entity_map = { key: result.entity_type @@ -192,45 +184,86 @@ def generate_analysis( return StructuredAnalysis(entity_mapping=key_entity_map) - def _find_most_common_entity( + def _generate_key_rec_results_map( self, df: DataFrame, language: str ) -> Dict[str, RecognizerResult]: """ Find the most common entity in a dataframe column. + If more than one entity is found in a cell, the first one is used. + :param df: The dataframe where entities will be searched. :param language: Language to be used in the analysis engine. :return: A dictionary mapping column names to the most common RecognizerResult. """ + column_analyzer_results_map = self._batch_analyze_df(df, language) key_recognizer_result_map = {} + for column, analyzer_result in column_analyzer_results_map.items(): + key_recognizer_result_map[column] = self._find_most_common_entity( + analyzer_result + ) + return key_recognizer_result_map - batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) + def _batch_analyze_df(self, df: DataFrame, language: str) \ + -> Dict[str, List[List[RecognizerResult]]]: + """ + Analyze each column in the dataframe for entities using the batch analyzer. + :param df: The dataframe to be analyzed. + :param language: The language configuration for the analyzer. + :return: A dictionary mapping each column name to a \ + list of lists of RecognizerResults. + """ + column_analyzer_results_map = {} for column in df.columns: logger.debug(f"Finding most common PII entity for column {column}") - analyzer_results = batch_analyzer.analyze_iterator( + analyzer_results = self.batch_analyzer.analyze_iterator( [val for val in df[column]], language=language ) + column_analyzer_results_map[column] = analyzer_results - if all(len(res) == 0 for res in analyzer_results): - key_recognizer_result_map[column] = RecognizerResult( - entity_type=NON_PII_ENTITY_TYPE, start=0, end=1, score=1.0 - ) - continue - # Grabbing most common type - types_list = [ - res[0].entity_type for res in analyzer_results if len(res) > 0 - ] - type_counter = Counter(types_list) - most_common_type = type_counter.most_common(1)[0][0] - # Grabbing the average confidence score for the most common type. - scores = [ - res[0].score - for res in analyzer_results - if len(res) > 0 and res[0].entity_type == most_common_type - ] - average_score = sum(scores) / len(scores) if scores else 0.0 - key_recognizer_result_map[column] = RecognizerResult( - most_common_type, 0, 1, average_score + return column_analyzer_results_map + + def _find_most_common_entity( + self, analyzer_results: List[List[RecognizerResult]] + ) -> RecognizerResult: + """ + Find the most common entity in a list of analyzer results for \ + a dataframe column. + + It takes the most common entity type and calculates the confidence score based + on the number of cells it appears in. + + :param analyzer_results: List of lists of RecognizerResults for each \ + cell in the column. + :return: A RecognizerResult with the most common entity type and the \ + calculated confidence score. + """ + + if not any(analyzer_results): + return RecognizerResult( + entity_type=NON_PII_ENTITY_TYPE, start=0, end=1, score=1.0 ) - return key_recognizer_result_map + + # Flatten the list of lists while keeping track of the cell index + flat_results = [ + (cell_idx, res) + for cell_idx, cell_results in enumerate(analyzer_results) + for res in cell_results + ] + + # Count the occurrences of each entity type in different cells + type_counter = Counter( + res.entity_type for cell_idx, res in flat_results + ) + + # Find the most common entity type based on the number of cells it appears in + most_common_type, _ = type_counter.most_common(1)[0] + + # The score is the ratio of the most common entity type's count to the total + most_common_count = type_counter[most_common_type] + score = most_common_count / len(analyzer_results) + + return RecognizerResult( + entity_type=most_common_type, start=0, end=1, score=score + ) \ No newline at end of file diff --git a/presidio-structured/presidio_structured/config/structured_analysis.py b/presidio-structured/presidio_structured/config/structured_analysis.py index ca1b75d2c..b1480f758 100644 --- a/presidio-structured/presidio_structured/config/structured_analysis.py +++ b/presidio-structured/presidio_structured/config/structured_analysis.py @@ -6,8 +6,16 @@ @dataclass class StructuredAnalysis: - """Dataclass containing entity analysis from structured data.\ - Currently only contains entity mapping.""" + """ + Dataclass containing entity analysis from structured data. + + Currently, this class only contains entity mapping. + + param entity_mapping : dict. Mapping column/key names to entity types, e.g., { + "person.name": "PERSON", + "person.address": "LOCATION" + } + """ entity_mapping: Dict[ str, str diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py index b0cccda3d..7a5cb9a72 100644 --- a/presidio-structured/presidio_structured/structured_engine.py +++ b/presidio-structured/presidio_structured/structured_engine.py @@ -25,7 +25,10 @@ def __init__( :param data_processor: Instance of DataProcessorBase. """ if data_processor is None: - data_processor = PandasDataProcessor() + self.data_processor = PandasDataProcessor() + else: + self.data_processor = data_processor + self.logger = logging.getLogger("presidio-structured") def anonymize( diff --git a/presidio-structured/tests/test_analysis_builder.py b/presidio-structured/tests/test_analysis_builder.py index 45db88f21..8666bbe9f 100644 --- a/presidio-structured/tests/test_analysis_builder.py +++ b/presidio-structured/tests/test_analysis_builder.py @@ -42,7 +42,7 @@ def test_generate_analysis_tabular_with_invalid_sampling( def test_find_most_common_entity(tabular_analysis_builder, sample_df): key_recognizer_result_map = ( - tabular_analysis_builder._find_most_common_entity(sample_df, "en") + tabular_analysis_builder._generate_key_rec_results_map(sample_df, "en") ) assert len(key_recognizer_result_map) == 3 @@ -54,32 +54,12 @@ def test_find_most_common_entity(tabular_analysis_builder, sample_df): def test_find_most_common_entity_with_empty_df(tabular_analysis_builder): df = pd.DataFrame() key_recognizer_result_map = ( - tabular_analysis_builder._find_most_common_entity(df, "en") + tabular_analysis_builder._generate_key_rec_results_map(df, "en") ) assert len(key_recognizer_result_map) == 0 -def test_analysis_tabular_when_threshold_is_zero_then_all_results_pass( - tabular_analysis_builder, sample_df -): - structured_analysis = tabular_analysis_builder.generate_analysis( - sample_df, score_threshold=0 - ) - - assert len(structured_analysis.entity_mapping) == 3 - - -def test_analysis_tabular_when_threshold_is_half_then_phone_does_not_pass( - tabular_analysis_builder, sample_df -): - structured_analysis = tabular_analysis_builder.generate_analysis( - sample_df, score_threshold=0.5 - ) - - assert len(structured_analysis.entity_mapping) == 2 - - def test_analysis_tabular_when_default_threshold_is_half_then_phone_does_not_pass(sample_df): analyzer_engine = AnalyzerEngine(default_score_threshold=0.5) tabular_analysis_builder = PandasAnalysisBuilder(analyzer_engine) @@ -87,10 +67,9 @@ def test_analysis_tabular_when_default_threshold_is_half_then_phone_does_not_pas assert len(structured_analysis.entity_mapping) == 2 - -def test_analysis_tabular_when_default_threshold_is_zero_then_all_results_pass( - tabular_analysis_builder, sample_df -): +def test_analysis_tabular_when_default_threshold_is_zero_then_all_results_pass(sample_df): + analyzer_engine = AnalyzerEngine(default_score_threshold=0.5) + tabular_analysis_builder = PandasAnalysisBuilder(analyzer_engine) structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) assert len(structured_analysis.entity_mapping) == 3 @@ -118,37 +97,9 @@ def test_generate_analysis_json_with_empty_data(json_analysis_builder): assert len(structured_analysis.entity_mapping) == 0 -def test_analysis_json_when_threshold_is_zero_then_all_results_pass( - json_analysis_builder, sample_json -): - structured_analysis = json_analysis_builder.generate_analysis( - sample_json, score_threshold=0 - ) - - assert len(structured_analysis.entity_mapping) == 4 - - -def test_analysis_json_when_threshold_is_high_then_only_email_passes( - json_analysis_builder, sample_json -): - structured_analysis = json_analysis_builder.generate_analysis( - sample_json, score_threshold=0.9 - ) - - assert len(structured_analysis.entity_mapping) == 1 - - def test_analysis_json_when_default_threshold_is_high_then_only_email_passes(sample_json): analyzer_engine = AnalyzerEngine(default_score_threshold=0.9) json_analysis_builder = JsonAnalysisBuilder(analyzer_engine) structured_analysis = json_analysis_builder.generate_analysis(sample_json) assert len(structured_analysis.entity_mapping) == 1 - - -def test_analysis_json_when_default_threshold_is_zero_then_all_results_pass( - json_analysis_builder, sample_json -): - structured_analysis = json_analysis_builder.generate_analysis(sample_json) - - assert len(structured_analysis.entity_mapping) == 4 \ No newline at end of file From 6985aa70e5560a470ed944b9306c1c69e9364e63 Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Tue, 9 Jan 2024 14:45:54 +0100 Subject: [PATCH 51/52] Update test_analysis_builder.py --- presidio-structured/tests/test_analysis_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-structured/tests/test_analysis_builder.py b/presidio-structured/tests/test_analysis_builder.py index 8666bbe9f..8479e5e2b 100644 --- a/presidio-structured/tests/test_analysis_builder.py +++ b/presidio-structured/tests/test_analysis_builder.py @@ -68,7 +68,7 @@ def test_analysis_tabular_when_default_threshold_is_half_then_phone_does_not_pas assert len(structured_analysis.entity_mapping) == 2 def test_analysis_tabular_when_default_threshold_is_zero_then_all_results_pass(sample_df): - analyzer_engine = AnalyzerEngine(default_score_threshold=0.5) + analyzer_engine = AnalyzerEngine(default_score_threshold=0) tabular_analysis_builder = PandasAnalysisBuilder(analyzer_engine) structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) From 0a8778383382d2dfacb5d8fb65046a1261d1aa95 Mon Sep 17 00:00:00 2001 From: Jakob Serlier <37184788+Jakob-98@users.noreply.github.com> Date: Wed, 10 Jan 2024 10:16:06 +0100 Subject: [PATCH 52/52] lint --- .../presidio_structured/analysis_builder.py | 34 ++++++++++--------- .../config/structured_analysis.py | 4 +-- .../data/data_processors.py | 18 +++------- .../presidio_structured/structured_engine.py | 12 ++----- presidio-structured/setup.py | 4 +-- presidio-structured/tests/conftest.py | 12 ++----- .../tests/data/test_data_transformers.py | 16 +++------ .../tests/test_analysis_builder.py | 33 +++++++++--------- .../tests/test_tabular_engine.py | 8 ++--- 9 files changed, 54 insertions(+), 87 deletions(-) diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py index f58d8bce8..b2db7ef6f 100644 --- a/presidio-structured/presidio_structured/analysis_builder.py +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -22,14 +22,20 @@ class AnalysisBuilder(ABC): """Abstract base class for a configuration generator.""" - def __init__(self, analyzer: Optional[AnalyzerEngine] = None, \ - analyzer_score_threshold: Optional[float] = None) -> None: + def __init__( + self, + analyzer: Optional[AnalyzerEngine] = None, + analyzer_score_threshold: Optional[float] = None, + ) -> None: """Initialize the configuration generator.""" - default_score_threshold = analyzer_score_threshold if \ - analyzer_score_threshold is not None else 0 - self.analyzer = \ - AnalyzerEngine(default_score_threshold = default_score_threshold) \ - if analyzer is None else analyzer + default_score_threshold = ( + analyzer_score_threshold if analyzer_score_threshold is not None else 0 + ) + self.analyzer = ( + AnalyzerEngine(default_score_threshold=default_score_threshold) + if analyzer is None + else analyzer + ) self.batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) @abstractmethod @@ -52,8 +58,6 @@ def _remove_low_scores( key_recognizer_result_map: Dict[str, RecognizerResult], score_threshold: float = None, ) -> List[RecognizerResult]: - #FIXME: score threshold between anaylzer engine (Top level) and analysis builder (lower level) are different in the way they work. - # it should never be used in here, but always in analyzer engine.... """ Remove results for which the confidence is lower than the threshold. @@ -95,7 +99,6 @@ def generate_analysis( analyzer_results ) - key_entity_map = { key: result.entity_type for key, result in key_recognizer_result_map.items() } @@ -204,8 +207,9 @@ def _generate_key_rec_results_map( ) return key_recognizer_result_map - def _batch_analyze_df(self, df: DataFrame, language: str) \ - -> Dict[str, List[List[RecognizerResult]]]: + def _batch_analyze_df( + self, df: DataFrame, language: str + ) -> Dict[str, List[List[RecognizerResult]]]: """ Analyze each column in the dataframe for entities using the batch analyzer. @@ -253,9 +257,7 @@ def _find_most_common_entity( ] # Count the occurrences of each entity type in different cells - type_counter = Counter( - res.entity_type for cell_idx, res in flat_results - ) + type_counter = Counter(res.entity_type for cell_idx, res in flat_results) # Find the most common entity type based on the number of cells it appears in most_common_type, _ = type_counter.most_common(1)[0] @@ -266,4 +268,4 @@ def _find_most_common_entity( return RecognizerResult( entity_type=most_common_type, start=0, end=1, score=score - ) \ No newline at end of file + ) diff --git a/presidio-structured/presidio_structured/config/structured_analysis.py b/presidio-structured/presidio_structured/config/structured_analysis.py index b1480f758..261d1e713 100644 --- a/presidio-structured/presidio_structured/config/structured_analysis.py +++ b/presidio-structured/presidio_structured/config/structured_analysis.py @@ -17,6 +17,4 @@ class StructuredAnalysis: } """ - entity_mapping: Dict[ - str, str - ] + entity_mapping: Dict[str, str] diff --git a/presidio-structured/presidio_structured/data/data_processors.py b/presidio-structured/presidio_structured/data/data_processors.py index 32b99b020..4e09d1cd9 100644 --- a/presidio-structured/presidio_structured/data/data_processors.py +++ b/presidio-structured/presidio_structured/data/data_processors.py @@ -39,7 +39,7 @@ def operate( @abstractmethod def _process( self, - data: Union[Dict, DataFrame], + data: Union[Dict, DataFrame], key_to_operator_mapping: Dict[str, Callable], ) -> Union[Dict, DataFrame]: """ @@ -72,12 +72,8 @@ def _generate_operator_mapping( operators_factory = OperatorsFactory() for key, entity in config.entity_mapping.items(): - self.logger.debug( - f"Creating operator for key {key} and entity {entity}" - ) - operator_config = operators.get( - entity, operators.get("DEFAULT", None) - ) + self.logger.debug(f"Creating operator for key {key} and entity {entity}") + operator_config = operators.get(entity, operators.get("DEFAULT", None)) if operator_config is None: raise ValueError(f"Operator for entity {entity} not found") # NOTE: hardcoded OperatorType.Anonymize, as this is the only one supported. @@ -162,9 +158,7 @@ def _get_nested_value(data: Union[Dict, List], path: List[str]) -> Any: return data @staticmethod - def _set_nested_value( - data: Union[Dict, List], path: List[str], value: Any - ) -> None: + def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> None: """ Recursively sets a value in nested data using a given path. @@ -182,9 +176,7 @@ def _set_nested_value( continue else: for item in data: - JsonDataProcessor._set_nested_value( - item, path[i:], value - ) + JsonDataProcessor._set_nested_value(item, path[i:], value) return elif isinstance(data, dict): if i == len(path) - 1: diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py index 7a5cb9a72..e36dc86fb 100644 --- a/presidio-structured/presidio_structured/structured_engine.py +++ b/presidio-structured/presidio_structured/structured_engine.py @@ -16,9 +16,7 @@ class StructuredEngine: """Class to implement methods for anonymizing tabular data.""" - def __init__( - self, data_processor: Optional[DataProcessorBase] = None - ) -> None: + def __init__(self, data_processor: Optional[DataProcessorBase] = None) -> None: """ Initialize the class with a data processor. @@ -48,9 +46,7 @@ def anonymize( self.logger.debug("Starting anonymization") operators = self.__check_or_add_default_operator(operators) - return self.data_processor.operate( - data, structured_analysis, operators - ) + return self.data_processor.operate(data, structured_analysis, operators) def __check_or_add_default_operator( self, operators: Dict[str, OperatorConfig] @@ -68,8 +64,6 @@ def __check_or_add_default_operator( self.logger.debug("No operators provided, using default operator") return {"DEFAULT": default_operator} if not operators.get("DEFAULT"): - self.logger.debug( - "No default operator provided, using default operator" - ) + self.logger.debug("No default operator provided, using default operator") operators["DEFAULT"] = default_operator return operators diff --git a/presidio-structured/setup.py b/presidio-structured/setup.py index 64f4df664..bd85e70b0 100644 --- a/presidio-structured/setup.py +++ b/presidio-structured/setup.py @@ -24,9 +24,7 @@ name="presidio_structured", python_requires=">=3.5", version=__version__, - packages=find_packages( - include=["presidio_structured", "presidio_structured.*"] - ), + packages=find_packages(include=["presidio_structured", "presidio_structured.*"]), classifiers=[ "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", diff --git a/presidio-structured/tests/conftest.py b/presidio-structured/tests/conftest.py index 11f78a805..96bd0f7a1 100644 --- a/presidio-structured/tests/conftest.py +++ b/presidio-structured/tests/conftest.py @@ -61,21 +61,15 @@ def tabular_analysis_builder(): @pytest.fixture def operators(): return { - "PERSON": OperatorConfig( - "replace", {"new_value": "PERSON_REPLACEMENT"} - ), - "DEFAULT": OperatorConfig( - "replace", {"new_value": "DEFAULT_REPLACEMENT"} - ), + "PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}), + "DEFAULT": OperatorConfig("replace", {"new_value": "DEFAULT_REPLACEMENT"}), } @pytest.fixture def operators_no_default(): return { - "PERSON": OperatorConfig( - "replace", {"new_value": "PERSON_REPLACEMENT"} - ), + "PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}), } diff --git a/presidio-structured/tests/data/test_data_transformers.py b/presidio-structured/tests/data/test_data_transformers.py index 514569ce9..c9bd365f2 100644 --- a/presidio-structured/tests/data/test_data_transformers.py +++ b/presidio-structured/tests/data/test_data_transformers.py @@ -8,9 +8,7 @@ class TestDataProcessorBase: - def test_abstract_init_raises( - self, sample_df, tabular_analysis_builder, operators - ): + def test_abstract_init_raises(self, sample_df, tabular_analysis_builder, operators): with pytest.raises(TypeError): DataProcessorBase() @@ -31,13 +29,9 @@ def test_process_no_default_should_raise( ): processor = PandasDataProcessor() with pytest.raises(ValueError): - processor.operate( - sample_df, tabular_analysis, operators_no_default - ) + processor.operate(sample_df, tabular_analysis, operators_no_default) - def test_process_invalid_data( - self, sample_json, tabular_analysis, operators - ): + def test_process_invalid_data(self, sample_json, tabular_analysis, operators): processor = PandasDataProcessor() with pytest.raises(ValueError): processor.operate(sample_json, tabular_analysis, operators) @@ -63,9 +57,7 @@ def test_process_no_default_should_raise( ): processor = JsonDataProcessor() with pytest.raises(ValueError): - processor.operate( - sample_json, json_analysis, operators_no_default - ) + processor.operate(sample_json, json_analysis, operators_no_default) def test_process_invalid_data(self, sample_df, json_analysis, operators): processor = JsonDataProcessor() diff --git a/presidio-structured/tests/test_analysis_builder.py b/presidio-structured/tests/test_analysis_builder.py index 8479e5e2b..101f2f637 100644 --- a/presidio-structured/tests/test_analysis_builder.py +++ b/presidio-structured/tests/test_analysis_builder.py @@ -11,21 +11,15 @@ def test_generate_analysis_tabular(tabular_analysis_builder, sample_df): - structured_analysis = tabular_analysis_builder.generate_analysis( - sample_df - ) + structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) assert structured_analysis.entity_mapping["name"] == "PERSON" assert structured_analysis.entity_mapping["email"] == "EMAIL_ADDRESS" assert structured_analysis.entity_mapping["phone"] == "PHONE_NUMBER" -def test_generate_analysis_tabular_with_sampling( - tabular_analysis_builder, sample_df -): - structured_analysis = tabular_analysis_builder.generate_analysis( - sample_df, n=2 - ) +def test_generate_analysis_tabular_with_sampling(tabular_analysis_builder, sample_df): + structured_analysis = tabular_analysis_builder.generate_analysis(sample_df, n=2) assert len(structured_analysis.entity_mapping) == 3 assert structured_analysis.entity_mapping["name"] == "PERSON" @@ -41,8 +35,8 @@ def test_generate_analysis_tabular_with_invalid_sampling( def test_find_most_common_entity(tabular_analysis_builder, sample_df): - key_recognizer_result_map = ( - tabular_analysis_builder._generate_key_rec_results_map(sample_df, "en") + key_recognizer_result_map = tabular_analysis_builder._generate_key_rec_results_map( + sample_df, "en" ) assert len(key_recognizer_result_map) == 3 @@ -53,21 +47,26 @@ def test_find_most_common_entity(tabular_analysis_builder, sample_df): def test_find_most_common_entity_with_empty_df(tabular_analysis_builder): df = pd.DataFrame() - key_recognizer_result_map = ( - tabular_analysis_builder._generate_key_rec_results_map(df, "en") + key_recognizer_result_map = tabular_analysis_builder._generate_key_rec_results_map( + df, "en" ) assert len(key_recognizer_result_map) == 0 -def test_analysis_tabular_when_default_threshold_is_half_then_phone_does_not_pass(sample_df): +def test_analysis_tabular_when_default_threshold_is_half_then_phone_does_not_pass( + sample_df, +): analyzer_engine = AnalyzerEngine(default_score_threshold=0.5) tabular_analysis_builder = PandasAnalysisBuilder(analyzer_engine) structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) assert len(structured_analysis.entity_mapping) == 2 -def test_analysis_tabular_when_default_threshold_is_zero_then_all_results_pass(sample_df): + +def test_analysis_tabular_when_default_threshold_is_zero_then_all_results_pass( + sample_df, +): analyzer_engine = AnalyzerEngine(default_score_threshold=0) tabular_analysis_builder = PandasAnalysisBuilder(analyzer_engine) structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) @@ -97,7 +96,9 @@ def test_generate_analysis_json_with_empty_data(json_analysis_builder): assert len(structured_analysis.entity_mapping) == 0 -def test_analysis_json_when_default_threshold_is_high_then_only_email_passes(sample_json): +def test_analysis_json_when_default_threshold_is_high_then_only_email_passes( + sample_json, +): analyzer_engine = AnalyzerEngine(default_score_threshold=0.9) json_analysis_builder = JsonAnalysisBuilder(analyzer_engine) structured_analysis = json_analysis_builder.generate_analysis(sample_json) diff --git a/presidio-structured/tests/test_tabular_engine.py b/presidio-structured/tests/test_tabular_engine.py index 3fe02f272..87a9dd287 100644 --- a/presidio-structured/tests/test_tabular_engine.py +++ b/presidio-structured/tests/test_tabular_engine.py @@ -21,9 +21,7 @@ def test_structured_engine_anonymize_calls_data_processor_operate(): structured_engine.anonymize(data, structured_analysis, operators) # Assert - data_processor.operate.assert_called_once_with( - data, structured_analysis, operators - ) + data_processor.operate.assert_called_once_with(data, structured_analysis, operators) def test_structured_engine_anonymize_adds_default_operator_if_none_provided(): @@ -54,9 +52,7 @@ def test_structured_engine_anonymize_doesnt_override_existing_default_operator() structured_engine.anonymize(data, structured_analysis, operators) # Assert - data_processor.operate.assert_called_once_with( - data, structured_analysis, operators - ) + data_processor.operate.assert_called_once_with(data, structured_analysis, operators) def test_json_processor_with_pandas_dataframe_will_raise(tabular_analysis):