diff --git a/.pipelines/templates/build-structured.yml b/.pipelines/templates/build-structured.yml new file mode 100644 index 000000000..13064583c --- /dev/null +++ b/.pipelines/templates/build-structured.yml @@ -0,0 +1,27 @@ +steps: + - task: Bash@3 + displayName: 'Setup pipenv' + inputs: + targetType: 'inline' + script: | + set -eux # fail on error + python -m pip install --upgrade pip + python -m pip install pipenv + pipenv --python 3 + + - task: Bash@3 + displayName: 'Install deps' + inputs: + targetType: 'inline' + workingDirectory: 'presidio-structured' + script: | + set -eux # fail on error + pipenv install --deploy --dev + pipenv run pip install -e ../presidio-analyzer/. # Use the existing analyzer and not the one in PyPI + pipenv run pip install -e ../presidio-anonymizer/. # Use the existing analyzer and not the one in PyPI + + - template: ./build-python.yml + parameters: + SERVICE: 'Structured' + WORKING_FOLDER: 'presidio-structured' + diff --git a/.pipelines/templates/lint-build-test.yml b/.pipelines/templates/lint-build-test.yml index 6505f788c..d362273cf 100644 --- a/.pipelines/templates/lint-build-test.yml +++ b/.pipelines/templates/lint-build-test.yml @@ -76,6 +76,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - template: ./build-image-redactor.yml + - job: TestCli displayName: Test Cli pool: @@ -97,3 +98,25 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - template: ./build-cli.yml + + - job: TestStructured + displayName: Test Presidio Structured + pool: + vmImage: 'ubuntu-latest' + strategy: + matrix: + Python38: + python.version: '3.8' + Python39: + python.version: '3.9' + Python310: + python.version: '3.10' + Python311: + python.version: '3.11' + + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '$(python.version)' + displayName: 'Use Python $(python.version)' + - template: ./build-structured.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 199e098bc..b5482076d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ All notable changes to this project will be documented in this file. + +## [Unreleased] +### Added +#### Structured +* Added alpha of presidio-structured, a library (presidio-structured) which re-uses existing logic from existing presidio components to allow anonymization of (semi-)structured data. + ## [2.2.351] - Nov. 6th 2024 ### Changed #### Analyzer @@ -17,6 +23,7 @@ All notable changes to this project will be documented in this file. #### Analyzer * Put org in ignore as it has many FPs (#1200) + ## [2.2.34] - Oct. 30th 2024 ### Added @@ -66,7 +73,6 @@ All notable changes to this project will be documented in this file. * Changed the ACR instance (#1089) * Updated to Cred Scan V3 (#1154) - ## [2.2.33] - June 1st 2023 ### Added #### Anonymizer diff --git a/docs/samples/index.md b/docs/samples/index.md index 3d7f462e1..4a113aa54 100644 --- a/docs/samples/index.md +++ b/docs/samples/index.md @@ -14,6 +14,7 @@ | Usage | Images | Python Notebook | [Plot custom bounding boxes](https://github.com/microsoft/presidio/blob/main/docs/samples/python/plot_custom_bboxes.ipynb) | Usage | Text | Python Notebook | [Integrating with external services](https://github.com/microsoft/presidio/blob/main/docs/samples/python/integrating_with_external_services.ipynb) | | Usage | Text | Python file | [Remote Recognizer](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_remote_recognizer.py) | +| Usage | Structured | Python Notebook | [Presidio Structured Basic Usage Notebook](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_structured.ipynb) | | Usage | Text | Python file | [Azure AI Language as a Remote Recognizer](python/text_analytics/index.md) | | Usage | CSV | Python file | [Analyze and Anonymize CSV file](https://github.com/microsoft/presidio/blob/main/docs/samples/python/process_csv_file.py) | | Usage | Text | Python | [Using Flair as an external PII model](https://github.com/microsoft/presidio/blob/main/docs/samples/python/flair_recognizer.py)| diff --git a/docs/samples/python/csv_sample_data/test_structured.csv b/docs/samples/python/csv_sample_data/test_structured.csv new file mode 100644 index 000000000..64e235473 --- /dev/null +++ b/docs/samples/python/csv_sample_data/test_structured.csv @@ -0,0 +1,4 @@ +id,name,email,street,city,state,postal_code +1,John Doe,john.doe@example.com,123 Main St,Anytown,CA,12345 +2,Jane Smith,jane.smith@example.com,456 Elm St,Somewhere,TX,67890 +3,Alice Johnson,alice.johnson@example.com,789 Pine St,Elsewhere,NY,11223 \ No newline at end of file diff --git a/docs/samples/python/example_structured.ipynb b/docs/samples/python/example_structured.ipynb new file mode 100644 index 000000000..33968e691 --- /dev/null +++ b/docs/samples/python/example_structured.ipynb @@ -0,0 +1,565 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from presidio_structured import StructuredEngine, JsonAnalysisBuilder, PandasAnalysisBuilder, StructuredAnalysis, CsvReader, JsonReader, JsonDataProcessor, PandasDataProcessor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This sample showcases presidio-structured on structured and semi-structured data containing sensitive data like names, emails, and addresses. It differs from the sample for the batch analyzer/anonymizer engines example, which includes narrative phrases that might contain sensitive data. The presence of personal data embedded in these phrases requires to analyze and to anonymize the text inside the cells, which is not the case for our structured sample, where the sensitive data is already separated into columns." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading in data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameemailstreetcitystatepostal_code
01John Doejohn.doe@example.com123 Main StAnytownCA12345
12Jane Smithjane.smith@example.com456 Elm StSomewhereTX67890
23Alice Johnsonalice.johnson@example.com789 Pine StElsewhereNY11223
\n", + "
" + ], + "text/plain": [ + " id name email street city state \\\n", + "0 1 John Doe john.doe@example.com 123 Main St Anytown CA \n", + "1 2 Jane Smith jane.smith@example.com 456 Elm St Somewhere TX \n", + "2 3 Alice Johnson alice.johnson@example.com 789 Pine St Elsewhere NY \n", + "\n", + " postal_code \n", + "0 12345 \n", + "1 67890 \n", + "2 11223 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_df = CsvReader().read(\"./csv_sample_data/test_structured.csv\")\n", + "sample_df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 1,\n", + " 'name': 'John Doe',\n", + " 'email': 'john.doe@example.com',\n", + " 'address': {'street': '123 Main St',\n", + " 'city': 'Anytown',\n", + " 'state': 'CA',\n", + " 'postal_code': '12345'}}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_json = JsonReader().read(\"./sample_data/test_structured.json\")\n", + "sample_json" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'users': [{'id': 1,\n", + " 'name': 'John Doe',\n", + " 'email': 'john.doe@example.com',\n", + " 'address': {'street': '123 Main St',\n", + " 'city': 'Anytown',\n", + " 'state': 'CA',\n", + " 'postal_code': '12345'}},\n", + " {'id': 2,\n", + " 'name': 'Jane Smith',\n", + " 'email': 'jane.smith@example.com',\n", + " 'address': {'street': '456 Elm St',\n", + " 'city': 'Somewhere',\n", + " 'state': 'TX',\n", + " 'postal_code': '67890'}},\n", + " {'id': 3,\n", + " 'name': 'Alice Johnson',\n", + " 'email': 'alice.johnson@example.com',\n", + " 'address': {'street': '789 Pine St',\n", + " 'city': 'Elsewhere',\n", + " 'state': 'NY',\n", + " 'postal_code': '11223'}}]}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# contains nested objects in lists\n", + "sample_complex_json = JsonReader().read(\"./sample_data/test_structured_complex.json\")\n", + "sample_complex_json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tabular (csv) data: defining & generating tabular analysis, anonymization." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "StructuredAnalysis(entity_mapping={'name': 'PERSON', 'email': 'EMAIL_ADDRESS', 'city': 'LOCATION', 'state': 'LOCATION'})" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Automatically detect the entity for the columns\n", + "tabular_analysis = PandasAnalysisBuilder().generate_analysis(sample_df)\n", + "tabular_analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameemailstreetcitystatepostal_code
01<None><None>123 Main St<None><None>12345
12<None><None>456 Elm St<None><None>67890
23<None><None>789 Pine St<None><None>11223
\n", + "
" + ], + "text/plain": [ + " id name email street city state postal_code\n", + "0 1 123 Main St 12345\n", + "1 2 456 Elm St 67890\n", + "2 3 789 Pine St 11223" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# anonymized data defaults to be replaced with None, unless operators is specified\n", + "\n", + "pandas_engine = StructuredEngine(data_processor=PandasDataProcessor())\n", + "df_to_be_anonymized = sample_df.copy() # in-place anonymization\n", + "anonymized_df = pandas_engine.anonymize(df_to_be_anonymized, tabular_analysis, operators=None) # explicit None for clarity\n", + "anonymized_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### We can also define operators using OperatorConfig similar as to the AnonymizerEngine:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameemailstreetcitystatepostal_code
01person...jamestaylor@example.net123 Main St<None><None>12345
12person...brian49@example.com456 Elm St<None><None>67890
23person...clarkcody@example.org789 Pine St<None><None>11223
\n", + "
" + ], + "text/plain": [ + " id name email street city state \\\n", + "0 1 person... jamestaylor@example.net 123 Main St \n", + "1 2 person... brian49@example.com 456 Elm St \n", + "2 3 person... clarkcody@example.org 789 Pine St \n", + "\n", + " postal_code \n", + "0 12345 \n", + "1 67890 \n", + "2 11223 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from presidio_anonymizer.entities.engine import OperatorConfig\n", + "from faker import Faker\n", + "fake = Faker()\n", + "\n", + "operators = {\n", + " \"PERSON\": OperatorConfig(\"replace\", {\"new_value\": \"person...\"}),\n", + " \"EMAIL_ADDRESS\": OperatorConfig(\"custom\", {\"lambda\": lambda x: fake.safe_email()})\n", + " # etc...\n", + " }\n", + "anonymized_df = pandas_engine.anonymize(sample_df, tabular_analysis, operators=operators)\n", + "anonymized_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semi-structured (JSON) data: simple and complex analysis, anonymization" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "StructuredAnalysis(entity_mapping={'name': 'PERSON', 'email': 'EMAIL_ADDRESS', 'address.city': 'LOCATION', 'address.state': 'LOCATION'})" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "json_analysis = JsonAnalysisBuilder().generate_analysis(sample_json)\n", + "json_analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Analyzer.analyze_iterator only works on primitive types (int, float, bool, str). Lists of objects are not yet supported.\n" + ] + } + ], + "source": [ + "# Currently does not support nested objects in lists\n", + "try:\n", + " json_complex_analysis = JsonAnalysisBuilder().generate_analysis(sample_complex_json)\n", + "except ValueError as e:\n", + " print(e)\n", + "\n", + "# however, we can define it manually:\n", + "json_complex_analysis = StructuredAnalysis(entity_mapping={\n", + " \"users.name\":\"PERSON\",\n", + " \"users.address.street\":\"LOCATION\",\n", + " \"users.address.city\":\"LOCATION\",\n", + " \"users.address.state\":\"LOCATION\",\n", + " \"users.email\": \"EMAIL_ADDRESS\",\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 1,\n", + " 'name': 'person...',\n", + " 'email': 'virginia29@example.org',\n", + " 'address': {'street': '123 Main St',\n", + " 'city': '',\n", + " 'state': '',\n", + " 'postal_code': '12345'}}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# anonymizing simple data\n", + "json_engine = StructuredEngine(data_processor=JsonDataProcessor())\n", + "anonymized_json = json_engine.anonymize(sample_json, json_analysis, operators=operators)\n", + "anonymized_json" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'users': [{'id': 1,\n", + " 'name': 'person...',\n", + " 'email': 'david90@example.org',\n", + " 'address': {'street': '',\n", + " 'city': '',\n", + " 'state': '',\n", + " 'postal_code': '12345'}},\n", + " {'id': 2,\n", + " 'name': 'person...',\n", + " 'email': 'david90@example.org',\n", + " 'address': {'street': '',\n", + " 'city': '',\n", + " 'state': '',\n", + " 'postal_code': '67890'}},\n", + " {'id': 3,\n", + " 'name': 'person...',\n", + " 'email': 'david90@example.org',\n", + " 'address': {'street': '',\n", + " 'city': '',\n", + " 'state': '',\n", + " 'postal_code': '11223'}}]}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymized_complex_json = json_engine.anonymize(sample_complex_json, json_complex_analysis, operators=operators)\n", + "anonymized_complex_json" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/samples/python/sample_data/test_structured.json b/docs/samples/python/sample_data/test_structured.json new file mode 100644 index 000000000..9e416fc90 --- /dev/null +++ b/docs/samples/python/sample_data/test_structured.json @@ -0,0 +1,11 @@ +{ + "id": 1, + "name": "John Doe", + "email": "john.doe@example.com", + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + "postal_code": "12345" + } +} \ No newline at end of file diff --git a/docs/samples/python/sample_data/test_structured_complex.json b/docs/samples/python/sample_data/test_structured_complex.json new file mode 100644 index 000000000..fce99ead7 --- /dev/null +++ b/docs/samples/python/sample_data/test_structured_complex.json @@ -0,0 +1,37 @@ +{ + "users": [ + { + "id": 1, + "name": "John Doe", + "email": "john.doe@example.com", + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + "postal_code": "12345" + } + }, + { + "id": 2, + "name": "Jane Smith", + "email": "jane.smith@example.com", + "address": { + "street": "456 Elm St", + "city": "Somewhere", + "state": "TX", + "postal_code": "67890" + } + }, + { + "id": 3, + "name": "Alice Johnson", + "email": "alice.johnson@example.com", + "address": { + "street": "789 Pine St", + "city": "Elsewhere", + "state": "NY", + "postal_code": "11223" + } + } + ] +} diff --git a/presidio-structured/Pipfile b/presidio-structured/Pipfile new file mode 100644 index 000000000..4205f8b63 --- /dev/null +++ b/presidio-structured/Pipfile @@ -0,0 +1,17 @@ +[[source]] +url = "https://pypi.python.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +flask = ">=1.1" +presidio-analyzer = ">=2.2.31" +presidio-anonymizer = ">=2.2.31" +pandas = ">=1.5.2" + +[dev-packages] +pytest = "*" +flake8 = { version = ">=3.7.9" } +pep8-naming = "*" +flake8-docstrings = "*" +pre_commit = "*" diff --git a/presidio-structured/README.md b/presidio-structured/README.md new file mode 100644 index 000000000..bae17ca3a --- /dev/null +++ b/presidio-structured/README.md @@ -0,0 +1,103 @@ +# Presidio structured + +## Status + +**Alpha**: This package is currently in alpha, meaning it is in its early stages of development. Features and functionality may change as the project evolves. + +## Description + +The Presidio structured package is a flexible and customizable framework designed to identify and protect structured sensitive data. This tool extends the capabilities of Presidio, focusing on structured data formats such as tabular formats and semi-structured formats (JSON). It leverages the detection capabilities of Presidio-Analyzer to identify columns or keys containing personally identifiable information (PII), and establishes a mapping between these column/keys names and the detected PII entities. Following the detection, Presidio-Anonymizer is used to apply de-identification techniques to each value in columns identified as containing PII, ensuring the sensitive data is appropriately protected. + +## Installation + +### As a python package: + +To install the `presidio-structured` package, run the following command: + +```sh +pip install presidio-structured +``` + +#### Getting started + +Example 1: Anonymizing DataFrames + +```python +import pandas as pd +from presidio_structured import StructuredEngine, PandasAnalysisBuilder +from presidio_anonymizer.entities import OperatorConfig +from faker import Faker # optionally using faker as an example + +# Initialize the engine with a Pandas data processor (default) +pandas_engine = StructuredEngine() + +# Create a sample DataFrame +sample_df = pd.DataFrame({'name': ['John Doe', 'Jane Smith'], 'email': ['john.doe@example.com', 'jane.smith@example.com']}) + +# Generate a tabular analysis which describes PII entities in the DataFrame. +tabular_analysis = PandasAnalysisBuilder().generate_analysis(sample_df) + +# Define anonymization operators +fake = Faker() +operators = { + "PERSON": OperatorConfig("replace", {"new_value": "REDACTED"}), + "EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: fake.safe_email()}) +} + +# Anonymize DataFrame +anonymized_df = pandas_engine.anonymize(sample_df, tabular_analysis, operators=operators) +print(anonymized_df) +``` + +Example 2: Anonymizing JSON Data + +```python +from presidio_structured import StructuredEngine, JsonAnalysisBuilder, StructuredAnalysis, JsonDataProcessor +from presidio_anonymizer.entities import OperatorConfig +from faker import Faker # optionally using faker as an example + +# Initialize the engine with a JSON data processor +json_engine = StructuredEngine(data_processor=JsonDataProcessor()) + + +# Sample JSON data +sample_json = { + "user": { + "name": "John Doe", + "email": "john.doe@example.com" + } +} + +# Generate analysis for simple JSON data +json_analysis = JsonAnalysisBuilder().generate_analysis(sample_json) + +# Define anonymization operators +fake = Faker() # using faker for email generation. +operators = { + "PERSON": OperatorConfig("replace", {"new_value": "REDACTED"}), + "EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: fake.safe_email()}) +} + +# Anonymize JSON data +anonymized_json = json_engine.anonymize(sample_json, json_analysis, operators=operators) +print(anonymized_json) + +# Handling Json Data with nested objects in lists +sample_complex_json = { + "users": [ + {"name": "John Doe", "email": "john.doe@example.com"}, + {"name": "Jane Smith", "email": "jane.smith@example.com"} + ] +} + +# Nesting objects in lists is not supported in JsonAnalysisBuilder for now, +# Manually defining the analysis for complex JSON data +json_complex_analysis = StructuredAnalysis(entity_mapping={ + "users.name": "PERSON", + "users.email": "EMAIL_ADDRESS" +}) + +# Anonymize complex JSON data +anonymized_complex_json = json_engine.anonymize(sample_complex_json, json_complex_analysis, operators=operators) +print(anonymized_complex_json) +``` diff --git a/presidio-structured/logging.ini b/presidio-structured/logging.ini new file mode 100644 index 000000000..62c58c1f8 --- /dev/null +++ b/presidio-structured/logging.ini @@ -0,0 +1,27 @@ +[loggers] +keys=root,presidio-structured + +[handlers] +keys=consoleHandler + +[formatters] +keys=simpleFormatter + +[logger_root] +level=INFO +handlers=consoleHandler + +[logger_presidio-structured] +level=INFO +handlers=consoleHandler +qualname=presidio-structured +propagate=0 + +[handler_consoleHandler] +class=StreamHandler +level=INFO +formatter=simpleFormatter +args=(sys.stdout,) + +[formatter_simpleFormatter] +format=%(asctime)s - %(name)s - %(levelname)s - %(message)s \ No newline at end of file diff --git a/presidio-structured/presidio_structured/__init__.py b/presidio-structured/presidio_structured/__init__.py new file mode 100644 index 000000000..7ad40b67a --- /dev/null +++ b/presidio-structured/presidio_structured/__init__.py @@ -0,0 +1,25 @@ +"""presidio-structured root module.""" +import logging + +from .analysis_builder import JsonAnalysisBuilder, PandasAnalysisBuilder +from .config import StructuredAnalysis +from .data import ( + CsvReader, + JsonDataProcessor, + JsonReader, + PandasDataProcessor, +) +from .structured_engine import StructuredEngine + +logging.getLogger("presidio-structured").addHandler(logging.NullHandler()) + +__all__ = [ + "StructuredEngine", + "JsonAnalysisBuilder", + "PandasAnalysisBuilder", + "StructuredAnalysis", + "CsvReader", + "JsonReader", + "PandasDataProcessor", + "JsonDataProcessor", +] diff --git a/presidio-structured/presidio_structured/analysis_builder.py b/presidio-structured/presidio_structured/analysis_builder.py new file mode 100644 index 000000000..b2db7ef6f --- /dev/null +++ b/presidio-structured/presidio_structured/analysis_builder.py @@ -0,0 +1,271 @@ +import logging +from abc import ABC, abstractmethod +from collections import Counter +from collections.abc import Iterable +from typing import Dict, Iterator, List, Optional, Union + +from pandas import DataFrame +from presidio_analyzer import ( + AnalyzerEngine, + BatchAnalyzerEngine, + DictAnalyzerResult, + RecognizerResult, +) + +from presidio_structured.config import StructuredAnalysis + +NON_PII_ENTITY_TYPE = "NON_PII" + +logger = logging.getLogger("presidio-structured") + + +class AnalysisBuilder(ABC): + """Abstract base class for a configuration generator.""" + + def __init__( + self, + analyzer: Optional[AnalyzerEngine] = None, + analyzer_score_threshold: Optional[float] = None, + ) -> None: + """Initialize the configuration generator.""" + default_score_threshold = ( + analyzer_score_threshold if analyzer_score_threshold is not None else 0 + ) + self.analyzer = ( + AnalyzerEngine(default_score_threshold=default_score_threshold) + if analyzer is None + else analyzer + ) + self.batch_analyzer = BatchAnalyzerEngine(analyzer_engine=self.analyzer) + + @abstractmethod + def generate_analysis( + self, + data: Union[Dict, DataFrame], + language: str = "en", + score_threshold: Optional[float] = None, + ) -> StructuredAnalysis: + """ + Abstract method to generate a configuration from the given data. + + :param data: The input data. Can be a dictionary or DataFrame instance. + :return: The generated configuration. + """ + pass + + def _remove_low_scores( + self, + key_recognizer_result_map: Dict[str, RecognizerResult], + score_threshold: float = None, + ) -> List[RecognizerResult]: + """ + Remove results for which the confidence is lower than the threshold. + + :param results: Dict of column names to RecognizerResult + :param score_threshold: float value for minimum possible confidence + :return: List[RecognizerResult] + """ + if score_threshold is None: + score_threshold = self.analyzer.default_score_threshold + + new_key_recognizer_result_map = {} + for column, result in key_recognizer_result_map.items(): + if result.score >= score_threshold: + new_key_recognizer_result_map[column] = result + + return new_key_recognizer_result_map + + +class JsonAnalysisBuilder(AnalysisBuilder): + """Concrete configuration generator for JSON data.""" + + def generate_analysis( + self, + data: Dict, + language: str = "en", + ) -> StructuredAnalysis: + """ + Generate a configuration from the given JSON data. + + :param data: The input JSON data. + :return: The generated configuration. + """ + logger.debug("Starting JSON BatchAnalyzer analysis") + analyzer_results = self.batch_analyzer.analyze_dict( + input_dict=data, language=language + ) + + key_recognizer_result_map = self._generate_analysis_from_results_json( + analyzer_results + ) + + key_entity_map = { + key: result.entity_type for key, result in key_recognizer_result_map.items() + } + + return StructuredAnalysis(entity_mapping=key_entity_map) + + def _generate_analysis_from_results_json( + self, analyzer_results: Iterator[DictAnalyzerResult], prefix: str = "" + ) -> Dict[str, RecognizerResult]: + """ + Generate a configuration from the given analyzer results. \ + Always uses the first recognizer result if there are more than one. + + :param analyzer_results: The analyzer results. + :param prefix: The prefix for the configuration keys. + :return: The generated configuration. + """ + key_recognizer_result_map = {} + + if not isinstance(analyzer_results, Iterable): + logger.debug( + "No analyzer results found, returning empty StructuredAnalysis" + ) + return key_recognizer_result_map + + for result in analyzer_results: + current_key = prefix + result.key + + if isinstance(result.value, dict): + nested_mappings = self._generate_analysis_from_results_json( + result.recognizer_results, prefix=current_key + "." + ) + key_recognizer_result_map.update(nested_mappings) + first_recognizer_result = next(iter(result.recognizer_results), None) + if first_recognizer_result is not None: + logger.debug( + f"Found result with entity {first_recognizer_result.entity_type} \ + in {current_key}" + ) + key_recognizer_result_map[current_key] = first_recognizer_result + return key_recognizer_result_map + + +class TabularAnalysisBuilder(AnalysisBuilder): + """Placeholder class for generalizing tabular data analysis builders \ + (e.g. PySpark). Only implemented as PandasAnalysisBuilder for now.""" + + pass + + +class PandasAnalysisBuilder(TabularAnalysisBuilder): + """Concrete configuration generator for tabular data.""" + + def generate_analysis( + self, + df: DataFrame, + n: Optional[int] = None, + language: str = "en", + ) -> StructuredAnalysis: + """ + Generate a configuration from the given tabular data. + + :param df: The input tabular data (dataframe). + :param n: The number of samples to be taken from the dataframe. + :param language: The language to be used for analysis. + :return: A StructuredAnalysis object containing the analysis results. + """ + if not n: + n = len(df) + elif n > len(df): + logger.debug( + f"Number of samples ({n}) is larger than the number of rows \ + ({len(df)}), using all rows" + ) + n = len(df) + + df = df.sample(n, random_state=123) + + key_recognizer_result_map = self._generate_key_rec_results_map(df, language) + + key_entity_map = { + key: result.entity_type + for key, result in key_recognizer_result_map.items() + if result.entity_type != NON_PII_ENTITY_TYPE + } + + return StructuredAnalysis(entity_mapping=key_entity_map) + + def _generate_key_rec_results_map( + self, df: DataFrame, language: str + ) -> Dict[str, RecognizerResult]: + """ + Find the most common entity in a dataframe column. + + If more than one entity is found in a cell, the first one is used. + + :param df: The dataframe where entities will be searched. + :param language: Language to be used in the analysis engine. + :return: A dictionary mapping column names to the most common RecognizerResult. + """ + column_analyzer_results_map = self._batch_analyze_df(df, language) + key_recognizer_result_map = {} + for column, analyzer_result in column_analyzer_results_map.items(): + key_recognizer_result_map[column] = self._find_most_common_entity( + analyzer_result + ) + return key_recognizer_result_map + + def _batch_analyze_df( + self, df: DataFrame, language: str + ) -> Dict[str, List[List[RecognizerResult]]]: + """ + Analyze each column in the dataframe for entities using the batch analyzer. + + :param df: The dataframe to be analyzed. + :param language: The language configuration for the analyzer. + :return: A dictionary mapping each column name to a \ + list of lists of RecognizerResults. + """ + column_analyzer_results_map = {} + for column in df.columns: + logger.debug(f"Finding most common PII entity for column {column}") + analyzer_results = self.batch_analyzer.analyze_iterator( + [val for val in df[column]], language=language + ) + column_analyzer_results_map[column] = analyzer_results + + return column_analyzer_results_map + + def _find_most_common_entity( + self, analyzer_results: List[List[RecognizerResult]] + ) -> RecognizerResult: + """ + Find the most common entity in a list of analyzer results for \ + a dataframe column. + + It takes the most common entity type and calculates the confidence score based + on the number of cells it appears in. + + :param analyzer_results: List of lists of RecognizerResults for each \ + cell in the column. + :return: A RecognizerResult with the most common entity type and the \ + calculated confidence score. + """ + + if not any(analyzer_results): + return RecognizerResult( + entity_type=NON_PII_ENTITY_TYPE, start=0, end=1, score=1.0 + ) + + # Flatten the list of lists while keeping track of the cell index + flat_results = [ + (cell_idx, res) + for cell_idx, cell_results in enumerate(analyzer_results) + for res in cell_results + ] + + # Count the occurrences of each entity type in different cells + type_counter = Counter(res.entity_type for cell_idx, res in flat_results) + + # Find the most common entity type based on the number of cells it appears in + most_common_type, _ = type_counter.most_common(1)[0] + + # The score is the ratio of the most common entity type's count to the total + most_common_count = type_counter[most_common_type] + score = most_common_count / len(analyzer_results) + + return RecognizerResult( + entity_type=most_common_type, start=0, end=1, score=score + ) diff --git a/presidio-structured/presidio_structured/config/__init__.py b/presidio-structured/presidio_structured/config/__init__.py new file mode 100644 index 000000000..f7724c726 --- /dev/null +++ b/presidio-structured/presidio_structured/config/__init__.py @@ -0,0 +1,6 @@ +"""Config module for presidio-structured.""" +from .structured_analysis import StructuredAnalysis + +__all__ = [ + "StructuredAnalysis", +] diff --git a/presidio-structured/presidio_structured/config/structured_analysis.py b/presidio-structured/presidio_structured/config/structured_analysis.py new file mode 100644 index 000000000..261d1e713 --- /dev/null +++ b/presidio-structured/presidio_structured/config/structured_analysis.py @@ -0,0 +1,20 @@ +"""Structured Analysis module.""" + +from dataclasses import dataclass +from typing import Dict + + +@dataclass +class StructuredAnalysis: + """ + Dataclass containing entity analysis from structured data. + + Currently, this class only contains entity mapping. + + param entity_mapping : dict. Mapping column/key names to entity types, e.g., { + "person.name": "PERSON", + "person.address": "LOCATION" + } + """ + + entity_mapping: Dict[str, str] diff --git a/presidio-structured/presidio_structured/data/__init__.py b/presidio-structured/presidio_structured/data/__init__.py new file mode 100644 index 000000000..a65a622dd --- /dev/null +++ b/presidio-structured/presidio_structured/data/__init__.py @@ -0,0 +1,11 @@ +"""Data module.""" + +from .data_reader import CsvReader, JsonReader +from .data_processors import JsonDataProcessor, PandasDataProcessor + +__all__ = [ + "CsvReader", + "JsonReader", + "PandasDataProcessor", + "JsonDataProcessor", +] diff --git a/presidio-structured/presidio_structured/data/data_processors.py b/presidio-structured/presidio_structured/data/data_processors.py new file mode 100644 index 000000000..4e09d1cd9 --- /dev/null +++ b/presidio-structured/presidio_structured/data/data_processors.py @@ -0,0 +1,223 @@ +import logging +from abc import ABC, abstractmethod +from typing import Any, Callable, Dict, List, Union + +from pandas import DataFrame +from presidio_anonymizer.entities import OperatorConfig +from presidio_anonymizer.operators import OperatorsFactory, OperatorType + +from presidio_structured.config import StructuredAnalysis + + +class DataProcessorBase(ABC): + """Abstract class to handle logic of operations over text using the operators.""" + + def __init__(self) -> None: + """Initialize DataProcessorBase object.""" + self.logger = logging.getLogger("presidio-structured") + + def operate( + self, + data: Any, + structured_analysis: StructuredAnalysis, + operators: Dict[str, OperatorConfig], + ) -> Any: + """ + Perform operations over the text using the operators, \ + as per the structured analysis. + + :param data: Data to be operated on. + :param structured_analysis: Analysis schema as per the structured data. + :param operators: Dictionary containing operator configuration objects. + :return: Data after being operated upon. + """ + key_to_operator_mapping = self._generate_operator_mapping( + structured_analysis, operators + ) + return self._process(data, key_to_operator_mapping) + + @abstractmethod + def _process( + self, + data: Union[Dict, DataFrame], + key_to_operator_mapping: Dict[str, Callable], + ) -> Union[Dict, DataFrame]: + """ + Abstract method for subclasses to provide operation implementation. + + :param data: Data to be operated on. + :param key_to_operator_mapping: Mapping of keys to operators. + :return: Operated data. + """ + pass + + @staticmethod + def _create_operator_callable(operator, params): + def operator_callable(text): + return operator.operate(params=params, text=text) + + return operator_callable + + def _generate_operator_mapping( + self, config, operators: Dict[str, OperatorConfig] + ) -> Dict[str, Callable]: + """ + Generate a mapping of keys to operator callables. + + :param config: Configuration object containing mapping of entity types to keys. + :param operators: Dictionary containing operator configuration objects. + :return: Dictionary mapping keys to operator callables. + """ + key_to_operator_mapping = {} + + operators_factory = OperatorsFactory() + for key, entity in config.entity_mapping.items(): + self.logger.debug(f"Creating operator for key {key} and entity {entity}") + operator_config = operators.get(entity, operators.get("DEFAULT", None)) + if operator_config is None: + raise ValueError(f"Operator for entity {entity} not found") + # NOTE: hardcoded OperatorType.Anonymize, as this is the only one supported. + operator = operators_factory.create_operator_class( + operator_config.operator_name, OperatorType.Anonymize + ) + operator_callable = self._create_operator_callable( + operator, operator_config.params + ) + key_to_operator_mapping[key] = operator_callable + + return key_to_operator_mapping + + def _operate_on_text( + self, + text_to_operate_on: str, + operator_callable: Callable, + ) -> str: + """ + Operates on the provided text using the operator callable. + + :param text_to_operate_on: Text to be operated on. + :param operator_callable: Callable that performs operation on the text. + :return: Text after operation. + """ + return operator_callable(text_to_operate_on) + + +class PandasDataProcessor(DataProcessorBase): + """Pandas Data Processor.""" + + def _process( + self, data: DataFrame, key_to_operator_mapping: Dict[str, Callable] + ) -> DataFrame: + """ + Operates on the given pandas DataFrame based on the provided operators. + + :param data: DataFrame to be operated on. + :param key_to_operator_mapping: Mapping of keys to operator callables. + :return: DataFrame after the operation. + """ + + if not isinstance(data, DataFrame): + raise ValueError("Data must be a pandas DataFrame") + + for key, operator_callable in key_to_operator_mapping.items(): + self.logger.debug(f"Operating on column {key}") + for row in data.itertuples(index=True): + text_to_operate_on = getattr(row, key) + operated_text = self._operate_on_text( + text_to_operate_on, operator_callable + ) + data.at[row.Index, key] = operated_text + return data + + +class JsonDataProcessor(DataProcessorBase): + """JSON Data Processor, Supports arbitrary nesting of dictionaries and lists.""" + + @staticmethod + def _get_nested_value(data: Union[Dict, List], path: List[str]) -> Any: + """ + Recursively retrieves the value from nested data using a given path. + + :param data: Nested data (list or dictionary). + :param path: List of keys/indexes representing the path. + :return: Retrieved value. + """ + for i, key in enumerate(path): + if isinstance(data, list): + if key.isdigit(): + data = data[int(key)] + else: + return [ + JsonDataProcessor._get_nested_value(item, path[i:]) + for item in data + ] + elif isinstance(data, dict): + data = data.get(key) + else: + return data + return data + + @staticmethod + def _set_nested_value(data: Union[Dict, List], path: List[str], value: Any) -> None: + """ + Recursively sets a value in nested data using a given path. + + :param data: Nested data (JSON-like). + :param path: List of keys/indexes representing the path. + :param value: Value to be set. + """ + for i, key in enumerate(path): + if isinstance(data, list): + if i + 1 < len(path) and path[i + 1].isdigit(): + idx = int(path[i + 1]) + while len(data) <= idx: + data.append({}) + data = data[idx] + continue + else: + for item in data: + JsonDataProcessor._set_nested_value(item, path[i:], value) + return + elif isinstance(data, dict): + if i == len(path) - 1: + data[key] = value + else: + data = data.setdefault(key, {}) + + def _process( + self, + data: Union[Dict, List], + key_to_operator_mapping: Dict[str, Callable], + ) -> Union[Dict, List]: + """ + Operates on the given JSON-like data based on the provided configuration. + + :param data: JSON-like data to be operated on. + :param key_to_operator_mapping: maps keys to Callable operators. + :return: JSON-like data after the operation. + """ + + if not isinstance(data, (dict, list)): + raise ValueError("Data must be a JSON-like object") + + for key, operator_callable in key_to_operator_mapping.items(): + self.logger.debug(f"Operating on key {key}") + keys = key.split(".") + if isinstance(data, list): + for item in data: + self._process(item, key_to_operator_mapping) + else: + text_to_operate_on = self._get_nested_value(data, keys) + if text_to_operate_on: + if isinstance(text_to_operate_on, list): + for text in text_to_operate_on: + operated_text = self._operate_on_text( + text, operator_callable + ) + self._set_nested_value(data, keys, operated_text) + else: + operated_text = self._operate_on_text( + text_to_operate_on, operator_callable + ) + self._set_nested_value(data, keys, operated_text) + return data diff --git a/presidio-structured/presidio_structured/data/data_reader.py b/presidio-structured/presidio_structured/data/data_reader.py new file mode 100644 index 000000000..ab1d675a1 --- /dev/null +++ b/presidio-structured/presidio_structured/data/data_reader.py @@ -0,0 +1,70 @@ +"""Helper data classes, mostly simple wrappers to ensure consistent user interface.""" + +import json +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Any, Dict, Union + +import pandas as pd + + +class ReaderBase(ABC): + """ + Base class for data readers. + + This class should not be instantiated directly, instead init a subclass. + """ + + @abstractmethod + def read(self, path: Union[str, Path], **kwargs) -> Any: + """ + Extract data from file located at path. + + :param path: String defining the location of the file to read. + :return: The data read from the file. + """ + pass + + +class CsvReader(ReaderBase): + """ + Reader for reading csv files. + + Usage:: + + reader = CsvReader() + data = reader.read(path="filepath.csv") + + """ + + def read(self, path: Union[str, Path], **kwargs) -> pd.DataFrame: + """ + Read csv file to pandas dataframe. + + :param path: String defining the location of the csv file to read. + :return: Pandas DataFrame with the data read from the csv file. + """ + return pd.read_csv(path, **kwargs) + + +class JsonReader(ReaderBase): + """ + Reader for reading json files. + + Usage:: + + reader = JsonReader() + data = reader.read(path="filepath.json") + + """ + + def read(self, path: Union[str, Path], **kwargs) -> Dict[str, Any]: + """ + Read json file to dict. + + :param path: String defining the location of the json file to read. + :return: dictionary with the data read from the json file. + """ + with open(path) as f: + data = json.load(f, **kwargs) + return data diff --git a/presidio-structured/presidio_structured/structured_engine.py b/presidio-structured/presidio_structured/structured_engine.py new file mode 100644 index 000000000..e36dc86fb --- /dev/null +++ b/presidio-structured/presidio_structured/structured_engine.py @@ -0,0 +1,69 @@ +import logging +from typing import Dict, Union, Optional + +from pandas import DataFrame +from presidio_anonymizer.entities import OperatorConfig + +from presidio_structured.config import StructuredAnalysis +from presidio_structured.data.data_processors import ( + DataProcessorBase, + PandasDataProcessor, +) + +DEFAULT = "replace" + + +class StructuredEngine: + """Class to implement methods for anonymizing tabular data.""" + + def __init__(self, data_processor: Optional[DataProcessorBase] = None) -> None: + """ + Initialize the class with a data processor. + + :param data_processor: Instance of DataProcessorBase. + """ + if data_processor is None: + self.data_processor = PandasDataProcessor() + else: + self.data_processor = data_processor + + self.logger = logging.getLogger("presidio-structured") + + def anonymize( + self, + data: Union[Dict, DataFrame], + structured_analysis: StructuredAnalysis, + operators: Dict[str, OperatorConfig] = None, + ) -> Union[Dict, DataFrame]: + """ + Anonymize the given data using the given configuration. + + :param data: input data as dictionary or pandas DataFrame. + :param structured_analysis: structured analysis configuration. + :param operators: a dictionary of operator configurations, optional. + :return: Anonymized dictionary or DataFrame. + """ + self.logger.debug("Starting anonymization") + operators = self.__check_or_add_default_operator(operators) + + return self.data_processor.operate(data, structured_analysis, operators) + + def __check_or_add_default_operator( + self, operators: Dict[str, OperatorConfig] + ) -> Dict[str, OperatorConfig]: + """ + Check if the provided operators dictionary has a default operator. \ + If not, add a default operator. + + :param operators: dictionary of operator configurations. + :return: operators dictionary with the default operator added \ + if it was not initially present. + """ + default_operator = OperatorConfig(DEFAULT) + if not operators: + self.logger.debug("No operators provided, using default operator") + return {"DEFAULT": default_operator} + if not operators.get("DEFAULT"): + self.logger.debug("No default operator provided, using default operator") + operators["DEFAULT"] = default_operator + return operators diff --git a/presidio-structured/setup.cfg b/presidio-structured/setup.cfg new file mode 100644 index 000000000..732559f8e --- /dev/null +++ b/presidio-structured/setup.cfg @@ -0,0 +1,10 @@ +[flake8] +max-line-length = 88 +exclude = + .git, + __pycache__, + build, + dist, + tests +docstring-convention = numpy +extend-ignore = E203 D100 D202 ANN101 ANN102 ANN204 ANN203 TC \ No newline at end of file diff --git a/presidio-structured/setup.py b/presidio-structured/setup.py new file mode 100644 index 000000000..bd85e70b0 --- /dev/null +++ b/presidio-structured/setup.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# noqa: D100 +import os.path +from os import path + +from setuptools import setup, find_packages + +test_requirements = ["pytest>=3", "flake8==3.7.9"] + +__version__ = "" +this_directory = path.abspath(path.dirname(__file__)) +parent_directory = os.path.abspath(os.path.join(this_directory, os.pardir)) + +with open(path.join(this_directory, "README.md"), encoding="utf-8") as f: + long_description = f.read() + +try: + with open(os.path.join(parent_directory, "VERSION")) as version_file: + __version__ = version_file.read().strip() +except Exception: + __version__ = os.environ.get("PRESIDIO_VERSION", "0.0.1-alpha") + +setup( + name="presidio_structured", + python_requires=">=3.5", + version=__version__, + packages=find_packages(include=["presidio_structured", "presidio_structured.*"]), + classifiers=[ + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + ], + description="Presidio structured package - analyses and anonymizes \ + structured and semistructured data.", + license="MIT license", + include_package_data=True, + keywords="presidio_structured", + install_requires=["presidio-analyzer>=2.2", "presidio-anonymizer>=2.2"], + test_suite="tests", + tests_require=test_requirements, + url="https://github.com/microsoft/presidio", + zip_safe=False, + trusted_host=["pypi.org"], + long_description=long_description, + long_description_content_type="text/markdown", +) diff --git a/presidio-structured/tests/__init__.py b/presidio-structured/tests/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/presidio-structured/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/presidio-structured/tests/conftest.py b/presidio-structured/tests/conftest.py new file mode 100644 index 000000000..96bd0f7a1 --- /dev/null +++ b/presidio-structured/tests/conftest.py @@ -0,0 +1,94 @@ +""" Pytest fixtures for presidio-structured tests. """ + +import pandas as pd +import pytest +from presidio_anonymizer.entities import OperatorConfig +from presidio_structured import PandasAnalysisBuilder, JsonAnalysisBuilder +from presidio_structured.config import StructuredAnalysis + + +@pytest.fixture +def sample_df(): + data = { + "name": ["John Doe", "Jane Doe", "John Smith"], + "email": [ + "john@example.com", + "jane@example.com", + "johnsmith@example.com", + ], + "phone": ["1234567890", "0987654321", "1122334455"], + } + return pd.DataFrame(data) + + +@pytest.fixture +def sample_json(): + data = { + "id": 1, + "name": "John Doe", + "email": "john.doe@example.com", + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + "postal_code": "12345", + }, + } + return data + + +@pytest.fixture +def sample_json_with_array(): + data = { + "users": [ + {"id": 1, "name": "John Doe"}, + {"id": 2, "name": "Jane Doe"}, + ] + } + return data + + +@pytest.fixture +def json_analysis_builder(): + return JsonAnalysisBuilder() + + +@pytest.fixture +def tabular_analysis_builder(): + return PandasAnalysisBuilder() + + +@pytest.fixture +def operators(): + return { + "PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}), + "DEFAULT": OperatorConfig("replace", {"new_value": "DEFAULT_REPLACEMENT"}), + } + + +@pytest.fixture +def operators_no_default(): + return { + "PERSON": OperatorConfig("replace", {"new_value": "PERSON_REPLACEMENT"}), + } + + +@pytest.fixture +def tabular_analysis(): + return StructuredAnalysis( + entity_mapping={ + "name": "PERSON", + "email": "EMAIL_ADDRESS", + "phone": "PHONE_NUMBER", + } + ) + + +@pytest.fixture +def json_analysis(): + return StructuredAnalysis( + entity_mapping={ + "name": "PERSON", + "address.city": "LOCATION", + } + ) diff --git a/presidio-structured/tests/data/test_data_transformers.py b/presidio-structured/tests/data/test_data_transformers.py new file mode 100644 index 000000000..c9bd365f2 --- /dev/null +++ b/presidio-structured/tests/data/test_data_transformers.py @@ -0,0 +1,65 @@ +import pytest +from pandas import DataFrame +from presidio_structured.data.data_processors import ( + DataProcessorBase, + PandasDataProcessor, + JsonDataProcessor, +) + + +class TestDataProcessorBase: + def test_abstract_init_raises(self, sample_df, tabular_analysis_builder, operators): + with pytest.raises(TypeError): + DataProcessorBase() + + +class TestPandasDataProcessor: + def test_process(self, sample_df, operators, tabular_analysis): + processor = PandasDataProcessor() + result = processor.operate(sample_df, tabular_analysis, operators) + assert isinstance(result, DataFrame) + for key in tabular_analysis.entity_mapping: + if key == "name": + assert all(result[key] == "PERSON_REPLACEMENT") + else: + assert all(result[key] == "DEFAULT_REPLACEMENT") + + def test_process_no_default_should_raise( + self, sample_df, operators_no_default, tabular_analysis + ): + processor = PandasDataProcessor() + with pytest.raises(ValueError): + processor.operate(sample_df, tabular_analysis, operators_no_default) + + def test_process_invalid_data(self, sample_json, tabular_analysis, operators): + processor = PandasDataProcessor() + with pytest.raises(ValueError): + processor.operate(sample_json, tabular_analysis, operators) + + +class TestJsonDataProcessor: + def test_process(self, sample_json, operators, json_analysis): + processor = JsonDataProcessor() + result = processor.operate(sample_json, json_analysis, operators) + assert isinstance(result, dict) + for key, value in json_analysis.entity_mapping.items(): + keys = key.split(".") + nested_value = sample_json + for inner_key in keys: + nested_value = nested_value[inner_key] + if value == "PERSON": + assert nested_value == "PERSON_REPLACEMENT" + else: + assert nested_value == "DEFAULT_REPLACEMENT" + + def test_process_no_default_should_raise( + self, sample_json, operators_no_default, json_analysis + ): + processor = JsonDataProcessor() + with pytest.raises(ValueError): + processor.operate(sample_json, json_analysis, operators_no_default) + + def test_process_invalid_data(self, sample_df, json_analysis, operators): + processor = JsonDataProcessor() + with pytest.raises(ValueError): + processor.operate(sample_df, json_analysis, operators) diff --git a/presidio-structured/tests/test_analysis_builder.py b/presidio-structured/tests/test_analysis_builder.py new file mode 100644 index 000000000..101f2f637 --- /dev/null +++ b/presidio-structured/tests/test_analysis_builder.py @@ -0,0 +1,106 @@ +""" Test the analysis builder """ + +import pandas as pd +import pytest + +from presidio_analyzer import AnalyzerEngine + +from presidio_structured import JsonAnalysisBuilder, PandasAnalysisBuilder + +# NOTE: we won't go into depth unit-testing all analyzers, as that is covered in the presidio-analyzer tests + + +def test_generate_analysis_tabular(tabular_analysis_builder, sample_df): + structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) + + assert structured_analysis.entity_mapping["name"] == "PERSON" + assert structured_analysis.entity_mapping["email"] == "EMAIL_ADDRESS" + assert structured_analysis.entity_mapping["phone"] == "PHONE_NUMBER" + + +def test_generate_analysis_tabular_with_sampling(tabular_analysis_builder, sample_df): + structured_analysis = tabular_analysis_builder.generate_analysis(sample_df, n=2) + + assert len(structured_analysis.entity_mapping) == 3 + assert structured_analysis.entity_mapping["name"] == "PERSON" + assert structured_analysis.entity_mapping["email"] == "EMAIL_ADDRESS" + assert structured_analysis.entity_mapping["phone"] == "PHONE_NUMBER" + + +def test_generate_analysis_tabular_with_invalid_sampling( + tabular_analysis_builder, sample_df +): + with pytest.raises(ValueError): + tabular_analysis_builder.generate_analysis(sample_df, n=-1) + + +def test_find_most_common_entity(tabular_analysis_builder, sample_df): + key_recognizer_result_map = tabular_analysis_builder._generate_key_rec_results_map( + sample_df, "en" + ) + + assert len(key_recognizer_result_map) == 3 + assert key_recognizer_result_map["name"].entity_type == "PERSON" + assert key_recognizer_result_map["email"].entity_type == "EMAIL_ADDRESS" + assert key_recognizer_result_map["phone"].entity_type == "PHONE_NUMBER" + + +def test_find_most_common_entity_with_empty_df(tabular_analysis_builder): + df = pd.DataFrame() + key_recognizer_result_map = tabular_analysis_builder._generate_key_rec_results_map( + df, "en" + ) + + assert len(key_recognizer_result_map) == 0 + + +def test_analysis_tabular_when_default_threshold_is_half_then_phone_does_not_pass( + sample_df, +): + analyzer_engine = AnalyzerEngine(default_score_threshold=0.5) + tabular_analysis_builder = PandasAnalysisBuilder(analyzer_engine) + structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) + + assert len(structured_analysis.entity_mapping) == 2 + + +def test_analysis_tabular_when_default_threshold_is_zero_then_all_results_pass( + sample_df, +): + analyzer_engine = AnalyzerEngine(default_score_threshold=0) + tabular_analysis_builder = PandasAnalysisBuilder(analyzer_engine) + structured_analysis = tabular_analysis_builder.generate_analysis(sample_df) + + assert len(structured_analysis.entity_mapping) == 3 + + +def test_generate_analysis_json(json_analysis_builder, sample_json): + structured_analysis = json_analysis_builder.generate_analysis(sample_json) + + assert structured_analysis.entity_mapping["name"] == "PERSON" + assert structured_analysis.entity_mapping["address.city"] == "LOCATION" + + +def test_generate_analysis_json_with_list_should_raise( + json_analysis_builder, sample_json_with_array +): + # this feature is not supported by the BatchAnalyzerEngine used in the JsonAnalysisBuilder + with pytest.raises(ValueError): + json_analysis_builder.generate_analysis(sample_json_with_array) + + +def test_generate_analysis_json_with_empty_data(json_analysis_builder): + data = {} + structured_analysis = json_analysis_builder.generate_analysis(data) + + assert len(structured_analysis.entity_mapping) == 0 + + +def test_analysis_json_when_default_threshold_is_high_then_only_email_passes( + sample_json, +): + analyzer_engine = AnalyzerEngine(default_score_threshold=0.9) + json_analysis_builder = JsonAnalysisBuilder(analyzer_engine) + structured_analysis = json_analysis_builder.generate_analysis(sample_json) + + assert len(structured_analysis.entity_mapping) == 1 diff --git a/presidio-structured/tests/test_tabular_engine.py b/presidio-structured/tests/test_tabular_engine.py new file mode 100644 index 000000000..87a9dd287 --- /dev/null +++ b/presidio-structured/tests/test_tabular_engine.py @@ -0,0 +1,70 @@ +from unittest.mock import Mock +import pandas as pd + +import pytest + +from presidio_anonymizer.entities import OperatorConfig + +from presidio_structured import StructuredEngine +from presidio_structured.data.data_processors import JsonDataProcessor + + +def test_structured_engine_anonymize_calls_data_processor_operate(): + # Arrange + data_processor = Mock() + structured_engine = StructuredEngine(data_processor) + data = Mock() + structured_analysis = Mock() + operators = {"DEFAULT": OperatorConfig("replace")} + + # Act + structured_engine.anonymize(data, structured_analysis, operators) + + # Assert + data_processor.operate.assert_called_once_with(data, structured_analysis, operators) + + +def test_structured_engine_anonymize_adds_default_operator_if_none_provided(): + # Arrange + data_processor = Mock() + structured_engine = StructuredEngine(data_processor) + data = Mock() + structured_analysis = Mock() + + # Act + structured_engine.anonymize(data, structured_analysis) + + # Assert + data_processor.operate.assert_called_once() + args, _ = data_processor.operate.call_args + assert "DEFAULT" in args[2] + + +def test_structured_engine_anonymize_doesnt_override_existing_default_operator(): + # Arrange + data_processor = Mock() + structured_engine = StructuredEngine(data_processor) + data = Mock() + structured_analysis = Mock() + operators = {"DEFAULT": OperatorConfig("custom")} + + # Act + structured_engine.anonymize(data, structured_analysis, operators) + + # Assert + data_processor.operate.assert_called_once_with(data, structured_analysis, operators) + + +def test_json_processor_with_pandas_dataframe_will_raise(tabular_analysis): + data_processor = JsonDataProcessor() + structured_engine = StructuredEngine(data_processor) + data = pd.DataFrame({"name": ["John", "Jane"]}) + with pytest.raises(ValueError): + structured_engine.anonymize(data, tabular_analysis) + + +def test_pandas_processor_with_json_will_raise(json_analysis): + structured_engine = StructuredEngine() # default PandasDataProcessor + data = {"name": ["John", "Jane"]} + with pytest.raises(ValueError): + structured_engine.anonymize(data, json_analysis)