From 4024d3dd55e6a22f65ff601c35bbecfcb25949a1 Mon Sep 17 00:00:00 2001 From: Dristy Srivastava <58721149+dristysrivastava@users.noreply.github.com> Date: Fri, 20 Sep 2024 10:02:16 +0530 Subject: [PATCH] Adding anonymize_snippet as a input filed in /loader/doc API (#558) * Adding Anonymize_snippet as a input filed in /loader/doc API * Added type checking, and some private functions * Added UTs --- pebblo/app/api/req_models.py | 1 + pebblo/app/service/doc_helper.py | 24 ++- .../app/service/loader/loader_doc_service.py | 47 +++-- pebblo/app/service/service.py | 36 +++- tests/app/service/test_doc_helper.py | 178 ++++++++++++++++++ tests/app/service/test_loader_doc_service.py | 116 ++++++++++++ 6 files changed, 376 insertions(+), 26 deletions(-) create mode 100644 tests/app/service/test_doc_helper.py create mode 100644 tests/app/service/test_loader_doc_service.py diff --git a/pebblo/app/api/req_models.py b/pebblo/app/api/req_models.py index dc5c2bc1..00401e49 100644 --- a/pebblo/app/api/req_models.py +++ b/pebblo/app/api/req_models.py @@ -66,6 +66,7 @@ class ReqLoaderDoc(BaseModel): source_owner: str classifier_location: str classifier_mode: Optional[str] = None + anonymize_snippets: Optional[bool] = None class Context(BaseModel): diff --git a/pebblo/app/service/doc_helper.py b/pebblo/app/service/doc_helper.py index a823769c..6a0c8c82 100644 --- a/pebblo/app/service/doc_helper.py +++ b/pebblo/app/service/doc_helper.py @@ -7,7 +7,7 @@ from datetime import datetime from pebblo.app.enums.common import ClassificationMode -from pebblo.app.enums.enums import CacheDir, ClassifierConstants, ReportConstants +from pebblo.app.enums.enums import CacheDir, ReportConstants from pebblo.app.models.models import ( AiDataModel, AiDocs, @@ -37,16 +37,24 @@ class LoaderHelper: Class for loader doc related task """ - def __init__(self, app_details, data, load_id, classifier_mode): + def __init__( + self, + app_details: dict, + data: dict, + load_id: str, + classifier_mode: str = "all", + anonymize_snippets: bool = False, + ): self.app_details = app_details self.data = data self.load_id = load_id self.loader_mapper = {} self.classifier_mode = classifier_mode + self.anonymize_snippets = anonymize_snippets self.entity_classifier_obj = EntityClassifier() # Initialization - def _initialize_raw_data(self): + def _initialize_raw_data(self) -> dict: """ Initializing raw data and return as dict object """ @@ -69,7 +77,7 @@ def _initialize_raw_data(self): return raw_data @staticmethod - def _fetch_variables(raw_data): + def _fetch_variables(raw_data: dict): """ Return list of variable's """ @@ -111,7 +119,7 @@ def _update_raw_data( ) # Model Creation - def _create_doc_model(self, doc, doc_info): + def _create_doc_model(self, doc: dict, doc_info: AiDataModel) -> dict: """ Create doc model and return its object """ @@ -163,7 +171,7 @@ def _get_top_n_findings(raw_data): ] return top_n_findings - def _count_files_with_findings(self): + def _count_files_with_findings(self) -> int: """ Return the count of files that have associated findings. """ @@ -176,7 +184,7 @@ def _count_files_with_findings(self): files_with_findings_count += 1 return files_with_findings_count - def _get_classifier_response(self, doc): + def _get_classifier_response(self, doc: dict) -> AiDataModel: doc_info = AiDataModel( data=doc.get("doc", None), entities={}, @@ -209,7 +217,7 @@ def _get_classifier_response(self, doc): entity_details, ) = self.entity_classifier_obj.presidio_entity_classifier_and_anonymizer( doc_info.data, - anonymize_snippets=ClassifierConstants.anonymize_snippets.value, + anonymize_snippets=self.anonymize_snippets, ) doc_info.entities = entities doc_info.entityDetails = entity_details diff --git a/pebblo/app/service/loader/loader_doc_service.py b/pebblo/app/service/loader/loader_doc_service.py index 8a7f5bb3..5549b0a6 100644 --- a/pebblo/app/service/loader/loader_doc_service.py +++ b/pebblo/app/service/loader/loader_doc_service.py @@ -5,7 +5,7 @@ from pebblo.app.config.config import var_server_config_dict from pebblo.app.enums.common import ClassificationMode -from pebblo.app.enums.enums import ApplicationTypes, CacheDir, ClassifierConstants +from pebblo.app.enums.enums import ApplicationTypes, CacheDir from pebblo.app.libs.responses import PebbloJsonResponse from pebblo.app.models.db_models import ( AiDataModel, @@ -40,8 +40,16 @@ def __init__(self): self.data = None self.app_name = None self.classifier_mode = None + self.anonymize_snippets = None self.entity_classifier_obj = EntityClassifier() + def _initialize_data(self, data: dict): + self.db = SQLiteClient() + self.data = data + self.app_name = data.get("name") + self._set_classifier_mode() + self._set_anonymize_snippets() + @staticmethod def _create_return_response(message, output=None, status_code=200): if output is None: @@ -202,7 +210,7 @@ def _get_doc_classification(self, doc): entity_details, ) = self.entity_classifier_obj.presidio_entity_classifier_and_anonymizer( doc_info.data, - anonymize_snippets=ClassifierConstants.anonymize_snippets.value, + anonymize_snippets=self.anonymize_snippets, ) doc_info.entities = entities doc_info.entityDetails = entity_details @@ -276,19 +284,34 @@ def _get_or_create_data_source(self): logger.debug("Data Source has been created successfully.") return data_source_obj.data + def _set_classifier_mode(self): + """ + This function defines the value of the classifier_mode: if it is included in the API request, + it will be used; otherwise, the value will be taken from the config. + """ + if not self.data.get("classifier_mode"): + self.classifier_mode = config_details.get("classifier", {}).get( + "mode", ClassificationMode.ALL.value + ) + else: + self.classifier_mode = self.data.get("classifier_mode") + + def _set_anonymize_snippets(self): + """ + This function defines the value of the anonymize_snippets: if it is included in the API request, + it will be used; otherwise, the value will be taken from the config. + """ + if not self.data.get("anonymize_snippets"): + self.anonymize_snippets = config_details.get("classifier", {}).get( + "anonymizeSnippets", False + ) + else: + self.anonymize_snippets = self.data.get("anonymize_snippets") + @timeit def process_request(self, data): try: - self.db = SQLiteClient() - self.data = data - self.app_name = data.get("name") - - if not self.data.get("classifier_mode"): - self.classifier_mode = config_details.get("classifier", {}).get( - "mode", ClassificationMode.ALL.value - ) - else: - self.classifier_mode = self.data.get("classifier_mode") + self._initialize_data(data) # create session self.db.create_session() diff --git a/pebblo/app/service/service.py b/pebblo/app/service/service.py index e8ce7912..31dffd84 100644 --- a/pebblo/app/service/service.py +++ b/pebblo/app/service/service.py @@ -30,10 +30,13 @@ def __init__(self): self.data = None self.app_name = None self.classifier_mode = None + self.anonymize_snippets = None - def _initialize_data(self, data): + def _initialize_data(self, data: dict): self.data = data self.app_name = data.get("name") + self._set_classifier_mode() + self._set_anonymize_snippets() def _write_pdf_report(self, final_report): """ @@ -122,17 +125,34 @@ def _upsert_loader_details(self, app_details): loader_list.append(new_loader_data.model_dump()) app_details["loaders"] = loader_list - def process_request(self, data): + def _set_classifier_mode(self): """ - This process is entrypoint function for loader doc API implementation. + This function defines the value of the classifier_mode: if it is included in the API request, + it will be used; otherwise, the value will be taken from the config. """ - if not data.get("classifier_mode"): + if not self.data.get("classifier_mode"): self.classifier_mode = config_details.get("classifier", {}).get( "mode", ClassificationMode.ALL.value ) else: - self.classifier_mode = data.get("classifier_mode") + self.classifier_mode = self.data.get("classifier_mode") + + def _set_anonymize_snippets(self): + """ + This function defines the value of the anonymize_snippets: if it is included in the API request, + it will be used; otherwise, the value will be taken from the config. + """ + if not self.data.get("anonymize_snippets"): + self.anonymize_snippets = config_details.get("classifier", {}).get( + "anonymizeSnippets", False + ) + else: + self.anonymize_snippets = self.data.get("anonymize_snippets") + def process_request(self, data: dict): + """ + This process is entrypoint function for loader doc API implementation. + """ self._initialize_data(data) try: @@ -173,7 +193,11 @@ def process_request(self, data): # process input docs, app details, and generate final report loader_helper_obj = LoaderHelper( - app_details, self.data, load_id, self.classifier_mode + app_details, + self.data, + load_id, + self.classifier_mode, + self.anonymize_snippets, ) ( app_details, diff --git a/tests/app/service/test_doc_helper.py b/tests/app/service/test_doc_helper.py new file mode 100644 index 00000000..dc6147ca --- /dev/null +++ b/tests/app/service/test_doc_helper.py @@ -0,0 +1,178 @@ +import datetime + +from pebblo.app.service.doc_helper import LoaderHelper + +app_details = { + "metadata": { + "createdAt": "2024-09-19 11:41:18.192182", + "modifiedAt": "2024-09-19 11:41:18.192183", + }, + "name": "UnitTestApp", + "description": "Loader App using Pebblo", + "owner": "AppOwner", + "pluginVersion": "0.1.1", + "instanceDetails": { + "type": "desktop", + "host": "AppOwner-MBP", + "path": "/home/data/scripts", + "runtime": "Mac OSX", + "ip": "192.168.1.39", + "language": "python", + "languageVersion": "3.11.9", + "platform": "macOS-14.6.1-arm64-i386-64bit", + "os": "Darwin", + "osVersion": "Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:04 PDT 2024; root:xnu-10063.141.2~1/RELEASE_ARM64_T6020", + "createdAt": "2024-09-19 11:41:18.192116", + }, + "framework": {"name": "langchain", "version": "0.2.35"}, + "lastUsed": "2024-09-19 11:41:18.192181", + "pebbloServerVersion": "0.1.19", + "pebbloClientVersion": "0.1.1", + "clientVersion": {"name": "langchain_community", "version": "0.2.12"}, + "loaders": [ + { + "name": "TextLoader", + "sourcePath": "/home/data/sample.txt", + "sourceType": "unsupported", + "sourceSize": 211, + "sourceFiles": [], + "lastModified": datetime.datetime.now(), + } + ], +} + +classifier_response_input_doc = { + "doc": "Sachin's SSN is 222-85-4836. His passport ID is 5484880UA. His American express credit card number is\n371449635398431. AWS Access Key AKIAQIPT4PDORIRTV6PH. client-secret is de1d4a2d-d9fa-44f1-84bb-4f73c004afda\n", + "source_path": "/home/data/sens_data.csv", + "last_modified": datetime.datetime.now(), + "file_owner": "fileOwner", + "source_path_size": 211, +} + +data = { + "name": "UnitTestApp", + "owner": "AppOwner", + "docs": [classifier_response_input_doc], + "plugin_version": "0.1.1", + "load_id": "26db970f-b4c6-44ae-9235-8f18236695a1", + "loader_details": { + "loader": "TextLoader", + "source_path": "/home/data/sample.txt", + "source_type": "unsupported", + "source_path_size": "211", + "source_aggregate_size": 211, + }, + "loading_end": True, + "source_owner": "AppOwner", + "classifier_location": "local", + "classifier_mode": None, + "anonymize_snippets": None, +} + +expected_output = { + "data": "Sachin's SSN is 222-85-4836. His passport ID is 5484880UA. His American express credit card number is\n371449635398431. AWS Access Key AKIAQIPT4PDORIRTV6PH. client-secret is de1d4a2d-d9fa-44f1-84bb-4f73c004afda\n", + "entityCount": 3, + "entities": {"us-ssn": 1, "credit-card-number": 1, "aws-access-key": 1}, + "entityDetails": { + "us-ssn": [ + { + "location": "16_27", + "confidence_score": "HIGH", + "entity_group": "pii-identification", + } + ], + "credit-card-number": [ + { + "location": "102_117", + "confidence_score": "HIGH", + "entity_group": "pii-financial", + } + ], + "aws-access-key": [ + { + "location": "134_154", + "confidence_score": "HIGH", + "entity_group": "secrets_and_tokens", + } + ], + }, + "topicCount": 0, + "topics": {}, + "topicDetails": {}, +} + + +def test_get_classifier_response(): + loader_helper = LoaderHelper(app_details, data=data, load_id=data.get("load_id")) + output = loader_helper._get_classifier_response(classifier_response_input_doc) + assert output.model_dump() == expected_output + + +def test_get_classifier_response_classifier_mode_entity(): + loader_helper_classifier_mode_entity = LoaderHelper( + app_details, data=data, load_id=data.get("load_id"), classifier_mode="entity" + ) + output = loader_helper_classifier_mode_entity._get_classifier_response( + classifier_response_input_doc + ) + + assert output.model_dump() == expected_output + + +def test_get_classifier_response_classifier_mode_topic(): + loader_helper_classifier_mode_topic = LoaderHelper( + app_details, data=data, load_id=data.get("load_id"), classifier_mode="topic" + ) + output = loader_helper_classifier_mode_topic._get_classifier_response( + classifier_response_input_doc + ) + + expected_output.update( + { + "entityCount": 0, + "entities": {}, + "entityDetails": {}, + } + ) + assert output.model_dump() == expected_output + + +def test_get_classifier_response_anonymize_true(): + loader_helper_anonymize_true = LoaderHelper( + app_details, data=data, load_id=data.get("load_id"), anonymize_snippets=True + ) + output = loader_helper_anonymize_true._get_classifier_response( + classifier_response_input_doc + ) + + expected_output.update( + { + "data": "Sachin's SSN is <US_SSN>. His passport ID is 5484880UA. His American express credit card number is\n<CREDIT_CARD>. AWS Access Key <AWS_ACCESS_KEY>. client-secret is de1d4a2d-d9fa-44f1-84bb-4f73c004afda\n", + "entities": {"aws-access-key": 1, "credit-card-number": 1, "us-ssn": 1}, + "entityCount": 3, + "entityDetails": { + "aws-access-key": [ + { + "confidence_score": "HIGH", + "entity_group": "secrets_and_tokens", + "location": "141_163", + } + ], + "credit-card-number": [ + { + "confidence_score": "HIGH", + "entity_group": "pii-financial", + "location": "105_124", + } + ], + "us-ssn": [ + { + "confidence_score": "HIGH", + "entity_group": "pii-identification", + "location": "16_30", + } + ], + }, + } + ) + assert output.model_dump() == expected_output diff --git a/tests/app/service/test_loader_doc_service.py b/tests/app/service/test_loader_doc_service.py new file mode 100644 index 00000000..52d25e58 --- /dev/null +++ b/tests/app/service/test_loader_doc_service.py @@ -0,0 +1,116 @@ +import datetime + +import pytest + +from pebblo.app.enums.common import ClassificationMode +from pebblo.app.service.loader.loader_doc_service import AppLoaderDoc + +classifier_response_input_doc = { + "doc": "Sachin's SSN is 222-85-4836. His passport ID is 5484880UA. His American express credit card number is\n371449635398431. AWS Access Key AKIAQIPT4PDORIRTV6PH. client-secret is de1d4a2d-d9fa-44f1-84bb-4f73c004afda\n", + "source_path": "/home/data/sens_data.csv", + "last_modified": datetime.datetime.now(), + "file_owner": "fileOwner", + "source_path_size": 211, +} + +expected_output = { + "data": "Sachin's SSN is 222-85-4836. His passport ID is 5484880UA. His American express credit card number is\n371449635398431. AWS Access Key AKIAQIPT4PDORIRTV6PH. client-secret is de1d4a2d-d9fa-44f1-84bb-4f73c004afda\n", + "entityCount": 3, + "entities": {"us-ssn": 1, "credit-card-number": 1, "aws-access-key": 1}, + "entityDetails": { + "us-ssn": [ + { + "location": "16_27", + "confidence_score": "HIGH", + "entity_group": "pii-identification", + } + ], + "credit-card-number": [ + { + "location": "102_117", + "confidence_score": "HIGH", + "entity_group": "pii-financial", + } + ], + "aws-access-key": [ + { + "location": "134_154", + "confidence_score": "HIGH", + "entity_group": "secrets_and_tokens", + } + ], + }, + "topicCount": 0, + "topics": {}, + "topicDetails": {}, +} + + +@pytest.fixture +def app_loader_helper(): + return AppLoaderDoc() + + +def test_get_classifier_response(app_loader_helper): + app_loader_helper.classifier_mode = ClassificationMode.ALL.value + app_loader_helper.anonymize_snippets = False + output = app_loader_helper._get_doc_classification(classifier_response_input_doc) + assert output.model_dump() == expected_output + + +def test_get_classifier_response_classifier_mode_entity(app_loader_helper): + app_loader_helper.classifier_mode = ClassificationMode.ENTITY.value + app_loader_helper.anonymize_snippets = False + output = app_loader_helper._get_doc_classification(classifier_response_input_doc) + assert output.model_dump() == expected_output + + +def test_get_classifier_response_classifier_mode_topic(app_loader_helper): + app_loader_helper.classifier_mode = ClassificationMode.TOPIC.value + app_loader_helper.anonymize_snippets = False + output = app_loader_helper._get_doc_classification(classifier_response_input_doc) + expected_output.update( + { + "entityCount": 0, + "entities": {}, + "entityDetails": {}, + } + ) + assert output.model_dump() == expected_output + + +def test_get_classifier_response_anonymize_true(app_loader_helper): + app_loader_helper.classifier_mode = ClassificationMode.ALL.value + app_loader_helper.anonymize_snippets = True + output = app_loader_helper._get_doc_classification(classifier_response_input_doc) + expected_output.update( + { + "data": "Sachin's SSN is <US_SSN>. His passport ID is 5484880UA. His American express credit card number is\n<CREDIT_CARD>. AWS Access Key <AWS_ACCESS_KEY>. client-secret is de1d4a2d-d9fa-44f1-84bb-4f73c004afda\n", + "entities": {"aws-access-key": 1, "credit-card-number": 1, "us-ssn": 1}, + "entityCount": 3, + "entityDetails": { + "aws-access-key": [ + { + "confidence_score": "HIGH", + "entity_group": "secrets_and_tokens", + "location": "141_163", + } + ], + "credit-card-number": [ + { + "confidence_score": "HIGH", + "entity_group": "pii-financial", + "location": "105_124", + } + ], + "us-ssn": [ + { + "confidence_score": "HIGH", + "entity_group": "pii-identification", + "location": "16_30", + } + ], + }, + } + ) + assert output.model_dump() == expected_output