From 4024d3dd55e6a22f65ff601c35bbecfcb25949a1 Mon Sep 17 00:00:00 2001
From: Dristy Srivastava <58721149+dristysrivastava@users.noreply.github.com>
Date: Fri, 20 Sep 2024 10:02:16 +0530
Subject: [PATCH] Adding anonymize_snippet as a input filed in /loader/doc API
 (#558)

* Adding Anonymize_snippet as a input filed in /loader/doc API

* Added type checking, and some private functions

* Added UTs
---
 pebblo/app/api/req_models.py                  |   1 +
 pebblo/app/service/doc_helper.py              |  24 ++-
 .../app/service/loader/loader_doc_service.py  |  47 +++--
 pebblo/app/service/service.py                 |  36 +++-
 tests/app/service/test_doc_helper.py          | 178 ++++++++++++++++++
 tests/app/service/test_loader_doc_service.py  | 116 ++++++++++++
 6 files changed, 376 insertions(+), 26 deletions(-)
 create mode 100644 tests/app/service/test_doc_helper.py
 create mode 100644 tests/app/service/test_loader_doc_service.py

diff --git a/pebblo/app/api/req_models.py b/pebblo/app/api/req_models.py
index dc5c2bc1..00401e49 100644
--- a/pebblo/app/api/req_models.py
+++ b/pebblo/app/api/req_models.py
@@ -66,6 +66,7 @@ class ReqLoaderDoc(BaseModel):
     source_owner: str
     classifier_location: str
     classifier_mode: Optional[str] = None
+    anonymize_snippets: Optional[bool] = None
 
 
 class Context(BaseModel):
diff --git a/pebblo/app/service/doc_helper.py b/pebblo/app/service/doc_helper.py
index a823769c..6a0c8c82 100644
--- a/pebblo/app/service/doc_helper.py
+++ b/pebblo/app/service/doc_helper.py
@@ -7,7 +7,7 @@
 from datetime import datetime
 
 from pebblo.app.enums.common import ClassificationMode
-from pebblo.app.enums.enums import CacheDir, ClassifierConstants, ReportConstants
+from pebblo.app.enums.enums import CacheDir, ReportConstants
 from pebblo.app.models.models import (
     AiDataModel,
     AiDocs,
@@ -37,16 +37,24 @@ class LoaderHelper:
     Class for loader doc related task
     """
 
-    def __init__(self, app_details, data, load_id, classifier_mode):
+    def __init__(
+        self,
+        app_details: dict,
+        data: dict,
+        load_id: str,
+        classifier_mode: str = "all",
+        anonymize_snippets: bool = False,
+    ):
         self.app_details = app_details
         self.data = data
         self.load_id = load_id
         self.loader_mapper = {}
         self.classifier_mode = classifier_mode
+        self.anonymize_snippets = anonymize_snippets
         self.entity_classifier_obj = EntityClassifier()
 
     # Initialization
-    def _initialize_raw_data(self):
+    def _initialize_raw_data(self) -> dict:
         """
         Initializing raw data and return as dict object
         """
@@ -69,7 +77,7 @@ def _initialize_raw_data(self):
         return raw_data
 
     @staticmethod
-    def _fetch_variables(raw_data):
+    def _fetch_variables(raw_data: dict):
         """
         Return list of variable's
         """
@@ -111,7 +119,7 @@ def _update_raw_data(
         )
 
     # Model Creation
-    def _create_doc_model(self, doc, doc_info):
+    def _create_doc_model(self, doc: dict, doc_info: AiDataModel) -> dict:
         """
         Create doc model and return its object
         """
@@ -163,7 +171,7 @@ def _get_top_n_findings(raw_data):
         ]
         return top_n_findings
 
-    def _count_files_with_findings(self):
+    def _count_files_with_findings(self) -> int:
         """
         Return the count of files that have associated findings.
         """
@@ -176,7 +184,7 @@ def _count_files_with_findings(self):
                     files_with_findings_count += 1
         return files_with_findings_count
 
-    def _get_classifier_response(self, doc):
+    def _get_classifier_response(self, doc: dict) -> AiDataModel:
         doc_info = AiDataModel(
             data=doc.get("doc", None),
             entities={},
@@ -209,7 +217,7 @@ def _get_classifier_response(self, doc):
                         entity_details,
                     ) = self.entity_classifier_obj.presidio_entity_classifier_and_anonymizer(
                         doc_info.data,
-                        anonymize_snippets=ClassifierConstants.anonymize_snippets.value,
+                        anonymize_snippets=self.anonymize_snippets,
                     )
                     doc_info.entities = entities
                     doc_info.entityDetails = entity_details
diff --git a/pebblo/app/service/loader/loader_doc_service.py b/pebblo/app/service/loader/loader_doc_service.py
index 8a7f5bb3..5549b0a6 100644
--- a/pebblo/app/service/loader/loader_doc_service.py
+++ b/pebblo/app/service/loader/loader_doc_service.py
@@ -5,7 +5,7 @@
 
 from pebblo.app.config.config import var_server_config_dict
 from pebblo.app.enums.common import ClassificationMode
-from pebblo.app.enums.enums import ApplicationTypes, CacheDir, ClassifierConstants
+from pebblo.app.enums.enums import ApplicationTypes, CacheDir
 from pebblo.app.libs.responses import PebbloJsonResponse
 from pebblo.app.models.db_models import (
     AiDataModel,
@@ -40,8 +40,16 @@ def __init__(self):
         self.data = None
         self.app_name = None
         self.classifier_mode = None
+        self.anonymize_snippets = None
         self.entity_classifier_obj = EntityClassifier()
 
+    def _initialize_data(self, data: dict):
+        self.db = SQLiteClient()
+        self.data = data
+        self.app_name = data.get("name")
+        self._set_classifier_mode()
+        self._set_anonymize_snippets()
+
     @staticmethod
     def _create_return_response(message, output=None, status_code=200):
         if output is None:
@@ -202,7 +210,7 @@ def _get_doc_classification(self, doc):
                         entity_details,
                     ) = self.entity_classifier_obj.presidio_entity_classifier_and_anonymizer(
                         doc_info.data,
-                        anonymize_snippets=ClassifierConstants.anonymize_snippets.value,
+                        anonymize_snippets=self.anonymize_snippets,
                     )
                     doc_info.entities = entities
                     doc_info.entityDetails = entity_details
@@ -276,19 +284,34 @@ def _get_or_create_data_source(self):
         logger.debug("Data Source has been created successfully.")
         return data_source_obj.data
 
+    def _set_classifier_mode(self):
+        """
+        This function defines the value of the classifier_mode: if it is included in the API request,
+        it will be used; otherwise, the value will be taken from the config.
+        """
+        if not self.data.get("classifier_mode"):
+            self.classifier_mode = config_details.get("classifier", {}).get(
+                "mode", ClassificationMode.ALL.value
+            )
+        else:
+            self.classifier_mode = self.data.get("classifier_mode")
+
+    def _set_anonymize_snippets(self):
+        """
+        This function defines the value of the anonymize_snippets: if it is included in the API request,
+        it will be used; otherwise, the value will be taken from the config.
+        """
+        if not self.data.get("anonymize_snippets"):
+            self.anonymize_snippets = config_details.get("classifier", {}).get(
+                "anonymizeSnippets", False
+            )
+        else:
+            self.anonymize_snippets = self.data.get("anonymize_snippets")
+
     @timeit
     def process_request(self, data):
         try:
-            self.db = SQLiteClient()
-            self.data = data
-            self.app_name = data.get("name")
-
-            if not self.data.get("classifier_mode"):
-                self.classifier_mode = config_details.get("classifier", {}).get(
-                    "mode", ClassificationMode.ALL.value
-                )
-            else:
-                self.classifier_mode = self.data.get("classifier_mode")
+            self._initialize_data(data)
 
             # create session
             self.db.create_session()
diff --git a/pebblo/app/service/service.py b/pebblo/app/service/service.py
index e8ce7912..31dffd84 100644
--- a/pebblo/app/service/service.py
+++ b/pebblo/app/service/service.py
@@ -30,10 +30,13 @@ def __init__(self):
         self.data = None
         self.app_name = None
         self.classifier_mode = None
+        self.anonymize_snippets = None
 
-    def _initialize_data(self, data):
+    def _initialize_data(self, data: dict):
         self.data = data
         self.app_name = data.get("name")
+        self._set_classifier_mode()
+        self._set_anonymize_snippets()
 
     def _write_pdf_report(self, final_report):
         """
@@ -122,17 +125,34 @@ def _upsert_loader_details(self, app_details):
                 loader_list.append(new_loader_data.model_dump())
                 app_details["loaders"] = loader_list
 
-    def process_request(self, data):
+    def _set_classifier_mode(self):
         """
-        This process is entrypoint function for loader doc API implementation.
+        This function defines the value of the classifier_mode: if it is included in the API request,
+        it will be used; otherwise, the value will be taken from the config.
         """
-        if not data.get("classifier_mode"):
+        if not self.data.get("classifier_mode"):
             self.classifier_mode = config_details.get("classifier", {}).get(
                 "mode", ClassificationMode.ALL.value
             )
         else:
-            self.classifier_mode = data.get("classifier_mode")
+            self.classifier_mode = self.data.get("classifier_mode")
+
+    def _set_anonymize_snippets(self):
+        """
+        This function defines the value of the anonymize_snippets: if it is included in the API request,
+        it will be used; otherwise, the value will be taken from the config.
+        """
+        if not self.data.get("anonymize_snippets"):
+            self.anonymize_snippets = config_details.get("classifier", {}).get(
+                "anonymizeSnippets", False
+            )
+        else:
+            self.anonymize_snippets = self.data.get("anonymize_snippets")
 
+    def process_request(self, data: dict):
+        """
+        This process is entrypoint function for loader doc API implementation.
+        """
         self._initialize_data(data)
 
         try:
@@ -173,7 +193,11 @@ def process_request(self, data):
 
             # process input docs, app details, and generate final report
             loader_helper_obj = LoaderHelper(
-                app_details, self.data, load_id, self.classifier_mode
+                app_details,
+                self.data,
+                load_id,
+                self.classifier_mode,
+                self.anonymize_snippets,
             )
             (
                 app_details,
diff --git a/tests/app/service/test_doc_helper.py b/tests/app/service/test_doc_helper.py
new file mode 100644
index 00000000..dc6147ca
--- /dev/null
+++ b/tests/app/service/test_doc_helper.py
@@ -0,0 +1,178 @@
+import datetime
+
+from pebblo.app.service.doc_helper import LoaderHelper
+
+app_details = {
+    "metadata": {
+        "createdAt": "2024-09-19 11:41:18.192182",
+        "modifiedAt": "2024-09-19 11:41:18.192183",
+    },
+    "name": "UnitTestApp",
+    "description": "Loader App using Pebblo",
+    "owner": "AppOwner",
+    "pluginVersion": "0.1.1",
+    "instanceDetails": {
+        "type": "desktop",
+        "host": "AppOwner-MBP",
+        "path": "/home/data/scripts",
+        "runtime": "Mac OSX",
+        "ip": "192.168.1.39",
+        "language": "python",
+        "languageVersion": "3.11.9",
+        "platform": "macOS-14.6.1-arm64-i386-64bit",
+        "os": "Darwin",
+        "osVersion": "Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:04 PDT 2024; root:xnu-10063.141.2~1/RELEASE_ARM64_T6020",
+        "createdAt": "2024-09-19 11:41:18.192116",
+    },
+    "framework": {"name": "langchain", "version": "0.2.35"},
+    "lastUsed": "2024-09-19 11:41:18.192181",
+    "pebbloServerVersion": "0.1.19",
+    "pebbloClientVersion": "0.1.1",
+    "clientVersion": {"name": "langchain_community", "version": "0.2.12"},
+    "loaders": [
+        {
+            "name": "TextLoader",
+            "sourcePath": "/home/data/sample.txt",
+            "sourceType": "unsupported",
+            "sourceSize": 211,
+            "sourceFiles": [],
+            "lastModified": datetime.datetime.now(),
+        }
+    ],
+}
+
+classifier_response_input_doc = {
+    "doc": "Sachin's SSN is 222-85-4836. His passport ID is 5484880UA. His American express credit card number is\n371449635398431. AWS Access Key AKIAQIPT4PDORIRTV6PH. client-secret is de1d4a2d-d9fa-44f1-84bb-4f73c004afda\n",
+    "source_path": "/home/data/sens_data.csv",
+    "last_modified": datetime.datetime.now(),
+    "file_owner": "fileOwner",
+    "source_path_size": 211,
+}
+
+data = {
+    "name": "UnitTestApp",
+    "owner": "AppOwner",
+    "docs": [classifier_response_input_doc],
+    "plugin_version": "0.1.1",
+    "load_id": "26db970f-b4c6-44ae-9235-8f18236695a1",
+    "loader_details": {
+        "loader": "TextLoader",
+        "source_path": "/home/data/sample.txt",
+        "source_type": "unsupported",
+        "source_path_size": "211",
+        "source_aggregate_size": 211,
+    },
+    "loading_end": True,
+    "source_owner": "AppOwner",
+    "classifier_location": "local",
+    "classifier_mode": None,
+    "anonymize_snippets": None,
+}
+
+expected_output = {
+    "data": "Sachin's SSN is 222-85-4836. His passport ID is 5484880UA. His American express credit card number is\n371449635398431. AWS Access Key AKIAQIPT4PDORIRTV6PH. client-secret is de1d4a2d-d9fa-44f1-84bb-4f73c004afda\n",
+    "entityCount": 3,
+    "entities": {"us-ssn": 1, "credit-card-number": 1, "aws-access-key": 1},
+    "entityDetails": {
+        "us-ssn": [
+            {
+                "location": "16_27",
+                "confidence_score": "HIGH",
+                "entity_group": "pii-identification",
+            }
+        ],
+        "credit-card-number": [
+            {
+                "location": "102_117",
+                "confidence_score": "HIGH",
+                "entity_group": "pii-financial",
+            }
+        ],
+        "aws-access-key": [
+            {
+                "location": "134_154",
+                "confidence_score": "HIGH",
+                "entity_group": "secrets_and_tokens",
+            }
+        ],
+    },
+    "topicCount": 0,
+    "topics": {},
+    "topicDetails": {},
+}
+
+
+def test_get_classifier_response():
+    loader_helper = LoaderHelper(app_details, data=data, load_id=data.get("load_id"))
+    output = loader_helper._get_classifier_response(classifier_response_input_doc)
+    assert output.model_dump() == expected_output
+
+
+def test_get_classifier_response_classifier_mode_entity():
+    loader_helper_classifier_mode_entity = LoaderHelper(
+        app_details, data=data, load_id=data.get("load_id"), classifier_mode="entity"
+    )
+    output = loader_helper_classifier_mode_entity._get_classifier_response(
+        classifier_response_input_doc
+    )
+
+    assert output.model_dump() == expected_output
+
+
+def test_get_classifier_response_classifier_mode_topic():
+    loader_helper_classifier_mode_topic = LoaderHelper(
+        app_details, data=data, load_id=data.get("load_id"), classifier_mode="topic"
+    )
+    output = loader_helper_classifier_mode_topic._get_classifier_response(
+        classifier_response_input_doc
+    )
+
+    expected_output.update(
+        {
+            "entityCount": 0,
+            "entities": {},
+            "entityDetails": {},
+        }
+    )
+    assert output.model_dump() == expected_output
+
+
+def test_get_classifier_response_anonymize_true():
+    loader_helper_anonymize_true = LoaderHelper(
+        app_details, data=data, load_id=data.get("load_id"), anonymize_snippets=True
+    )
+    output = loader_helper_anonymize_true._get_classifier_response(
+        classifier_response_input_doc
+    )
+
+    expected_output.update(
+        {
+            "data": "Sachin's SSN is &lt;US_SSN&gt;. His passport ID is 5484880UA. His American express credit card number is\n&lt;CREDIT_CARD&gt;. AWS Access Key &lt;AWS_ACCESS_KEY&gt;. client-secret is de1d4a2d-d9fa-44f1-84bb-4f73c004afda\n",
+            "entities": {"aws-access-key": 1, "credit-card-number": 1, "us-ssn": 1},
+            "entityCount": 3,
+            "entityDetails": {
+                "aws-access-key": [
+                    {
+                        "confidence_score": "HIGH",
+                        "entity_group": "secrets_and_tokens",
+                        "location": "141_163",
+                    }
+                ],
+                "credit-card-number": [
+                    {
+                        "confidence_score": "HIGH",
+                        "entity_group": "pii-financial",
+                        "location": "105_124",
+                    }
+                ],
+                "us-ssn": [
+                    {
+                        "confidence_score": "HIGH",
+                        "entity_group": "pii-identification",
+                        "location": "16_30",
+                    }
+                ],
+            },
+        }
+    )
+    assert output.model_dump() == expected_output
diff --git a/tests/app/service/test_loader_doc_service.py b/tests/app/service/test_loader_doc_service.py
new file mode 100644
index 00000000..52d25e58
--- /dev/null
+++ b/tests/app/service/test_loader_doc_service.py
@@ -0,0 +1,116 @@
+import datetime
+
+import pytest
+
+from pebblo.app.enums.common import ClassificationMode
+from pebblo.app.service.loader.loader_doc_service import AppLoaderDoc
+
+classifier_response_input_doc = {
+    "doc": "Sachin's SSN is 222-85-4836. His passport ID is 5484880UA. His American express credit card number is\n371449635398431. AWS Access Key AKIAQIPT4PDORIRTV6PH. client-secret is de1d4a2d-d9fa-44f1-84bb-4f73c004afda\n",
+    "source_path": "/home/data/sens_data.csv",
+    "last_modified": datetime.datetime.now(),
+    "file_owner": "fileOwner",
+    "source_path_size": 211,
+}
+
+expected_output = {
+    "data": "Sachin's SSN is 222-85-4836. His passport ID is 5484880UA. His American express credit card number is\n371449635398431. AWS Access Key AKIAQIPT4PDORIRTV6PH. client-secret is de1d4a2d-d9fa-44f1-84bb-4f73c004afda\n",
+    "entityCount": 3,
+    "entities": {"us-ssn": 1, "credit-card-number": 1, "aws-access-key": 1},
+    "entityDetails": {
+        "us-ssn": [
+            {
+                "location": "16_27",
+                "confidence_score": "HIGH",
+                "entity_group": "pii-identification",
+            }
+        ],
+        "credit-card-number": [
+            {
+                "location": "102_117",
+                "confidence_score": "HIGH",
+                "entity_group": "pii-financial",
+            }
+        ],
+        "aws-access-key": [
+            {
+                "location": "134_154",
+                "confidence_score": "HIGH",
+                "entity_group": "secrets_and_tokens",
+            }
+        ],
+    },
+    "topicCount": 0,
+    "topics": {},
+    "topicDetails": {},
+}
+
+
+@pytest.fixture
+def app_loader_helper():
+    return AppLoaderDoc()
+
+
+def test_get_classifier_response(app_loader_helper):
+    app_loader_helper.classifier_mode = ClassificationMode.ALL.value
+    app_loader_helper.anonymize_snippets = False
+    output = app_loader_helper._get_doc_classification(classifier_response_input_doc)
+    assert output.model_dump() == expected_output
+
+
+def test_get_classifier_response_classifier_mode_entity(app_loader_helper):
+    app_loader_helper.classifier_mode = ClassificationMode.ENTITY.value
+    app_loader_helper.anonymize_snippets = False
+    output = app_loader_helper._get_doc_classification(classifier_response_input_doc)
+    assert output.model_dump() == expected_output
+
+
+def test_get_classifier_response_classifier_mode_topic(app_loader_helper):
+    app_loader_helper.classifier_mode = ClassificationMode.TOPIC.value
+    app_loader_helper.anonymize_snippets = False
+    output = app_loader_helper._get_doc_classification(classifier_response_input_doc)
+    expected_output.update(
+        {
+            "entityCount": 0,
+            "entities": {},
+            "entityDetails": {},
+        }
+    )
+    assert output.model_dump() == expected_output
+
+
+def test_get_classifier_response_anonymize_true(app_loader_helper):
+    app_loader_helper.classifier_mode = ClassificationMode.ALL.value
+    app_loader_helper.anonymize_snippets = True
+    output = app_loader_helper._get_doc_classification(classifier_response_input_doc)
+    expected_output.update(
+        {
+            "data": "Sachin's SSN is &lt;US_SSN&gt;. His passport ID is 5484880UA. His American express credit card number is\n&lt;CREDIT_CARD&gt;. AWS Access Key &lt;AWS_ACCESS_KEY&gt;. client-secret is de1d4a2d-d9fa-44f1-84bb-4f73c004afda\n",
+            "entities": {"aws-access-key": 1, "credit-card-number": 1, "us-ssn": 1},
+            "entityCount": 3,
+            "entityDetails": {
+                "aws-access-key": [
+                    {
+                        "confidence_score": "HIGH",
+                        "entity_group": "secrets_and_tokens",
+                        "location": "141_163",
+                    }
+                ],
+                "credit-card-number": [
+                    {
+                        "confidence_score": "HIGH",
+                        "entity_group": "pii-financial",
+                        "location": "105_124",
+                    }
+                ],
+                "us-ssn": [
+                    {
+                        "confidence_score": "HIGH",
+                        "entity_group": "pii-identification",
+                        "location": "16_30",
+                    }
+                ],
+            },
+        }
+    )
+    assert output.model_dump() == expected_output