amosproj · luccalb · Feb 6, 2024 · Feb 5, 2024 · Feb 5, 2024 · Feb 5, 2024
diff --git a/Documentation/ideas.md b/Documentation/ideas.md
@@ -25,6 +25,13 @@ The current implementation of the module supports queueing messages from the BDC
 
 This step was supposed to be used for querying lead data from the facebook by using either the business owner's name or the company name. The attempt was deprecated as the cost for the needed API token was evaluated too high and because the usage permissions of the facebook API were changed. Furthermore, it is paramount to check the legal ramifications of querying facebook for this kind of data as there might be legal consequences of searching for individuals on facebook instead of their businesses due to data privacy regulations in the EU.
 
+### ScrapeAddresses
+
+This step was an early experiment, using only the custom domain from an email address. We check if there's a live website running
+for the domain, and then try to parse the main site for a business address using a RegEx pattern. The pattern is not very precise
+and calling the website, as well as parsing it, takes quite some time, which accumulates for a lot of entries. The Google places
+step yields better results for the business address and is faster, that's why `scrape_addresses.py` was deprecated.
+
 ## Possible ML improvements
 
 ### Creating data subsets

diff --git a/src/bdc/steps/scrape_address.py → deprecated/steps/scrape_address.py b/src/bdc/steps/scrape_address.py → deprecated/steps/scrape_address.py
diff --git a/src/bdc/steps/__init__.py b/src/bdc/steps/__init__.py
@@ -9,6 +9,5 @@
 from .hash_generator import *
 from .preprocess_phonenumbers import *
 from .regionalatlas import *
-from .scrape_address import *
 from .search_offeneregister import *
 from .step import *
diff --git a/src/bdc/steps/regionalatlas.py b/src/bdc/steps/regionalatlas.py
@@ -6,7 +6,6 @@
 import geopandas as gpd
 import osmnx
 import pandas as pd
-from geopandas.tools import sjoin
 from pandas import DataFrame
 from tqdm import tqdm
 
@@ -144,13 +143,13 @@ def run(self) -> DataFrame:
 
         tqdm.pandas(desc="Computing Regional Score")
 
-        self.df[self.added_cols[:-1]] = self.df.progress_apply(
+        self.df[self.added_cols[-1:]] = self.df.progress_apply(
             lambda lead: pd.Series(
                 get_lead_hash_generator().hash_check(
                     lead,
                     self.calculate_regional_score,
                     self.name + "_Regional-Score",
-                    self.added_cols[:-1],
+                    self.added_cols[-1:],
                     lead,
                 )
             ),

diff --git a/src/database/leads/local_repository.py b/src/database/leads/local_repository.py
@@ -4,7 +4,6 @@
 import csv
 import json
 import os
-from datetime import datetime
 from pathlib import Path
 
 import joblib
@@ -197,6 +196,11 @@ def fetch_gpt_result(self, file_id, operation_name):
         try:
             with open(json_file_path, "r", encoding="utf-8") as json_file:
                 data = json.load(json_file)
+                if operation_name not in data:
+                    log.info(
+                        f"Data for operation {operation_name} was not found in {json_file_path}"
+                    )
+                    return ""
                 return data[operation_name]
         except:
             log.warning(f"Error loading GPT results from path {json_file_path}.")

diff --git a/src/database/leads/s3_repository.py b/src/database/leads/s3_repository.py
@@ -95,7 +95,7 @@ def _fetch_object_s3(self, bucket, obj_key):
             obj = s3.get_object(Bucket=bucket, Key=obj_key)
         except botocore.exceptions.ClientError as e:
             log.warning(
-                f"{e.response['Error']['Code']}: {e.response['Error']['Message']}"
+                f"{e.response['Error']['Code']}: {e.response['Error']['Message']} (s3://{bucket}/{obj_key})"
                 if "Error" in e.response
                 else f"Error while getting object s3://{bucket}/{obj_key}"
             )
@@ -209,8 +209,8 @@ def fetch_review(self, place_id):
             json_content = json.loads(file_content)
             return json_content
         except Exception as e:
-            log.error(
-                f"Error loading review from S3 with id {place_id}. Error: {str(e)}"
+            log.info(
+                f"No reviews in S3 for place with at s3://{bucket}/{key}. Error: {str(e)}"
             )
             return []
 

diff --git a/src/demo/pipeline_configs/config_sprint09_release.json b/src/demo/pipeline_configs/config_sprint09_release.json
@@ -6,10 +6,6 @@
         "name": "AnalyzeEmails",
         "force_refresh": true
       },
-      {
-        "name": "ScrapeAddress",
-        "force_refresh": true
-      },
       {
         "name": "PreprocessPhonenumbers",
         "force_refresh": true

diff --git a/src/demo/pipeline_configs/config_template b/src/demo/pipeline_configs/config_template
@@ -7,10 +7,6 @@
             "name": "AnalyzeEmails",
             "force_refresh": true
         },
-        {
-            "name": "ScrapeAddress",
-            "force_refresh": true
-        },
         {
             "name": "PreprocessPhonenumbers",
             "force_refresh": true

diff --git a/src/demo/pipeline_configs/force_refresh_all_steps.json b/src/demo/pipeline_configs/force_refresh_all_steps.json
@@ -0,0 +1,43 @@
+{
+  "description": "This config runs all steps with force_refresh set to true.",
+  "config": {
+    "steps": [
+      {
+        "name": "HashGenerator",
+        "force_refresh": true
+      },
+      {
+        "name": "AnalyzeEmails",
+        "force_refresh": true
+      },
+      {
+        "name": "PreprocessPhonenumbers",
+        "force_refresh": true
+      },
+      {
+        "name": "GooglePlaces",
+        "force_refresh": true
+      },
+      {
+        "name": "GooglePlacesDetailed",
+        "force_refresh": true
+      },
+      {
+        "name": "GPTReviewSentimentAnalyzer",
+        "force_refresh": true
+      },
+      {
+        "name": "GPTSummarizer",
+        "force_refresh": true
+      },
+      {
+        "name": "SmartReviewInsightsEnhancer",
+        "force_refresh": true
+      },
+      {
+        "name": "RegionalAtlas",
+        "force_refresh": true
+      }
+    ]
+  }
+}
diff --git a/src/demo/pipeline_configs/force_refresh_all_steps.json.license b/src/demo/pipeline_configs/force_refresh_all_steps.json.license
@@ -0,0 +1,2 @@
+SPDX-License-Identifier: MIT
+SPDX-FileCopyrightText: 2023 Berkay Bozkurt <resitberkaybozkurt@gmail.com>
diff --git a/src/demo/pipeline_configs/regionalatlas_only.json b/src/demo/pipeline_configs/regionalatlas_only.json
@@ -0,0 +1,27 @@
+{
+  "description": "This config runs all steps with force_refresh set to true.",
+  "config": {
+    "steps": [
+      {
+        "name": "HashGenerator",
+        "force_refresh": true
+      },
+      {
+        "name": "AnalyzeEmails",
+        "force_refresh": true
+      },
+      {
+        "name": "PreprocessPhonenumbers",
+        "force_refresh": true
+      },
+      {
+        "name": "GooglePlaces",
+        "force_refresh": true
+      },
+      {
+        "name": "RegionalAtlas",
+        "force_refresh": true
+      }
+    ]
+  }
+}
diff --git a/src/demo/pipeline_configs/regionalatlas_only.json.license b/src/demo/pipeline_configs/regionalatlas_only.json.license
@@ -0,0 +1,2 @@
+SPDX-License-Identifier: MIT
+SPDX-FileCopyrightText: 2023 Lucca Baumgärtner <lucca.baumgaertner@fau.de>
diff --git a/src/demo/pipeline_configs/run_all_steps.json b/src/demo/pipeline_configs/run_all_steps.json
@@ -1,46 +1,42 @@
 {
-  "description": "This config runs all steps with force_refresh set to true.",
+  "description": "This config runs all steps with force_refresh set to false.",
   "config": {
     "steps": [
       {
         "name": "HashGenerator",
-        "force_refresh": true
+        "force_refresh": false
       },
       {
         "name": "AnalyzeEmails",
-        "force_refresh": true
-      },
-      {
-        "name": "ScrapeAddress",
-        "force_refresh": true
+        "force_refresh": false
       },
       {
         "name": "PreprocessPhonenumbers",
-        "force_refresh": true
+        "force_refresh": false
       },
       {
         "name": "GooglePlaces",
-        "force_refresh": true
+        "force_refresh": false
       },
       {
         "name": "GooglePlacesDetailed",
-        "force_refresh": true
+        "force_refresh": false
       },
       {
         "name": "GPTReviewSentimentAnalyzer",
-        "force_refresh": true
+        "force_refresh": false
       },
       {
         "name": "GPTSummarizer",
-        "force_refresh": true
+        "force_refresh": false
       },
       {
         "name": "SmartReviewInsightsEnhancer",
-        "force_refresh": true
+        "force_refresh": false
       },
       {
         "name": "RegionalAtlas",
-        "force_refresh": true
+        "force_refresh": false
       }
     ]
   }

diff --git a/src/demo/pipeline_utils.py b/src/demo/pipeline_utils.py
@@ -17,7 +17,6 @@
     HashGenerator,
     PreprocessPhonenumbers,
     RegionalAtlas,
-    ScrapeAddress,
     SearchOffeneRegister,
     SmartReviewInsightsEnhancer,
 )
@@ -33,14 +32,12 @@
     "GPTSummarizer": GPTSummarizer,
     "PreprocessPhonenumbers": PreprocessPhonenumbers,
     "RegionalAtlas": RegionalAtlas,
-    "ScrapeAddress": ScrapeAddress,
     "SearchOffeneRegister": SearchOffeneRegister,
     "SmartReviewInsightsEnhancer": SmartReviewInsightsEnhancer,
 }
 
 # Please do not write following lists! Use the functions below instead.
 _additional_pipeline_steps = [
-    (ScrapeAddress, "Scrape Address", "(will take a long time)"),
     (SearchOffeneRegister, "Search OffeneRegister", "(will take a long time)"),
     (PreprocessPhonenumbers, "Phone Number Validation", ""),
     (

diff --git a/tests/test_pipeline_utils.py b/tests/test_pipeline_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: MIT
 # SPDX-FileCopyrightText: 2024 Felix Zailskas <felixzailskas@gmail.com>
 
-import os
 import unittest
 from unittest.mock import MagicMock, mock_open, patch
 
@@ -22,7 +21,6 @@ def test_get_pipeline_steps(self):
             [
                 (HashGenerator, "Hash Generator", ""),
                 (AnalyzeEmails, "Analyze Emails", ""),
-                (ScrapeAddress, "Scrape Address", "(will take a long time)"),
                 (
                     SearchOffeneRegister,
                     "Search OffeneRegister",
@@ -73,7 +71,6 @@ def test_get_pipeline_additional_steps(self):
         additional_steps = get_pipeline_additional_steps()
         self.assertEqual(
             [
-                (ScrapeAddress, "Scrape Address", "(will take a long time)"),
                 (
                     SearchOffeneRegister,
                     "Search OffeneRegister",
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		SPDX-License-Identifier: MIT
		SPDX-FileCopyrightText: 2023 Berkay Bozkurt <resitberkaybozkurt@gmail.com>
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		SPDX-License-Identifier: MIT
		SPDX-FileCopyrightText: 2023 Lucca Baumgärtner <lucca.baumgaertner@fau.de>