From a20c71d0dae36835776ea002d296d4ccc95227a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Mon, 5 Feb 2024 16:18:03 +0100 Subject: [PATCH 1/7] fix minor error in regionalatlas step + moved scrape address step to deprecated + added new bdc config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- .../steps/scrape_address.py | 0 src/bdc/steps/__init__.py | 1 - src/bdc/steps/regionalatlas.py | 5 +-- src/database/leads/local_repository.py | 3 +- .../config_sprint09_release.json | 4 -- src/demo/pipeline_configs/config_template | 4 -- .../force_refresh_all_steps.json | 43 +++++++++++++++++++ .../force_refresh_all_steps.json.license | 2 + .../pipeline_configs/regionalatlas_only.json | 27 ++++++++++++ .../regionalatlas_only.json.license | 2 + src/demo/pipeline_configs/run_all_steps.json | 24 +++++------ src/demo/pipeline_utils.py | 3 -- 12 files changed, 87 insertions(+), 31 deletions(-) rename {src/bdc => deprecated}/steps/scrape_address.py (100%) create mode 100644 src/demo/pipeline_configs/force_refresh_all_steps.json create mode 100644 src/demo/pipeline_configs/force_refresh_all_steps.json.license create mode 100644 src/demo/pipeline_configs/regionalatlas_only.json create mode 100644 src/demo/pipeline_configs/regionalatlas_only.json.license diff --git a/src/bdc/steps/scrape_address.py b/deprecated/steps/scrape_address.py similarity index 100% rename from src/bdc/steps/scrape_address.py rename to deprecated/steps/scrape_address.py diff --git a/src/bdc/steps/__init__.py b/src/bdc/steps/__init__.py index 9b0533b..5736e45 100644 --- a/src/bdc/steps/__init__.py +++ b/src/bdc/steps/__init__.py @@ -9,6 +9,5 @@ from .hash_generator import * from .preprocess_phonenumbers import * from .regionalatlas import * -from .scrape_address import * from .search_offeneregister import * from .step import * diff --git a/src/bdc/steps/regionalatlas.py b/src/bdc/steps/regionalatlas.py index a461198..b810072 100644 --- a/src/bdc/steps/regionalatlas.py +++ b/src/bdc/steps/regionalatlas.py @@ -6,7 +6,6 @@ import geopandas as gpd import osmnx import pandas as pd -from geopandas.tools import sjoin from pandas import DataFrame from tqdm import tqdm @@ -118,13 +117,13 @@ def run(self) -> DataFrame: tqdm.pandas(desc="Computing Regional Score") - self.df[self.added_cols[:-1]] = self.df.progress_apply( + self.df[self.added_cols[-1:]] = self.df.progress_apply( lambda lead: pd.Series( get_lead_hash_generator().hash_check( lead, self.calculate_regional_score, self.name + "_Regional-Score", - self.added_cols[:-1], + self.added_cols[-1:], lead, ) ), diff --git a/src/database/leads/local_repository.py b/src/database/leads/local_repository.py index ebeb90b..20cbfe2 100644 --- a/src/database/leads/local_repository.py +++ b/src/database/leads/local_repository.py @@ -4,7 +4,6 @@ import csv import json import os -from datetime import datetime from pathlib import Path import joblib @@ -20,7 +19,7 @@ class LocalRepository(Repository): BASE_PATH = os.path.dirname(__file__) DF_INPUT = os.path.abspath( - os.path.join(BASE_PATH, "../../data/sumup_leads_email.csv") + os.path.join(BASE_PATH, "../../data/demo_leads_email.csv") ) DF_OUTPUT = os.path.abspath( os.path.join(BASE_PATH, "../../data/leads_enriched.csv") diff --git a/src/demo/pipeline_configs/config_sprint09_release.json b/src/demo/pipeline_configs/config_sprint09_release.json index 080cd4b..f726661 100644 --- a/src/demo/pipeline_configs/config_sprint09_release.json +++ b/src/demo/pipeline_configs/config_sprint09_release.json @@ -6,10 +6,6 @@ "name": "AnalyzeEmails", "force_refresh": true }, - { - "name": "ScrapeAddress", - "force_refresh": true - }, { "name": "PreprocessPhonenumbers", "force_refresh": true diff --git a/src/demo/pipeline_configs/config_template b/src/demo/pipeline_configs/config_template index c48e5e8..9fc5eb1 100644 --- a/src/demo/pipeline_configs/config_template +++ b/src/demo/pipeline_configs/config_template @@ -7,10 +7,6 @@ "name": "AnalyzeEmails", "force_refresh": true }, - { - "name": "ScrapeAddress", - "force_refresh": true - }, { "name": "PreprocessPhonenumbers", "force_refresh": true diff --git a/src/demo/pipeline_configs/force_refresh_all_steps.json b/src/demo/pipeline_configs/force_refresh_all_steps.json new file mode 100644 index 0000000..8356533 --- /dev/null +++ b/src/demo/pipeline_configs/force_refresh_all_steps.json @@ -0,0 +1,43 @@ +{ + "description": "This config runs all steps with force_refresh set to true.", + "config": { + "steps": [ + { + "name": "HashGenerator", + "force_refresh": true + }, + { + "name": "AnalyzeEmails", + "force_refresh": true + }, + { + "name": "PreprocessPhonenumbers", + "force_refresh": true + }, + { + "name": "GooglePlaces", + "force_refresh": true + }, + { + "name": "GooglePlacesDetailed", + "force_refresh": true + }, + { + "name": "GPTReviewSentimentAnalyzer", + "force_refresh": true + }, + { + "name": "GPTSummarizer", + "force_refresh": true + }, + { + "name": "SmartReviewInsightsEnhancer", + "force_refresh": true + }, + { + "name": "RegionalAtlas", + "force_refresh": true + } + ] + } +} diff --git a/src/demo/pipeline_configs/force_refresh_all_steps.json.license b/src/demo/pipeline_configs/force_refresh_all_steps.json.license new file mode 100644 index 0000000..f079a3f --- /dev/null +++ b/src/demo/pipeline_configs/force_refresh_all_steps.json.license @@ -0,0 +1,2 @@ +SPDX-License-Identifier: MIT +SPDX-FileCopyrightText: 2023 Berkay Bozkurt diff --git a/src/demo/pipeline_configs/regionalatlas_only.json b/src/demo/pipeline_configs/regionalatlas_only.json new file mode 100644 index 0000000..16c15eb --- /dev/null +++ b/src/demo/pipeline_configs/regionalatlas_only.json @@ -0,0 +1,27 @@ +{ + "description": "This config runs all steps with force_refresh set to true.", + "config": { + "steps": [ + { + "name": "HashGenerator", + "force_refresh": true + }, + { + "name": "AnalyzeEmails", + "force_refresh": true + }, + { + "name": "PreprocessPhonenumbers", + "force_refresh": true + }, + { + "name": "GooglePlaces", + "force_refresh": true + }, + { + "name": "RegionalAtlas", + "force_refresh": true + } + ] + } +} diff --git a/src/demo/pipeline_configs/regionalatlas_only.json.license b/src/demo/pipeline_configs/regionalatlas_only.json.license new file mode 100644 index 0000000..4ff3a64 --- /dev/null +++ b/src/demo/pipeline_configs/regionalatlas_only.json.license @@ -0,0 +1,2 @@ +SPDX-License-Identifier: MIT +SPDX-FileCopyrightText: 2023 Lucca Baumgärtner diff --git a/src/demo/pipeline_configs/run_all_steps.json b/src/demo/pipeline_configs/run_all_steps.json index 1e442f0..f694adb 100644 --- a/src/demo/pipeline_configs/run_all_steps.json +++ b/src/demo/pipeline_configs/run_all_steps.json @@ -1,46 +1,42 @@ { - "description": "This config runs all steps with force_refresh set to true.", + "description": "This config runs all steps with force_refresh set to false.", "config": { "steps": [ { "name": "HashGenerator", - "force_refresh": true + "force_refresh": false }, { "name": "AnalyzeEmails", - "force_refresh": true - }, - { - "name": "ScrapeAddress", - "force_refresh": true + "force_refresh": false }, { "name": "PreprocessPhonenumbers", - "force_refresh": true + "force_refresh": false }, { "name": "GooglePlaces", - "force_refresh": true + "force_refresh": false }, { "name": "GooglePlacesDetailed", - "force_refresh": true + "force_refresh": false }, { "name": "GPTReviewSentimentAnalyzer", - "force_refresh": true + "force_refresh": false }, { "name": "GPTSummarizer", - "force_refresh": true + "force_refresh": false }, { "name": "SmartReviewInsightsEnhancer", - "force_refresh": true + "force_refresh": false }, { "name": "RegionalAtlas", - "force_refresh": true + "force_refresh": false } ] } diff --git a/src/demo/pipeline_utils.py b/src/demo/pipeline_utils.py index 23c77b5..d95435a 100644 --- a/src/demo/pipeline_utils.py +++ b/src/demo/pipeline_utils.py @@ -17,7 +17,6 @@ HashGenerator, PreprocessPhonenumbers, RegionalAtlas, - ScrapeAddress, SearchOffeneRegister, SmartReviewInsightsEnhancer, ) @@ -33,14 +32,12 @@ "GPTSummarizer": GPTSummarizer, "PreprocessPhonenumbers": PreprocessPhonenumbers, "RegionalAtlas": RegionalAtlas, - "ScrapeAddress": ScrapeAddress, "SearchOffeneRegister": SearchOffeneRegister, "SmartReviewInsightsEnhancer": SmartReviewInsightsEnhancer, } # Please do not write following lists! Use the functions below instead. _additional_pipeline_steps = [ - (ScrapeAddress, "Scrape Address", "(will take a long time)"), (SearchOffeneRegister, "Search OffeneRegister", "(will take a long time)"), (PreprocessPhonenumbers, "Phone Number Validation", ""), ( From e228a82bcf49164a2a944015a2929d9f02c88808 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Mon, 5 Feb 2024 16:19:19 +0100 Subject: [PATCH 2/7] undo change to input filename MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- src/database/leads/local_repository.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/database/leads/local_repository.py b/src/database/leads/local_repository.py index 20cbfe2..ae02a5c 100644 --- a/src/database/leads/local_repository.py +++ b/src/database/leads/local_repository.py @@ -19,7 +19,7 @@ class LocalRepository(Repository): BASE_PATH = os.path.dirname(__file__) DF_INPUT = os.path.abspath( - os.path.join(BASE_PATH, "../../data/demo_leads_email.csv") + os.path.join(BASE_PATH, "../../data/sumup_leads_email.csv") ) DF_OUTPUT = os.path.abspath( os.path.join(BASE_PATH, "../../data/leads_enriched.csv") From ce0140f266e8374403fac1800c8d05c8e86bf544 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Mon, 5 Feb 2024 16:44:29 +0100 Subject: [PATCH 3/7] Adjust gpt caching error logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- src/database/leads/local_repository.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/database/leads/local_repository.py b/src/database/leads/local_repository.py index ae02a5c..3b50c75 100644 --- a/src/database/leads/local_repository.py +++ b/src/database/leads/local_repository.py @@ -19,7 +19,7 @@ class LocalRepository(Repository): BASE_PATH = os.path.dirname(__file__) DF_INPUT = os.path.abspath( - os.path.join(BASE_PATH, "../../data/sumup_leads_email.csv") + os.path.join(BASE_PATH, "../../data/demo_leads_email.csv") ) DF_OUTPUT = os.path.abspath( os.path.join(BASE_PATH, "../../data/leads_enriched.csv") @@ -196,6 +196,11 @@ def fetch_gpt_result(self, file_id, operation_name): try: with open(json_file_path, "r", encoding="utf-8") as json_file: data = json.load(json_file) + if operation_name not in data: + log.info( + f"Data for operation {operation_name} was not found in {json_file_path}" + ) + return "" return data[operation_name] except: log.warning(f"Error loading GPT results from path {json_file_path}.") From 67ddaaa483230f0d8fd556e4db3f25d940fb8eae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Mon, 5 Feb 2024 17:06:59 +0100 Subject: [PATCH 4/7] change input file location back to original MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- src/database/leads/local_repository.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/database/leads/local_repository.py b/src/database/leads/local_repository.py index 3b50c75..b80e063 100644 --- a/src/database/leads/local_repository.py +++ b/src/database/leads/local_repository.py @@ -19,7 +19,7 @@ class LocalRepository(Repository): BASE_PATH = os.path.dirname(__file__) DF_INPUT = os.path.abspath( - os.path.join(BASE_PATH, "../../data/demo_leads_email.csv") + os.path.join(BASE_PATH, "../../data/sumup_leads_email.csv") ) DF_OUTPUT = os.path.abspath( os.path.join(BASE_PATH, "../../data/leads_enriched.csv") From 4977ae7aa6eef6ff2ea06f22a617f8c9ca20108c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Tue, 6 Feb 2024 10:15:05 +0100 Subject: [PATCH 5/7] Add explanation for deprecation of scrape_addresses.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- Documentation/ideas.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/ideas.md b/Documentation/ideas.md index 12a9868..c9eb7bc 100644 --- a/Documentation/ideas.md +++ b/Documentation/ideas.md @@ -25,6 +25,13 @@ The current implementation of the module supports queueing messages from the BDC This step was supposed to be used for querying lead data from the facebook by using either the business owner's name or the company name. The attempt was deprecated as the cost for the needed API token was evaluated too high and because the usage permissions of the facebook API were changed. Furthermore, it is paramount to check the legal ramifications of querying facebook for this kind of data as there might be legal consequences of searching for individuals on facebook instead of their businesses due to data privacy regulations in the EU. +### ScrapeAddresses + +This step was an early experiment, using only the custom domain from an email address. We check if there's a live website running +for the domain, and then try to parse the main site for a business address using a RegEx pattern. The pattern is not very precise +and calling the website, as well as parsing it, takes quite some time, which accumulates for a lot of entries. The Google places +step yields better results for the business address and is faster, that's why `scrape_addresses.py` was deprecated. + ## Possible ML improvements ### Creating data subsets From 302136c9620d7f09373ee36e925308c4d8583a7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Tue, 6 Feb 2024 11:03:08 +0100 Subject: [PATCH 6/7] Adjust error logging for review analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- src/database/leads/s3_repository.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/database/leads/s3_repository.py b/src/database/leads/s3_repository.py index 2e11ed5..dbdf620 100644 --- a/src/database/leads/s3_repository.py +++ b/src/database/leads/s3_repository.py @@ -95,7 +95,7 @@ def _fetch_object_s3(self, bucket, obj_key): obj = s3.get_object(Bucket=bucket, Key=obj_key) except botocore.exceptions.ClientError as e: log.warning( - f"{e.response['Error']['Code']}: {e.response['Error']['Message']}" + f"{e.response['Error']['Code']}: {e.response['Error']['Message']} (s3://{bucket}/{obj_key})" if "Error" in e.response else f"Error while getting object s3://{bucket}/{obj_key}" ) @@ -209,8 +209,8 @@ def fetch_review(self, place_id): json_content = json.loads(file_content) return json_content except Exception as e: - log.error( - f"Error loading review from S3 with id {place_id}. Error: {str(e)}" + log.info( + f"No reviews in S3 for place with at s3://{bucket}/{key}. Error: {str(e)}" ) return [] From 91b83ae41ab4feb5502df4db50114a6baecaa29a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Tue, 6 Feb 2024 11:09:49 +0100 Subject: [PATCH 7/7] remove deprecated steps from tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- tests/test_pipeline_utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_pipeline_utils.py b/tests/test_pipeline_utils.py index 434fb6c..966415d 100644 --- a/tests/test_pipeline_utils.py +++ b/tests/test_pipeline_utils.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: 2024 Felix Zailskas -import os import unittest from unittest.mock import MagicMock, mock_open, patch @@ -22,7 +21,6 @@ def test_get_pipeline_steps(self): [ (HashGenerator, "Hash Generator", ""), (AnalyzeEmails, "Analyze Emails", ""), - (ScrapeAddress, "Scrape Address", "(will take a long time)"), ( SearchOffeneRegister, "Search OffeneRegister", @@ -73,7 +71,6 @@ def test_get_pipeline_additional_steps(self): additional_steps = get_pipeline_additional_steps() self.assertEqual( [ - (ScrapeAddress, "Scrape Address", "(will take a long time)"), ( SearchOffeneRegister, "Search OffeneRegister",