Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix/235 all steps bdc errors #239

Merged
merged 9 commits into from
Feb 6, 2024
Merged
7 changes: 7 additions & 0 deletions Documentation/ideas.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ The current implementation of the module supports queueing messages from the BDC

This step was supposed to be used for querying lead data from the facebook by using either the business owner's name or the company name. The attempt was deprecated as the cost for the needed API token was evaluated too high and because the usage permissions of the facebook API were changed. Furthermore, it is paramount to check the legal ramifications of querying facebook for this kind of data as there might be legal consequences of searching for individuals on facebook instead of their businesses due to data privacy regulations in the EU.

### ScrapeAddresses

This step was an early experiment, using only the custom domain from an email address. We check if there's a live website running
for the domain, and then try to parse the main site for a business address using a RegEx pattern. The pattern is not very precise
and calling the website, as well as parsing it, takes quite some time, which accumulates for a lot of entries. The Google places
step yields better results for the business address and is faster, that's why `scrape_addresses.py` was deprecated.

## Possible ML improvements

### Creating data subsets
Expand Down
File renamed without changes.
1 change: 0 additions & 1 deletion src/bdc/steps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,5 @@
from .hash_generator import *
from .preprocess_phonenumbers import *
from .regionalatlas import *
from .scrape_address import *
from .search_offeneregister import *
from .step import *
5 changes: 2 additions & 3 deletions src/bdc/steps/regionalatlas.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import geopandas as gpd
import osmnx
import pandas as pd
from geopandas.tools import sjoin
from pandas import DataFrame
from tqdm import tqdm

Expand Down Expand Up @@ -144,13 +143,13 @@ def run(self) -> DataFrame:

tqdm.pandas(desc="Computing Regional Score")

self.df[self.added_cols[:-1]] = self.df.progress_apply(
self.df[self.added_cols[-1:]] = self.df.progress_apply(
lambda lead: pd.Series(
get_lead_hash_generator().hash_check(
lead,
self.calculate_regional_score,
self.name + "_Regional-Score",
self.added_cols[:-1],
self.added_cols[-1:],
lead,
)
),
Expand Down
6 changes: 5 additions & 1 deletion src/database/leads/local_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import csv
import json
import os
from datetime import datetime
from pathlib import Path

import joblib
Expand Down Expand Up @@ -197,6 +196,11 @@ def fetch_gpt_result(self, file_id, operation_name):
try:
with open(json_file_path, "r", encoding="utf-8") as json_file:
data = json.load(json_file)
if operation_name not in data:
log.info(
f"Data for operation {operation_name} was not found in {json_file_path}"
)
return ""
return data[operation_name]
except:
log.warning(f"Error loading GPT results from path {json_file_path}.")
Expand Down
6 changes: 3 additions & 3 deletions src/database/leads/s3_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def _fetch_object_s3(self, bucket, obj_key):
obj = s3.get_object(Bucket=bucket, Key=obj_key)
except botocore.exceptions.ClientError as e:
log.warning(
f"{e.response['Error']['Code']}: {e.response['Error']['Message']}"
f"{e.response['Error']['Code']}: {e.response['Error']['Message']} (s3://{bucket}/{obj_key})"
if "Error" in e.response
else f"Error while getting object s3://{bucket}/{obj_key}"
)
Expand Down Expand Up @@ -209,8 +209,8 @@ def fetch_review(self, place_id):
json_content = json.loads(file_content)
return json_content
except Exception as e:
log.error(
f"Error loading review from S3 with id {place_id}. Error: {str(e)}"
log.info(
f"No reviews in S3 for place with at s3://{bucket}/{key}. Error: {str(e)}"
)
return []

Expand Down
4 changes: 0 additions & 4 deletions src/demo/pipeline_configs/config_sprint09_release.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@
"name": "AnalyzeEmails",
"force_refresh": true
},
{
"name": "ScrapeAddress",
"force_refresh": true
},
{
"name": "PreprocessPhonenumbers",
"force_refresh": true
Expand Down
4 changes: 0 additions & 4 deletions src/demo/pipeline_configs/config_template
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@
"name": "AnalyzeEmails",
"force_refresh": true
},
{
"name": "ScrapeAddress",
"force_refresh": true
},
{
"name": "PreprocessPhonenumbers",
"force_refresh": true
Expand Down
43 changes: 43 additions & 0 deletions src/demo/pipeline_configs/force_refresh_all_steps.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"description": "This config runs all steps with force_refresh set to true.",
"config": {
"steps": [
{
"name": "HashGenerator",
"force_refresh": true
},
{
"name": "AnalyzeEmails",
"force_refresh": true
},
{
"name": "PreprocessPhonenumbers",
"force_refresh": true
},
{
"name": "GooglePlaces",
"force_refresh": true
},
{
"name": "GooglePlacesDetailed",
"force_refresh": true
},
{
"name": "GPTReviewSentimentAnalyzer",
"force_refresh": true
},
{
"name": "GPTSummarizer",
"force_refresh": true
},
{
"name": "SmartReviewInsightsEnhancer",
"force_refresh": true
},
{
"name": "RegionalAtlas",
"force_refresh": true
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SPDX-License-Identifier: MIT
SPDX-FileCopyrightText: 2023 Berkay Bozkurt <resitberkaybozkurt@gmail.com>
27 changes: 27 additions & 0 deletions src/demo/pipeline_configs/regionalatlas_only.json
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need force_refresh on all the earlier steps if we only want to run RegionalAtlas?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Depends if you expect that there can be newer data found than what you have locally. I don't expect the addresses to change much and the regionalatlas data is local anyway so it cant change.

Copy link
Collaborator Author

@luccalb luccalb Feb 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In any case, it's ensured that the data is present, it's just not refreshed :)

Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"description": "This config runs all steps with force_refresh set to true.",
"config": {
"steps": [
{
"name": "HashGenerator",
"force_refresh": true
},
{
"name": "AnalyzeEmails",
"force_refresh": true
},
{
"name": "PreprocessPhonenumbers",
"force_refresh": true
},
{
"name": "GooglePlaces",
"force_refresh": true
},
{
"name": "RegionalAtlas",
"force_refresh": true
}
]
}
}
2 changes: 2 additions & 0 deletions src/demo/pipeline_configs/regionalatlas_only.json.license
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SPDX-License-Identifier: MIT
SPDX-FileCopyrightText: 2023 Lucca Baumgärtner <lucca.baumgaertner@fau.de>
24 changes: 10 additions & 14 deletions src/demo/pipeline_configs/run_all_steps.json
Original file line number Diff line number Diff line change
@@ -1,46 +1,42 @@
{
"description": "This config runs all steps with force_refresh set to true.",
"description": "This config runs all steps with force_refresh set to false.",
"config": {
"steps": [
{
"name": "HashGenerator",
"force_refresh": true
"force_refresh": false
},
{
"name": "AnalyzeEmails",
"force_refresh": true
},
{
"name": "ScrapeAddress",
"force_refresh": true
"force_refresh": false
},
{
"name": "PreprocessPhonenumbers",
"force_refresh": true
"force_refresh": false
},
{
"name": "GooglePlaces",
"force_refresh": true
"force_refresh": false
},
{
"name": "GooglePlacesDetailed",
"force_refresh": true
"force_refresh": false
},
{
"name": "GPTReviewSentimentAnalyzer",
"force_refresh": true
"force_refresh": false
},
{
"name": "GPTSummarizer",
"force_refresh": true
"force_refresh": false
},
{
"name": "SmartReviewInsightsEnhancer",
"force_refresh": true
"force_refresh": false
},
{
"name": "RegionalAtlas",
"force_refresh": true
"force_refresh": false
}
]
}
Expand Down
3 changes: 0 additions & 3 deletions src/demo/pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
HashGenerator,
PreprocessPhonenumbers,
RegionalAtlas,
ScrapeAddress,
SearchOffeneRegister,
SmartReviewInsightsEnhancer,
)
Expand All @@ -33,14 +32,12 @@
"GPTSummarizer": GPTSummarizer,
"PreprocessPhonenumbers": PreprocessPhonenumbers,
"RegionalAtlas": RegionalAtlas,
"ScrapeAddress": ScrapeAddress,
"SearchOffeneRegister": SearchOffeneRegister,
"SmartReviewInsightsEnhancer": SmartReviewInsightsEnhancer,
}

# Please do not write following lists! Use the functions below instead.
_additional_pipeline_steps = [
(ScrapeAddress, "Scrape Address", "(will take a long time)"),
(SearchOffeneRegister, "Search OffeneRegister", "(will take a long time)"),
(PreprocessPhonenumbers, "Phone Number Validation", ""),
(
Expand Down
3 changes: 0 additions & 3 deletions tests/test_pipeline_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: 2024 Felix Zailskas <felixzailskas@gmail.com>

import os
import unittest
from unittest.mock import MagicMock, mock_open, patch

Expand All @@ -22,7 +21,6 @@ def test_get_pipeline_steps(self):
[
(HashGenerator, "Hash Generator", ""),
(AnalyzeEmails, "Analyze Emails", ""),
(ScrapeAddress, "Scrape Address", "(will take a long time)"),
(
SearchOffeneRegister,
"Search OffeneRegister",
Expand Down Expand Up @@ -73,7 +71,6 @@ def test_get_pipeline_additional_steps(self):
additional_steps = get_pipeline_additional_steps()
self.assertEqual(
[
(ScrapeAddress, "Scrape Address", "(will take a long time)"),
(
SearchOffeneRegister,
"Search OffeneRegister",
Expand Down
Loading