Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix/235 all steps bdc errors #239

Merged
merged 9 commits into from
Feb 6, 2024
File renamed without changes.
1 change: 0 additions & 1 deletion src/bdc/steps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,5 @@
from .hash_generator import *
from .preprocess_phonenumbers import *
from .regionalatlas import *
from .scrape_address import *
from .search_offeneregister import *
from .step import *
5 changes: 2 additions & 3 deletions src/bdc/steps/regionalatlas.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import geopandas as gpd
import osmnx
import pandas as pd
from geopandas.tools import sjoin
from pandas import DataFrame
from tqdm import tqdm

Expand Down Expand Up @@ -118,13 +117,13 @@ def run(self) -> DataFrame:

tqdm.pandas(desc="Computing Regional Score")

self.df[self.added_cols[:-1]] = self.df.progress_apply(
self.df[self.added_cols[-1:]] = self.df.progress_apply(
lambda lead: pd.Series(
get_lead_hash_generator().hash_check(
lead,
self.calculate_regional_score,
self.name + "_Regional-Score",
self.added_cols[:-1],
self.added_cols[-1:],
lead,
)
),
Expand Down
6 changes: 5 additions & 1 deletion src/database/leads/local_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import csv
import json
import os
from datetime import datetime
from pathlib import Path

import joblib
Expand Down Expand Up @@ -197,6 +196,11 @@ def fetch_gpt_result(self, file_id, operation_name):
try:
with open(json_file_path, "r", encoding="utf-8") as json_file:
data = json.load(json_file)
if operation_name not in data:
log.info(
f"Data for operation {operation_name} was not found in {json_file_path}"
)
return ""
return data[operation_name]
except:
log.warning(f"Error loading GPT results from path {json_file_path}.")
Expand Down
4 changes: 0 additions & 4 deletions src/demo/pipeline_configs/config_sprint09_release.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@
"name": "AnalyzeEmails",
"force_refresh": true
},
{
"name": "ScrapeAddress",
"force_refresh": true
},
{
"name": "PreprocessPhonenumbers",
"force_refresh": true
Expand Down
4 changes: 0 additions & 4 deletions src/demo/pipeline_configs/config_template
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@
"name": "AnalyzeEmails",
"force_refresh": true
},
{
"name": "ScrapeAddress",
"force_refresh": true
},
{
"name": "PreprocessPhonenumbers",
"force_refresh": true
Expand Down
43 changes: 43 additions & 0 deletions src/demo/pipeline_configs/force_refresh_all_steps.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"description": "This config runs all steps with force_refresh set to true.",
"config": {
"steps": [
{
"name": "HashGenerator",
"force_refresh": true
},
{
"name": "AnalyzeEmails",
"force_refresh": true
},
{
"name": "PreprocessPhonenumbers",
"force_refresh": true
},
{
"name": "GooglePlaces",
"force_refresh": true
},
{
"name": "GooglePlacesDetailed",
"force_refresh": true
},
{
"name": "GPTReviewSentimentAnalyzer",
"force_refresh": true
},
{
"name": "GPTSummarizer",
"force_refresh": true
},
{
"name": "SmartReviewInsightsEnhancer",
"force_refresh": true
},
{
"name": "RegionalAtlas",
"force_refresh": true
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SPDX-License-Identifier: MIT
SPDX-FileCopyrightText: 2023 Berkay Bozkurt <resitberkaybozkurt@gmail.com>
27 changes: 27 additions & 0 deletions src/demo/pipeline_configs/regionalatlas_only.json
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need force_refresh on all the earlier steps if we only want to run RegionalAtlas?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Depends if you expect that there can be newer data found than what you have locally. I don't expect the addresses to change much and the regionalatlas data is local anyway so it cant change.

Copy link
Collaborator Author

@luccalb luccalb Feb 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In any case, it's ensured that the data is present, it's just not refreshed :)

Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"description": "This config runs all steps with force_refresh set to true.",
"config": {
"steps": [
{
"name": "HashGenerator",
"force_refresh": true
},
{
"name": "AnalyzeEmails",
"force_refresh": true
},
{
"name": "PreprocessPhonenumbers",
"force_refresh": true
},
{
"name": "GooglePlaces",
"force_refresh": true
},
{
"name": "RegionalAtlas",
"force_refresh": true
}
]
}
}
2 changes: 2 additions & 0 deletions src/demo/pipeline_configs/regionalatlas_only.json.license
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SPDX-License-Identifier: MIT
SPDX-FileCopyrightText: 2023 Lucca Baumgärtner <lucca.baumgaertner@fau.de>
24 changes: 10 additions & 14 deletions src/demo/pipeline_configs/run_all_steps.json
Original file line number Diff line number Diff line change
@@ -1,46 +1,42 @@
{
"description": "This config runs all steps with force_refresh set to true.",
"description": "This config runs all steps with force_refresh set to false.",
"config": {
"steps": [
{
"name": "HashGenerator",
"force_refresh": true
"force_refresh": false
},
{
"name": "AnalyzeEmails",
"force_refresh": true
},
{
"name": "ScrapeAddress",
"force_refresh": true
"force_refresh": false
},
{
"name": "PreprocessPhonenumbers",
"force_refresh": true
"force_refresh": false
},
{
"name": "GooglePlaces",
"force_refresh": true
"force_refresh": false
},
{
"name": "GooglePlacesDetailed",
"force_refresh": true
"force_refresh": false
},
{
"name": "GPTReviewSentimentAnalyzer",
"force_refresh": true
"force_refresh": false
},
{
"name": "GPTSummarizer",
"force_refresh": true
"force_refresh": false
},
{
"name": "SmartReviewInsightsEnhancer",
"force_refresh": true
"force_refresh": false
},
{
"name": "RegionalAtlas",
"force_refresh": true
"force_refresh": false
}
]
}
Expand Down
3 changes: 0 additions & 3 deletions src/demo/pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
HashGenerator,
PreprocessPhonenumbers,
RegionalAtlas,
ScrapeAddress,
SearchOffeneRegister,
SmartReviewInsightsEnhancer,
)
Expand All @@ -33,14 +32,12 @@
"GPTSummarizer": GPTSummarizer,
"PreprocessPhonenumbers": PreprocessPhonenumbers,
"RegionalAtlas": RegionalAtlas,
"ScrapeAddress": ScrapeAddress,
"SearchOffeneRegister": SearchOffeneRegister,
"SmartReviewInsightsEnhancer": SmartReviewInsightsEnhancer,
}

# Please do not write following lists! Use the functions below instead.
_additional_pipeline_steps = [
(ScrapeAddress, "Scrape Address", "(will take a long time)"),
(SearchOffeneRegister, "Search OffeneRegister", "(will take a long time)"),
(PreprocessPhonenumbers, "Phone Number Validation", ""),
(
Expand Down
Loading