From c230aeb9f567e2807e68156a7f6a203f3ca756f2 Mon Sep 17 00:00:00 2001 From: Jesse Mortenson Date: Wed, 14 Aug 2024 12:47:53 -0500 Subject: [PATCH 1/2] Scrape archiving: handle exceptions so archiving doesn't fail scrape --- openstates/cli/update.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/openstates/cli/update.py b/openstates/cli/update.py index b1902d83..1e30bf25 100644 --- a/openstates/cli/update.py +++ b/openstates/cli/update.py @@ -217,22 +217,28 @@ def archive_to_cloud_storage( return logger.info("Beginning archive of scraped files to google cloud storage.") logger.info(f"GCP Project is {GCP_PROJECT} and bucket is {BUCKET_NAME}") - cloud_storage_client = storage.Client(project=GCP_PROJECT) - bucket = cloud_storage_client.bucket(BUCKET_NAME) - jurisdiction_id = juris.jurisdiction_id.replace("ocd-jurisdiction/", "") - destination_prefx = ( - f"{SCRAPE_LAKE_PREFIX}/{jurisdiction_id}/{last_scrape_end_datetime.isoformat()}" - ) - # read files in directory and upload - files_count = 0 - for file_path in glob.glob(datadir + "/*.json"): - files_count += 1 - blob_name = os.path.join(destination_prefx, os.path.basename(file_path)) - blob = bucket.blob(blob_name) - blob.upload_from_filename(file_path) + # Catch exceptions so that we do not fail the scrape if transient GCS error occurs + try: + cloud_storage_client = storage.Client(project=GCP_PROJECT) + bucket = cloud_storage_client.bucket(BUCKET_NAME) + jurisdiction_id = juris.jurisdiction_id.replace("ocd-jurisdiction/", "") + destination_prefx = ( + f"{SCRAPE_LAKE_PREFIX}/{jurisdiction_id}/{last_scrape_end_datetime.isoformat()}" + ) + + # read files in directory and upload + files_count = 0 + for file_path in glob.glob(datadir + "/*.json"): + files_count += 1 + blob_name = os.path.join(destination_prefx, os.path.basename(file_path)) + blob = bucket.blob(blob_name) + blob.upload_from_filename(file_path) + + logger.info(f"Completed archive to Google Cloud Storage, {files_count} files were uploaded.") - logger.info(f"Completed archive to Google Cloud Storage, {files_count} files were uploaded.") + except Exception as e: + logger.warning(f"An error occurred during the attempt to archive files to Google Cloud Storage: {e}") def do_import(juris: State, args: argparse.Namespace) -> dict[str, typing.Any]: From 4af2454140d1f0480147476d5ae359ff00971e13 Mon Sep 17 00:00:00 2001 From: Jesse Mortenson Date: Wed, 14 Aug 2024 12:50:56 -0500 Subject: [PATCH 2/2] Mark minor release for archiving exception handling --- CHANGELOG.md | 4 ++++ pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d44dbac..e704b83f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## 6.20.2 - Aug 14, 2024 + +* Prevent failure in Google Cloud Storage archiving from failing a scrape/update operation + ## 6.20.1 - Aug 2, 2024 * Fix permissions issue caused by slightly wrong usage of GCP storage client code diff --git a/pyproject.toml b/pyproject.toml index b3d59ea1..f71ceb50 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "openstates" -version = "6.20.1" +version = "6.20.2" description = "core infrastructure for the openstates project" authors = ["James Turk "] license = "MIT"