Skip to content

Commit

Permalink
Add option to provide mapping
Browse files Browse the repository at this point in the history
  • Loading branch information
BinamB committed Dec 6, 2024
1 parent 25d5709 commit 7ca2fae
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 4 deletions.
32 changes: 32 additions & 0 deletions scripts/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import boto3
import csv
import json
import time
import random
from google.cloud import storage
Expand Down Expand Up @@ -483,3 +484,34 @@ def get_record_with_retry(guid, max_retries=5, base_delay=1, backoff_factor=2):
f"Found {len(errored_list)} guids that weren't found in indexd. Here are all the guids: {errored_list}"
)
return result


def download_and_parse_map_file(file_location):
"""Downloads the map file and loads it as a readable dictionary
Args:
file_location (str): file location in s3 bucket
Returns:
dict: The contents of the map file as a dictionary.
"""
# Initialize S3 resource
s3 = boto3.resource("s3")

# Strip any extra spaces
file_location = file_location.strip()

# Parse the S3 URL
parsed_url = urlparse(file_location)
bucket_name = parsed_url.netloc
object_key = parsed_url.path.lstrip("/")

# Download the file to a local temporary location
local_file_path = "./map_file.json"
s3.meta.client.download_file(bucket_name, object_key, local_file_path)

# Load the JSON file as a dictionary
with open(local_file_path, "r") as file:
data = json.load(file)

return data
15 changes: 11 additions & 4 deletions scripts/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def resume_logger(filename=None):
logger = get_logger("Validation", filename)


def run(global_config):
def run(global_config, map_file={}):
"""
Given manifests run validation process to check if all the objects exist and are indexed correctly
Args:
Expand All @@ -28,6 +28,7 @@ def run(global_config):
'manifest_files': 's3://input/active_manifest.tsv, s3://input/legacy_manifest.tsv'
'out_manifests': 'active_manifest_aug.tsv, legacy_manifest_aug.tsv'
'FORCE_CREATE_MANIFEST': 'True' 'False'
'map_file': 's3://location/to/map_file.json'
}
Returns:
Expand Down Expand Up @@ -76,9 +77,15 @@ def run(global_config):
logger.info("scan all copied objects")

indexd_records = {}
for manifest_file in manifest_files:
records = utils.get_indexd_record_from_GDC_files(manifest_file, logger)
indexd_records.update(records)

if global_config.get("map_file"):
indexd_records = utils.download_and_parse_map_file(
global_config.get("map_file")
)
else:
for manifest_file in manifest_files:
records = utils.get_indexd_record_from_GDC_files(manifest_file, logger)
indexd_records.update(records)
aws_copied_objects, _ = build_object_dataset_aws(PROJECT_ACL, logger)
gs_copied_objects = utils.build_object_dataset_gs(PROJECT_ACL)

Expand Down

0 comments on commit 7ca2fae

Please sign in to comment.