Skip to content

Commit

Permalink
fix: Removed unnecesseary try-except statement
Browse files Browse the repository at this point in the history
  • Loading branch information
da-the-dev committed Jul 25, 2024
1 parent dc0e462 commit 4225b16
Showing 1 changed file with 51 additions and 57 deletions.
108 changes: 51 additions & 57 deletions src/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,63 +24,57 @@ def sample_data(cfg: DictConfig):
The function to sample the data from the given URL and save it to the sample path.
Returns both the sampled data and the updated configuration settings without updating the real config files.
"""
try:
datastore_path = cfg.data.datastore_path

# Create datastore directory if not exists
Path(datastore_path).parent.mkdir(exist_ok=True, parents=True)

# Check if the source data is available, if not download it.
if not os.path.exists(datastore_path):
print("Downloading data from: ", cfg.data.url)
gdown.download(cfg.data.url, datastore_path, quiet=False, use_cookies=False)

# Determine the total number of rows in the file without loading it entirely
total_rows = sum(1 for row in open(datastore_path, "r")) - 1 # Exclude header

# Calculate the sample size
sample_size = math.ceil(total_rows * cfg.data.sample_size)

# Determine the start row for sampling
start_row = (
0
if cfg.data.last_included_row_number < 0
else (cfg.data.last_included_row_number + 1) % total_rows
)

# If the start_row + sample_size exceeds total_rows, adjust the sample size
if start_row + sample_size > total_rows:
sample_size = total_rows - start_row

# Load only the necessary rows into memory
skiprows = range(
1, start_row + 1
) # Skip rows before the start_row, keeping header
nrows = sample_size # Number of rows to read
data = pd.read_csv(datastore_path, skiprows=skiprows, nrows=nrows)

print("Sampling data...")
resulted_sample = data

# Create a deep copy of cfg to modify without affecting the original
updated_cfg = copy.deepcopy(cfg)

# Update the configuration for last included row number in the copy
new_last_included_row_number = start_row + sample_size - 1
updated_cfg.data.last_included_row_number = (
new_last_included_row_number % total_rows
)

# Increment and update the data version in the copy
new_version = f"v{updated_cfg.data.version_number + 1}.0"
updated_cfg.data.data_version = new_version
updated_cfg.data.version_number = updated_cfg.data.version_number + 1

# Return both the sampled data and the updated configuration
return resulted_sample, updated_cfg
except Exception as e:
print("Error in loading or sampling the data: ", e)
return None, cfg
datastore_path = cfg.data.datastore_path

# Create datastore directory if not exists
Path(datastore_path).parent.mkdir(exist_ok=True, parents=True)

# Check if the source data is available, if not download it.
if not os.path.exists(datastore_path):
print("Downloading data from: ", cfg.data.url)
gdown.download(cfg.data.url, datastore_path, quiet=False, use_cookies=False)

# Determine the total number of rows in the file without loading it entirely
total_rows = sum(1 for row in open(datastore_path, "r")) - 1 # Exclude header

# Calculate the sample size
sample_size = math.ceil(total_rows * cfg.data.sample_size)

# Determine the start row for sampling
start_row = (
0
if cfg.data.last_included_row_number < 0
else (cfg.data.last_included_row_number + 1) % total_rows
)

# If the start_row + sample_size exceeds total_rows, adjust the sample size
if start_row + sample_size > total_rows:
sample_size = total_rows - start_row

# Load only the necessary rows into memory
skiprows = range(1, start_row + 1) # Skip rows before the start_row, keeping header
nrows = sample_size # Number of rows to read
data = pd.read_csv(datastore_path, skiprows=skiprows, nrows=nrows)

print("Sampling data...")
resulted_sample = data

# Create a deep copy of cfg to modify without affecting the original
updated_cfg = copy.deepcopy(cfg)

# Update the configuration for last included row number in the copy
new_last_included_row_number = start_row + sample_size - 1
updated_cfg.data.last_included_row_number = (
new_last_included_row_number % total_rows
)

# Increment and update the data version in the copy
new_version = f"v{updated_cfg.data.version_number + 1}.0"
updated_cfg.data.data_version = new_version
updated_cfg.data.version_number = updated_cfg.data.version_number + 1

# Return both the sampled data and the updated configuration
return resulted_sample, updated_cfg


def validate_initial_data(cfg: DictConfig, df: pd.DataFrame):
Expand Down

0 comments on commit 4225b16

Please sign in to comment.