From 4841bd6db6c96655674c8e1a8cd83f5177e4fd7a Mon Sep 17 00:00:00 2001 From: Andrew Ridden-Harper Date: Fri, 19 Jul 2024 08:59:21 +1200 Subject: [PATCH] improved documentation --- README.md | 14 ++++++++ config.yaml | 20 ++++-------- examine_calculated_vs30.py | 14 ++++++++ filtering.py | 10 ++++-- load_sql_db.py | 15 ++------- main.py | 65 ++++++-------------------------------- requirements.txt | 5 +++ run_calculations.py | 61 +++++++++++++++++++++++++++++++++++ 8 files changed, 121 insertions(+), 83 deletions(-) create mode 100644 README.md create mode 100644 examine_calculated_vs30.py create mode 100644 run_calculations.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..e529518 --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +This package loads CPT data, filters out unusable data, +and calculates Vs30 values for each CPT using several +correlations. + +The calculations and CPT class were implemented by Joel Ridden +in the `vs_calc` package. + +The loading and filtering of input data was adapted from +earlier work by Sung Bae in the `cpt2vs30` package. + +To run this package, first configure the input parameters +by editing the `config.yaml` file. Then run the `main.py` script as + +```python main.py``` \ No newline at end of file diff --git a/config.yaml b/config.yaml index fc15c0a..9f60333 100644 --- a/config.yaml +++ b/config.yaml @@ -1,25 +1,17 @@ -#input_data_dir: "/home/arr65/vs30_data_input_data/csv" -#input_data_format : "csv" - -n_procs : 7 - +## Set the input data directory and data format (csv or sql) input_data_dir : "/home/arr65/vs30_data_input_data/sql" input_data_format : "sql" +## Set the output directory output_dir : "/home/arr65/vs30_data_output/" -# data filtering -min_CPT_separation_dist_m : 0.1 +## Set the number of processors to use +n_procs : 7 +## Set input data filtering parameters +min_CPT_separation_dist_m : 0.1 max_num_same_depth_values : 1 - min_allowed_data_value : -0.2 - max_num_allowed_repeated_digits : 3 - min_allowed_max_depth_m : 5 - min_allowed_depth_span_m : 5 - - - diff --git a/examine_calculated_vs30.py b/examine_calculated_vs30.py new file mode 100644 index 0000000..005ed36 --- /dev/null +++ b/examine_calculated_vs30.py @@ -0,0 +1,14 @@ +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +from pathlib import Path + +import config as cfg + +config = cfg.Config() + +output_dir = Path(config.get_value("output_dir")) + +summary_df = pd.read_csv(output_dir / "summary.csv") + diff --git a/filtering.py b/filtering.py index ef067e0..38853d6 100644 --- a/filtering.py +++ b/filtering.py @@ -1,3 +1,9 @@ +""" +This module contains functions to filter out unusable CPT data. +These functions are adapted from earlier work by Sung Bae in +the cpt2vs30 package. +""" + import functools import multiprocessing from collections import Counter @@ -60,9 +66,9 @@ def filtered_out_entry( ) -def no_data_in_cpt(cpt_name: str, cpt_record: np.array) -> Optional[pd.DataFrame]: +def identify_no_data_in_cpt(cpt_name: str, cpt_record: np.array) -> Optional[pd.DataFrame]: """ - Check if there is data in the CPT record. + Identify CPT records that contain no data. Parameters ---------- diff --git a/load_sql_db.py b/load_sql_db.py index 9ff2f07..b179a4f 100644 --- a/load_sql_db.py +++ b/load_sql_db.py @@ -1,4 +1,6 @@ -from collections import Counter +""" +Functions developed by Sung Bae to load CPT data from a SQL database. +""" import numpy as np from sqlalchemy import ( @@ -10,19 +12,8 @@ ) from sqlalchemy.ext.declarative import declarative_base - -def log_error(skipped_fp, cpt_name, error): - skipped_fp.write(f"{cpt_name} - {error}\n") - - -def count_digits(arr): - stringified = str(arr).replace("0", "").replace(".", "") - return Counter(stringified) - - Base = declarative_base() - class CPTLocation(Base): __tablename__ = "cpt_location" id = Column(Integer, primary_key=True) diff --git a/main.py b/main.py index 485051a..3f6a0bb 100644 --- a/main.py +++ b/main.py @@ -1,10 +1,10 @@ """ -The main script to run the Vs30 estimation workflow +The main script for calculating Vs30 values from CPT data. """ -import functools + import glob -import multiprocessing + import time from pathlib import Path @@ -14,6 +14,7 @@ import config as cfg import filtering +import run_calculations import load_sql_db from vs_calc import ( CPT, @@ -23,52 +24,6 @@ ) -def calculate_vs30_from_single_cpt(cpt: CPT, cpt_vs_correlations, vs30_correlations): - - results_df_list = [] - - for cpt_vs_correlation in cpt_vs_correlations: - - for vs30_correlation in vs30_correlations: - - cpt_vs_profile = VsProfile.from_cpt(cpt, cpt_vs_correlation) - - cpt_vs_profile.vs30_correlation = vs30_correlation - - results_df_list.append( - pd.DataFrame( - { - "cpt_name": [cpt.name], - "nztm_x": [cpt.nztm_x], - "nztm_y": [cpt.nztm_y], - "cpt_correlation": [cpt_vs_correlation], - "vs30_correlation": [cpt_vs_profile.vs30_correlation], - "vs30": [cpt_vs_profile.vs30], - "vs30_sd": [cpt_vs_profile.vs30_sd], - } - ) - ) - - return pd.concat(results_df_list, ignore_index=True) - - -def calculate_vs30_from_all_cpts( - cpts, cpt_vs_correlations, vs30_correlations, n_procs=1 -): - - with multiprocessing.Pool(processes=n_procs) as pool: - - results_df_list = pool.map( - functools.partial( - calculate_vs30_from_single_cpt, - cpt_vs_correlations=cpt_vs_correlations, - vs30_correlations=vs30_correlations, - ), - cpts, - ) - - return pd.concat(results_df_list, ignore_index=True) - start_time = time.time() @@ -107,11 +62,11 @@ def calculate_vs30_from_all_cpts( cpt_records = load_sql_db.get_cpt_data(session, cpt_loc.name, columnwise=False) - skipped_record_from_filter = filtering.no_data_in_cpt(cpt_loc.name, cpt_records) + filtered_out_entry = filtering.identify_no_data_in_cpt(cpt_loc.name, cpt_records) - if skipped_record_from_filter is not None: + if filtered_out_entry is not None: filtered_out_df = pd.concat( - [filtered_out_df, skipped_record_from_filter], ignore_index=True + [filtered_out_df, filtered_out_entry], ignore_index=True ) continue @@ -166,7 +121,7 @@ def calculate_vs30_from_all_cpts( print(f"time taken for filtering: {(time.time() - start_time)/60.0} minutes") vs_calc_start_time = time.time() -vs30_results_df = calculate_vs30_from_all_cpts( +vs30_results_df = run_calculations.calculate_vs30_from_all_cpts( cpts=cpts, cpt_vs_correlations=list(cpt_vs_correlations.CPT_CORRELATIONS.keys()), vs30_correlations=list(vs30_correlations.VS30_CORRELATIONS.keys()), @@ -180,10 +135,10 @@ def calculate_vs30_from_all_cpts( # Write output files filtered_out_df.to_csv(output_dir / "filtered_out_with_all_reasons.csv", index=False) vs30_results_df.to_csv(output_dir / "vs30_results.csv", index=False) - - filtered_out_df.drop_duplicates(subset="cpt_name", keep="first").to_csv( output_dir / "filtered_out_only_first_reason.csv", index=False ) +summary_df = pd.DataFrame({"num_remaining": num_cpts_remaining, "num_skipped": num_skipped_for_reason, "reason": reasons }) + print(f"total taken: {(time.time() - start_time)/60.0} minutes") diff --git a/requirements.txt b/requirements.txt index e69de29..0559322 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,5 @@ +pandas +SQLAlchemy +numpy +qcore @ git+https://github.com/ucgmsim/qcore.git +PyYAML \ No newline at end of file diff --git a/run_calculations.py b/run_calculations.py new file mode 100644 index 0000000..5fc7f7c --- /dev/null +++ b/run_calculations.py @@ -0,0 +1,61 @@ +""" +Functions to run the calculations for Vs30 estimation. +The Vs30 calculations were implemented by Joel Ridden +in the vs_calc package. +""" +import functools +import multiprocessing + +import pandas as pd + + +from vs_calc import ( + CPT, + VsProfile) + +def calculate_vs30_from_single_cpt(cpt: CPT, cpt_vs_correlations, vs30_correlations): + + results_df_list = [] + + for cpt_vs_correlation in cpt_vs_correlations: + + for vs30_correlation in vs30_correlations: + + cpt_vs_profile = VsProfile.from_cpt(cpt, cpt_vs_correlation) + + cpt_vs_profile.vs30_correlation = vs30_correlation + + results_df_list.append( + pd.DataFrame( + { + "cpt_name": [cpt.name], + "nztm_x": [cpt.nztm_x], + "nztm_y": [cpt.nztm_y], + "cpt_correlation": [cpt_vs_correlation], + "vs30_correlation": [cpt_vs_profile.vs30_correlation], + "vs30": [cpt_vs_profile.vs30], + "vs30_sd": [cpt_vs_profile.vs30_sd], + } + ) + ) + + return pd.concat(results_df_list, ignore_index=True) + + +def calculate_vs30_from_all_cpts( + cpts, cpt_vs_correlations, vs30_correlations, n_procs=1 +): + + with multiprocessing.Pool(processes=n_procs) as pool: + + results_df_list = pool.map( + functools.partial( + calculate_vs30_from_single_cpt, + cpt_vs_correlations=cpt_vs_correlations, + vs30_correlations=vs30_correlations, + ), + cpts, + ) + + return pd.concat(results_df_list, ignore_index=True) +