Skip to content

Commit

Permalink
improved documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
AndrewRidden-Harper committed Jul 18, 2024
1 parent 714445f commit 4841bd6
Show file tree
Hide file tree
Showing 8 changed files with 121 additions and 83 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
This package loads CPT data, filters out unusable data,
and calculates Vs30 values for each CPT using several
correlations.

The calculations and CPT class were implemented by Joel Ridden
in the `vs_calc` package.

The loading and filtering of input data was adapted from
earlier work by Sung Bae in the `cpt2vs30` package.

To run this package, first configure the input parameters
by editing the `config.yaml` file. Then run the `main.py` script as

```python main.py```
20 changes: 6 additions & 14 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1,25 +1,17 @@
#input_data_dir: "/home/arr65/vs30_data_input_data/csv"
#input_data_format : "csv"

n_procs : 7

## Set the input data directory and data format (csv or sql)
input_data_dir : "/home/arr65/vs30_data_input_data/sql"
input_data_format : "sql"

## Set the output directory
output_dir : "/home/arr65/vs30_data_output/"

# data filtering
min_CPT_separation_dist_m : 0.1
## Set the number of processors to use
n_procs : 7

## Set input data filtering parameters
min_CPT_separation_dist_m : 0.1
max_num_same_depth_values : 1

min_allowed_data_value : -0.2

max_num_allowed_repeated_digits : 3

min_allowed_max_depth_m : 5

min_allowed_depth_span_m : 5



14 changes: 14 additions & 0 deletions examine_calculated_vs30.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from pathlib import Path

import config as cfg

config = cfg.Config()

output_dir = Path(config.get_value("output_dir"))

summary_df = pd.read_csv(output_dir / "summary.csv")

10 changes: 8 additions & 2 deletions filtering.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
"""
This module contains functions to filter out unusable CPT data.
These functions are adapted from earlier work by Sung Bae in
the cpt2vs30 package.
"""

import functools
import multiprocessing
from collections import Counter
Expand Down Expand Up @@ -60,9 +66,9 @@ def filtered_out_entry(
)


def no_data_in_cpt(cpt_name: str, cpt_record: np.array) -> Optional[pd.DataFrame]:
def identify_no_data_in_cpt(cpt_name: str, cpt_record: np.array) -> Optional[pd.DataFrame]:
"""
Check if there is data in the CPT record.
Identify CPT records that contain no data.
Parameters
----------
Expand Down
15 changes: 3 additions & 12 deletions load_sql_db.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from collections import Counter
"""
Functions developed by Sung Bae to load CPT data from a SQL database.
"""

import numpy as np
from sqlalchemy import (
Expand All @@ -10,19 +12,8 @@
)
from sqlalchemy.ext.declarative import declarative_base


def log_error(skipped_fp, cpt_name, error):
skipped_fp.write(f"{cpt_name} - {error}\n")


def count_digits(arr):
stringified = str(arr).replace("0", "").replace(".", "")
return Counter(stringified)


Base = declarative_base()


class CPTLocation(Base):
__tablename__ = "cpt_location"
id = Column(Integer, primary_key=True)
Expand Down
65 changes: 10 additions & 55 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
"""
The main script to run the Vs30 estimation workflow
The main script for calculating Vs30 values from CPT data.
"""

import functools

import glob
import multiprocessing

import time
from pathlib import Path

Expand All @@ -14,6 +14,7 @@

import config as cfg
import filtering
import run_calculations
import load_sql_db
from vs_calc import (
CPT,
Expand All @@ -23,52 +24,6 @@
)


def calculate_vs30_from_single_cpt(cpt: CPT, cpt_vs_correlations, vs30_correlations):

results_df_list = []

for cpt_vs_correlation in cpt_vs_correlations:

for vs30_correlation in vs30_correlations:

cpt_vs_profile = VsProfile.from_cpt(cpt, cpt_vs_correlation)

cpt_vs_profile.vs30_correlation = vs30_correlation

results_df_list.append(
pd.DataFrame(
{
"cpt_name": [cpt.name],
"nztm_x": [cpt.nztm_x],
"nztm_y": [cpt.nztm_y],
"cpt_correlation": [cpt_vs_correlation],
"vs30_correlation": [cpt_vs_profile.vs30_correlation],
"vs30": [cpt_vs_profile.vs30],
"vs30_sd": [cpt_vs_profile.vs30_sd],
}
)
)

return pd.concat(results_df_list, ignore_index=True)


def calculate_vs30_from_all_cpts(
cpts, cpt_vs_correlations, vs30_correlations, n_procs=1
):

with multiprocessing.Pool(processes=n_procs) as pool:

results_df_list = pool.map(
functools.partial(
calculate_vs30_from_single_cpt,
cpt_vs_correlations=cpt_vs_correlations,
vs30_correlations=vs30_correlations,
),
cpts,
)

return pd.concat(results_df_list, ignore_index=True)


start_time = time.time()

Expand Down Expand Up @@ -107,11 +62,11 @@ def calculate_vs30_from_all_cpts(

cpt_records = load_sql_db.get_cpt_data(session, cpt_loc.name, columnwise=False)

skipped_record_from_filter = filtering.no_data_in_cpt(cpt_loc.name, cpt_records)
filtered_out_entry = filtering.identify_no_data_in_cpt(cpt_loc.name, cpt_records)

if skipped_record_from_filter is not None:
if filtered_out_entry is not None:
filtered_out_df = pd.concat(
[filtered_out_df, skipped_record_from_filter], ignore_index=True
[filtered_out_df, filtered_out_entry], ignore_index=True
)
continue

Expand Down Expand Up @@ -166,7 +121,7 @@ def calculate_vs30_from_all_cpts(
print(f"time taken for filtering: {(time.time() - start_time)/60.0} minutes")

vs_calc_start_time = time.time()
vs30_results_df = calculate_vs30_from_all_cpts(
vs30_results_df = run_calculations.calculate_vs30_from_all_cpts(
cpts=cpts,
cpt_vs_correlations=list(cpt_vs_correlations.CPT_CORRELATIONS.keys()),
vs30_correlations=list(vs30_correlations.VS30_CORRELATIONS.keys()),
Expand All @@ -180,10 +135,10 @@ def calculate_vs30_from_all_cpts(
# Write output files
filtered_out_df.to_csv(output_dir / "filtered_out_with_all_reasons.csv", index=False)
vs30_results_df.to_csv(output_dir / "vs30_results.csv", index=False)


filtered_out_df.drop_duplicates(subset="cpt_name", keep="first").to_csv(
output_dir / "filtered_out_only_first_reason.csv", index=False
)
summary_df = pd.DataFrame({"num_remaining": num_cpts_remaining, "num_skipped": num_skipped_for_reason, "reason": reasons })


print(f"total taken: {(time.time() - start_time)/60.0} minutes")
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pandas
SQLAlchemy
numpy
qcore @ git+https://github.com/ucgmsim/qcore.git
PyYAML
61 changes: 61 additions & 0 deletions run_calculations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""
Functions to run the calculations for Vs30 estimation.
The Vs30 calculations were implemented by Joel Ridden
in the vs_calc package.
"""
import functools
import multiprocessing

import pandas as pd


from vs_calc import (
CPT,
VsProfile)

def calculate_vs30_from_single_cpt(cpt: CPT, cpt_vs_correlations, vs30_correlations):

results_df_list = []

for cpt_vs_correlation in cpt_vs_correlations:

for vs30_correlation in vs30_correlations:

cpt_vs_profile = VsProfile.from_cpt(cpt, cpt_vs_correlation)

cpt_vs_profile.vs30_correlation = vs30_correlation

results_df_list.append(
pd.DataFrame(
{
"cpt_name": [cpt.name],
"nztm_x": [cpt.nztm_x],
"nztm_y": [cpt.nztm_y],
"cpt_correlation": [cpt_vs_correlation],
"vs30_correlation": [cpt_vs_profile.vs30_correlation],
"vs30": [cpt_vs_profile.vs30],
"vs30_sd": [cpt_vs_profile.vs30_sd],
}
)
)

return pd.concat(results_df_list, ignore_index=True)


def calculate_vs30_from_all_cpts(
cpts, cpt_vs_correlations, vs30_correlations, n_procs=1
):

with multiprocessing.Pool(processes=n_procs) as pool:

results_df_list = pool.map(
functools.partial(
calculate_vs30_from_single_cpt,
cpt_vs_correlations=cpt_vs_correlations,
vs30_correlations=vs30_correlations,
),
cpts,
)

return pd.concat(results_df_list, ignore_index=True)

0 comments on commit 4841bd6

Please sign in to comment.