From 4841bd6db6c96655674c8e1a8cd83f5177e4fd7a Mon Sep 17 00:00:00 2001
From: Andrew Ridden-Harper <andrew.ridden-harper@canterbury.ac.nz>
Date: Fri, 19 Jul 2024 08:59:21 +1200
Subject: [PATCH] improved documentation

---
 README.md                  | 14 ++++++++
 config.yaml                | 20 ++++--------
 examine_calculated_vs30.py | 14 ++++++++
 filtering.py               | 10 ++++--
 load_sql_db.py             | 15 ++-------
 main.py                    | 65 ++++++--------------------------------
 requirements.txt           |  5 +++
 run_calculations.py        | 61 +++++++++++++++++++++++++++++++++++
 8 files changed, 121 insertions(+), 83 deletions(-)
 create mode 100644 README.md
 create mode 100644 examine_calculated_vs30.py
 create mode 100644 run_calculations.py

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e529518
--- /dev/null
+++ b/README.md
@@ -0,0 +1,14 @@
+This package loads CPT data, filters out unusable data,
+and calculates Vs30 values for each CPT using several 
+correlations. 
+
+The calculations and CPT class were implemented by Joel Ridden
+in the `vs_calc` package. 
+
+The loading and filtering of input data was adapted from 
+earlier work by Sung Bae in the `cpt2vs30` package.
+
+To run this package, first configure the input parameters 
+by editing the `config.yaml` file. Then run the `main.py` script as 
+
+```python main.py```
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
index fc15c0a..9f60333 100644
--- a/config.yaml
+++ b/config.yaml
@@ -1,25 +1,17 @@
-#input_data_dir: "/home/arr65/vs30_data_input_data/csv"
-#input_data_format : "csv"
-
-n_procs : 7
-
+## Set the input data directory and data format (csv or sql)
 input_data_dir : "/home/arr65/vs30_data_input_data/sql"
 input_data_format : "sql"
 
+## Set the output directory
 output_dir : "/home/arr65/vs30_data_output/"
 
-# data filtering
-min_CPT_separation_dist_m : 0.1
+## Set the number of processors to use
+n_procs : 7
 
+## Set input data filtering parameters
+min_CPT_separation_dist_m : 0.1
 max_num_same_depth_values : 1
-
 min_allowed_data_value : -0.2
-
 max_num_allowed_repeated_digits : 3
-
 min_allowed_max_depth_m : 5
-
 min_allowed_depth_span_m : 5
-
-
-
diff --git a/examine_calculated_vs30.py b/examine_calculated_vs30.py
new file mode 100644
index 0000000..005ed36
--- /dev/null
+++ b/examine_calculated_vs30.py
@@ -0,0 +1,14 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+from pathlib import Path
+
+import config as cfg
+
+config = cfg.Config()
+
+output_dir = Path(config.get_value("output_dir"))
+
+summary_df = pd.read_csv(output_dir / "summary.csv")
+
diff --git a/filtering.py b/filtering.py
index ef067e0..38853d6 100644
--- a/filtering.py
+++ b/filtering.py
@@ -1,3 +1,9 @@
+"""
+This module contains functions to filter out unusable CPT data.
+These functions are adapted from earlier work by Sung Bae in
+the cpt2vs30 package.
+"""
+
 import functools
 import multiprocessing
 from collections import Counter
@@ -60,9 +66,9 @@ def filtered_out_entry(
     )
 
 
-def no_data_in_cpt(cpt_name: str, cpt_record: np.array) -> Optional[pd.DataFrame]:
+def identify_no_data_in_cpt(cpt_name: str, cpt_record: np.array) -> Optional[pd.DataFrame]:
     """
-    Check if there is data in the CPT record.
+    Identify CPT records that contain no data.
 
     Parameters
     ----------
diff --git a/load_sql_db.py b/load_sql_db.py
index 9ff2f07..b179a4f 100644
--- a/load_sql_db.py
+++ b/load_sql_db.py
@@ -1,4 +1,6 @@
-from collections import Counter
+"""
+Functions developed by Sung Bae to load CPT data from a SQL database.
+"""
 
 import numpy as np
 from sqlalchemy import (
@@ -10,19 +12,8 @@
 )
 from sqlalchemy.ext.declarative import declarative_base
 
-
-def log_error(skipped_fp, cpt_name, error):
-    skipped_fp.write(f"{cpt_name} - {error}\n")
-
-
-def count_digits(arr):
-    stringified = str(arr).replace("0", "").replace(".", "")
-    return Counter(stringified)
-
-
 Base = declarative_base()
 
-
 class CPTLocation(Base):
     __tablename__ = "cpt_location"
     id = Column(Integer, primary_key=True)
diff --git a/main.py b/main.py
index 485051a..3f6a0bb 100644
--- a/main.py
+++ b/main.py
@@ -1,10 +1,10 @@
 """
-The main script to run the Vs30 estimation workflow
+The main script for calculating Vs30 values from CPT data.
 """
 
-import functools
+
 import glob
-import multiprocessing
+
 import time
 from pathlib import Path
 
@@ -14,6 +14,7 @@
 
 import config as cfg
 import filtering
+import run_calculations
 import load_sql_db
 from vs_calc import (
     CPT,
@@ -23,52 +24,6 @@
 )
 
 
-def calculate_vs30_from_single_cpt(cpt: CPT, cpt_vs_correlations, vs30_correlations):
-
-    results_df_list = []
-
-    for cpt_vs_correlation in cpt_vs_correlations:
-
-        for vs30_correlation in vs30_correlations:
-
-            cpt_vs_profile = VsProfile.from_cpt(cpt, cpt_vs_correlation)
-
-            cpt_vs_profile.vs30_correlation = vs30_correlation
-
-            results_df_list.append(
-                pd.DataFrame(
-                    {
-                        "cpt_name": [cpt.name],
-                        "nztm_x": [cpt.nztm_x],
-                        "nztm_y": [cpt.nztm_y],
-                        "cpt_correlation": [cpt_vs_correlation],
-                        "vs30_correlation": [cpt_vs_profile.vs30_correlation],
-                        "vs30": [cpt_vs_profile.vs30],
-                        "vs30_sd": [cpt_vs_profile.vs30_sd],
-                    }
-                )
-            )
-
-    return pd.concat(results_df_list, ignore_index=True)
-
-
-def calculate_vs30_from_all_cpts(
-    cpts, cpt_vs_correlations, vs30_correlations, n_procs=1
-):
-
-    with multiprocessing.Pool(processes=n_procs) as pool:
-
-        results_df_list = pool.map(
-            functools.partial(
-                calculate_vs30_from_single_cpt,
-                cpt_vs_correlations=cpt_vs_correlations,
-                vs30_correlations=vs30_correlations,
-            ),
-            cpts,
-        )
-
-    return pd.concat(results_df_list, ignore_index=True)
-
 
 start_time = time.time()
 
@@ -107,11 +62,11 @@ def calculate_vs30_from_all_cpts(
 
         cpt_records = load_sql_db.get_cpt_data(session, cpt_loc.name, columnwise=False)
 
-        skipped_record_from_filter = filtering.no_data_in_cpt(cpt_loc.name, cpt_records)
+        filtered_out_entry = filtering.identify_no_data_in_cpt(cpt_loc.name, cpt_records)
 
-        if skipped_record_from_filter is not None:
+        if filtered_out_entry is not None:
             filtered_out_df = pd.concat(
-                [filtered_out_df, skipped_record_from_filter], ignore_index=True
+                [filtered_out_df, filtered_out_entry], ignore_index=True
             )
             continue
 
@@ -166,7 +121,7 @@ def calculate_vs30_from_all_cpts(
 print(f"time taken for filtering: {(time.time() - start_time)/60.0} minutes")
 
 vs_calc_start_time = time.time()
-vs30_results_df = calculate_vs30_from_all_cpts(
+vs30_results_df = run_calculations.calculate_vs30_from_all_cpts(
     cpts=cpts,
     cpt_vs_correlations=list(cpt_vs_correlations.CPT_CORRELATIONS.keys()),
     vs30_correlations=list(vs30_correlations.VS30_CORRELATIONS.keys()),
@@ -180,10 +135,10 @@ def calculate_vs30_from_all_cpts(
 # Write output files
 filtered_out_df.to_csv(output_dir / "filtered_out_with_all_reasons.csv", index=False)
 vs30_results_df.to_csv(output_dir / "vs30_results.csv", index=False)
-
-
 filtered_out_df.drop_duplicates(subset="cpt_name", keep="first").to_csv(
     output_dir / "filtered_out_only_first_reason.csv", index=False
 )
+summary_df = pd.DataFrame({"num_remaining": num_cpts_remaining, "num_skipped": num_skipped_for_reason, "reason": reasons })
+
 
 print(f"total taken: {(time.time() - start_time)/60.0} minutes")
diff --git a/requirements.txt b/requirements.txt
index e69de29..0559322 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+pandas
+SQLAlchemy
+numpy
+qcore @ git+https://github.com/ucgmsim/qcore.git
+PyYAML
\ No newline at end of file
diff --git a/run_calculations.py b/run_calculations.py
new file mode 100644
index 0000000..5fc7f7c
--- /dev/null
+++ b/run_calculations.py
@@ -0,0 +1,61 @@
+"""
+Functions to run the calculations for Vs30 estimation.
+The Vs30 calculations were implemented by Joel Ridden
+in the vs_calc package.
+"""
+import functools
+import multiprocessing
+
+import pandas as pd
+
+
+from vs_calc import (
+    CPT,
+    VsProfile)
+
+def calculate_vs30_from_single_cpt(cpt: CPT, cpt_vs_correlations, vs30_correlations):
+
+    results_df_list = []
+
+    for cpt_vs_correlation in cpt_vs_correlations:
+
+        for vs30_correlation in vs30_correlations:
+
+            cpt_vs_profile = VsProfile.from_cpt(cpt, cpt_vs_correlation)
+
+            cpt_vs_profile.vs30_correlation = vs30_correlation
+
+            results_df_list.append(
+                pd.DataFrame(
+                    {
+                        "cpt_name": [cpt.name],
+                        "nztm_x": [cpt.nztm_x],
+                        "nztm_y": [cpt.nztm_y],
+                        "cpt_correlation": [cpt_vs_correlation],
+                        "vs30_correlation": [cpt_vs_profile.vs30_correlation],
+                        "vs30": [cpt_vs_profile.vs30],
+                        "vs30_sd": [cpt_vs_profile.vs30_sd],
+                    }
+                )
+            )
+
+    return pd.concat(results_df_list, ignore_index=True)
+
+
+def calculate_vs30_from_all_cpts(
+    cpts, cpt_vs_correlations, vs30_correlations, n_procs=1
+):
+
+    with multiprocessing.Pool(processes=n_procs) as pool:
+
+        results_df_list = pool.map(
+            functools.partial(
+                calculate_vs30_from_single_cpt,
+                cpt_vs_correlations=cpt_vs_correlations,
+                vs30_correlations=vs30_correlations,
+            ),
+            cpts,
+        )
+
+    return pd.concat(results_df_list, ignore_index=True)
+