Skip to content

Commit

Permalink
Backend PHRMA Cancer: Adds _pct_share cols (SatcherInstitute#3584)
Browse files Browse the repository at this point in the history
# Description and Motivation
<!--- bulleted, high level items. use keywords (eg "closes SatcherInstitute#144" or
"fixes #4323") -->

- part of SatcherInstitute#3579
- extricate `build_bq_col_types()` in util function which scans df and
automatically gets the metric cols based on suffixes
- calcs pct_shares with or without "unknowns" based on the demographic
breakdown and what's reflected in the source data
- updates golden data with new cols

## Has this been tested? How?

- tests passing

## Types of changes

(leave all that apply)

- New content or feature

## New frontend preview link is below in the Netlify comment 😎

---------

Co-authored-by: Eric M Warren II <eric.m.warren1@gmail.com>
  • Loading branch information
2 people authored and kccrtv committed Aug 27, 2024
1 parent 6dd293c commit 5c77f18
Show file tree
Hide file tree
Showing 12 changed files with 2,288 additions and 2,262 deletions.
43 changes: 33 additions & 10 deletions python/datasources/phrma_brfss.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import pandas as pd
from datasources.data_source import DataSource
from ingestion.constants import NATIONAL_LEVEL, ALL_VALUE, US_NAME

from ingestion.constants import NATIONAL_LEVEL, ALL_VALUE, US_NAME, UNKNOWN
from ingestion import gcs_to_bq_util, standardized_columns as std_col
from ingestion.merge_utils import merge_state_ids
from ingestion.dataset_utils import (
generate_pct_share_col_without_unknowns,
generate_pct_share_col_with_unknowns,
build_bq_col_types,
)
from ingestion.het_types import (
GEO_TYPE,
PHRMA_BREAKDOWN_TYPE,
Expand Down Expand Up @@ -48,9 +52,8 @@ def write_to_bq(self, dataset, gcs_bucket, **attrs):

df = self.generate_breakdown_df(demo_type, geo_level)

float_cols = []
col_types = gcs_to_bq_util.get_bq_column_types(df, float_cols)
gcs_to_bq_util.add_df_to_bq(df, dataset, table_name, column_types=col_types)
bq_col_types = build_bq_col_types(df)
gcs_to_bq_util.add_df_to_bq(df, dataset, table_name, column_types=bq_col_types)

def generate_breakdown_df(
self,
Expand Down Expand Up @@ -88,18 +91,38 @@ def generate_breakdown_df(

# rename count cols
rename_col_map = {}
count_to_pct_share_map = {}
for condition in PHRMA_CANCER_PCT_CONDITIONS:
cancer_type = condition.lower()
rate_numerator = f'{condition}_{COUNT_YES_LOWER}'
rate_denominator = f'{condition}_{COUNT_TOTAL_LOWER}'
rename_col_map[rate_numerator] = f'{cancer_type}_{SCREENED}_{std_col.RAW_SUFFIX}'
rename_col_map[rate_denominator] = f'{cancer_type}_{SCREENING_ELIGIBLE}_{std_col.RAW_SUFFIX}'

source_rate_numerator = f'{condition}_{COUNT_YES_LOWER}'
source_rate_denominator = f'{condition}_{COUNT_TOTAL_LOWER}'
het_rate_numerator = f'{cancer_type}_{SCREENED}_{std_col.RAW_SUFFIX}'
het_rate_denominator = f'{cancer_type}_{SCREENING_ELIGIBLE}_{std_col.RAW_SUFFIX}'
het_pct_share = f'{cancer_type}_{SCREENED}_{std_col.PCT_SHARE_SUFFIX}'
rename_col_map[source_rate_numerator] = het_rate_numerator
count_to_pct_share_map[het_rate_numerator] = het_pct_share
rename_col_map[source_rate_denominator] = het_rate_denominator
df = df.rename(columns=rename_col_map)

if demo_breakdown == std_col.RACE_OR_HISPANIC_COL:
std_col.add_race_columns_from_category_id(df)

if demo_breakdown in [std_col.RACE_OR_HISPANIC_COL, std_col.AGE_COL]:
df = generate_pct_share_col_without_unknowns(
df,
count_to_pct_share_map,
demo_breakdown,
ALL_VALUE,
)
else:
df = generate_pct_share_col_with_unknowns(
df,
count_to_pct_share_map,
demo_breakdown,
ALL_VALUE,
UNKNOWN,
)

df = df.sort_values(by=[std_col.STATE_FIPS_COL, demo_col]).reset_index(drop=True)

return df
15 changes: 9 additions & 6 deletions python/ingestion/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,16 +789,19 @@ def get_timeview_df_and_cols(
if time_view == 'current':
df = preserve_most_recent_year_rows_per_topic(df, topic_prefixes)

# build BigQuery types dict
bq_col_types: Dict[str, str] = {}
for kept_col in df.columns:
bq_col_types[kept_col] = (
BQ_FLOAT if std_col.ends_with_suffix_from_list(kept_col, std_col.SUFFIXES) else BQ_STRING
)
bq_col_types = build_bq_col_types(df)

return (df, bq_col_types)


def build_bq_col_types(df: pd.DataFrame) -> Dict[str, str]:
"""Returns a dict mapping column names needed by BigQuery to their BQ types."""
bq_col_types: Dict[str, str] = {}
for col in df.columns:
bq_col_types[col] = BQ_FLOAT if std_col.ends_with_suffix_from_list(col, std_col.SUFFIXES) else BQ_STRING
return bq_col_types


# TODO: Remove in favor of new function get_timeview_df_and_cols() above
def generate_time_df_with_cols_and_types(
df: pd.DataFrame,
Expand Down
32 changes: 16 additions & 16 deletions python/tests/data/phrma_brfss/golden_data/expected_age_national.csv
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
age,breast_screened_estimated_total,breast_screening_eligible_estimated_total,state_fips,cervical_screened_estimated_total,cervical_screening_eligible_estimated_total,colorectal_screened_estimated_total,colorectal_screening_eligible_estimated_total,lung_screened_estimated_total,lung_screening_eligible_estimated_total,prostate_screened_estimated_total,prostate_screening_eligible_estimated_total,breast_screened_pct_rate,cervical_screened_pct_rate,colorectal_screened_pct_rate,lung_screened_pct_rate,prostate_screened_pct_rate,state_name
21-24,,,00,1868.0,6170.0,,,,,,,,26.0,,,,United States
25-29,,,00,3653.0,8534.0,,,,,,,,40.0,,,,United States
30-34,,,00,4841.0,10474.0,,,,,,,,44.0,,,,United States
35-39,,,00,5778.0,11870.0,,,,,,,,46.0,,,,United States
40-44,,,00,6911.0,12770.0,,,,,,,,52.0,,,,United States
45-49,,,00,6741.0,12324.0,7430.0,25200.0,,,,,,52.0,30.0,,,United States
50-54,11932.0,16118.0,00,7668.0,14828.0,16788.0,29794.0,215.0,1701.0,,,74.0,51.0,54.0,13.0,,United States
55-59,13224.0,17541.0,00,7980.0,16110.0,23976.0,32699.0,594.0,2595.0,284.0,884.0,76.0,50.0,71.0,22.0,30.0,United States
60-64,16850.0,21889.0,00,,,30420.0,39805.0,1092.0,3604.0,435.0,1059.0,76.0,,75.0,29.0,35.0,United States
60-65,,,00,11135.0,24842.0,,,,,,,,44.0,,,,United States
65-69,18526.0,23217.0,00,,,34570.0,42506.0,1274.0,3669.0,563.0,1116.0,79.0,,80.0,34.0,49.0,United States
70-74,17273.0,21512.0,00,,,,,1090.0,2995.0,,,79.0,,,35.0,,United States
70-75,,,00,,,39433.0,47189.0,,,,,,,83.0,,,United States
75-79,,,00,,,,,546.0,1642.0,,,,,,38.0,,United States
All,77805.0,100277.0,00,56575.0,117922.0,152617.0,217193.0,4811.0,16206.0,1282.0,3059.0,77.0,45.0,66.0,28.0,37.0,United States
age,breast_screened_estimated_total,breast_screening_eligible_estimated_total,state_fips,cervical_screened_estimated_total,cervical_screening_eligible_estimated_total,colorectal_screened_estimated_total,colorectal_screening_eligible_estimated_total,lung_screened_estimated_total,lung_screening_eligible_estimated_total,prostate_screened_estimated_total,prostate_screening_eligible_estimated_total,breast_screened_pct_rate,cervical_screened_pct_rate,colorectal_screened_pct_rate,lung_screened_pct_rate,prostate_screened_pct_rate,state_name,breast_screened_pct_share,cervical_screened_pct_share,colorectal_screened_pct_share,lung_screened_pct_share,prostate_screened_pct_share
21-24,,,00,1868.0,6170.0,,,,,,,,26.0,,,,United States,,3.3,,,
25-29,,,00,3653.0,8534.0,,,,,,,,40.0,,,,United States,,6.5,,,
30-34,,,00,4841.0,10474.0,,,,,,,,44.0,,,,United States,,8.6,,,
35-39,,,00,5778.0,11870.0,,,,,,,,46.0,,,,United States,,10.2,,,
40-44,,,00,6911.0,12770.0,,,,,,,,52.0,,,,United States,,12.2,,,
45-49,,,00,6741.0,12324.0,7430.0,25200.0,,,,,,52.0,30.0,,,United States,,11.9,4.9,,
50-54,11932.0,16118.0,00,7668.0,14828.0,16788.0,29794.0,215.0,1701.0,,,74.0,51.0,54.0,13.0,,United States,15.3,13.6,11.0,4.5,
55-59,13224.0,17541.0,00,7980.0,16110.0,23976.0,32699.0,594.0,2595.0,284.0,884.0,76.0,50.0,71.0,22.0,30.0,United States,17.0,14.1,15.7,12.3,22.2
60-64,16850.0,21889.0,00,,,30420.0,39805.0,1092.0,3604.0,435.0,1059.0,76.0,,75.0,29.0,35.0,United States,21.7,,19.9,22.7,33.9
60-65,,,00,11135.0,24842.0,,,,,,,,44.0,,,,United States,,19.7,,,
65-69,18526.0,23217.0,00,,,34570.0,42506.0,1274.0,3669.0,563.0,1116.0,79.0,,80.0,34.0,49.0,United States,23.8,,22.7,26.5,43.9
70-74,17273.0,21512.0,00,,,,,1090.0,2995.0,,,79.0,,,35.0,,United States,22.2,,,22.7,
70-75,,,00,,,39433.0,47189.0,,,,,,,83.0,,,United States,,,25.8,,
75-79,,,00,,,,,546.0,1642.0,,,,,,38.0,,United States,,,,11.3,
All,77805.0,100277.0,00,56575.0,117922.0,152617.0,217193.0,4811.0,16206.0,1282.0,3059.0,77.0,45.0,66.0,28.0,37.0,United States,100.0,100.0,100.0,100.0,100.0
Loading

0 comments on commit 5c77f18

Please sign in to comment.