Skip to content

Commit

Permalink
add USHD test with draws
Browse files Browse the repository at this point in the history
  • Loading branch information
ADucellierIHME committed Oct 29, 2024
1 parent e8f1449 commit 1377a23
Show file tree
Hide file tree
Showing 6 changed files with 274 additions and 4 deletions.
Binary file added docs/user_guide/figures/raking_USHD.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions docs/user_guide/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ User guide
what_is_raking
raking_without_u
raking_with_u
special_cases

This user guide introduces and explains some key concepts of raking.
If you are a new user starting out with raking, we recommend you to start with the
Expand Down
46 changes: 46 additions & 0 deletions docs/user_guide/special_cases.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
Special cases
=======================

We included other versions of the raking with specific constraints that are more complex than the 1D, 2D or 3D cases.

USHD raking
-----------

.. figure:: figures/raking_USHD.png

Raking the deaths count over cause, race and county.

In that case, the only known margins are the GBD values for the number of deaths at the state level for each cause of death.

The observations data frame looks like this:

===== ====== ====== ======
value cause race county
===== ====== ====== ======
float string string string
float string string string
float string string string
float string string string
===== ====== ====== ======

It should include the all causes value (cause='_all') for each race (including all races) and county and the all races value (race=0) for each cause (including all causes) and county. The all causes value is denoted by cause = '_all', you must make sure that all other causes sorted in alphabetical order will be ranked after '_all' (e.g. '_comm', '_inj' and '_ncd' will work). The all races value is denoted by race = 0, you must make sure that all other races sorted in ascending order will be ranked after 0 (e.g. 1, 2, 3, 4 and 7 will work).

The margins data frame look like this:

===== ==========================
cause value_agg_over_race_county
===== ==========================
_all float
_comm float
_inj float
_ncd float
===== ==========================

It contains the GBD values.

The inputs of the raking function are similar to the 1D, 2D and 3D cases:

* dim: Enter 'USHD'.
* df_obs: Enter the pandas data frame containing the observations. It must be formatted as explained above.
* df_margins: Enter a list of containing the margins data frame. It must be formatted as explained above.
* var_names: Enter None.
106 changes: 106 additions & 0 deletions src/raking/compute_covariance.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,6 +675,112 @@ def compute_covariance_margins_3D(
return sigma_ss


def check_margins_USHD(
df_margins: pd.DataFrame, var_names: list, draws: str
) -> None:
"""Check whether the margin data frame is valid for the USHD case.
Parameters
----------
df_margins : pd.DataFrame
Margins data (sums over the first variable)
var_names : list of strings
Names of the variables over which we rake (e.g. cause)
draws : string
Names of the column containing the indices of the draws
Returns
-------
None
"""
assert isinstance(
df_margins, pd.DataFrame
), "The margins should be a pandas data frame."
assert (
len(df_margins) >= 2
), "There should be at least 2 data points for the margins."

assert (
"value_agg_over_race_county" in df_margins.columns.tolist()
), "The column for the aggregated value over race and county is missing from the margins data frame."
assert (
df_margins["value_agg_over_race_county"].isna().sum() == 0
), "There are missing values in the value_agg_over_race_county column of the margins."

assert isinstance(
draws, str
), "The name of the column containing the draws should be a string."
assert (
draws in df_margins.columns.tolist()
), "The column containing the draws is missing from the margins data frame."
assert (
df_margins[draws].isna().sum() == 0
), "There are missing values in the draws column of the margins."
assert (
len(df_margins[df_margins.duplicated(["cause", draws])]) == 0
), "There are duplicated rows in the margins."
count_obs = df_margins[["cause", draws]].value_counts()
assert (len(count_obs.unique()) == 1) and (
count_obs.unique()[0] == 1
), "There are missing draws in the margins."


def compute_covariance_margins_USHD(
df_margins: pd.DataFrame,
var_names: list,
draws: str,
I: int,
J: int,
K: int,
) -> np.ndarray:
"""Compute the covariance matrix of the margins for the USHD case.
Parameters
----------
df_margins : pd.DataFrame
Margins data (sums over the first variable)
var_names : list of strings
Names of the variables over which we rake (e.g. cause)
draws : string
Names of the column containing the indices of the draws
I : int
Number of causes of deaths
J : int
Number of races and ethnicities
K : int
Number of counties
Returns
-------
sigma_ss : np.ndarray
I * I covariance matrix
"""
check_margins_USHD(df_margins, var_names, draws)

nsamples = len(df_margins[draws].unique())
df = df_margins[["cause", "value_agg_over_race_county", draws]].loc[
df_margins.cause != "_all"
]
df.sort_values(by=["cause", draws], inplace=True)
value = df["value_agg_over_race_county"].to_numpy()
X = np.reshape(value, shape=(nsamples, -1), order="F")
Xmean = np.mean(X, axis=0)
Xc = X - Xmean
sigma_ss = np.matmul(np.transpose(Xc), Xc) / nsamples
sigma_12 = np.zeros((I, 2 * K + J * K + (I - 1) * K))
sigma_22 = np.zeros(
(2 * K + J * K + (I - 1) * K, 2 * K + J * K + (I - 1) * K)
)
sigma_ss = np.concatenate(
(
np.concatenate((sigma_ss, sigma_12), axis=1),
np.concatenate((np.transpose(sigma_12), sigma_22), axis=1),
),
axis=0,
)
return sigma_ss


def check_obs_margins_1D(
df_obs: pd.DataFrame, df_margins: pd.DataFrame, draws: str
) -> None:
Expand Down
74 changes: 70 additions & 4 deletions src/raking/run_raking.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
compute_covariance_margins_1D,
compute_covariance_margins_2D,
compute_covariance_margins_3D,
compute_covariance_margins_USHD,
)
from raking.compute_covariance import (
compute_covariance_obs_margins_1D,
Expand Down Expand Up @@ -184,7 +185,15 @@ def run_raking(
sigma_ys,
)
elif dim == "USHD":
pass
(sigma_yy, sigma_ss, sigma_ys) = compute_covariance_USHD(
df_obs,
df_margins,
var_names,
draws,
sigma_yy,
sigma_ss,
sigma_ys,
)
else:
pass
# Check if matrix is definite positive
Expand All @@ -194,9 +203,14 @@ def run_raking(

# Compute the mean (if we have draws)
if cov_mat:
(df_obs, df_margins) = compute_mean(
df_obs, df_margins, var_names, draws
)
if dim in [1, 2, 3]:
(df_obs, df_margins) = compute_mean(
df_obs, df_margins, var_names, draws
)
else:
(df_obs, df_margins) = compute_mean(
df_obs, df_margins, ["race_county"], draws
)

# Get the input variables for the raking
if dim == 1:
Expand Down Expand Up @@ -669,6 +683,58 @@ def compute_covariance_3D(
return (sigma_yy, sigma_ss, sigma_ys)


def compute_covariance_USHD(
df_obs: pd.DataFrame,
df_margins: pd.DataFrame,
var_names: list,
draws: str,
sigma_yy: np.ndarray,
sigma_ss: np.ndarray,
sigma_ys: np.ndarray,
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""Compute the covariance matrix of observations and margins for the USHD case.
Parameters
----------
df_obs : pd.DataFrame
Observations data
df_margins : list of pd.DataFrame
list of data frames contatining the margins data
var_names : list of strings
Names of the variables over which we rake (cause, race, county)
draws: string
Name of the column that contains the samples.
sigma_yy : np.ndarray
Covariance matrix of the observations
sigma_ss : np.ndarray
Covariance matrix of the margins
sigma_ys : np.ndarray
Covariance matrix of the observations and margins
Returns
-------
sigma_yy : np.ndarray
Covariance matrix of the observations
sigma_ss : np.ndarray
Covariance matrix of the margins
sigma_ys : np.ndarray
Covariance matrix of the observations and margins
"""
df_margins = df_margins[0]
if sigma_yy is None:
sigma_yy = compute_covariance_obs(df_obs, var_names, draws)
if sigma_ss is None:
I = len(df_obs["cause"].unique()) - 1
J = len(df_obs["race"].unique()) - 1
K = len(df_obs["county"].unique())
sigma_ss = compute_covariance_margins_USHD(
df_margins, ["cause"], draws, I, J, K
)
if sigma_ys is None:
sigma_ys = np.zeros((sigma_yy.shape[0], sigma_ss.shape[0]))
return (sigma_yy, sigma_ss, sigma_ys)


def compute_mean(
df_obs: pd.DataFrame, df_margins: list, var_names: list, draws: str
) -> tuple[pd.DataFrame, list]:
Expand Down
51 changes: 51 additions & 0 deletions tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,3 +244,54 @@ def test_run_raking_3D_draws(example_3D_draws):
assert np.allclose(
sum_over_var3["raked_value"], sum_over_var3["value_agg_over_var3"]
), "The sums over the third variable must match the third margins."


def test_run_raking_USHD_draws(example_USHD_draws):
(df_obs, Dphi_y, Dphi_s, sigma) = run_raking(
dim="USHD",
df_obs=example_USHD_draws.df_obs,
df_margins=[example_USHD_draws.df_margins],
var_names=None,
cov_mat=True,
)
sum_over_cause = (
df_obs.loc[df_obs.cause != "_all"]
.groupby(["race", "county"])
.agg({"raked_value": "sum"})
.reset_index()
.merge(df_obs.loc[df_obs.cause == "_all"], on=["race", "county"])
)
assert np.allclose(
sum_over_cause["raked_value_x"],
sum_over_cause["raked_value_y"],
atol=1.0e-4,
), "The sums over the cause must match the all causes deaths."
sum_over_race = (
df_obs.loc[df_obs.race != 0]
.groupby(["cause", "county"])
.agg({"raked_value": "sum"})
.reset_index()
.merge(df_obs.loc[df_obs.race == 0], on=["cause", "county"])
)
assert np.allclose(
sum_over_race["raked_value_x"],
sum_over_race["raked_value_y"],
atol=1.0e-4,
), "The sums over the race must match the all races deaths."
sum_over_race_county = (
df_obs.loc[df_obs.race != 0]
.groupby(["cause"])
.agg({"raked_value": "sum"})
.reset_index()
.merge(
example_USHD_draws.df_margins.groupby(["cause"])
.agg({"value_agg_over_race_county": "mean"})
.reset_index(),
on=["cause"],
)
)
assert np.allclose(
sum_over_race_county["raked_value"],
sum_over_race_county["value_agg_over_race_county"],
atol=1.0e-5,
), "The sums over race and county must match the GBD values."

0 comments on commit 1377a23

Please sign in to comment.