add USHD test with draws

ihmeuw-msca · Oct 29, 2024 · 1377a23 · 1377a23
1 parent e8f1449
commit 1377a23
Show file tree

Hide file tree

Showing 6 changed files with 274 additions and 4 deletions.
diff --git a/docs/user_guide/figures/raking_USHD.png b/docs/user_guide/figures/raking_USHD.png
diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst
@@ -8,6 +8,7 @@ User guide
    what_is_raking
    raking_without_u
    raking_with_u
+   special_cases
 
 This user guide introduces and explains some key concepts of raking.
 If you are a new user starting out with raking, we recommend you to start with the

diff --git a/docs/user_guide/special_cases.rst b/docs/user_guide/special_cases.rst
@@ -0,0 +1,46 @@
+Special cases
+=======================
+
+We included other versions of the raking with specific constraints that are more complex than the 1D, 2D or 3D cases.
+
+USHD raking
+-----------
+
+.. figure:: figures/raking_USHD.png
+
+    Raking the deaths count over cause, race and county.
+
+In that case, the only known margins are the GBD values for the number of deaths at the state level for each cause of death.
+
+The observations data frame looks like this:
+
+=====  ====== ====== ======
+value  cause  race   county
+=====  ====== ====== ======
+float  string string string
+float  string string string
+float  string string string
+float  string string string
+=====  ====== ====== ======
+
+It should include the all causes value (cause='_all') for each race (including all races) and county and the all races value (race=0) for each cause (including all causes) and county. The all causes value is denoted by cause = '_all', you must make sure that all other causes sorted in alphabetical order will be ranked after '_all' (e.g. '_comm', '_inj' and '_ncd' will work). The all races value is denoted by race = 0, you must make sure that all other races sorted in ascending order will be ranked after 0 (e.g. 1, 2, 3, 4 and 7 will work).
+
+The margins data frame look like this:
+
+===== ==========================
+cause value_agg_over_race_county
+===== ==========================
+_all  float
+_comm float
+_inj  float
+_ncd  float
+===== ==========================
+
+It contains the GBD values.
+
+The inputs of the raking function are similar to the 1D, 2D and 3D cases:
+
+* dim: Enter 'USHD'.
+* df_obs: Enter the pandas data frame containing the observations. It must be formatted as explained above.
+* df_margins: Enter a list of containing the margins data frame. It must be formatted as explained above.
+* var_names: Enter None.
diff --git a/src/raking/compute_covariance.py b/src/raking/compute_covariance.py
@@ -675,6 +675,112 @@ def compute_covariance_margins_3D(
     return sigma_ss
 
 
+def check_margins_USHD(
+    df_margins: pd.DataFrame, var_names: list, draws: str
+) -> None:
+    """Check whether the margin data frame is valid for the USHD case.
+
+    Parameters
+    ----------
+    df_margins : pd.DataFrame
+        Margins data (sums over the first variable)
+    var_names : list of strings
+        Names of the variables over which we rake (e.g. cause)
+    draws : string
+        Names of the column containing the indices of the draws
+
+    Returns
+    -------
+    None
+    """
+    assert isinstance(
+        df_margins, pd.DataFrame
+    ), "The margins should be a pandas data frame."
+    assert (
+        len(df_margins) >= 2
+    ), "There should be at least 2 data points for the margins."
+
+    assert (
+        "value_agg_over_race_county" in df_margins.columns.tolist()
+    ), "The column for the aggregated value over race and county is missing from the margins data frame."
+    assert (
+        df_margins["value_agg_over_race_county"].isna().sum() == 0
+    ), "There are missing values in the value_agg_over_race_county column of the margins."
+
+    assert isinstance(
+        draws, str
+    ), "The name of the column containing the draws should be a string."
+    assert (
+        draws in df_margins.columns.tolist()
+    ), "The column containing the draws is missing from the margins data frame."
+    assert (
+        df_margins[draws].isna().sum() == 0
+    ), "There are missing values in the draws column of the margins."
+    assert (
+        len(df_margins[df_margins.duplicated(["cause", draws])]) == 0
+    ), "There are duplicated rows in the margins."
+    count_obs = df_margins[["cause", draws]].value_counts()
+    assert (len(count_obs.unique()) == 1) and (
+        count_obs.unique()[0] == 1
+    ), "There are missing draws in the margins."
+
+
+def compute_covariance_margins_USHD(
+    df_margins: pd.DataFrame,
+    var_names: list,
+    draws: str,
+    I: int,
+    J: int,
+    K: int,
+) -> np.ndarray:
+    """Compute the covariance matrix of the margins for the USHD case.
+
+    Parameters
+    ----------
+    df_margins : pd.DataFrame
+        Margins data (sums over the first variable)
+    var_names : list of strings
+        Names of the variables over which we rake (e.g. cause)
+    draws : string
+        Names of the column containing the indices of the draws
+    I : int
+        Number of causes of deaths
+    J : int
+        Number of races and ethnicities
+    K : int
+        Number of counties
+
+    Returns
+    -------
+    sigma_ss : np.ndarray
+        I * I covariance matrix
+    """
+    check_margins_USHD(df_margins, var_names, draws)
+
+    nsamples = len(df_margins[draws].unique())
+    df = df_margins[["cause", "value_agg_over_race_county", draws]].loc[
+        df_margins.cause != "_all"
+    ]
+    df.sort_values(by=["cause", draws], inplace=True)
+    value = df["value_agg_over_race_county"].to_numpy()
+    X = np.reshape(value, shape=(nsamples, -1), order="F")
+    Xmean = np.mean(X, axis=0)
+    Xc = X - Xmean
+    sigma_ss = np.matmul(np.transpose(Xc), Xc) / nsamples
+    sigma_12 = np.zeros((I, 2 * K + J * K + (I - 1) * K))
+    sigma_22 = np.zeros(
+        (2 * K + J * K + (I - 1) * K, 2 * K + J * K + (I - 1) * K)
+    )
+    sigma_ss = np.concatenate(
+        (
+            np.concatenate((sigma_ss, sigma_12), axis=1),
+            np.concatenate((np.transpose(sigma_12), sigma_22), axis=1),
+        ),
+        axis=0,
+    )
+    return sigma_ss
+
+
 def check_obs_margins_1D(
     df_obs: pd.DataFrame, df_margins: pd.DataFrame, draws: str
 ) -> None:

diff --git a/src/raking/run_raking.py b/src/raking/run_raking.py
@@ -14,6 +14,7 @@
     compute_covariance_margins_1D,
     compute_covariance_margins_2D,
     compute_covariance_margins_3D,
+    compute_covariance_margins_USHD,
 )
 from raking.compute_covariance import (
     compute_covariance_obs_margins_1D,
@@ -184,7 +185,15 @@ def run_raking(
                 sigma_ys,
             )
         elif dim == "USHD":
-            pass
+            (sigma_yy, sigma_ss, sigma_ys) = compute_covariance_USHD(
+                df_obs,
+                df_margins,
+                var_names,
+                draws,
+                sigma_yy,
+                sigma_ss,
+                sigma_ys,
+            )
         else:
             pass
         # Check if matrix is definite positive
@@ -194,9 +203,14 @@ def run_raking(
 
     # Compute the mean (if we have draws)
     if cov_mat:
-        (df_obs, df_margins) = compute_mean(
-            df_obs, df_margins, var_names, draws
-        )
+        if dim in [1, 2, 3]:
+            (df_obs, df_margins) = compute_mean(
+                df_obs, df_margins, var_names, draws
+            )
+        else:
+            (df_obs, df_margins) = compute_mean(
+                df_obs, df_margins, ["race_county"], draws
+            )
 
     # Get the input variables for the raking
     if dim == 1:
@@ -669,6 +683,58 @@ def compute_covariance_3D(
     return (sigma_yy, sigma_ss, sigma_ys)
 
 
+def compute_covariance_USHD(
+    df_obs: pd.DataFrame,
+    df_margins: pd.DataFrame,
+    var_names: list,
+    draws: str,
+    sigma_yy: np.ndarray,
+    sigma_ss: np.ndarray,
+    sigma_ys: np.ndarray,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Compute the covariance matrix of observations and margins for the USHD case.
+
+    Parameters
+    ----------
+    df_obs : pd.DataFrame
+        Observations data
+    df_margins : list of pd.DataFrame
+        list of data frames contatining the margins data
+    var_names : list of strings
+        Names of the variables over which we rake (cause, race, county)
+    draws: string
+        Name of the column that contains the samples.
+    sigma_yy : np.ndarray
+        Covariance matrix of the observations
+    sigma_ss : np.ndarray
+        Covariance matrix of the margins
+    sigma_ys : np.ndarray
+        Covariance matrix of the observations and margins
+
+    Returns
+    -------
+    sigma_yy : np.ndarray
+        Covariance matrix of the observations
+    sigma_ss : np.ndarray
+        Covariance matrix of the margins
+    sigma_ys : np.ndarray
+        Covariance matrix of the observations and margins
+    """
+    df_margins = df_margins[0]
+    if sigma_yy is None:
+        sigma_yy = compute_covariance_obs(df_obs, var_names, draws)
+    if sigma_ss is None:
+        I = len(df_obs["cause"].unique()) - 1
+        J = len(df_obs["race"].unique()) - 1
+        K = len(df_obs["county"].unique())
+        sigma_ss = compute_covariance_margins_USHD(
+            df_margins, ["cause"], draws, I, J, K
+        )
+    if sigma_ys is None:
+        sigma_ys = np.zeros((sigma_yy.shape[0], sigma_ss.shape[0]))
+    return (sigma_yy, sigma_ss, sigma_ys)
+
+
 def compute_mean(
     df_obs: pd.DataFrame, df_margins: list, var_names: list, draws: str
 ) -> tuple[pd.DataFrame, list]:

diff --git a/tests/test_run.py b/tests/test_run.py
@@ -244,3 +244,54 @@ def test_run_raking_3D_draws(example_3D_draws):
     assert np.allclose(
         sum_over_var3["raked_value"], sum_over_var3["value_agg_over_var3"]
     ), "The sums over the third variable must match the third margins."
+
+
+def test_run_raking_USHD_draws(example_USHD_draws):
+    (df_obs, Dphi_y, Dphi_s, sigma) = run_raking(
+        dim="USHD",
+        df_obs=example_USHD_draws.df_obs,
+        df_margins=[example_USHD_draws.df_margins],
+        var_names=None,
+        cov_mat=True,
+    )
+    sum_over_cause = (
+        df_obs.loc[df_obs.cause != "_all"]
+        .groupby(["race", "county"])
+        .agg({"raked_value": "sum"})
+        .reset_index()
+        .merge(df_obs.loc[df_obs.cause == "_all"], on=["race", "county"])
+    )
+    assert np.allclose(
+        sum_over_cause["raked_value_x"],
+        sum_over_cause["raked_value_y"],
+        atol=1.0e-4,
+    ), "The sums over the cause must match the all causes deaths."
+    sum_over_race = (
+        df_obs.loc[df_obs.race != 0]
+        .groupby(["cause", "county"])
+        .agg({"raked_value": "sum"})
+        .reset_index()
+        .merge(df_obs.loc[df_obs.race == 0], on=["cause", "county"])
+    )
+    assert np.allclose(
+        sum_over_race["raked_value_x"],
+        sum_over_race["raked_value_y"],
+        atol=1.0e-4,
+    ), "The sums over the race must match the all races deaths."
+    sum_over_race_county = (
+        df_obs.loc[df_obs.race != 0]
+        .groupby(["cause"])
+        .agg({"raked_value": "sum"})
+        .reset_index()
+        .merge(
+            example_USHD_draws.df_margins.groupby(["cause"])
+            .agg({"value_agg_over_race_county": "mean"})
+            .reset_index(),
+            on=["cause"],
+        )
+    )
+    assert np.allclose(
+        sum_over_race_county["raked_value"],
+        sum_over_race_county["value_agg_over_race_county"],
+        atol=1.0e-5,
+    ), "The sums over race and county must match the GBD values."