add USHD example without draws

ihmeuw-msca · Oct 29, 2024 · e8f1449 · e8f1449
1 parent c0a1a69
commit e8f1449
Show file tree

Hide file tree

Showing 6 changed files with 96 additions and 58 deletions.
diff --git a/src/raking/formatting_methods.py b/src/raking/formatting_methods.py
@@ -881,19 +881,19 @@ def format_data_USHD(
         len(df_margins) >= 3
     ), "There should be at least 3 data points for the margins."
 
-    for var_name in ['value', 'cause', 'race' 'county']:
+    for var_name in ["value", "cause", "race", "county"]:
         assert var_name in df_obs.columns.tolist(), (
             "The column for the categorical variable "
             + var_name
             + " is missing from the observations data frame."
         )
 
-    assert 'cause' in df_margins.columns.tolist(), (
-        "The cause column is missing from the margins data frame."
-    )
-    assert "value_agg_over_race_county" in df_margins.columns.tolist(), (
-        "The column for the aggregated value over races and counties is missing from the margins data frame."
-    )
+    assert (
+        "cause" in df_margins.columns.tolist()
+    ), "The cause column is missing from the margins data frame."
+    assert (
+        "value_agg_over_race_county" in df_margins.columns.tolist()
+    ), "The column for the aggregated value over races and counties is missing from the margins data frame."
 
     if weights is not None:
         assert isinstance(
@@ -918,53 +918,48 @@ def format_data_USHD(
         ), "The column containing the upper_boundaries is missing from the data frame."
 
     # Check the observations data
-    for var_name in ['value', 'cause', 'race' 'county']:
+    for var_name in ["value", "cause", "race", "county"]:
         assert df_obs[var_name].isna().sum() == 0, (
             "There are missing values in the "
             + var_name
             + " column of the observations."
         )
     assert (
-        len(df_obs[df_obs.duplicated(['cause', 'race' 'county'])]) == 0
+        len(df_obs[df_obs.duplicated(["cause", "race", "county"])]) == 0
     ), "There are duplicated rows in the observations."
-    count_obs = df_obs[['cause', 'race' 'county']].value_counts()
-    assert (len(count_obs.unique()) == 1) and (count_obs.unique()[0] == 1), (
-        "There are missing combinations of cause, race and county in the observations."
-    )
+    count_obs = df_obs[["cause", "race", "county"]].value_counts()
+    assert (
+        (len(count_obs.unique()) == 1) and (count_obs.unique()[0] == 1)
+    ), "There are missing combinations of cause, race and county in the observations."
 
     # Check the margins data
-    assert df_margins['cause'].isna().sum() == 0, (
-        "There are missing values in the cause column of the margins."
-    )
-    assert df_margins["value_agg_over_race_county"].isna().sum() == 0, (
-        "There are missing values in the value_agg_over_race_county column of the margins."
-    )
     assert (
-        len(df_margins[df_margins.duplicated(['cause'])]) == 0
+        df_margins["cause"].isna().sum() == 0
+    ), "There are missing values in the cause column of the margins."
+    assert (
+        df_margins["value_agg_over_race_county"].isna().sum() == 0
+    ), "There are missing values in the value_agg_over_race_county column of the margins."
+    assert (
+        len(df_margins[df_margins.duplicated(["cause"])]) == 0
     ), "There are duplicated rows in the margins data frame."
 
     # Check consistency between observations and margins
-    assert len(df_obs['cause'].unique()) == len(
-        df_margins['cause'].unique()
-    ), (
-        "The number of categories for cause should be the same in the observations and margins data frames."
-    )
-    assert set(df_obs['cause'].unique().tolist()) == set(
-        df_margins['cause'].unique().tolist()
-    ), (
-        "The names of the categories for cause should be the same in the observations and margins data frames."
-    )
+    assert (
+        len(df_obs["cause"].unique()) == len(df_margins["cause"].unique())
+    ), "The number of categories for cause should be the same in the observations and margins data frames."
+    assert (
+        set(df_obs["cause"].unique().tolist())
+        == set(df_margins["cause"].unique().tolist())
+    ), "The names of the categories for cause should be the same in the observations and margins data frames."
 
     # Create input variables for the raking functions
-    df_obs.sort_values(
-        by=['county', 'race', 'cause'], inplace=True
-    )
-    df_margins.sort_values(by=['cause'], inplace=True)
-    I = len(df_obs['cause'].unique()) - 1
-    J = len(df_obs['race'].unique()) - 1
-    K = len(df_obs['county'].unique())
+    df_obs.sort_values(by=["county", "race", "cause"], inplace=True)
+    df_margins.sort_values(by=["cause"], inplace=True)
+    I = len(df_obs["cause"].unique()) - 1
+    J = len(df_obs["race"].unique()) - 1
+    K = len(df_obs["county"].unique())
     y = df_obs.value.to_numpy()
-    s = df_margins["value_agg_over_race-county"].to_numpy()
+    s = df_margins["value_agg_over_race_county"].to_numpy()
     if weights is not None:
         q = df_obs[weights].to_numpy()
     else:
@@ -978,4 +973,3 @@ def format_data_USHD(
     else:
         h = None
     return (y, s, I, J, K, q, l, h)
-
diff --git a/src/raking/run_raking.py b/src/raking/run_raking.py
@@ -109,13 +109,14 @@ def run_raking(
     df_obs : pd.DataFrame
         The initial observations data frame with an additional column for the raked values
     """
-    assert isinstance(dim, int) or isinstance(dim, str), \
-        "The dimension of the raking problem must be an integer or string."
+    assert isinstance(dim, int) or isinstance(
+        dim, str
+    ), "The dimension of the raking problem must be an integer or string."
     assert dim in [
         1,
         2,
         3,
-        "USHD"
+        "USHD",
     ], "The dimension of the raking problem must be 1, 2, 3 or USHD."
     assert isinstance(
         cov_mat, bool
@@ -127,6 +128,8 @@ def run_raking(
         assert (
             dim == len(var_names)
         ), "The number of variables over which we rake must be equal to the dimension of the problem."
+    else:
+        var_names = ["cause", "race", "county"]
     assert isinstance(
         df_margins, list
     ), "The margins data frames must be entered as a list."
@@ -135,8 +138,9 @@ def run_raking(
             dim == len(df_margins)
         ), "The number of margins data frames must be equal to the dimension of the problem."
     else:
-        assert len(df_margins) == 1, \
-            "There should be only one margins data frame in the list."
+        assert (
+            len(df_margins) == 1
+        ), "There should be only one margins data frame in the list."
     assert isinstance(
         method, str
     ), "The name of the distance function used for the raking must be a string."

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -78,10 +78,10 @@ def __init__(self):
 
 class ExampleUSHD:
     def __init__(self):
-        self.df_obs = pd.read_csv(EXAMPLES / "example_USHD" / "observations.csv")
-        self.df_margins = pd.read_csv(
-            EXAMPLES / "example_USHD" / "margins.csv"
+        self.df_obs = pd.read_csv(
+            EXAMPLES / "example_USHD" / "observations.csv"
         )
+        self.df_margins = pd.read_csv(EXAMPLES / "example_USHD" / "margins.csv")
 
 
 class ExampleUSHD_draws:
@@ -132,4 +132,3 @@ def example_USHD():
 @pytest.fixture
 def example_USHD_draws():
     return ExampleUSHD_draws()
-
diff --git a/tests/test_formatting.py b/tests/test_formatting.py
@@ -153,14 +153,10 @@ def test_format_data_USHD():
         {"cause": cause, "value_agg_over_race_county": s_cause}
     )
     # Get the formatted data
-    (y, s, I, J, K, q, l, h) = format_data_USHD(
-        df_obs,
-        df_margins
-    )
+    (y, s, I, J, K, q, l, h) = format_data_USHD(df_obs, df_margins)
     # Generate the constraints
     (A, s) = constraints_USHD(s, I, J, K)
     # Verify that the constraint A beta = s is respected
     assert np.allclose(
         np.matmul(A, y), s
     ), "For the format_data_USHD function, the constraint A y = s is not respected."
-
diff --git a/tests/test_raking.py b/tests/test_raking.py
@@ -103,7 +103,7 @@ def test_chi2_raking_USHD():
     (beta_star, lambda_star) = raking_chi2(y, A, s)
     # Verify that the constraint A beta_star = s is respected
     assert np.allclose(
-        np.matmul(A, beta_star), s
+        np.matmul(A, beta_star), s, atol=1.0e-5
     ), "For the USHD raking with the chi2 distance, the constraint A beta_star = s is not respected."
 
 
@@ -196,7 +196,7 @@ def test_entropic_raking_USHD():
     (beta_star, lambda_star, iter_eps) = raking_entropic(y, A, s)
     # Verify that the constraint A beta_star = s is respected
     assert np.allclose(
-        np.matmul(A, beta_star), s
+        np.matmul(A, beta_star), s, atol=1.0e-6
     ), "For the USHD raking with the entropic distance, the constraint A beta_star = s is not respected."
 
 
@@ -289,7 +289,7 @@ def test_general_raking_USHD():
     (beta_star, lambda_star, iter_eps) = raking_general(y, A, s, -2.0)
     # Verify that the constraint A beta_star = s is respected
     assert np.allclose(
-        np.matmul(A, beta_star), s
+        np.matmul(A, beta_star), s, atol=1.0e-6
     ), "For the USHD raking with the general distance, the constraint A beta_star = s is not respected."
 
 
@@ -414,7 +414,7 @@ def test_logit_raking_USHD():
     (beta_star, lambda_star, iter_eps) = raking_logit(y, A, s, l, h)
     # Verify that the constraint A beta_star = s is respected
     assert np.allclose(
-        np.matmul(A, beta_star), s
+        np.matmul(A, beta_star), s, atol=1.0e-6
     ), "For the USHD raking with the logit distance, the constraint A beta_star = s is not respected."
     # Verify that the lower bound is respected
     assert np.all(
@@ -424,4 +424,3 @@ def test_logit_raking_USHD():
     assert np.all(
         h - beta_star > -1.0e-5
     ), "For the USHD raking with the logit distance, some raked values are higher than the upper bound."
-
diff --git a/tests/test_run.py b/tests/test_run.py
@@ -86,6 +86,52 @@ def test_run_raking_3D(example_3D):
     ), "The sums over the third variable must match the third margins."
 
 
+def test_run_raking_USHD(example_USHD):
+    (df_obs, Dphi_y, Dphi_s, sigma) = run_raking(
+        dim="USHD",
+        df_obs=example_USHD.df_obs,
+        df_margins=[example_USHD.df_margins],
+        var_names=None,
+        cov_mat=False,
+    )
+    sum_over_cause = (
+        df_obs.loc[df_obs.cause != "_all"]
+        .groupby(["race", "county"])
+        .agg({"raked_value": "sum"})
+        .reset_index()
+        .merge(df_obs.loc[df_obs.cause == "_all"], on=["race", "county"])
+    )
+    assert np.allclose(
+        sum_over_cause["raked_value_x"],
+        sum_over_cause["raked_value_y"],
+        atol=1.0e-4,
+    ), "The sums over the cause must match the all causes deaths."
+    sum_over_race = (
+        df_obs.loc[df_obs.race != 0]
+        .groupby(["cause", "county"])
+        .agg({"raked_value": "sum"})
+        .reset_index()
+        .merge(df_obs.loc[df_obs.race == 0], on=["cause", "county"])
+    )
+    assert np.allclose(
+        sum_over_race["raked_value_x"],
+        sum_over_race["raked_value_y"],
+        atol=1.0e-4,
+    ), "The sums over the race must match the all races deaths."
+    sum_over_race_county = (
+        df_obs.loc[df_obs.race != 0]
+        .groupby(["cause"])
+        .agg({"raked_value": "sum"})
+        .reset_index()
+        .merge(example_USHD.df_margins, on=["cause"])
+    )
+    assert np.allclose(
+        sum_over_race_county["raked_value"],
+        sum_over_race_county["value_agg_over_race_county"],
+        atol=1.0e-5,
+    ), "The sums over race and county must match the GBD values."
+
+
 def test_run_raking_1D_draws(example_1D_draws):
     (df_obs, Dphi_y, Dphi_s, sigma) = run_raking(
         dim=1,