Skip to content

Commit

Permalink
add USHD example without draws
Browse files Browse the repository at this point in the history
  • Loading branch information
ADucellierIHME committed Oct 29, 2024
1 parent c0a1a69 commit e8f1449
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 58 deletions.
72 changes: 33 additions & 39 deletions src/raking/formatting_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,19 +881,19 @@ def format_data_USHD(
len(df_margins) >= 3
), "There should be at least 3 data points for the margins."

for var_name in ['value', 'cause', 'race' 'county']:
for var_name in ["value", "cause", "race", "county"]:
assert var_name in df_obs.columns.tolist(), (
"The column for the categorical variable "
+ var_name
+ " is missing from the observations data frame."
)

assert 'cause' in df_margins.columns.tolist(), (
"The cause column is missing from the margins data frame."
)
assert "value_agg_over_race_county" in df_margins.columns.tolist(), (
"The column for the aggregated value over races and counties is missing from the margins data frame."
)
assert (
"cause" in df_margins.columns.tolist()
), "The cause column is missing from the margins data frame."
assert (
"value_agg_over_race_county" in df_margins.columns.tolist()
), "The column for the aggregated value over races and counties is missing from the margins data frame."

if weights is not None:
assert isinstance(
Expand All @@ -918,53 +918,48 @@ def format_data_USHD(
), "The column containing the upper_boundaries is missing from the data frame."

# Check the observations data
for var_name in ['value', 'cause', 'race' 'county']:
for var_name in ["value", "cause", "race", "county"]:
assert df_obs[var_name].isna().sum() == 0, (
"There are missing values in the "
+ var_name
+ " column of the observations."
)
assert (
len(df_obs[df_obs.duplicated(['cause', 'race' 'county'])]) == 0
len(df_obs[df_obs.duplicated(["cause", "race", "county"])]) == 0
), "There are duplicated rows in the observations."
count_obs = df_obs[['cause', 'race' 'county']].value_counts()
assert (len(count_obs.unique()) == 1) and (count_obs.unique()[0] == 1), (
"There are missing combinations of cause, race and county in the observations."
)
count_obs = df_obs[["cause", "race", "county"]].value_counts()
assert (
(len(count_obs.unique()) == 1) and (count_obs.unique()[0] == 1)
), "There are missing combinations of cause, race and county in the observations."

# Check the margins data
assert df_margins['cause'].isna().sum() == 0, (
"There are missing values in the cause column of the margins."
)
assert df_margins["value_agg_over_race_county"].isna().sum() == 0, (
"There are missing values in the value_agg_over_race_county column of the margins."
)
assert (
len(df_margins[df_margins.duplicated(['cause'])]) == 0
df_margins["cause"].isna().sum() == 0
), "There are missing values in the cause column of the margins."
assert (
df_margins["value_agg_over_race_county"].isna().sum() == 0
), "There are missing values in the value_agg_over_race_county column of the margins."
assert (
len(df_margins[df_margins.duplicated(["cause"])]) == 0
), "There are duplicated rows in the margins data frame."

# Check consistency between observations and margins
assert len(df_obs['cause'].unique()) == len(
df_margins['cause'].unique()
), (
"The number of categories for cause should be the same in the observations and margins data frames."
)
assert set(df_obs['cause'].unique().tolist()) == set(
df_margins['cause'].unique().tolist()
), (
"The names of the categories for cause should be the same in the observations and margins data frames."
)
assert (
len(df_obs["cause"].unique()) == len(df_margins["cause"].unique())
), "The number of categories for cause should be the same in the observations and margins data frames."
assert (
set(df_obs["cause"].unique().tolist())
== set(df_margins["cause"].unique().tolist())
), "The names of the categories for cause should be the same in the observations and margins data frames."

# Create input variables for the raking functions
df_obs.sort_values(
by=['county', 'race', 'cause'], inplace=True
)
df_margins.sort_values(by=['cause'], inplace=True)
I = len(df_obs['cause'].unique()) - 1
J = len(df_obs['race'].unique()) - 1
K = len(df_obs['county'].unique())
df_obs.sort_values(by=["county", "race", "cause"], inplace=True)
df_margins.sort_values(by=["cause"], inplace=True)
I = len(df_obs["cause"].unique()) - 1
J = len(df_obs["race"].unique()) - 1
K = len(df_obs["county"].unique())
y = df_obs.value.to_numpy()
s = df_margins["value_agg_over_race-county"].to_numpy()
s = df_margins["value_agg_over_race_county"].to_numpy()
if weights is not None:
q = df_obs[weights].to_numpy()
else:
Expand All @@ -978,4 +973,3 @@ def format_data_USHD(
else:
h = None
return (y, s, I, J, K, q, l, h)

14 changes: 9 additions & 5 deletions src/raking/run_raking.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,14 @@ def run_raking(
df_obs : pd.DataFrame
The initial observations data frame with an additional column for the raked values
"""
assert isinstance(dim, int) or isinstance(dim, str), \
"The dimension of the raking problem must be an integer or string."
assert isinstance(dim, int) or isinstance(
dim, str
), "The dimension of the raking problem must be an integer or string."
assert dim in [
1,
2,
3,
"USHD"
"USHD",
], "The dimension of the raking problem must be 1, 2, 3 or USHD."
assert isinstance(
cov_mat, bool
Expand All @@ -127,6 +128,8 @@ def run_raking(
assert (
dim == len(var_names)
), "The number of variables over which we rake must be equal to the dimension of the problem."
else:
var_names = ["cause", "race", "county"]
assert isinstance(
df_margins, list
), "The margins data frames must be entered as a list."
Expand All @@ -135,8 +138,9 @@ def run_raking(
dim == len(df_margins)
), "The number of margins data frames must be equal to the dimension of the problem."
else:
assert len(df_margins) == 1, \
"There should be only one margins data frame in the list."
assert (
len(df_margins) == 1
), "There should be only one margins data frame in the list."
assert isinstance(
method, str
), "The name of the distance function used for the raking must be a string."
Expand Down
7 changes: 3 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@ def __init__(self):

class ExampleUSHD:
def __init__(self):
self.df_obs = pd.read_csv(EXAMPLES / "example_USHD" / "observations.csv")
self.df_margins = pd.read_csv(
EXAMPLES / "example_USHD" / "margins.csv"
self.df_obs = pd.read_csv(
EXAMPLES / "example_USHD" / "observations.csv"
)
self.df_margins = pd.read_csv(EXAMPLES / "example_USHD" / "margins.csv")


class ExampleUSHD_draws:
Expand Down Expand Up @@ -132,4 +132,3 @@ def example_USHD():
@pytest.fixture
def example_USHD_draws():
return ExampleUSHD_draws()

6 changes: 1 addition & 5 deletions tests/test_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,10 @@ def test_format_data_USHD():
{"cause": cause, "value_agg_over_race_county": s_cause}
)
# Get the formatted data
(y, s, I, J, K, q, l, h) = format_data_USHD(
df_obs,
df_margins
)
(y, s, I, J, K, q, l, h) = format_data_USHD(df_obs, df_margins)
# Generate the constraints
(A, s) = constraints_USHD(s, I, J, K)
# Verify that the constraint A beta = s is respected
assert np.allclose(
np.matmul(A, y), s
), "For the format_data_USHD function, the constraint A y = s is not respected."

9 changes: 4 additions & 5 deletions tests/test_raking.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def test_chi2_raking_USHD():
(beta_star, lambda_star) = raking_chi2(y, A, s)
# Verify that the constraint A beta_star = s is respected
assert np.allclose(
np.matmul(A, beta_star), s
np.matmul(A, beta_star), s, atol=1.0e-5
), "For the USHD raking with the chi2 distance, the constraint A beta_star = s is not respected."


Expand Down Expand Up @@ -196,7 +196,7 @@ def test_entropic_raking_USHD():
(beta_star, lambda_star, iter_eps) = raking_entropic(y, A, s)
# Verify that the constraint A beta_star = s is respected
assert np.allclose(
np.matmul(A, beta_star), s
np.matmul(A, beta_star), s, atol=1.0e-6
), "For the USHD raking with the entropic distance, the constraint A beta_star = s is not respected."


Expand Down Expand Up @@ -289,7 +289,7 @@ def test_general_raking_USHD():
(beta_star, lambda_star, iter_eps) = raking_general(y, A, s, -2.0)
# Verify that the constraint A beta_star = s is respected
assert np.allclose(
np.matmul(A, beta_star), s
np.matmul(A, beta_star), s, atol=1.0e-6
), "For the USHD raking with the general distance, the constraint A beta_star = s is not respected."


Expand Down Expand Up @@ -414,7 +414,7 @@ def test_logit_raking_USHD():
(beta_star, lambda_star, iter_eps) = raking_logit(y, A, s, l, h)
# Verify that the constraint A beta_star = s is respected
assert np.allclose(
np.matmul(A, beta_star), s
np.matmul(A, beta_star), s, atol=1.0e-6
), "For the USHD raking with the logit distance, the constraint A beta_star = s is not respected."
# Verify that the lower bound is respected
assert np.all(
Expand All @@ -424,4 +424,3 @@ def test_logit_raking_USHD():
assert np.all(
h - beta_star > -1.0e-5
), "For the USHD raking with the logit distance, some raked values are higher than the upper bound."

46 changes: 46 additions & 0 deletions tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,52 @@ def test_run_raking_3D(example_3D):
), "The sums over the third variable must match the third margins."


def test_run_raking_USHD(example_USHD):
(df_obs, Dphi_y, Dphi_s, sigma) = run_raking(
dim="USHD",
df_obs=example_USHD.df_obs,
df_margins=[example_USHD.df_margins],
var_names=None,
cov_mat=False,
)
sum_over_cause = (
df_obs.loc[df_obs.cause != "_all"]
.groupby(["race", "county"])
.agg({"raked_value": "sum"})
.reset_index()
.merge(df_obs.loc[df_obs.cause == "_all"], on=["race", "county"])
)
assert np.allclose(
sum_over_cause["raked_value_x"],
sum_over_cause["raked_value_y"],
atol=1.0e-4,
), "The sums over the cause must match the all causes deaths."
sum_over_race = (
df_obs.loc[df_obs.race != 0]
.groupby(["cause", "county"])
.agg({"raked_value": "sum"})
.reset_index()
.merge(df_obs.loc[df_obs.race == 0], on=["cause", "county"])
)
assert np.allclose(
sum_over_race["raked_value_x"],
sum_over_race["raked_value_y"],
atol=1.0e-4,
), "The sums over the race must match the all races deaths."
sum_over_race_county = (
df_obs.loc[df_obs.race != 0]
.groupby(["cause"])
.agg({"raked_value": "sum"})
.reset_index()
.merge(example_USHD.df_margins, on=["cause"])
)
assert np.allclose(
sum_over_race_county["raked_value"],
sum_over_race_county["value_agg_over_race_county"],
atol=1.0e-5,
), "The sums over race and county must match the GBD values."


def test_run_raking_1D_draws(example_1D_draws):
(df_obs, Dphi_y, Dphi_s, sigma) = run_raking(
dim=1,
Expand Down

0 comments on commit e8f1449

Please sign in to comment.