finish synthetics with draws

ihmeuw-msca · Oct 14, 2024 · afdbd44 · afdbd44
1 parent 192d160
commit afdbd44
Show file tree

Hide file tree

Showing 11 changed files with 134,276 additions and 8 deletions.
diff --git a/src/raking/compute_covariance.py b/src/raking/compute_covariance.py
@@ -0,0 +1,167 @@
+
+import numpy as np
+import pandas as pd
+
+pd.options.mode.chained_assignment = None
+
+def compute_covariance_obs(df_obs, var_names, draws):
+
+    nsamples = len(df_obs[draws].unique())
+    var_names.reverse()
+    df = df_obs[['value'] + var_names + [draws]]
+    df.sort_values(by=var_names + [draws], inplace=True)
+    value = df['value'].to_numpy()
+    X = np.reshape(value, shape=(nsamples, -1), order='F')
+    Xmean = np.mean(X, axis=0)
+    Xc = X - Xmean
+    sigma_yy = np.matmul(np.transpose(Xc), Xc) / nsamples
+    return sigma_yy
+
+def compute_covariance_margins_1D(df_margins, var_names, draws):
+
+    nsamples = len(df_margins[draws].unique())
+    df = df_margins[['value_agg_over_' + var_names[0]] + [draws]]
+    df.sort_values(by=[draws], inplace=True)
+    value = df['value_agg_over_' + var_names[0]].to_numpy()
+    X = np.reshape(value, shape=(nsamples, -1), order='F')
+    Xmean = np.mean(X, axis=0)
+    Xc = X - Xmean
+    sigma_ss = np.matmul(np.transpose(Xc), Xc) / nsamples
+    return sigma_ss
+
+def compute_covariance_margins_2D(df_margins_1, df_margins_2, var_names, draws):
+
+    nsamples = len(df_margins_1[draws].unique())
+    df1 = df_margins_1[[var_names[1], 'value_agg_over_' + var_names[0], draws]]
+    df1.sort_values(by=[var_names[1], draws], inplace=True)
+    df2 = df_margins_2[[var_names[0], 'value_agg_over_' + var_names[1], draws]]
+    df2.sort_values(by=[var_names[0], draws], inplace=True)
+    value1 = df1['value_agg_over_' + var_names[0]].to_numpy()
+    value2 = df2['value_agg_over_' + var_names[1]].to_numpy()
+    value = np.concatenate((value1, value2))
+    X = np.reshape(value, shape=(nsamples, -1), order='F')
+    X = X[:, 0:-1]
+    Xmean = np.mean(X, axis=0)
+    Xc = X - Xmean
+    sigma_ss = np.matmul(np.transpose(Xc), Xc) / nsamples
+    return sigma_ss
+
+def compute_covariance_margins_3D(df_margins_1, df_margins_2, df_margins_3, var_names, draws):
+
+    nsamples = len(df_margins_1[draws].unique())
+    var1 = df_margins_2[var_names[0]].unique().tolist()
+    var2 = df_margins_1[var_names[1]].unique().tolist()
+    var3 = df_margins_1[var_names[2]].unique().tolist()
+    var1.sort()
+    var2.sort()
+    var3.sort()
+    df1 = df_margins_1[[var_names[1], var_names[2], 'value_agg_over_' + var_names[0], draws]]
+    df1 = df1.loc[(df1[var_names[1]].isin(var2[0:-1]) | ((df1[var_names[1]]==var2[-1]) & (df1[var_names[2]]==var3[-1]))]
+    df1.sort_values(by=[var_names[2], var_names[1], draws], inplace=True)
+    df2 = df_margins_2[[var_names[0], var_names[2], 'value_agg_over_' + var_names[1], draws]]
+    df2 = df2.loc[df2[var_names[2]].isin(var3[0:-1])]
+    df2.sort_values(by=[var_names[0], var_names[2], draws, inplace=True)
+    df3 = df_margins_3[[var_names[0], var_names[1], 'value_agg_over_' + var_names[2], draws]]
+    df3 = df3.loc[df3[var_names[0]].isin(var1[0:-1])]
+    df3.sort_values(by=[var_names[1], var_names[0], draws, inplace=True)
+    value1 = df1['value_agg_over_' + var_names[0]].to_numpy()
+    value2 = df2['value_agg_over_' + var_names[1]].to_numpy()
+    value3 = df3['value_agg_over_' + var_names[2]].to_numpy()
+    value = np.concatenate((value1, value2, value3))
+    X = np.reshape(value, shape=(nsamples, -1), order='F')
+    Xmean = np.mean(X, axis=0)
+    Xc = X - Xmean
+    sigma_ss = np.matmul(np.transpose(Xc), Xc) / nsamples
+    return sigma_ss
+
+def compute_covariance_obs_margins_1D(df_obs, df_margins, var_names, draws):
+
+    nsamples = len(df_obs[draws].unique())
+    var_names.reverse()
+    df_obs = df_obs[['value'] + var_names + [draws]]
+    df_obs.sort_values(by=var_names + [draws], inplace=True)
+    df_margins = df_margins[['value_agg_over_' + var_names[0]] + [draws]]
+    df_margins.sort_values(by=[draws], inplace=True)
+    value_obs = df_obs['value'].to_numpy()
+    X = np.reshape(value_obs, shape=(nsamples, -1), order='F')
+    value_margins = df_margins['value_agg_over_' + var_names[0]].to_numpy()
+    Y = np.reshape(value_margins, shape=(nsamples, -1), order='F')
+    Xmean = np.mean(X, axis=0)
+    Ymean = np.mean(Y, axis=0)
+    Xc = X - Xmean
+    Yc = Y - Ymean
+    sigma_ys = np.matmul(np.transpose(Xc), Yc) / nsamples
+    return sigma_ys
+
+def compute_covariance_obs_margins_2D(df_obs, df_margins_1, df_margins_2, var_names, draws):
+
+    nsamples = len(df_obs[draws].unique())
+    var_names.reverse()
+    df_obs = df_obs[var_names + [draws]]
+    df_obs.sort_values(by=var_names + [draws], inplace=True)
+    df_margins_1 = df_margins_1[[var_names[1], 'value_agg_over_' + var_names[0], draws]]
+    df_margins_1.sort_values(by=[var_names[1], draws], inplace=True)
+    df_margins_2 = df_margins_2[[var_names[0], 'value_agg_over_' + var_names[1], draws]]
+    df_margins_2.sort_values(by=[var_names[0], draws], inplace=True)
+    value_obs = df_obs['value'].to_numpy()
+    X = np.reshape(value_obs, shape=(nsamples, -1), order='F')
+    value_margins_1 = df_margins_1['value_agg_over_' + var_names[0]].to_numpy()
+    value_margins_2 = df_margins_2['value_agg_over_' + var_names[1]].to_numpy()
+    value_margins = np.concatenate((value_margins_1, value_margins_2))
+    Y = np.reshape(value_margins, shape=(nsamples, -1), order='F')
+    Xmean = np.mean(X, axis=0)
+    Ymean = np.mean(Y, axis=0)
+    Xc = X - Xmean
+    Yc = Y - Ymean
+    sigma_ys = np.matmul(np.transpose(Xc), Yc) / nsamples
+    return sigma_ys
+
+def compute_covariance_obs_margins_3D(df_obs, df_margins_1, df_margins_2, df_margins_3, var_names, draws):
+
+    nsamples = len(df_obs[draws].unique())
+    var_names.reverse()
+    df_obs = df_obs[var_names + [draws]]
+    df_obs.sort_values(by=var_names + [draws], inplace=True)
+    var1 = df_margins_2[var_names[0]].unique().tolist()
+    var2 = df_margins_1[var_names[1]].unique().tolist()
+    var3 = df_margins_1[var_names[2]].unique().tolist()
+    var1.sort()
+    var2.sort()
+    var3.sort()
+    df_margins_1 = df_margins_1[[var_names[1], var_names[2], 'value_agg_over_' + var_names[0], draws]]
+    df_margins_1 = df_margins_1.loc[(df_margins_1[var_names[1]].isin(var2[0:-1]) | ((df_margins_1[var_names[1]]==var2[-1]) & (df_margins_1[var_names[2]]==var3[-1]))]
+    df_margins_1.sort_values(by=[var_names[2], var_names[1], draws], inplace=True)
+    df_margins_2 = df_margins_2[[var_names[0], var_names[2], 'value_agg_over_' + var_names[1], draws]]
+    df_margins_2 = df_margins_2.loc[df_margins_2[var_names[2]].isin(var3[0:-1])]
+    df_margins_2.sort_values(by=[var_names[0], var_names[2], draws, inplace=True)
+    df_margins_3 = df_margins_3[[var_names[0], var_names[1], 'value_agg_over_' + var_names[2], draws]]
+    df_margins_3 = df_margins_3.loc[df_margins_3[var_names[0]].isin(var1[0:-1])]
+    df_margins_3.sort_values(by=[var_names[1], var_names[0], draws, inplace=True)
+    value_obs = df_obs['value'].to_numpy()
+    value_margins_1 = df_margins_1['value_agg_over_' + var_names[0]].to_numpy()
+    value_margins_2 = df_margins_2['value_agg_over_' + var_names[1]].to_numpy()
+    value_margins_3 = df_margins_3['value_agg_over_' + var_names[2]].to_numpy()
+    value = np.concatenate((value_obs, value_margins_1, df_margins_2, df_margins_3))
+    X = np.reshape(value, shape=(nsamples, -1), order='F')
+    Xmean = np.mean(X, axis=0)
+    Xc = X - Xmean
+    sigma_ys = np.matmul(np.transpose(Xc), Xc) / nsamples
+    return sigma_ys
+
+def check_covariance(sigma_yy, sigma_ss, sigma_ys):
+    """
+    """
+    sigma = np.concatenate(( \
+        np.concatenate((sigma_yy, sigma_ys), axis=1), \
+        np.concatenate((np.transpose(sigma_ys), sigma_ss), axis=1)), axis=0)
+    valid = True
+    if np.allclose(np.transpose(sigma), sigma, rtol, atol):
+        valid = False
+    if np.any(np.linalg.eig(sigma)[0] < 0.0):
+        valid = False
+    if not valid:
+        sigma_yy = np.diag(np.diag(sigma_yy))
+        sigma_ss = np.diag(np.diag(sigma_ss))
+        sigma_ys = np.zeros(sigma_ys.shape)
+    return sigma_yy, sigma_ss, sigma_ys
+
diff --git a/src/raking/run_raking.py b/src/raking/run_raking.py
@@ -4,15 +4,23 @@
 import pandas as pd
 
 from raking.compute_constraints import constraints_1D, constraints_2D, constraints_3D
+from raking.compute_covariance import compute_covariance_obs
+from raking.compute_covariance import compute_covariance_margins_1D, compute_covariance_margins_2D, compute_covariance_margins_3D
+from raking.compute_covariance import compute_covariance_obs_margins_1D, compute_covariance_obs_margins_2D, compute_covariance_obs_margins_3D
 from raking.formatting_methods import format_data_1D, format_data_2D, format_data_3D
 from raking.raking_methods import raking_chi2, raking_entropic, raking_general, raking_logit
+from raking.uncertainty_methods import compute_covariance, compute_gradient
 
 def run_raking(
     dim: int,
-    directory_name: str,
     df_obs: pd.DataFrame,
     df_margins: list,
     var_names: list,
+    draws: str = 'draws',
+    cov_mat: bool = True,
+    sigma_yy: np.ndarray = None,
+    sigma_ss: np.ndarray = None,
+    sigma_ys: np.ndarray = None,
     method: str = 'chi2',
     alpha: float = 1,
     weights: str = None,
@@ -22,22 +30,33 @@ def run_raking(
     atol:float = 1e-08,
     gamma0: float = 1.0,
     max_iter: int = 500
-) -> None:
+) -> np.ndarray:
     """
     This function allows the user to run the raking problem.
 
     Parameters
     ----------
     dim : integer
         Dimension of the raking problem (1, 2, 3)
-    directory_name: string
-        Name of the directory where we write the results
     df_obs : pd.DataFrame
         Observations data
     df_margins : list of pd.DataFrame
         list of data frames contatining the margins data
     var_names : list of strings
         Names of the variables over which we rake (e.g. cause, race, county)
+    draws: string
+        Name of the column that contains the samples.
+    cov_mat : boolean
+        If True, compute the covariance matrix of the raked values
+    sigma_yy: np.ndarray
+        Covariance matrix of the observations. We assume that there are sorted by var3, var2, var1.
+        If None, the observation data frame must contain samples and we compte the sample covariance matrix.
+    sigma_ss: np.ndarray
+        Covariance matrix of the margins.
+        If None, the margins data frames must contain samples and we compte the sample covariance matrix.
+    sigma_ys: np.ndarray
+        Covariance matrix of the observations and the margins.
+        If None, the observations and margins data frames must contain samples and we compte the sample covariance matrix.
     method : string
         Name of the distance function used for the raking.
         Possible values are chi2, entropic, general, logit
@@ -60,7 +79,8 @@ def run_raking(
 
     Returns
     -------
-    None
+    df_obs : pd.DataFrame
+        The initial observations data frame with an additional column for the raked values
     """
     assert isinstance(dim, int), \
         'The dimension of the raking problem must be an integer.'
@@ -78,7 +98,20 @@ def run_raking(
         'The name of the distance function used for the raking must be a string.'
     assert method in ['chi2', 'entropic', 'general', 'logit'], \
         'The distance function must be chi2, entropic, general or logit.'
-
+
+    # Compute the covariance matrix
+    if cov_mat:
+        if dim == 1:
+            (sigma_yy, sigma_ss, sigma_ys) = compute_covariance_1D(df_obs, df_margins, var_names, draws, sigma_yy, sigma_ss, sigma_ys)
+        elif dim == 2:
+            (sigma_yy, sigma_ss, sigma_ys) = compute_covariance_2D(df_obs, df_margins, var_names, draws, sigma_yy, sigma_ss, sigma_ys)
+        elif dim == 2:
+            (sigma_yy, sigma_ss, sigma_ys) = compute_covariance_3D(df_obs, df_margins, var_names, draws, sigma_yy, sigma_ss, sigma_ys)
+        else:
+            pass
+    # Check if matrix is definite positive
+        (sigma_yy, sigma_ss, sigma_ys) = check_covariance(sigma_yy, sigma_ss, sigma_ys)
+
     # Get the input variables for the raking
     if dim == 1:
         (y, s, q, l, h, A) = run_raking_1D(df_obs, df_margins, var_names, weights, lower, upper, rtol, atol)
@@ -88,6 +121,7 @@ def run_raking(
         (y, s, q, l, h, A) = run_raking_3D(df_obs, df_margins, var_names, weights, lower, upper, rtol, atol)
     else:
         pass
+
     # Rake
     if method == 'chi2':
         (beta, lambda_k) = raking_chi2(y, A, s, q)
@@ -99,11 +133,21 @@ def run_raking(
         (beta, lambda_k, iter_eps) = raking_logit(y, A, s, l, h, q, gamma0, max_iter)
     else:
         pass
-    # Write output file
+
+    # Create data frame for the raked values
     var_names.reverse()
     df_obs.sort_values(by=var_names, inplace=True)
     df_obs['raked_value'] = beta
-    df_obs.to_csv(directory_name + '/raked_observations.csv', index=False)
+
+    # Compute the covariance matrix of the raked values
+    if cov_mat:
+        (Dphi_y, Dphi_s) = compute_gradient(beta, lambda_k, y, A, method, alpha, l, h, q)
+        sigma = compute_covariance(Dphi_y, Dphi_s, sigma_yy, sigma_ss, sigma_ys)
+    else:
+        Dphi_y = None
+        Dphi_s = None
+        sigma = None
+    return (df_obs, Dphi_y, Dphi_s, sigma)
 
 def run_raking_1D(
     df_obs: pd.DataFrame,
@@ -265,3 +309,51 @@ def run_raking_3D(
     (A, s) = constraints_3D(s1, s2, s3, I, J, K, rtol, atol)
     return (y, s, q, l, h, A)
 
+def compute_covariance_1D(df_obs, df_margins, var_names, draws, sigma_yy, sigma_ss, sigma_ys):
+    """
+    """
+    df_margins = df_margins[0]
+    if sigma_yy is None:
+        sigma_yy = compute_covariance_obs(df_obs, var_names, draws)
+    if sigma_ss is None:
+        sigma_ss = compute_covariance_margins_1D(df_margins, var_names, draws)
+    if sigma_ys is None:
+        sigma_ys = compute_covariance_obs_margins_1D(df_obs, df_margins, var_names, draws)
+    sigma = np.concatenate(( \
+        np.concatenate((sigma_yy, sigma_ys), axis=1), \
+        np.concatenate((np.transpose(sigma_ys), sigma_ss), axis=1)), axis=0)
+    return (sigma_yy, sigma_ss, sigma_ys)
+
+def compute_covariance_2D(df_obs, df_margins, var_names, draws, sigma_yy, sigma_ss, sigma_ys):
+    """
+    """
+    df_margins_1 = df_margins[0]
+    df_margins_2 = df_margins[1]
+    if sigma_yy is None:
+        sigma_yy = compute_covariance_obs(df_obs, var_names, draws)
+    if sigma_ss is None:
+        sigma_ss = compute_covariance_margins_2D(df_margins_1, df_margins_2, var_names, draws)
+    if sigma_ys is None:
+        sigma_ys = compute_covariance_obs_margins_2D(df_obs, df_margins_1, df_margins_2, var_names, draws)
+    sigma = np.concatenate(( \
+        np.concatenate((sigma_yy, sigma_ys), axis=1), \
+        np.concatenate((np.transpose(sigma_ys), sigma_ss), axis=1)), axis=0)
+    return (sigma_yy, sigma_ss, sigma_ys)
+
+def compute_covariance_3D(df_obs, df_margins, var_names, draws, sigma_yy, sigma_ss, sigma_ys):
+    """
+    """
+    df_margins_1 = df_margins[0]
+    df_margins_2 = df_margins[1]
+    df_margins_3 = df_margins[2]
+    if sigma_yy is None:
+        sigma_yy = compute_covariance_obs(df_obs, var_names, draws)
+    if sigma_ss is None:
+        sigma_ss = compute_covariance_margins_3D(df_margins_1, df_margins_2, df_margins_3, var_names, draws)
+    if sigma_ys is None:
+        sigma_ys = compute_covariance_obs_margins_3D(df_obs, df_margins_1, df_margins_2, df_margins_3, var_names, draws)
+    sigma = np.concatenate(( \
+        np.concatenate((sigma_yy, sigma_ys), axis=1), \
+        np.concatenate((np.transpose(sigma_ys), sigma_ss), axis=1)), axis=0)
+    return (sigma_yy, sigma_ss, sigma_ys)
+