Skip to content

Commit

Permalink
work on covariance matrix
Browse files Browse the repository at this point in the history
  • Loading branch information
ADucellierIHME committed Oct 14, 2024
1 parent afdbd44 commit e996089
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 5 deletions.
78 changes: 73 additions & 5 deletions src/raking/compute_covariance.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,35 @@
"""Module with methods to compute the covaraince matrices of observations and margins"""

import numpy as np
import pandas as pd

pd.options.mode.chained_assignment = None

def compute_covariance_obs(df_obs, var_names, draws):
def compute_covariance_obs(
df_obs: pd.DataFrame,
var_names: list,
draws: str
) -> np.ndarray:
"""Compute the covariance matrix of the observations.
The observations will be sorted by var3, var2, var1, meaning that
sigma_yy contains on its diagonal in this order the variances of
y_111, ... , y_I11, y_121, ... , y_IJ1, y_112, ... , y_IJK.
Parameters
----------
df_obs : pd.DataFrame
Observations data
var_names : list of strings
Names of the variables over which we rake (e.g. cause, race, county)
draws : string
Names of the column containing the indices of the draws
Returns
-------
sigma_yy : np.ndarray
(IJK) * (IJK) covariance matrix
"""
nsamples = len(df_obs[draws].unique())
var_names.reverse()
df = df_obs[['value'] + var_names + [draws]]
Expand All @@ -17,8 +41,27 @@ def compute_covariance_obs(df_obs, var_names, draws):
sigma_yy = np.matmul(np.transpose(Xc), Xc) / nsamples
return sigma_yy

def compute_covariance_margins_1D(df_margins, var_names, draws):

def compute_covariance_margins_1D(
df_margins: pd.DataFrame,
var_names: list,
draws: str
) -> np.ndarray:
"""Compute the covariance matrix of the margins in 1D.
Parameters
----------
df_margins : pd.DataFrame
Margins data (sums over the first variable)
var_names : list of strings
Names of the variables over which we rake (e.g. cause, race, county)
draws : string
Names of the column containing the indices of the draws
Returns
-------
sigma_ss : np.ndarray
1 * 1 covariance matrix
"""
nsamples = len(df_margins[draws].unique())
df = df_margins[['value_agg_over_' + var_names[0]] + [draws]]
df.sort_values(by=[draws], inplace=True)
Expand All @@ -29,8 +72,33 @@ def compute_covariance_margins_1D(df_margins, var_names, draws):
sigma_ss = np.matmul(np.transpose(Xc), Xc) / nsamples
return sigma_ss

def compute_covariance_margins_2D(df_margins_1, df_margins_2, var_names, draws):

def compute_covariance_margins_2D(
df_margins_1: pd.DataFrame,
df_margins_2: pd.DataFrame,
var_names: list,
draws: str
) -> np.ndarray:
"""Compute the covariance matrix of the margins in 2D.
The margins are sorted in the same order as what is done
when computing the constraint matrix.
Parameters
----------
df_margins_1 : pd.DataFrame
Margins data (sums over the first variable)
df_margins_2 : pd.DataFrame
Margins data (sums over the second variable)
var_names : list of strings
Names of the variables over which we rake (e.g. cause, race, county)
draws : string
Names of the column containing the indices of the draws
Returns
-------
sigma_ss : np.ndarray
(I + J - 1) * (I + J - 1) covariance matrix
"""
nsamples = len(df_margins_1[draws].unique())
df1 = df_margins_1[[var_names[1], 'value_agg_over_' + var_names[0], draws]]
df1.sort_values(by=[var_names[1], draws], inplace=True)
Expand Down
2 changes: 2 additions & 0 deletions src/raking/formatting_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
import pandas as pd

pd.options.mode.chained_assignment = None

def format_data_1D(
df_obs: pd.DataFrame,
df_margins: pd.DataFrame,
Expand Down
2 changes: 2 additions & 0 deletions src/raking/run_raking.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from raking.raking_methods import raking_chi2, raking_entropic, raking_general, raking_logit
from raking.uncertainty_methods import compute_covariance, compute_gradient

pd.options.mode.chained_assignment = None

def run_raking(
dim: int,
df_obs: pd.DataFrame,
Expand Down
2 changes: 2 additions & 0 deletions src/raking/uncertainty_methods.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Module with methods to propagate the uncertainties through the raking process"""

import numpy as np

def compute_covariance(
Dphi_y: np.ndarray,
Dphi_s: np.ndarray,
Expand Down

0 comments on commit e996089

Please sign in to comment.