Skip to content

Commit

Permalink
Added more numpy docstrings
Browse files Browse the repository at this point in the history
This should be all of them (for user facing methods).
  • Loading branch information
AHsu98 committed Feb 3, 2023
1 parent 9b6e258 commit 4ba2157
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 27 deletions.
4 changes: 2 additions & 2 deletions src/pydisagg/DisaggModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,9 +493,9 @@ def split_groups(
that we want to rescale. If given, replaces the model's attribute rate pattern, by default None
None default uses model's rate pattern attribute
CI_method : Optional[str], optional
_description_, by default 'delta-wald'
method to use for standard errors, by default 'delta-wald'
alpha : Optional[float], optional
_description_, by default 0.05
1 - (confidence level) for confidence intervals, by default 0.05
Returns
-------
Expand Down
94 changes: 69 additions & 25 deletions src/pydisagg/disaggregate.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,60 @@
"""Module containing high level api for splitting"""
from typing import Optional
from typing import Optional, Union

import pandas as pd
from numpy.typing import NDArray
from pandas import DataFrame

from pydisagg.models import LMO_model, DisaggModel
from pydisagg.models import LMO_model, LogOdds_model
from pydisagg.DisaggModel import DisaggModel


def split_datapoint(
observed_total: float,
bucket_populations: NDArray,
rate_pattern: NDArray,
observed_total_se: Optional[float] = None,
model: Optional[DisaggModel] = LMO_model(1),
model: Optional[DisaggModel] = LogOdds_model(),
CI_method: Optional[str] = 'delta-wald'
):
'''
Disaggregates a datapoint using the model given as input.
) -> Union[tuple,NDArray]:
"""Disaggregate a datapoint using the model given as input.
Defaults to assuming multiplicativity in the odds ratio
If no observed_total_se is given, returns scalar point estimate
Parameters
----------
observed_total : float
aggregated observed_total across all buckets, value to be split
bucket_populations : NDArray
population size in each bucket
rate_pattern : NDArray
Rate Pattern to use, should be an estimate of the rates in each bucket
that we want to rescale
observed_total_se : Optional[float], optional
standard error of observed_total, by default None
model : Optional[DisaggModel], optional
DisaggModel to use, by default LMO_model(1)
CI_method : Optional[str], optional
method to use for confidence intervals,
see documentation for standard error methods in DisaggModel, by default 'delta-wald'
Returns
-------
Union[Tuple,NDArray]
If standard errors are available, this will return the tuple
(
estimate_in_each_bucket,
se_of_estimate_bucket,
(CI_lower_in_each_bucket,CI_upper_in_each_bucket)
)
Otherwise, if standard errors are not available,
this will return a numpy array of the disaggregated estimates
Notes
-----
If no observed_total_se is given, returns point estimates
If observed_total_se is given, then returns a tuple
(point_estimate,standard_error,(CI_lower,CI_upper))
'''
"""
return model.split_groups(
bucket_populations,
observed_total,
Expand All @@ -37,40 +68,53 @@ def split_dataframe(
groups_to_split_into: list,
observation_group_membership_df: DataFrame,
population_sizes: DataFrame,
baseline_patterns: DataFrame,
rate_patterns: DataFrame,
use_se: Optional[bool] = False,
model: Optional[DisaggModel] = LMO_model(1),
):
'''
Disaggregates datapoints and pivots observations into estimates for each group per pop id
groups_to_split_into: list of groups to disaggregate observations into
) -> DataFrame:
"""Disaggregate datapoints and pivots observations into estimates for each group per pop id
observation_group_membership_df: dataframe with columns location_id, pattern_id, obs,
Parameters
----------
groups_to_split_into : list
list of groups to disaggregate observations into
observation_group_membership_df : DataFrame
Dataframe with columns location_id, pattern_id, obs,
and columns for each of the groups_to_split_into
with dummy variables that represent whether or not
each group is included in the observations for that row.
This also optionally contains a obs_se column which will be used if use_se is True
location_id represents the population that the observation comes from
pattern_id gives the baseline that should be used for splitting
population_sizes: dataframe with location_id as the index containing the
population_sizes : DataFrame
Dataframe with location_id as the index containing the
size of each group within each population (given the location_id)
rate_patterns: dataframe with pattern_id as the index, and columns
rate_patterns : DataFrame
dataframe with pattern_id as the index, and columns
for each of the groups_to_split where the entries represent the rate pattern
in the given group to use for pydisagg.
use_se: Boolean, whether or not to report standard errors along with estimates
use_se : Optional[bool], optional
whether or not to report standard errors along with estimates
if set to True, then observation_group_membership_df must have an obs_se column
'''
, by default False
model : Optional[DisaggModel], optional
DisaggModel to use for splitting, by default LMO_model(1)
Returns
-------
DataFrame
Dataframe where each row corresponds to one of obs, with one or
two columns for each of the groups_to_split_into, giving the estimate
If use_se==True, then has a nested column indexing, where both the
point estimate and standard error for the estimate for each group is given.
"""
splitting_df = observation_group_membership_df.copy()
if use_se is False:
def split_row(x):
return split_datapoint(
x['obs'],
population_sizes.loc[x.name]*x[groups_to_split_into],
baseline_patterns.loc[x['pattern_id']],
rate_patterns.loc[x['pattern_id']],
model=model
)
result = (
Expand All @@ -88,14 +132,14 @@ def split_row(x):
raw_split_result = split_datapoint(
x['obs'],
population_sizes.loc[x.name]*x[groups_to_split_into],
baseline_patterns.loc[x['pattern_id']],
rate_patterns.loc[x['pattern_id']],
model=model,
observed_total_se=x['obs_se']
)
return pd.Series(
[
(estimate, se) for estimate, se in zip(raw_split_result[0], raw_split_result[1])
],
],
index=groups_to_split_into)
result_raw = (
splitting_df
Expand Down

0 comments on commit 4ba2157

Please sign in to comment.