Skip to content

Commit

Permalink
modified: dsm/datasets.py
Browse files Browse the repository at this point in the history
  • Loading branch information
chiragnagpal committed Oct 30, 2020
1 parent cf85ed2 commit 1ee7790
Showing 1 changed file with 52 additions and 0 deletions.
52 changes: 52 additions & 0 deletions dsm/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,58 @@ def increase_censoring(e, t, p):

return e, t

def _load_framingham_dataset(sequential):
"""Helper function to load and preprocess the Framingham dataset.
The Framingham Dataset is a subset of 4,434 participants of the well known,
ongoing Framingham Heart study [1] for studying epidemiology for
hypertensive and arteriosclerotic cardiovascular disease. It is a popular
dataset for longitudinal survival analysis with time dependent covariates.
Parameters
----------
sequential: bool
If True returns a list of np.arrays for each individual.
else, returns collapsed results for each time step. To train
recurrent neural models you would typically use True.
References
----------
[1] Dawber, Thomas R., Gilcin F. Meadors, and Felix E. Moore Jr.
"Epidemiological approaches to heart disease: the Framingham Study."
American Journal of Public Health and the Nations Health 41.3 (1951).
"""

data = pkgutil.get_data(__name__, 'datasets/framingham.csv')
data = pd.read_csv(io.BytesIO(data))

dat_cat = data[['SEX', 'CURSMOKE', 'DIABETES', 'BPMEDS',
'educ', 'PREVCHD', 'PREVAP', 'PREVMI',
'PREVSTRK', 'PREVHYP']]
dat_num = data[['TOTCHOL', 'AGE', 'SYSBP', 'DIABP',
'CIGPDAY', 'BMI', 'HEARTRTE', 'GLUCOSE']]

x1 = pd.get_dummies(dat_cat).values
x2 = dat_num.values
x = np.hstack([x1, x2])

time = (data['TIMEDTH'] - data['TIME']).values
event = data['DEATH'].values

x = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x)
x_ = StandardScaler().fit_transform(x)

if not sequential:
return x_, time, event
else:
x, t, e = [], [], []
for id_ in sorted(list(set(data['RANDID']))):
x.append(x_[data['RANDID'] == id_])
t.append(time[data['RANDID'] == id_])
e.append(event[data['RANDID'] == id_])
return x, t, e

def _load_pbc_dataset(sequential):
"""Helper function to load and preprocess the PBC dataset
Expand Down

0 comments on commit 1ee7790

Please sign in to comment.