modified: dsm/datasets.py

autonlab · Oct 30, 2020 · 1ee7790 · 1ee7790
1 parent cf85ed2
commit 1ee7790
Showing 1 changed file with 52 additions and 0 deletions.
diff --git a/dsm/datasets.py b/dsm/datasets.py
@@ -48,6 +48,58 @@ def increase_censoring(e, t, p):
 
   return e, t
 
+def _load_framingham_dataset(sequential):
+  """Helper function to load and preprocess the Framingham dataset.
+
+  The Framingham Dataset is a subset of 4,434 participants of the well known,
+  ongoing Framingham Heart study [1] for studying epidemiology for
+  hypertensive and arteriosclerotic cardiovascular disease. It is a popular
+  dataset for longitudinal survival analysis with time dependent covariates.
+
+  Parameters
+  ----------
+  sequential: bool
+    If True returns a list of np.arrays for each individual.
+    else, returns collapsed results for each time step. To train
+    recurrent neural models you would typically use True.
+
+  References
+  ----------
+  [1] Dawber, Thomas R., Gilcin F. Meadors, and Felix E. Moore Jr.
+  "Epidemiological approaches to heart disease: the Framingham Study."
+  American Journal of Public Health and the Nations Health 41.3 (1951).
+
+  """
+
+  data = pkgutil.get_data(__name__, 'datasets/framingham.csv')
+  data = pd.read_csv(io.BytesIO(data))
+
+  dat_cat = data[['SEX', 'CURSMOKE', 'DIABETES', 'BPMEDS',
+                  'educ', 'PREVCHD', 'PREVAP', 'PREVMI',
+                  'PREVSTRK', 'PREVHYP']]
+  dat_num = data[['TOTCHOL', 'AGE', 'SYSBP', 'DIABP',
+                  'CIGPDAY', 'BMI', 'HEARTRTE', 'GLUCOSE']]
+
+  x1 = pd.get_dummies(dat_cat).values
+  x2 = dat_num.values
+  x = np.hstack([x1, x2])
+
+  time = (data['TIMEDTH'] - data['TIME']).values
+  event = data['DEATH'].values
+
+  x = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x)
+  x_ = StandardScaler().fit_transform(x)
+
+  if not sequential:
+    return x_, time, event
+  else:
+    x, t, e = [], [], []
+    for id_ in sorted(list(set(data['RANDID']))):
+      x.append(x_[data['RANDID'] == id_])
+      t.append(time[data['RANDID'] == id_])
+      e.append(event[data['RANDID'] == id_])
+    return x, t, e
+
 def _load_pbc_dataset(sequential):
   """Helper function to load and preprocess the PBC dataset