From 41cd52e425835fe9abf0a00f990d1b397fe87c92 Mon Sep 17 00:00:00 2001 From: Chirag Nagpal Date: Sun, 1 Nov 2020 14:51:48 -0500 Subject: [PATCH] modified: dsm/__init__.py modified: dsm/datasets.py modified: dsm/dsm_api.py modified: dsm/dsm_torch.py modified: dsm/losses.py modified: dsm/utilities.py --- dsm/__init__.py | 45 ++++++--- dsm/datasets.py | 31 ++++++- dsm/dsm_api.py | 234 +++++++++++++++++++++++++++++------------------ dsm/dsm_torch.py | 20 +++- dsm/losses.py | 11 ++- dsm/utilities.py | 37 +++++--- 6 files changed, 262 insertions(+), 116 deletions(-) diff --git a/dsm/__init__.py b/dsm/__init__.py index bb0605d..ad59601 100644 --- a/dsm/__init__.py +++ b/dsm/__init__.py @@ -18,6 +18,17 @@ # If not, see . """ +[![Build Status](https://travis-ci.org/chiragnagpal/DeepSurvivalMachines.svg?\ +branch=master)](https://travis-ci.org/chiragnagpal/DeepSurvivalMachines) +   \ +[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)]\ +(https://www.gnu.org/licenses/gpl-3.0) +   \ +[![GitHub Repo stars](https://img.shields.io/github/stars/autonlab/Deep\ +SurvivalMachines?style=social)](https://github.com/autonlab/DeepSurvival\ +Machines) + + Python package `dsm` provides an API to train the Deep Survival Machines and associated models for problems in survival analysis. The underlying model is implemented in `pytorch`. @@ -54,11 +65,6 @@ parametric distributions. The parameters of these mixture distributions as well as the mixing weights are modelled using Neural Networks. -#### Usage Example - >>> from dsm import DeepSurvivalMachines - >>> model = DeepSurvivalMachines() - >>> model.fit() - >>> model.predict_risk() Deep Recurrent Survival Machines -------------------------------- @@ -71,9 +77,10 @@ data like vital signs, degradation monitoring signals in predictive maintainance. **DRSM** allows the learnt representations at each time step to involve historical context from previous time steps. **DRSM** implementation in -`dsm` is carried out through an easy to use API that accepts lists of data -streams and corresponding failure times. The module automatically takes care of -appropriate batching and padding of variable length sequences. +`dsm` is carried out through an easy to use API, +`DeepRecurrentSurvivalMachines` that accepts lists of data streams and +corresponding failure times. The module automatically takes care of appropriate +batching and padding of variable length sequences. ..warning:: Not Implemented Yet! @@ -90,6 +97,21 @@ ..warning:: Not Implemented Yet! + +Example Usage +------------- + +>>> from dsm import DeepSurvivalMachines +>>> from dsm import datasets +>>> # load the SUPPORT dataset. +>>> x, t, e = datasets.load_dataset('SUPPORT') +>>> # instantiate a DeepSurvivalMachines model. +>>> model = DeepSurvivalMachines() +>>> # fit the model to the dataset. +>>> model.fit(x, t, e) +>>> # estimate the predicted risks at the time +>>> model.predict_risk(x, 10) + References ---------- @@ -143,13 +165,14 @@ - - + +









- """ from dsm.dsm_api import DeepSurvivalMachines, DeepRecurrentSurvivalMachines diff --git a/dsm/datasets.py b/dsm/datasets.py index bfceb34..50c6273 100644 --- a/dsm/datasets.py +++ b/dsm/datasets.py @@ -114,7 +114,6 @@ def _load_pbc_dataset(sequential): else, returns collapsed results for each time step. To train recurrent neural models you would typically use True. - References ---------- [1] Fleming, Thomas R., and David P. Harrington. Counting processes and @@ -193,6 +192,36 @@ def _load_support_dataset(): def load_dataset(dataset='SUPPORT', **kwargs): """Helper function to load datasets to test Survival Analysis models. + Currently implemented datasets include: + + **SUPPORT**: This dataset comes from the Vanderbilt University study + to estimate survival for seriously ill hospitalized adults [1]. + (Refer to http://biostat.mc.vanderbilt.edu/wiki/Main/SupportDesc. + for the original datasource.) + + **PBC**: The Primary biliary cirrhosis dataset [2] is well known + dataset for evaluating survival analysis models with time + dependent covariates. + + **FRAMINGHAM**: This dataset is a subset of 4,434 participants of the well + known, ongoing Framingham Heart study [3] for studying epidemiology for + hypertensive and arteriosclerotic cardiovascular disease. It is a popular + dataset for longitudinal survival analysis with time dependent covariates. + + References + ----------- + + [1]: Knaus WA, Harrell FE, Lynn J et al. (1995): The SUPPORT prognostic + model: Objective estimates of survival for seriously ill hospitalized + adults. Annals of Internal Medicine 122:191-203. + + [2] Fleming, Thomas R., and David P. Harrington. Counting processes and + survival analysis. Vol. 169. John Wiley & Sons, 2011. + + [3] Dawber, Thomas R., Gilcin F. Meadors, and Felix E. Moore Jr. + "Epidemiological approaches to heart disease: the Framingham Study." + American Journal of Public Health and the Nations Health 41.3 (1951). + Parameters ---------- dataset: str diff --git a/dsm/dsm_api.py b/dsm/dsm_api.py index 856b9d5..7c14487 100644 --- a/dsm/dsm_api.py +++ b/dsm/dsm_api.py @@ -24,60 +24,23 @@ """ from dsm.dsm_torch import DeepSurvivalMachinesTorch +from dsm.dsm_torch import DeepRecurrentSurvivalMachinesTorch from dsm.losses import predict_cdf -from dsm.utilities import train_dsm +from dsm.utilities import train_dsm, _get_padded_features, _get_padded_targets import torch import numpy as np -class DeepSurvivalMachines(): - """A Deep Survival Machines model. - - This is the main interface to a Deep Survival Machines model. - A model is instantiated with approporiate set of hyperparameters and - fit on numpy arrays consisting of the features, event/censoring times - and the event/censoring indicators. - - For full details on Deep Survival Machines, refer to our paper [1]. - - References - ---------- - [1] Deep Survival Machines: - Fully Parametric Survival Regression and - Representation Learning for Censored Data with Competing Risks." - arXiv preprint arXiv:2003.01176 (2020) - - Parameters - ---------- - k: int - The number of underlying parametric distributions. - layers: list - A list of integers consisting of the number of neurons in each - hidden layer. - distribution: str - Choice of the underlying survival distributions. - One of 'Weibull', 'LogNormal'. - Default is 'Weibull'. - temp: float - The logits for the gate are rescaled with this value. - Default is 1000. - discount: float - a float in [0,1] that determines how to discount the tail bias - from the uncensored instances. - Default is 1. +__pdoc__ = {} +__pdoc__["DSMBase"] = False +__pdoc__["DeepSurvivalMachines.fit"] = True - Example - ------- - >>> from dsm import DeepSurvivalMachines - >>> model = DeepSurvivalMachines() - >>> model.fit(x, t, e) - """ +class DSMBase(): + """Base Class for all DSM models""" def __init__(self, k=3, layers=None, distribution="Weibull", temp=1000., discount=1.0): - super(DeepSurvivalMachines, self).__init__() - self.k = k self.layers = layers self.dist = distribution @@ -85,16 +48,15 @@ def __init__(self, k=3, layers=None, distribution="Weibull", self.discount = discount self.fitted = False - def __call__(self): - if self.fitted: - print("A fitted instance of the Deep Survival Machines model") - else: - print("An unfitted instance of the Deep Survival Machines model") - - print("Number of underlying distributions (k):", self.k) - print("Hidden Layers:", self.layers) - print("Distribution Choice:", self.dist) - + def _gen_torch_model(self, inputdim, optimizer): + """Helper function to return a torch model.""" + return DeepSurvivalMachinesTorch(inputdim, + k=self.k, + layers=self.layers, + dist=self.dist, + temp=self.temp, + discount=self.discount, + optimizer=optimizer) def fit(self, x, t, e, vsize=0.15, iters=1, learning_rate=1e-3, batch_size=100, @@ -128,8 +90,32 @@ def fit(self, x, t, e, vsize=0.15, 'Adam', 'RMSProp' or 'SGD'. random_state: float random seed that determines how the validation set is chosen. + """ + processed_data = self._prepocess_training_data(x, t, e, vsize, + random_state) + x_train, t_train, e_train, x_val, t_val, e_val = processed_data + + inputdim = x_train.shape[-1] + + model = self._gen_torch_model(inputdim, optimizer) + model, _ = train_dsm(model, + x_train, t_train, e_train, + x_val, t_val, e_val, + n_iter=iters, + lr=learning_rate, + elbo=elbo, + bs=batch_size) + + self.torch_model = model.eval() + self.fitted = True + + def _prepocess_test_data(self, x): + return torch.from_numpy(x) + + def _prepocess_training_data(self, x, t, e, vsize, random_state): + idx = list(range(x.shape[0])) np.random.seed(random_state) np.random.shuffle(idx) @@ -146,31 +132,8 @@ def fit(self, x, t, e, vsize=0.15, t_train = t_train[:-vsize] e_train = e_train[:-vsize] - inputdim = x_train.shape[1] - - if type(self).__name__ == "DeepSurvivalMachines": - - model = DeepSurvivalMachinesTorch(inputdim, - k=self.k, - layers=self.layers, - dist=self.dist, - temp=self.temp, - discount=self.discount, - optimizer=optimizer) - - model, _ = train_dsm(model, x_train, t_train, e_train, - x_val, t_val, e_val, - n_iter=iters, - lr=learning_rate, - elbo=elbo, - bs=batch_size) - - self.torch_model = model.eval() - self.fitted = True - - else: - raise NotImplementedError("`fit` nethod not implemented for "+ - type(self).__name__) + return (x_train, t_train, e_train, + x_val, t_val, e_val) def predict_risk(self, x, t): @@ -186,11 +149,11 @@ def predict_risk(self, x, t): to be computed Returns: np.array: numpy array of the risks at each time in t. + """ if self.fitted: return 1-self.predict_survival(x, t) - else: raise Exception("The model has not been fitted yet. Please fit the " + "model using the `fit` method on some training data " + @@ -210,25 +173,122 @@ def predict_survival(self, x, t): to be computed Returns: np.array: numpy array of the survival probabilites at each time in t. - """ + """ + x = self._prepocess_test_data(x) if not isinstance(t, list): t = [t] - if self.fitted: - x = torch.from_numpy(x) scores = predict_cdf(self.torch_model, x, t) return np.exp(np.array(scores)).T - else: raise Exception("The model has not been fitted yet. Please fit the " + "model using the `fit` method on some training data " + "before calling `predict_risk`.") -class DeepRecurrentSurvivalMachines(DeepSurvivalMachines): - __doc__ = "..warning:: Not Implemented" - pass +class DeepSurvivalMachines(DSMBase): + """A Deep Survival Machines model. + + This is the main interface to a Deep Survival Machines model. + A model is instantiated with approporiate set of hyperparameters and + fit on numpy arrays consisting of the features, event/censoring times + and the event/censoring indicators. + + For full details on Deep Survival Machines, refer to our paper [1]. + + References + ---------- + [1] Deep Survival Machines: + Fully Parametric Survival Regression and + Representation Learning for Censored Data with Competing Risks." + arXiv preprint arXiv:2003.01176 (2020) + + Parameters + ---------- + k: int + The number of underlying parametric distributions. + layers: list + A list of integers consisting of the number of neurons in each + hidden layer. + distribution: str + Choice of the underlying survival distributions. + One of 'Weibull', 'LogNormal'. + Default is 'Weibull'. + temp: float + The logits for the gate are rescaled with this value. + Default is 1000. + discount: float + a float in [0,1] that determines how to discount the tail bias + from the uncensored instances. + Default is 1. + + Example + ------- + >>> from dsm import DeepSurvivalMachines + >>> model = DeepSurvivalMachines() + >>> model.fit(x, t, e) + + """ + + def __call__(self): + if self.fitted: + print("A fitted instance of the Deep Survival Machines model") + else: + print("An unfitted instance of the Deep Survival Machines model") + + print("Number of underlying distributions (k):", self.k) + print("Hidden Layers:", self.layers) + print("Distribution Choice:", self.dist) + + +class DeepRecurrentSurvivalMachines(DSMBase): + + """The Deep Recurrent Survival Machines model to handle data with + time-dependent covariates. + + """ + + def _gen_torch_model(self, inputdim, optimizer): + """Helper function to return a torch model.""" + return DeepRecurrentSurvivalMachinesTorch(inputdim, + k=self.k, + layers=self.layers, + dist=self.dist, + temp=self.temp, + discount=self.discount, + optimizer=optimizer) + + def _prepocess_test_data(self, x): + return torch.from_numpy(_get_padded_features(x)) + + def _prepocess_training_data(self, x, t, e, vsize, random_state): + """RNNs require different preprocessing for variable length sequences""" + + idx = list(range(x.shape[0])) + np.random.seed(random_state) + np.random.shuffle(idx) + + x = _get_padded_features(x) + t = _get_padded_targets(t) + e = _get_padded_targets(e) + + x_train, t_train, e_train = x[idx], t[idx], e[idx] + + x_train = torch.from_numpy(x_train).double() + t_train = torch.from_numpy(t_train).double() + e_train = torch.from_numpy(e_train).double() + + vsize = int(vsize*x_train.shape[0]) + x_val, t_val, e_val = x_train[-vsize:], t_train[-vsize:], e_train[-vsize:] + + x_train = x_train[:-vsize] + t_train = t_train[:-vsize] + e_train = e_train[:-vsize] + + return (x_train, t_train, e_train, + x_val, t_val, e_val) + class DeepConvolutionalSurvivalMachines(DeepRecurrentSurvivalMachines): __doc__ = "..warning:: Not Implemented" diff --git a/dsm/dsm_torch.py b/dsm/dsm_torch.py index 58439ea..d0b9e83 100644 --- a/dsm/dsm_torch.py +++ b/dsm/dsm_torch.py @@ -25,11 +25,21 @@ from torch.nn.Module. Note: NOT DESIGNED TO BE CALLED DIRECTLY!!! + """ + import torch.nn as nn import torch +__pdoc__ = {} + +for clsn in ['DeepSurvivalMachinesTorch', + 'DeepRecurrentSurvivalMachinesTorch']: + for membr in ['training', 'dump_patches']: + + __pdoc__[clsn+'.'+membr] = False + def create_representation(inputdim, layers, activation): """Helper function to generate the representation function for DSM. @@ -54,6 +64,7 @@ def create_representation(inputdim, layers, activation): Returns ---------- an MLP with torch.nn.Module with the specfied structure. + """ if activation == 'ReLU6': @@ -114,6 +125,7 @@ class DeepSurvivalMachinesTorch(nn.Module): a float in [0,1] that determines how to discount the tail bias from the uncensored instances. Default is 1. + """ def __init__(self, inputdim, k, layers=None, dist='Weibull', @@ -148,7 +160,6 @@ def __init__(self, inputdim, k, layers=None, dist='Weibull', self.gate = nn.Sequential(nn.Linear(inputdim, k, bias=False)) self.scaleg = nn.Sequential(nn.Linear(inputdim, k, bias=True)) self.shapeg = nn.Sequential(nn.Linear(inputdim, k, bias=True)) - else: self.gate = nn.Sequential(nn.Linear(layers[-1], k, bias=False)) self.scaleg = nn.Sequential(nn.Linear(layers[-1], k, bias=True)) @@ -160,6 +171,7 @@ def forward(self, x): Args: x: a torch.tensor of the input features. + """ xrep = self.embedding(x) return(self.act(self.shapeg(xrep))+self.shape.expand(x.shape[0], -1), @@ -170,7 +182,7 @@ def get_shape_scale(self): return(self.shape, self.scale) -class DeepRecurrentSurvivalMachinesTorch(DeepSurvivalMachinesTorch): +class DeepRecurrentSurvivalMachinesTorch(nn.Module): """A Torch implementation of Deep Recurrent Survival Machines model. This is an implementation of Deep Recurrent Survival Machines model @@ -208,12 +220,13 @@ class DeepRecurrentSurvivalMachinesTorch(DeepSurvivalMachinesTorch): a float in [0,1] that determines how to discount the tail bias from the uncensored instances. Default is 1. + """ def __init__(self, inputdim, k, typ='LSTM', layers=1, hidden=None, dist='Weibull', temp=1000., discount=1.0, optimizer='Adam'): - super(DeepSurvivalMachinesTorch, self).__init__() + super(DeepRecurrentSurvivalMachinesTorch, self).__init__() self.k = k self.dist = dist @@ -259,6 +272,7 @@ def forward(self, x): Args: x: a torch.tensor of the input features. + """ x = x.detach().clone() inputmask = ~torch.isnan(x[:, :, 0]).reshape(-1) diff --git a/dsm/losses.py b/dsm/losses.py index 3a95a7a..26bb174 100644 --- a/dsm/losses.py +++ b/dsm/losses.py @@ -89,9 +89,11 @@ def unconditional_loss(model, t, e): if model.dist == 'Weibull': return _weibull_loss(model, t, e) - elif model.dist == 'LogNormal': return _lognormal_loss(model, t, e) + else: + raise NotImplementedError('Distribution: '+model.dist+ + ' not implemented yet.') def _conditional_lognormal_loss(model, x, t, e, elbo=True): @@ -199,9 +201,11 @@ def conditional_loss(model, x, t, e, elbo=True): if model.dist == 'Weibull': return _conditional_weibull_loss(model, x, t, e, elbo) - elif model.dist == 'LogNormal': return _conditional_lognormal_loss(model, x, t, e, elbo) + else: + raise NotImplementedError('Distribution: '+model.dist+ + ' not implemented yet.') def _weibull_cdf(model, x, t_horizon): @@ -282,3 +286,6 @@ def predict_cdf(model, x, t_horizon): return _weibull_cdf(model, x, t_horizon) if model.dist == 'LogNormal': return _lognormal_cdf(model, x, t_horizon) + else: + raise NotImplementedError('Distribution: '+model.dist+ + ' not implemented yet.') \ No newline at end of file diff --git a/dsm/utilities.py b/dsm/utilities.py index 4c6da14..60687ea 100644 --- a/dsm/utilities.py +++ b/dsm/utilities.py @@ -29,6 +29,7 @@ import numpy as np import gc +import logging def get_optimizer(model, lr): @@ -51,21 +52,19 @@ def pretrain_dsm(model, t_train, e_train, t_valid, e_valid, premodel.double() optimizer = torch.optim.Adam(premodel.parameters(), lr=lr) + oldcost = float('inf') patience = 0 - costs = [] for _ in tqdm(range(n_iter)): optimizer.zero_grad() - loss = unconditional_loss(premodel, t_train, e_train) loss.backward() optimizer.step() valid_loss = unconditional_loss(premodel, t_valid, e_valid) valid_loss = valid_loss.detach().cpu().numpy() - costs.append(valid_loss) if np.abs(costs[-1] - oldcost) < thres: @@ -77,27 +76,43 @@ def pretrain_dsm(model, t_train, e_train, t_valid, e_valid, return premodel def _reshape_tensor_with_nans(data): - """Helper function to unroll padded RNN inputs""" + """Helper function to unroll padded RNN inputs.""" data = data.reshape(-1) return data[~torch.isnan(data)] +def _get_padded_features(x): + """Helper function to pad variable length RNN inputs with nans.""" + d = max([len(x_) for x_ in x]) + padx = [] + for i in range(len(x)): + pads = np.nan*np.ones((d - len(x[i]), x[i].shape[1])) + padx.append(np.concatenate([x[i], pads])) + return np.array(padx) + +def _get_padded_targets(t): + """Helper function to pad variable length RNN inputs with nans.""" + d = max([len(t_) for t_ in t]) + padt = [] + for i in range(len(t)): + pads = np.nan*np.ones(d - len(t[i])) + padt.append(np.concatenate([t[i], pads])) + return np.array(padt)[:, :, np.newaxis] + def train_dsm(model, x_train, t_train, e_train, x_valid, t_valid, e_valid, n_iter=10000, lr=1e-3, elbo=True, bs=100): + """Function to train the torch instance of the model.""" - print('Pretraining the Underlying Distributions...') - - print(t_train.shape, e_train.shape) - + logging.info('Pretraining the Underlying Distributions...') + # For padded variable length sequences we first unroll the input and + # mask out the padded nans. t_train_ = _reshape_tensor_with_nans(t_train) e_train_ = _reshape_tensor_with_nans(e_train) t_valid_ = _reshape_tensor_with_nans(t_valid) e_valid_ = _reshape_tensor_with_nans(e_valid) - print(t_train_.shape, e_train_.shape) - premodel = pretrain_dsm(model, t_train_, e_train_, @@ -109,8 +124,6 @@ def train_dsm(model, model.shape.data.fill_(float(premodel.shape)) model.scale.data.fill_(float(premodel.scale)) - print(float(premodel.shape), float(premodel.scale)) - model.double() optimizer = torch.optim.Adam(model.parameters(), lr=lr)