Merge pull request neuralhydrology#87 from neuralhydrology/staging

Version update 1.3.0
grey-nearing · Apr 29, 2022 · a4c284b · a4c284b
2 parents 6d0b722 + 5141399
commit a4c284b
Show file tree

Hide file tree

Showing 26 changed files with 1,250 additions and 40 deletions.
diff --git a/docs/source/api/neuralhydrology.datasetzoo.camelsaus.rst b/docs/source/api/neuralhydrology.datasetzoo.camelsaus.rst
@@ -0,0 +1,7 @@
+CamelsAUS
+=========
+
+.. automodule:: neuralhydrology.datasetzoo.camelsaus
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/neuralhydrology.datasetzoo.camelsbr.rst b/docs/source/api/neuralhydrology.datasetzoo.camelsbr.rst
@@ -0,0 +1,7 @@
+CamelsBR
+========
+
+.. automodule:: neuralhydrology.datasetzoo.camelsbr
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/neuralhydrology.datasetzoo.lamah.rst b/docs/source/api/neuralhydrology.datasetzoo.lamah.rst
@@ -0,0 +1,7 @@
+LamaH
+=====
+
+.. automodule:: neuralhydrology.datasetzoo.lamah
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/neuralhydrology.datasetzoo.rst b/docs/source/api/neuralhydrology.datasetzoo.rst
@@ -10,9 +10,12 @@ nh.datasetzoo
    :maxdepth: 4
 
    neuralhydrology.datasetzoo.basedataset
+   neuralhydrology.datasetzoo.camelsaus
+   neuralhydrology.datasetzoo.camelsbr
    neuralhydrology.datasetzoo.camelscl
    neuralhydrology.datasetzoo.camelsgb
    neuralhydrology.datasetzoo.camelsus
    neuralhydrology.datasetzoo.genericdataset
    neuralhydrology.datasetzoo.hourlycamelsus
+   neuralhydrology.datasetzoo.lamah
    neuralhydrology.datasetzoo.template
diff --git a/docs/source/api/neuralhydrology.modelzoo.arlstm.rst b/docs/source/api/neuralhydrology.modelzoo.arlstm.rst
@@ -0,0 +1,7 @@
+ARLSTM
+======
+
+.. automodule:: neuralhydrology.modelzoo.arlstm
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/neuralhydrology.modelzoo.rst b/docs/source/api/neuralhydrology.modelzoo.rst
@@ -9,6 +9,7 @@ nh.modelzoo
 .. toctree::
    :maxdepth: 4
 
+   neuralhydrology.modelzoo.arlstm
    neuralhydrology.modelzoo.basemodel
    neuralhydrology.modelzoo.cudalstm
    neuralhydrology.modelzoo.customlstm

diff --git a/docs/source/api/neuralhydrology.utils.rst b/docs/source/api/neuralhydrology.utils.rst
@@ -13,3 +13,4 @@ nh.utils
    neuralhydrology.utils.config
    neuralhydrology.utils.configutils
    neuralhydrology.utils.errors
+   neuralhydrology.utils.samplingutils
diff --git a/docs/source/api/neuralhydrology.utils.samplingutils.rst b/docs/source/api/neuralhydrology.utils.samplingutils.rst
@@ -0,0 +1,7 @@
+samplingutils
+=============
+
+.. automodule:: neuralhydrology.utils.samplingutils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/usage/config.rst b/docs/source/usage/config.rst
@@ -388,10 +388,16 @@ Data settings
 -------------
 
 -  ``dataset``: Defines which data set will be used. Currently supported
-   are ``camels_us`` (CAMELS data set by Newman et al.), ``CAMELS_GB``
-   (the GB version of CAMELS by Coxon et al.), ``CAMELS_CL`` (the CL
-   version of CAMELS by Alvarez-Garreton et al.), and 
-   ``hourly_camels_us`` (hourly data for 516 CAMELS basins).
+   are ``camels_us`` (`CAMELS (US) data set by Newman et al. <https://hess.copernicus.org/articles/19/209/2015/>`__), 
+   ``camels_gb`` (`CAMELS-GB by Coxon et al. <https://essd.copernicus.org/articles/12/2459/2020/>`__), 
+   ``camels_cl`` (`CAMELS-CL by Alvarez-Garreton et al. <https://hess.copernicus.org/articles/22/5817/2018/>`__), 
+   ``camels_br`` (`CAMELS-BR by Chagas et al. <https://essd.copernicus.org/articles/12/2075/2020>`__),
+   ``camels_aus`` (`CAMELS-AUS by Fowler et al. <https://essd.copernicus.org/articles/13/3847/2021/>`__),  
+   ``lamah_{a,b,c}`` (`LamaH-CE by Klingler et al. <https://essd.copernicus.org/articles/13/4529/2021/>`__), 
+   ``hourly_camels_us`` (hourly forcing and streamflow data for 516 CAMELS (US) basins, published 
+   by `Gauch et al. <https://hess.copernicus.org/articles/25/2045/2021/>`__), 
+   and ``generic`` (can be used with any dataset that is stored in a specific format, 
+   see :py:class:`documentation <neuralhydrology.datasetzoo.genericdataset>` for further informations).
 
 -  ``data_dir``: Full or relative path to the root directory of the data set.
 
@@ -451,6 +457,29 @@ Data settings
    `pandas shift <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.shift.html>`__
    for details). If a list of integers is provided, only unique values are considered.
    We append ``_shiftN`` to each lagged feature, where `N` is the shift count.
+
+   ``autoregressive_inputs``: Currently, only one autoregressive input is allowed, 
+   and only one output feature is allowed in an autoregressive model.
+   This is a list of target feature(s) to be used as model inputs. These 
+   will be lagged by some number of timesteps > 0, and therefore must appear in the list
+   of ``lagged_features``. Autoregressive inputs are appended to the end of the dynamic 
+   features list when building the dataset(s). Missing data is supported in autoregressive 
+   inputs. During runtime, autoregressive models append binary flags as inputs to indicate
+   missing data. Autoregressive inputs only work with models that support autoregression
+   and will throw an error if they are included in a config file for a model that does
+   not support autoregression. Leave empty if none should be used. 
+
+-  ``random_holdout_from_dynamic_features``: Dictionary to define timeseries
+   features to remove random sections of data from. This allows for conducting
+   certain types of missing data analyses. Keys of this dictionary must match 
+   exact names of dynamic inputs as defined in the data set. Values are a dict
+   with keys "missing_fraction" and "mean_missing_length", and values that are 
+   float and float, respectively, representing ("missing_fraction") the long-term 
+   fraction of data to be randomly removed from a given feature, and (2) the
+   expected value of the length of continuous subsequences removed from the 
+   timeseries. These two distribution parameters do not consider whether there
+   are any NaN's in the original timeseries. Only works for timeseries features
+   (inputs and targets). Leave empty if none should be used. 
 
 -  ``custom_normalization``: Has to be a dictionary, mapping from
    time series feature names to ``centering`` and/or ``scaling``. Using

diff --git a/docs/source/usage/models.rst b/docs/source/usage/models.rst
@@ -41,6 +41,18 @@ BaseModel
 ^^^^^^^^^
 Abstract base class from which all models derive. Do not use this class for model training.
 
+ARLSTM
+^^^^^^
+:py:class:`neuralhydrology.modelzoo.arlstm.ARLSTM` is an autoregressive long short term memory network (LSTM)
+that assumes one input is a time-lagged version of the output. All features (``x_d``, ``x_s``, ``x_one_hot``) 
+are concatenated and passed to the timeseries network at each time step, along with a binary flag that indicates 
+whether the autoregressive input (i.e., lagged target data) is missing (False) or present (True). The length of
+the autoregressive lag can be specified in the config file by specifying the lag on the autoregressive input.
+Any missing data in the autoregressive inputs is imputed with appropriately lagged model output, and gradients are 
+calculated through this imputation during backpropagation. Only one autoregressive input is currently supported, 
+and it is assumed that this is the last variable in the ``x_d`` vector. This model uses a standard pytorch LSTM 
+cell, but only runs the optimized LSTM one timestep at a time, and is therefore significantly slower than the CudaLSTM.  
+
 CudaLSTM
 ^^^^^^^^
 :py:class:`neuralhydrology.modelzoo.cudalstm.CudaLSTM` is a network using the standard PyTorch LSTM implementation.

diff --git a/neuralhydrology/__about__.py b/neuralhydrology/__about__.py
@@ -1 +1 @@
-__version__ = "1.2.5"
+__version__ = "1.3.0"
diff --git a/neuralhydrology/datasetzoo/__init__.py b/neuralhydrology/datasetzoo/__init__.py
@@ -1,9 +1,12 @@
 from neuralhydrology.datasetzoo.basedataset import BaseDataset
+from neuralhydrology.datasetzoo.camelsaus import CamelsAUS
+from neuralhydrology.datasetzoo.camelsbr import CamelsBR
 from neuralhydrology.datasetzoo.camelscl import CamelsCL
 from neuralhydrology.datasetzoo.camelsgb import CamelsGB
 from neuralhydrology.datasetzoo.camelsus import CamelsUS
 from neuralhydrology.datasetzoo.genericdataset import GenericDataset
 from neuralhydrology.datasetzoo.hourlycamelsus import HourlyCamelsUS
+from neuralhydrology.datasetzoo.lamah import LamaH
 from neuralhydrology.utils.config import Config
 
 
@@ -16,7 +19,9 @@ def get_dataset(cfg: Config,
                 scaler: dict = {}) -> BaseDataset:
     """Get data set instance, depending on the run configuration.
 
-    Currently implemented datasets are 'camels_cl', 'camels_gb', 'camels_us' and 'hourly_camels_us'
+    Currently implemented datasets are 'camels_aus', 'camels_br', 'camels_cl', 'camels_gb', 'camels_us', and 
+    'hourly_camels_us', as well as the 'generic' dataset class that can be used for any kind of dataset as long as it is
+    in the correct format.
 
     Parameters
     ----------
@@ -57,12 +62,18 @@ def get_dataset(cfg: Config,
         Dataset = CamelsUS
     elif cfg.dataset.lower() == "camels_gb":
         Dataset = CamelsGB
+    elif cfg.dataset.lower() == "camels_aus":
+        Dataset = CamelsAUS
+    elif cfg.dataset.lower() == "camels_br":
+        Dataset = CamelsBR
     elif cfg.dataset.lower() == "hourly_camels_us":
         Dataset = HourlyCamelsUS
     elif cfg.dataset.lower() == "camels_cl":
         Dataset = CamelsCL
     elif cfg.dataset.lower() == "generic":
         Dataset = GenericDataset
+    elif cfg.dataset.lower() in ["lamah_a", "lamah_b", "lamah_c"]:
+        Dataset = LamaH
     else:
         raise NotImplementedError(f"No dataset class implemented for dataset {cfg.dataset}")
 

diff --git a/neuralhydrology/datasetzoo/basedataset.py b/neuralhydrology/datasetzoo/basedataset.py
@@ -1,5 +1,6 @@
 import logging
 import pickle
+import re
 import sys
 import warnings
 from collections import defaultdict
@@ -19,6 +20,7 @@
 
 from neuralhydrology.datautils import utils
 from neuralhydrology.utils.config import Config
+from neuralhydrology.utils import samplingutils
 
 LOGGER = logging.getLogger(__name__)
 
@@ -236,6 +238,11 @@ def _duplicate_features(self, df: pd.DataFrame) -> pd.DataFrame:
         return df
 
     def _add_lagged_features(self, df: pd.DataFrame) -> pd.DataFrame:
+
+        # check that all autoregressive inputs are contained in the list of shifted variables
+        self._check_autoregressive_inputs()
+
+        # create the shifted varaibles, as requested
         for feature, shift in self.cfg.lagged_features.items():
             if isinstance(shift, list):
                 # only consider unique shift values, otherwise we have columns with identical names
@@ -245,15 +252,30 @@ def _add_lagged_features(self, df: pd.DataFrame) -> pd.DataFrame:
                 df[f"{feature}_shift{shift}"] = df[feature].shift(periods=shift, freq="infer")
             else:
                 raise ValueError("The value of the 'lagged_features' arg must be either an int or a list of ints")
+
         return df
 
+    def _check_autoregressive_inputs(self):
+        # The dataset requires that AR inputs be lagged features, however in general when constructing the dataset
+        # we do not care whether these are lagged targets, specifically. The requirement that AR inputs be lagged
+        # targets, although typical for AR models, is not strictly required and depends on how these features are
+        # used in any particular model.
+        for input in self.cfg.autoregressive_inputs:
+            capture = re.compile(r'^(.*)_shift(\d+)$').search(input)
+            if not capture:
+                raise ValueError('Autoregressive inputs must be a shifted variable with form <variable>_shift<lag> ',
+                                f'where <lag> is an integer. Instead got: {input}.')
+            if capture[1] not in self.cfg.lagged_features or int(capture[2]) not in self.cfg.lagged_features[capture[1]]:
+                raise ValueError('Autoregressive inputs must be in the list of "lagged_inputs".')
+        return
+
     def _load_or_create_xarray_dataset(self) -> xarray.Dataset:
         # if no netCDF file is passed, data set is created from raw basin files
         if (self.cfg.train_data_file is None) or (not self.is_train):
             data_list = []
 
             # list of columns to keep, everything else will be removed to reduce memory footprint
-            keep_cols = self.cfg.target_variables + self.cfg.evolving_attributes + self.cfg.mass_inputs
+            keep_cols = self.cfg.target_variables + self.cfg.evolving_attributes + self.cfg.mass_inputs + self.cfg.autoregressive_inputs
 
             if isinstance(self.cfg.dynamic_inputs, list):
                 keep_cols += self.cfg.dynamic_inputs
@@ -289,7 +311,15 @@ def _load_or_create_xarray_dataset(self) -> xarray.Dataset:
                     ]
                     raise KeyError("".join(msg))
 
-                # make end_date the last second of the specified day, such that the
+                # remove random portions of the timeseries of dynamic features
+                for holdout_variable,  holdout_dict in self.cfg.random_holdout_from_dynamic_features.items():
+                    df[holdout_variable] = samplingutils.bernoulli_subseries_sampler(
+                        data=df[holdout_variable].values,
+                        missing_fraction=holdout_dict['missing_fraction'], 
+                        mean_missing_length=holdout_dict['mean_missing_length'],
+                    )
+
+                # Make end_date the last second of the specified day, such that the
                 # dataset will include all hours of the last day, not just 00:00.
                 start_dates = self.dates[basin]["start_dates"]
                 end_dates = [date + pd.Timedelta(days=1, seconds=-1) for date in self.dates[basin]["end_dates"]]
@@ -427,8 +457,12 @@ def _create_lookup_table(self, xr: xarray.Dataset):
                 else:
                     dynamic_cols = self.cfg.mass_inputs + self.cfg.dynamic_inputs[freq]
 
-                df_resampled = df_native[dynamic_cols + self.cfg.target_variables +
-                                         self.cfg.evolving_attributes].resample(freq).mean()
+                df_resampled = df_native[
+                    dynamic_cols + self.cfg.target_variables + 
+                    self.cfg.evolving_attributes + self.cfg.autoregressive_inputs
+                ].resample(freq).mean()
+
+                # pull all of the data that needs to be validated
                 x_d[freq] = df_resampled[dynamic_cols].values
                 y[freq] = df_resampled[self.cfg.target_variables].values
                 if self.cfg.evolving_attributes:
@@ -459,6 +493,14 @@ def _create_lookup_table(self, xr: xarray.Dataset):
                                         frequency_maps=[frequency_maps[freq] for freq in self.frequencies],
                                         seq_length=self.seq_len,
                                         predict_last_n=self._predict_last_n)
+
+            # Concatenate autoregressive columns to dynamic inputs *after* validation, so as to not remove
+            # samples with missing autoregressive inputs. 
+            # AR inputs must go at the end of the df/array (this is assumed by the AR model).
+            if self.cfg.autoregressive_inputs:
+                for freq in self.frequencies:
+                    x_d[freq] = np.concatenate([x_d[freq], df_resampled[self.cfg.autoregressive_inputs].values], axis=1)
+
             valid_samples = np.argwhere(flag == 1)
             for f in valid_samples:
                 # store pointer to basin and the sample's index in each frequency