Simplify simulation and prepare for #310. (#312)

OpenSourceEconomics · Jan 15, 2020 · 84863f0 · 84863f0
1 parent 3de71f1
commit 84863f0
Show file tree

Hide file tree

Showing 9 changed files with 278 additions and 293 deletions.
diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml
@@ -28,7 +28,7 @@ jobs:
 
   - bash: |
       source activate respy
-      tox
+      tox -- -m "not slow"
     displayName: Run all tests.
 
 - job:
@@ -54,7 +54,7 @@ jobs:
 
   - script: |
       call activate respy
-      tox -e pytest
+      tox -e pytest -- -m "not slow"
     displayName: Run pytest.
 
 - job:
@@ -94,5 +94,5 @@ jobs:
 
   - bash: |
       source activate respy
-      tox -e pytest
+      tox -e pytest -- -m "not slow"
     displayName: Run pytest.
diff --git a/docs/getting_started/tutorial-simulation.ipynb b/docs/getting_started/tutorial-simulation.ipynb
diff --git a/respy/likelihood.py b/respy/likelihood.py
@@ -18,7 +18,7 @@
 from respy.shared import create_base_draws
 from respy.shared import downcast_to_smallest_dtype
 from respy.shared import generate_column_dtype_dict_for_estimation
-from respy.shared import rename_labels
+from respy.shared import rename_labels_to_internal
 from respy.solve import solve_with_backward_induction
 from respy.state_space import StateSpace
 
@@ -499,8 +499,11 @@ def _process_estimation_data(df, state_space, optim_paras, options):
     """
     col_dtype = generate_column_dtype_dict_for_estimation(optim_paras)
 
-    df = df.sort_index()[list(col_dtype)[2:]]
-    df = df.rename(columns=rename_labels).rename_axis(index=rename_labels)
+    df = (
+        df.sort_index()[list(col_dtype)[2:]]
+        .rename(columns=rename_labels_to_internal)
+        .rename_axis(index=rename_labels_to_internal)
+    )
     df = convert_labeled_variables_to_codes(df, optim_paras)
 
     # Get indices of states in the state space corresponding to all observations for all

diff --git a/respy/shared.py b/respy/shared.py
@@ -4,6 +4,8 @@
 import from respy itself. This is to prevent circular imports.
 
 """
+import copy
+
 import chaospy as cp
 import numba as nb
 import numpy as np
@@ -107,8 +109,8 @@ def create_base_draws(shape, seed, monte_carlo_sequence):
            University Press.*
 
     """
-    n_choices = shape[2]
-    n_points = shape[0] * shape[1]
+    n_choices = shape[-1]
+    n_points = np.prod(shape[:-1])
 
     np.random.seed(seed)
 
@@ -151,8 +153,8 @@ def transform_base_draws_with_cholesky_factor(draws, shocks_cholesky, n_wages):
 
     """
     draws_transformed = draws.dot(shocks_cholesky.T)
-    draws_transformed[:, :, :n_wages] = np.exp(
-        np.clip(draws_transformed[:, :, :n_wages], MIN_LOG_FLOAT, MAX_LOG_FLOAT)
+    draws_transformed[..., :n_wages] = np.exp(
+        np.clip(draws_transformed[..., :n_wages], MIN_LOG_FLOAT, MAX_LOG_FLOAT)
     )
 
     return draws_transformed
@@ -179,25 +181,6 @@ def generate_column_dtype_dict_for_estimation(optim_paras):
     return column_dtype_dict
 
 
-def generate_column_dtype_dict_for_simulation(optim_paras):
-    """Generate column labels for simulation output."""
-    est_col_dtype = generate_column_dtype_dict_for_estimation(optim_paras)
-    labels = ["Type"] if optim_paras["n_types"] >= 2 else []
-    labels += (
-        [f"Nonpecuniary_Reward_{choice.title()}" for choice in optim_paras["choices"]]
-        + [f"Wage_{choice.title()}" for choice in optim_paras["choices_w_wage"]]
-        + [f"Flow_Utility_{choice.title()}" for choice in optim_paras["choices"]]
-        + [f"Value_Function_{choice.title()}" for choice in optim_paras["choices"]]
-        + [f"Shock_Reward_{choice.title()}" for choice in optim_paras["choices"]]
-        + ["Discount_Rate"]
-    )
-
-    sim_col_dtype = {col: (int if col == "Type" else float) for col in labels}
-    sim_col_dtype = {**est_col_dtype, **sim_col_dtype}
-
-    return sim_col_dtype
-
-
 @nb.njit
 def clip(x, minimum=None, maximum=None):
     """Clip input array at minimum and maximum."""
@@ -214,37 +197,48 @@ def clip(x, minimum=None, maximum=None):
     return out
 
 
-def downcast_to_smallest_dtype(series):
+def downcast_to_smallest_dtype(series, downcast_options=None):
     """Downcast the dtype of a :class:`pandas.Series` to the lowest possible dtype.
 
+    By default, variables are converted to signed or unsigned integers. Use ``"float"``
+    to cast variables from ``float64`` to ``float32``.
+
     Be aware that NumPy integers silently overflow which is why conversion to low dtypes
-    should be done after calculations. For example, using :class:`np.uint8` for an array
-    and squaring the elements leads to silent overflows for numbers higher than 255.
+    should be done after calculations. For example, using :class:`numpy.uint8` for an
+    array and squaring the elements leads to silent overflows for numbers higher than
+    255.
 
-    For more information on the boundaries the NumPy documentation under
+    For more information on the dtype boundaries see the NumPy documentation under
     https://docs.scipy.org/doc/numpy-1.17.0/user/basics.types.html.
 
     """
     # We can skip integer as "unsigned" and "signed" will find the same dtypes.
-    _downcast_options = ["unsigned", "signed", "float"]
+    if downcast_options is None:
+        downcast_options = ["unsigned", "signed"]
 
     if series.dtype.name == "category":
         out = series
 
+    # Convert bools to integers because they turn the dot product in
+    # `create_choice_rewards` to the object dtype.
     elif series.dtype == np.bool:
         out = series.astype(np.dtype("uint8"))
 
     else:
-        min_dtype = np.dtype("float64")
+        min_dtype = series.dtype
 
-        for dc_opt in _downcast_options:
-            dtype = pd.to_numeric(series, downcast=dc_opt).dtype
+        for dc_opt in downcast_options:
+            try:
+                dtype = pd.to_numeric(series, downcast=dc_opt).dtype
+            # A ValueError happens if strings are found in the series.
+            except ValueError:
+                min_dtype = "category"
+                break
 
-            if dtype.itemsize == 1 and dtype.name.startswith("u"):
+            # If we can convert the series to an unsigned integer, we can stop.
+            if dtype.name.startswith("u"):
                 min_dtype = dtype
                 break
-            elif dtype.itemsize == min_dtype.itemsize and dtype.name.startswith("u"):
-                min_dtype = dtype
             elif dtype.itemsize < min_dtype.itemsize:
                 min_dtype = dtype
             else:
@@ -282,15 +276,29 @@ def create_base_covariates(states, covariates_spec, raise_errors=True):
     """
     covariates = states.copy()
 
-    for covariate, definition in covariates_spec.items():
-        if covariate not in states.columns:
-            try:
-                covariates[covariate] = covariates.eval(definition)
-            except pd.core.computation.ops.UndefinedVariableError as e:
-                if raise_errors:
-                    raise e
-                else:
+    has_covariates_left_changed = True
+    covariates_left = list(covariates_spec)
+
+    while has_covariates_left_changed:
+        n_covariates_left = len(covariates_left)
+
+        # Create a copy of `covariates_left` to remove elements without side-effects.
+        for covariate in copy.copy(covariates_left):
+            # Check if the covariate does not exist and needs to be computed.
+            is_covariate_missing = covariate not in covariates.columns
+
+            if is_covariate_missing:
+                try:
+                    covariates[covariate] = covariates.eval(covariates_spec[covariate])
+                except pd.core.computation.ops.UndefinedVariableError:
                     pass
+                else:
+                    covariates_left.remove(covariate)
+
+        has_covariates_left_changed = n_covariates_left != len(covariates_left)
+
+    if covariates_left and raise_errors:
+        raise Exception(f"Cannot compute all covariates: {covariates_left}.")
 
     covariates = covariates.drop(columns=states.columns)
 
@@ -323,11 +331,16 @@ def convert_labeled_variables_to_codes(df, optim_paras):
     return df
 
 
-def rename_labels(x):
+def rename_labels_to_internal(x):
     """Shorten labels and convert them to lower-case."""
     return x.replace("Experience", "exp").lower()
 
 
+def rename_labels_from_internal(x):
+    """Shorten labels and convert them to lower-case."""
+    return x.replace("exp", "Experience").title()
+
+
 def normalize_probabilities(probabilities):
     """Normalize probabilities such that their sum equals one.