diff --git a/README.md b/README.md
index b688c969c..c3856f4cc 100644
--- a/README.md
+++ b/README.md
@@ -88,6 +88,8 @@ Such questions arise frequently in customer segmentation (what is the effect of
 
 # News
 
+**January 10, 2020:** Release v0.6.1, see release notes [here](https://github.com/Microsoft/EconML/releases/tag/v0.6.1)
+
 **December 6, 2019:** Release v0.6, see release notes [here](https://github.com/Microsoft/EconML/releases/tag/v0.6)
 
 **November 21, 2019:** Release v0.5, see release notes [here](https://github.com/Microsoft/EconML/releases/tag/v0.5). 
diff --git a/econml/drlearner.py b/econml/drlearner.py
index 829cf4e13..e30a0f416 100644
--- a/econml/drlearner.py
+++ b/econml/drlearner.py
@@ -29,15 +29,14 @@
 
 import numpy as np
 from warnings import warn
+
+from sklearn.base import clone
 from sklearn.linear_model import LogisticRegressionCV, LinearRegression, LassoCV
-from econml.utilities import inverse_onehot, check_high_dimensional
-from econml.sklearn_extensions.linear_model import WeightedLassoCV, DebiasedLasso
+from econml.utilities import inverse_onehot, check_high_dimensional, StatsModelsLinearRegression
+from econml.sklearn_extensions.linear_model import WeightedLassoCVWrapper, DebiasedLasso
 from econml.sklearn_extensions.ensemble import SubsampledHonestForest
-from sklearn.base import clone
 from econml._ortho_learner import _OrthoLearner
 from econml.cate_estimator import StatsModelsCateEstimatorDiscreteMixin, DebiasedLassoCateEstimatorDiscreteMixin
-from econml.utilities import StatsModelsLinearRegression
-from sklearn.preprocessing import PolynomialFeatures
 from econml.inference import GenericModelFinalInferenceDiscrete
 
 
@@ -243,7 +242,7 @@ class takes as input the parameter ``model_regressor``, which is an arbitrary sc
     """
 
     def __init__(self, model_propensity=LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto'),
-                 model_regression=WeightedLassoCV(cv=3),
+                 model_regression=WeightedLassoCVWrapper(cv=3),
                  model_final=StatsModelsLinearRegression(),
                  multitask_model_final=False,
                  featurizer=None,
@@ -258,8 +257,9 @@ def _combine(self, X, W):
                 return np.hstack([arr for arr in [X, W] if arr is not None])
 
             def fit(self, Y, T, X=None, W=None, *, sample_weight=None):
-                # TODO Allow for non-vector y, i.e. of shape (n, 1)
-                assert np.ndim(Y) == 1, "Can only accept single dimensional outcomes Y! Use Y.ravel()."
+                if Y.ndim != 1 and (Y.ndim != 2 or Y.shape[1] != 1):
+                    raise ValueError("The outcome matrix must be of shape ({0}, ) or ({0}, 1), "
+                                     "instead got {1}.".format(len(X), Y.shape))
                 if (X is None) and (W is None):
                     raise AttributeError("At least one of X or W has to not be None!")
                 if np.any(np.all(T == 0, axis=0)) or (not np.any(np.all(T == 0, axis=1))):
@@ -274,16 +274,17 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None):
             def predict(self, Y, T, X=None, W=None, *, sample_weight=None):
                 XW = self._combine(X, W)
                 propensities = self._model_propensity.predict_proba(XW)
+                n = T.shape[0]
                 Y_pred = np.zeros((T.shape[0], T.shape[1] + 1))
                 T_counter = np.zeros(T.shape)
-                Y_pred[:, 0] = self._model_regression.predict(np.hstack([XW, T_counter]))
-                Y_pred[:, 0] += (Y - Y_pred[:, 0]) * np.all(T == 0, axis=1) / propensities[:, 0]
+                Y_pred[:, 0] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n)
+                Y_pred[:, 0] += (Y.reshape(n) - Y_pred[:, 0]) * np.all(T == 0, axis=1) / propensities[:, 0]
                 for t in np.arange(T.shape[1]):
                     T_counter = np.zeros(T.shape)
                     T_counter[:, t] = 1
-                    Y_pred[:, t + 1] = self._model_regression.predict(np.hstack([XW, T_counter]))
-                    Y_pred[:, t + 1] += (Y - Y_pred[:, t + 1]) * (T[:, t] == 1) / propensities[:, t + 1]
-                return Y_pred
+                    Y_pred[:, t + 1] = self._model_regression.predict(np.hstack([XW, T_counter])).reshape(n)
+                    Y_pred[:, t + 1] += (Y.reshape(n) - Y_pred[:, t + 1]) * (T[:, t] == 1) / propensities[:, t + 1]
+                return Y_pred.reshape(Y.shape + (T.shape[1] + 1,))
 
         class ModelFinal:
             # Coding Remark: The reasoning around the multitask_model_final could have been simplified if
@@ -299,36 +300,45 @@ def __init__(self, model_final, featurizer, multitask_model_final):
 
             def fit(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None):
                 Y_pred, = nuisances
+                self.d_y = Y_pred.shape[1:-1]  # track whether there's a Y dimension (must be a singleton)
                 if (X is not None) and (self._featurizer is not None):
                     X = self._featurizer.fit_transform(X)
                 filtered_kwargs = _filter_none_kwargs(sample_weight=sample_weight, sample_var=sample_var)
                 if self._multitask_model_final:
-                    self.model_cate = self._model_final.fit(X, Y_pred[:, 1:] - Y_pred[:, [0]], **filtered_kwargs)
+                    ys = Y_pred[..., 1:] - Y_pred[..., [0]]  # subtract control results from each other arm
+                    if self.d_y:  # need to squeeze out singleton so that we fit on 2D array
+                        ys = ys.squeeze(1)
+                    self.model_cate = self._model_final.fit(X, ys, **filtered_kwargs)
                 else:
-                    self.models_cate = [clone(self._model_final, safe=False).fit(X, Y_pred[:, t] - Y_pred[:, 0],
+                    self.models_cate = [clone(self._model_final, safe=False).fit(X, Y_pred[..., t] - Y_pred[..., 0],
                                                                                  **filtered_kwargs)
-                                        for t in np.arange(1, Y_pred.shape[1])]
+                                        for t in np.arange(1, Y_pred.shape[-1])]
                 return self
 
             def predict(self, X=None):
                 if (X is not None) and (self._featurizer is not None):
                     X = self._featurizer.transform(X)
                 if self._multitask_model_final:
-                    return self.model_cate.predict(X)
+                    pred = self.model_cate.predict(X)
+                    if self.d_y:  # need to reintroduce singleton Y dimension
+                        return pred[:, np.newaxis, :]
+                    return pred
                 else:
-                    return np.array([mdl.predict(X) for mdl in self.models_cate]).T
+                    preds = np.array([mdl.predict(X) for mdl in self.models_cate])
+                    return np.moveaxis(preds, 0, -1)  # move treatment dim to end
 
             def score(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None):
                 if (X is not None) and (self._featurizer is not None):
                     X = self._featurizer.transform(X)
                 Y_pred, = nuisances
                 if self._multitask_model_final:
-                    return np.mean(np.average((Y_pred[:, 1:] - Y_pred[:, [0]] - self.model_cate.predict(X))**2,
+                    return np.mean(np.average((Y_pred[..., 1:] - Y_pred[..., [0]] - self.model_cate.predict(X))**2,
                                               weights=sample_weight, axis=0))
                 else:
-                    return np.mean([np.average((Y_pred[:, t] - Y_pred[:, 0] - self.models_cate[t - 1].predict(X))**2,
+                    return np.mean([np.average((Y_pred[..., t] - Y_pred[..., 0] -
+                                                self.models_cate[t - 1].predict(X))**2,
                                                weights=sample_weight, axis=0)
-                                    for t in np.arange(1, Y_pred.shape[1])])
+                                    for t in np.arange(1, Y_pred.shape[-1])])
 
         self._multitask_model_final = multitask_model_final
         super().__init__(ModelNuisance(model_propensity, model_regression),
@@ -615,7 +625,7 @@ class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner):
 
     def __init__(self,
                  model_propensity=LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto'),
-                 model_regression=WeightedLassoCV(cv=3),
+                 model_regression=WeightedLassoCVWrapper(cv=3),
                  featurizer=None,
                  fit_cate_intercept=True,
                  n_splits=2, random_state=None):
@@ -806,7 +816,7 @@ class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
 
     def __init__(self,
                  model_propensity=LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto'),
-                 model_regression=WeightedLassoCV(cv=3),
+                 model_regression=WeightedLassoCVWrapper(cv=3),
                  featurizer=None,
                  fit_cate_intercept=True,
                  alpha='auto',
diff --git a/econml/inference.py b/econml/inference.py
index 597107dc7..50e8bc283 100644
--- a/econml/inference.py
+++ b/econml/inference.py
@@ -221,19 +221,19 @@ def const_marginal_effect_interval(self, X, *, alpha=0.1):
         if (X is not None) and (self.featurizer is not None):
             X = self.featurizer.fit_transform(X)
         preds = np.array([mdl.predict_interval(X, alpha=alpha) for mdl in self.fitted_models_final])
-        return tuple([preds[:, 0, :].T, preds[:, 1, :].T])
+        return tuple(np.moveaxis(preds, [0, 1], [-1, 0]))  # send treatment to the end, pull bounds to the front
 
     def effect_interval(self, X, *, T0, T1, alpha=0.1):
         X, T0, T1 = self._est._expand_treatments(X, T0, T1)
         if np.any(np.any(T0 > 0, axis=1)):
             raise AttributeError("Can only calculate intervals of effects with respect to baseline treatment!")
-        ind = (T1 @ np.arange(1, T1.shape[1] + 1)).astype(int)
+        ind = inverse_onehot(T1)
         lower, upper = self.const_marginal_effect_interval(X, alpha=alpha)
-        lower = np.hstack([np.zeros((lower.shape[0], 1)), lower])
-        upper = np.hstack([np.zeros((upper.shape[0], 1)), upper])
+        lower = np.concatenate([np.zeros(lower.shape[0:-1] + (1,)), lower], -1)
+        upper = np.concatenate([np.zeros(upper.shape[0:-1] + (1,)), upper], -1)
         if X is None:  # Then const_marginal_effect_interval will return a single row
-            lower, upper = np.tile(lower, (T0.shape[0], 1)), np.tile(upper, (T0.shape[0], 1))
-        return lower[np.arange(T0.shape[0]), ind], upper[np.arange(T0.shape[0]), ind]
+            lower, upper = np.repeat(lower, T0.shape[0], axis=0), np.repeat(upper, T0.shape[0], axis=0)
+        return lower[np.arange(T0.shape[0]), ..., ind], upper[np.arange(T0.shape[0]), ..., ind]
 
 
 class LinearModelFinalInferenceDiscrete(GenericModelFinalInferenceDiscrete):
diff --git a/econml/tests/test_drlearner.py b/econml/tests/test_drlearner.py
index 99068ed9a..97fe1f2e5 100644
--- a/econml/tests/test_drlearner.py
+++ b/econml/tests/test_drlearner.py
@@ -70,12 +70,11 @@ def make_random(is_discrete, d):
             else:
                 return np.random.normal(size=sz)
 
-        d_y = 0
-        is_discrete = True
-        for d_t in [0, 1]:
-            for d_x in [2, None]:
-                for d_w in [2, None]:
-                    with self.subTest(d_t=d_t, d_x=d_x, d_w=d_w):
+        for d_y in [0, 1]:
+            is_discrete = True
+            for d_t in [0, 1]:
+                for d_x in [2, None]:
+                    for d_w in [2, None]:
                         W, X, Y, T = [make_random(is_discrete, d)
                                       for is_discrete, d in [(False, d_w),
                                                              (False, d_x),
@@ -96,51 +95,56 @@ def make_random(is_discrete, d):
                                                        ((d_y,) if d_y > 0 else ()) +
                                                        ((d_t_final,) if d_t_final > 0 else()))
 
-                        # TODO: add stratification to bootstrap so that we can use it even with discrete treatments
-                        infs = [None, 'statsmodels']
-
-                        est = LinearDRLearner(model_regression=Lasso(),
-                                              model_propensity=LogisticRegression(C=1000, solver='lbfgs',
-                                                                                  multi_class='auto'))
-
-                        for inf in infs:
-                            with self.subTest(d_w=d_w, d_x=d_x, d_y=d_y, d_t=d_t,
-                                              is_discrete=is_discrete, est=est, inf=inf):
-                                est.fit(Y, T, X, W, inference=inf)
-                                # make sure we can call the marginal_effect and effect methods
-                                const_marg_eff = est.const_marginal_effect(X)
-                                marg_eff = est.marginal_effect(T, X)
-                                self.assertEqual(shape(marg_eff), marginal_effect_shape)
-                                self.assertEqual(shape(const_marg_eff), const_marginal_effect_shape)
-
-                                np.testing.assert_array_equal(
-                                    marg_eff if d_x else marg_eff[0:1], const_marg_eff)
-
-                                T0 = np.full_like(T, 'a')
-                                eff = est.effect(X, T0=T0, T1=T)
-                                self.assertEqual(shape(eff), effect_shape)
-                                if inf is not None:
-                                    const_marg_eff_int = est.const_marginal_effect_interval(X)
-                                    marg_eff_int = est.marginal_effect_interval(T, X)
-                                    self.assertEqual(shape(marg_eff_int),
-                                                     (2,) + marginal_effect_shape)
-                                    self.assertEqual(shape(const_marg_eff_int),
-                                                     (2,) + const_marginal_effect_shape)
-                                    self.assertEqual(shape(est.effect_interval(X, T0=T0, T1=T)),
-                                                     (2,) + effect_shape)
-
-                                est.score(Y, T, X, W)
-
-                                # make sure we can call effect with implied scalar treatments, no matter the
-                                # dimensions of T, and also that we warn when there are multiple treatments
-                                if d_t > 1:
-                                    cm = self.assertWarns(Warning)
-                                else:
-                                    cm = ExitStack()  # ExitStack can be used as a "do nothing" ContextManager
-                                with cm:
-                                    effect_shape2 = (n if d_x else 1,) + ((d_y,) if d_y > 0 else())
-                                    eff = est.effect(X, T0='a', T1='b')
-                                    self.assertEqual(shape(eff), effect_shape2)
+                        for est in [LinearDRLearner(model_propensity=LogisticRegression(C=1000, solver='lbfgs',
+                                                                                        multi_class='auto')),
+                                    DRLearner(model_propensity=LogisticRegression(multi_class='auto'),
+                                              model_regression=LinearRegression(),
+                                              model_final=StatsModelsLinearRegression(),
+                                              multitask_model_final=True)]:
+
+                            # TODO: add stratification to bootstrap so that we can use it even with discrete treatments
+                            infs = [None]
+                            if isinstance(est, LinearDRLearner):
+                                infs.append('statsmodels')
+
+                            for inf in infs:
+                                with self.subTest(d_w=d_w, d_x=d_x, d_y=d_y, d_t=d_t,
+                                                  is_discrete=is_discrete, est=est, inf=inf):
+                                    est.fit(Y, T, X, W, inference=inf)
+                                    # make sure we can call the marginal_effect and effect methods
+                                    const_marg_eff = est.const_marginal_effect(X)
+                                    marg_eff = est.marginal_effect(T, X)
+                                    self.assertEqual(shape(marg_eff), marginal_effect_shape)
+                                    self.assertEqual(shape(const_marg_eff), const_marginal_effect_shape)
+
+                                    np.testing.assert_array_equal(
+                                        marg_eff if d_x else marg_eff[0:1], const_marg_eff)
+
+                                    T0 = np.full_like(T, 'a')
+                                    eff = est.effect(X, T0=T0, T1=T)
+                                    self.assertEqual(shape(eff), effect_shape)
+                                    if inf is not None:
+                                        const_marg_eff_int = est.const_marginal_effect_interval(X)
+                                        marg_eff_int = est.marginal_effect_interval(T, X)
+                                        self.assertEqual(shape(marg_eff_int),
+                                                         (2,) + marginal_effect_shape)
+                                        self.assertEqual(shape(const_marg_eff_int),
+                                                         (2,) + const_marginal_effect_shape)
+                                        self.assertEqual(shape(est.effect_interval(X, T0=T0, T1=T)),
+                                                         (2,) + effect_shape)
+
+                                    est.score(Y, T, X, W)
+
+                                    # make sure we can call effect with implied scalar treatments, no matter the
+                                    # dimensions of T, and also that we warn when there are multiple treatments
+                                    if d_t > 1:
+                                        cm = self.assertWarns(Warning)
+                                    else:
+                                        cm = ExitStack()  # ExitStack can be used as a "do nothing" ContextManager
+                                    with cm:
+                                        effect_shape2 = (n if d_x else 1,) + ((d_y,) if d_y > 0 else())
+                                        eff = est.effect(X, T0='a', T1='b')
+                                        self.assertEqual(shape(eff), effect_shape2)
 
     def test_can_use_vectors(self):
         """
diff --git a/setup.cfg b/setup.cfg
index 9d509ad23..8eea03541 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -7,11 +7,11 @@ max-line-length=119
 convention=numpy
 
 [build_sphinx]
-version = 0.6
+version = 0.6.1
 
 [metadata]
 name = econml
-version = 0.6
+version = 0.6.1
 author = Microsoft Corporation
 description = This package contains several methods for calculating Conditional Average Treatment Effects
 long_description = file: README.md