Reduce test flakiness (#575)

py-why · Apr 6, 2022 · 3228508 · 3228508
1 parent 415cb80
commit 3228508
Show file tree

Hide file tree

Showing 9 changed files with 261 additions and 201 deletions.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -98,8 +98,8 @@ jobs:
       # Work around https://github.com/pypa/pip/issues/9542
       - script: 'pip install -U numpy~=1.21.0'
         displayName: 'Upgrade numpy'
-
-      - script: 'pip install pytest pytest-runner jupyter jupyter-client nbconvert nbformat seaborn xgboost tqdm && python setup.py pytest'
+        
+      - script: 'pip install pytest pytest-runner jupyter jupyter-client nbconvert nbformat seaborn xgboost tqdm && pip list && python setup.py pytest'
         displayName: 'Unit tests'
         env:
           PYTEST_ADDOPTS: '-m "notebook"'
@@ -126,12 +126,6 @@ jobs:
       # Work around https://github.com/pypa/pip/issues/9542
       - script: 'pip install -U numpy~=1.21.0'
         displayName: 'Upgrade numpy'
-
-      # shap 0.39 and sklearn 1.0 interact badly in these notebooks
-      # shap 0.40 has a bug in waterfall (https://github.com/slundberg/shap/issues/2283) that breaks our main tests
-      # but fixes the interaction here...
-      - script: 'pip install -U shap~=0.40.0'
-        displayName: 'Upgrade shap'
 
       - script: 'pip install pytest pytest-runner jupyter jupyter-client nbconvert nbformat seaborn xgboost tqdm && python setup.py pytest'
         displayName: 'Unit tests'
@@ -207,7 +201,7 @@ jobs:
       - script: 'pip install pytest pytest-runner && python setup.py pytest'
         displayName: 'Unit tests'
         env:
-          PYTEST_ADDOPTS: '-m "not (notebook or automl or dml or causal)" -n 2'
+          PYTEST_ADDOPTS: '-m "not (notebook or automl or dml or serial or cate_api)" -n 2'
           COVERAGE_PROCESS_START: 'setup.cfg'
       - task: PublishTestResults@2
         displayName: 'Publish Test Results **/test-results.xml'
@@ -253,15 +247,44 @@ jobs:
   parameters:
     package: '-e .[tf,plt]'
     job:
-      job: Tests_causal
+      job: Tests_serial
       dependsOn: 'EvalChanges'
       condition: eq(dependencies.EvalChanges.outputs['output.testCode'], 'True')
-      displayName: 'Run tests (Causal)'
+      displayName: 'Run tests (Serial)'
       steps:
       - script: 'pip install pytest pytest-runner && python setup.py pytest'
         displayName: 'Unit tests'
         env:
-          PYTEST_ADDOPTS: '-m "causal" -n 1'
+          PYTEST_ADDOPTS: '-m "serial" -n 1'
+          COVERAGE_PROCESS_START: 'setup.cfg'
+      - task: PublishTestResults@2
+        displayName: 'Publish Test Results **/test-results.xml'
+        inputs:
+          testResultsFiles: '**/test-results.xml'
+          testRunTitle: 'Python $(python.version), image $(imageName)'
+        condition: succeededOrFailed()
+
+      - task: PublishCodeCoverageResults@1
+        displayName: 'Publish Code Coverage Results'
+        inputs:
+          codeCoverageTool: Cobertura
+          summaryFileLocation: '$(System.DefaultWorkingDirectory)/**/coverage.xml'
+
+- template: azure-pipelines-steps.yml
+  parameters:
+    package: '-e .[tf,plt]'
+    job:
+      job: Tests_CATE_API
+      dependsOn: 'EvalChanges'
+      condition: eq(dependencies.EvalChanges.outputs['output.testCode'], 'True')
+      displayName: 'Run tests (Other)'
+      steps:
+      - script: 'pip install pytest pytest-runner'
+        displayName: 'Install pytest'
+      - script: 'python setup.py pytest'
+        displayName: 'CATE Unit tests'
+        env:
+          PYTEST_ADDOPTS: '-m "cate_api" -n auto'
           COVERAGE_PROCESS_START: 'setup.cfg'
       - task: PublishTestResults@2
         displayName: 'Publish Test Results **/test-results.xml'

diff --git a/econml/tests/test_causal_analysis.py b/econml/tests/test_causal_analysis.py
@@ -2,11 +2,14 @@
 # Licensed under the MIT License.
 
 import unittest
+
+from contextlib import ExitStack
+import itertools
 import numpy as np
 from numpy.core.fromnumeric import squeeze
 import pandas as pd
-from contextlib import ExitStack
 import pytest
+
 from econml.solutions.causal_analysis import CausalAnalysis
 from econml.solutions.causal_analysis._causal_analysis import _CausalInsightsConstants
 
@@ -15,7 +18,7 @@ def assert_less_close(arr1, arr2):
     assert np.all(np.logical_or(arr1 <= arr2, np.isclose(arr1, arr2)))
 
 
-@pytest.mark.causal
+@pytest.mark.serial
 class TestCausalAnalysis(unittest.TestCase):
 
     def test_basic_array(self):
@@ -670,21 +673,24 @@ def test_random_state(self):
         inds = [0, 1, 2, 3]
         cats = [2, 3]
         hinds = [0, 3]
-        for n_model in ['linear', 'automl']:
-            for h_model in ['linear', 'forest']:
-                for classification in [True, False]:
-                    ca = CausalAnalysis(inds, cats, hinds, classification=classification,
-                                        nuisance_models=n_model, heterogeneity_model=h_model, random_state=123)
-                    ca.fit(X, y)
-                    glo = ca.global_causal_effect()
 
-                    ca2 = CausalAnalysis(inds, cats, hinds, classification=classification,
-                                         nuisance_models=n_model, heterogeneity_model=h_model, random_state=123)
-                    ca2.fit(X, y)
-                    glo2 = ca.global_causal_effect()
+        for n_model, h_model, classification in\
+            itertools.product(['linear', 'automl'],
+                              ['linear', 'forest'],
+                              [True, False]):
 
-                    np.testing.assert_equal(glo.point.values, glo2.point.values)
-                    np.testing.assert_equal(glo.stderr.values, glo2.stderr.values)
+            ca = CausalAnalysis(inds, cats, hinds, classification=classification,
+                                nuisance_models=n_model, heterogeneity_model=h_model, random_state=123)
+            ca.fit(X, y)
+            glo = ca.global_causal_effect()
+
+            ca2 = CausalAnalysis(inds, cats, hinds, classification=classification,
+                                 nuisance_models=n_model, heterogeneity_model=h_model, random_state=123)
+            ca2.fit(X, y)
+            glo2 = ca.global_causal_effect()
+
+            np.testing.assert_equal(glo.point.values, glo2.point.values)
+            np.testing.assert_equal(glo.stderr.values, glo2.stderr.values)
 
     def test_can_set_categories(self):
         y = pd.Series(np.random.choice([0, 1], size=(500,)))
@@ -784,6 +790,7 @@ def test_invalid_inds(self):
     # Pass an example where W is irrelevant and X is confounder
     # As long as DML doesnt change the order of the inputs, then things should be good. Otherwise X would be
     # zeroed out and the test will fail
+
     def test_scaling_transforms(self):
         # shouldn't matter if X is scaled much larger or much smaller than W, we should still get good estimates
         n = 2000

diff --git a/econml/tests/test_dmliv.py b/econml/tests/test_dmliv.py
@@ -1,20 +1,23 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import unittest
-import pytest
 import pickle
+import unittest
+
 import numpy as np
+import pytest
 from scipy import special
-from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.ensemble import RandomForestRegressor
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.preprocessing import PolynomialFeatures
+
+from econml.iv.dml import OrthoIV, DMLIV, NonParamDMLIV
 from econml.iv.dr._dr import _DummyCATE
 from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression
-from sklearn.preprocessing import PolynomialFeatures
 from econml.utilities import shape
-from econml.iv.dml import OrthoIV, DMLIV, NonParamDMLIV
 
 
+@pytest.mark.cate_api
 class TestDMLIV(unittest.TestCase):
     def test_cate_api(self):
         def const_marg_eff_shape(n, d_x, d_y, binary_T):