Skip to content

Commit

Permalink
[fbsync] Better logic for ignoring CPU tests on GPU CI machines (#4025)
Browse files Browse the repository at this point in the history
Reviewed By: fmassa

Differential Revision: D29105975

fbshipit-source-id: 0f3446a61934e6b5ee3151c390e604e5b858d355
  • Loading branch information
NicolasHug authored and facebook-github-bot committed Jun 15, 2021
1 parent 7b21f69 commit 5865648
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 99 deletions.
50 changes: 2 additions & 48 deletions test/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,58 +259,12 @@ def call_args_to_kwargs_only(call_args, *callable_or_arg_names):

def cpu_and_gpu():
import pytest # noqa

# ignore CPU tests in RE as they're already covered by another contbuild
# also ignore CPU tests in CircleCI machines that have a GPU: these tests
# are run on CPU-only machines already.
if IN_RE_WORKER:
devices = []
else:
if IN_CIRCLE_CI and torch.cuda.is_available():
mark = pytest.mark.skip(reason=CIRCLECI_GPU_NO_CUDA_MSG)
else:
mark = ()
devices = [pytest.param('cpu', marks=mark)]

if torch.cuda.is_available():
cuda_marks = ()
elif IN_FBCODE:
# Dont collect cuda tests on fbcode if the machine doesnt have a GPU
# This avoids skipping the tests. More robust would be to detect if
# we're in sancastle instead of fbcode?
cuda_marks = pytest.mark.dont_collect()
else:
cuda_marks = pytest.mark.skip(reason=CUDA_NOT_AVAILABLE_MSG)

devices.append(pytest.param('cuda', marks=cuda_marks))

return devices
return ('cpu', pytest.param('cuda', marks=pytest.mark.needs_cuda))


def needs_cuda(test_func):
import pytest # noqa

if IN_FBCODE and not IN_RE_WORKER:
# We don't want to skip in fbcode, so we just don't collect
# TODO: slightly more robust way would be to detect if we're in a sandcastle instance
# so that the test will still be collected (and skipped) in the devvms.
return pytest.mark.dont_collect(test_func)
elif torch.cuda.is_available():
return test_func
else:
return pytest.mark.skip(reason=CUDA_NOT_AVAILABLE_MSG)(test_func)


def cpu_only(test_func):
import pytest # noqa

if IN_RE_WORKER:
# The assumption is that all RE workers have GPUs.
return pytest.mark.dont_collect(test_func)
elif IN_CIRCLE_CI and torch.cuda.is_available():
return pytest.mark.skip(reason=CIRCLECI_GPU_NO_CUDA_MSG)(test_func)
else:
return test_func
return pytest.mark.needs_cuda(test_func)


def _create_data(height=3, width=3, channels=3, device="cpu"):
Expand Down
57 changes: 51 additions & 6 deletions test/conftest.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,59 @@
from common_utils import IN_CIRCLE_CI, CIRCLECI_GPU_NO_CUDA_MSG, IN_FBCODE, IN_RE_WORKER, CUDA_NOT_AVAILABLE_MSG
import torch
import pytest


def pytest_configure(config):
# register an additional marker (see pytest_collection_modifyitems)
config.addinivalue_line(
"markers", "dont_collect: marks a test that should not be collected (avoids skipping it)"
"markers", "needs_cuda: mark for tests that rely on a CUDA device"
)
config.addinivalue_line(
"markers", "dont_collect: mark for tests that should not be collected"
)


def pytest_collection_modifyitems(items):
# This hook is called by pytest after it has collected the tests (google its name!)
# We can ignore some tests as we see fit here. In particular we ignore the tests that
# we have marked with the custom 'dont_collect' mark. This avoids skipping the tests,
# since the internal fb infra doesn't like skipping tests.
to_keep = [item for item in items if item.get_closest_marker('dont_collect') is None]
items[:] = to_keep
# We can ignore some tests as we see fit here, or add marks, such as a skip mark.

out_items = []
for item in items:
# The needs_cuda mark will exist if the test was explicitely decorated with
# the @needs_cuda decorator. It will also exist if it was parametrized with a
# parameter that has the mark: for example if a test is parametrized with
# @pytest.mark.parametrize('device', cpu_and_gpu())
# the "instances" of the tests where device == 'cuda' will have the 'needs_cuda' mark,
# and the ones with device == 'cpu' won't have the mark.
needs_cuda = item.get_closest_marker('needs_cuda') is not None

if needs_cuda and not torch.cuda.is_available():
# In general, we skip cuda tests on machines without a GPU
# There are special cases though, see below
item.add_marker(pytest.mark.skip(reason=CUDA_NOT_AVAILABLE_MSG))

if IN_FBCODE:
# fbcode doesn't like skipping tests, so instead we just don't collect the test
# so that they don't even "exist", hence the continue statements.
if not needs_cuda and IN_RE_WORKER:
# The RE workers are the machines with GPU, we don't want them to run CPU-only tests.
continue
if needs_cuda and not torch.cuda.is_available():
# On the test machines without a GPU, we want to ignore the tests that need cuda.
# TODO: something more robust would be to do that only in a sandcastle instance,
# so that we can still see the test being skipped when testing locally from a devvm
continue
elif IN_CIRCLE_CI:
# Here we're not in fbcode, so we can safely collect and skip tests.
if not needs_cuda and torch.cuda.is_available():
# Similar to what happens in RE workers: we don't need the CircleCI GPU machines
# to run the CPU-only tests.
item.add_marker(pytest.mark.skip(reason=CIRCLECI_GPU_NO_CUDA_MSG))

if item.get_closest_marker('dont_collect') is not None:
# currently, this is only used for some tests we're sure we dont want to run on fbcode
continue

out_items.append(item)

items[:] = out_items
25 changes: 12 additions & 13 deletions test/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import torch
from PIL import Image
import torchvision.transforms.functional as F
from common_utils import get_tmp_dir, needs_cuda, cpu_only
from common_utils import get_tmp_dir, needs_cuda
from _assert_utils import assert_equal

from torchvision.io.image import (
Expand Down Expand Up @@ -335,7 +335,6 @@ def test_decode_jpeg_cuda_errors():
torch.ops.image.decode_jpeg_cuda(data, ImageReadMode.UNCHANGED.value, 'cpu')


@cpu_only
def test_encode_jpeg_errors():

with pytest.raises(RuntimeError, match="Input tensor dtype should be uint8"):
Expand All @@ -360,7 +359,7 @@ def test_encode_jpeg_errors():


def _collect_if(cond):
# TODO: remove this once test_encode_jpeg_windows and test_write_jpeg_windows
# TODO: remove this once test_encode_jpeg_reference and test_write_jpeg_reference
# are removed
def _inner(test_func):
if cond:
Expand All @@ -370,15 +369,14 @@ def _inner(test_func):
return _inner


@cpu_only
@_collect_if(cond=IS_WINDOWS)
@pytest.mark.parametrize('img_path', [
pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path))
for jpeg_path in get_images(ENCODE_JPEG, ".jpg")
])
def test_encode_jpeg_windows(img_path):
def test_encode_jpeg_reference(img_path):
# This test is *wrong*.
# It compares a torchvision-encoded jpeg with a PIL-encoded jpeg, but it
# It compares a torchvision-encoded jpeg with a PIL-encoded jpeg (the reference), but it
# starts encoding the torchvision version from an image that comes from
# decode_jpeg, which can yield different results from pil.decode (see
# test_decode... which uses a high tolerance).
Expand All @@ -403,14 +401,13 @@ def test_encode_jpeg_windows(img_path):
assert_equal(jpeg_bytes, pil_bytes)


@cpu_only
@_collect_if(cond=IS_WINDOWS)
@pytest.mark.parametrize('img_path', [
pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path))
for jpeg_path in get_images(ENCODE_JPEG, ".jpg")
])
def test_write_jpeg_windows(img_path):
# FIXME: Remove this eventually, see test_encode_jpeg_windows
def test_write_jpeg_reference(img_path):
# FIXME: Remove this eventually, see test_encode_jpeg_reference
with get_tmp_dir() as d:
data = read_file(img_path)
img = decode_jpeg(data)
Expand All @@ -433,8 +430,9 @@ def test_write_jpeg_windows(img_path):
assert_equal(torch_bytes, pil_bytes)


@cpu_only
@_collect_if(cond=not IS_WINDOWS)
@pytest.mark.skipif(IS_WINDOWS, reason=(
'this test fails on windows because PIL uses libjpeg-turbo on windows'
))
@pytest.mark.parametrize('img_path', [
pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path))
for jpeg_path in get_images(ENCODE_JPEG, ".jpg")
Expand All @@ -455,8 +453,9 @@ def test_encode_jpeg(img_path):
assert_equal(encoded_jpeg_torch, encoded_jpeg_pil)


@cpu_only
@_collect_if(cond=not IS_WINDOWS)
@pytest.mark.skipif(IS_WINDOWS, reason=(
'this test fails on windows because PIL uses libjpeg-turbo on windows'
))
@pytest.mark.parametrize('img_path', [
pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path))
for jpeg_path in get_images(ENCODE_JPEG, ".jpg")
Expand Down
11 changes: 1 addition & 10 deletions test/test_models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import io
import sys
from common_utils import map_nested_tensor_object, freeze_rng_state, set_rng_seed, cpu_and_gpu, needs_cuda, cpu_only
from common_utils import map_nested_tensor_object, freeze_rng_state, set_rng_seed, cpu_and_gpu, needs_cuda
from _utils_internal import get_relative_path
from collections import OrderedDict
import functools
Expand Down Expand Up @@ -234,7 +234,6 @@ def _make_sliced_model(model, stop_layer):
return new_model


@cpu_only
@pytest.mark.parametrize('model_name', ['densenet121', 'densenet169', 'densenet201', 'densenet161'])
def test_memory_efficient_densenet(model_name):
input_shape = (1, 3, 300, 300)
Expand All @@ -257,7 +256,6 @@ def test_memory_efficient_densenet(model_name):
torch.testing.assert_close(out1, out2, rtol=0.0, atol=1e-5)


@cpu_only
@pytest.mark.parametrize('dilate_layer_2', (True, False))
@pytest.mark.parametrize('dilate_layer_3', (True, False))
@pytest.mark.parametrize('dilate_layer_4', (True, False))
Expand All @@ -272,7 +270,6 @@ def test_resnet_dilation(dilate_layer_2, dilate_layer_3, dilate_layer_4):
assert out.shape == (1, 2048, 7 * f, 7 * f)


@cpu_only
def test_mobilenet_v2_residual_setting():
model = models.__dict__["mobilenet_v2"](inverted_residual_setting=[[1, 16, 1, 1], [6, 24, 2, 2]])
model.eval()
Expand All @@ -281,7 +278,6 @@ def test_mobilenet_v2_residual_setting():
assert out.shape[-1] == 1000


@cpu_only
@pytest.mark.parametrize('model_name', ["mobilenet_v2", "mobilenet_v3_large", "mobilenet_v3_small"])
def test_mobilenet_norm_layer(model_name):
model = models.__dict__[model_name]()
Expand All @@ -295,7 +291,6 @@ def get_gn(num_channels):
assert any(isinstance(x, nn.GroupNorm) for x in model.modules())


@cpu_only
def test_inception_v3_eval():
# replacement for models.inception_v3(pretrained=True) that does not download weights
kwargs = {}
Expand All @@ -311,7 +306,6 @@ def test_inception_v3_eval():
_check_jit_scriptable(model, (x,), unwrapper=script_model_unwrapper.get(name, None))


@cpu_only
def test_fasterrcnn_double():
model = models.detection.fasterrcnn_resnet50_fpn(num_classes=50, pretrained_backbone=False)
model.double()
Expand All @@ -327,7 +321,6 @@ def test_fasterrcnn_double():
assert "labels" in out[0]


@cpu_only
def test_googlenet_eval():
# replacement for models.googlenet(pretrained=True) that does not download weights
kwargs = {}
Expand Down Expand Up @@ -376,7 +369,6 @@ def checkOut(out):
checkOut(out_cpu)


@cpu_only
def test_generalizedrcnn_transform_repr():

min_size, max_size = 224, 299
Expand Down Expand Up @@ -573,7 +565,6 @@ def compute_mean_std(tensor):
pytest.skip(msg)


@cpu_only
@pytest.mark.parametrize('model_name', get_available_detection_models())
def test_detection_model_validation(model_name):
set_rng_seed(0)
Expand Down
Loading

0 comments on commit 5865648

Please sign in to comment.