From a9132a659ed5dab2312867dabe6812cc9655b8da Mon Sep 17 00:00:00 2001 From: Mathias Louboutin Date: Fri, 6 Oct 2023 10:25:48 -0400 Subject: [PATCH 1/6] compiler: prevent reduction clause for perfect-enough outer loops --- devito/passes/iet/parpragma.py | 10 ++++++++++ tests/test_dle.py | 23 ++++++++++++++++++++++- tests/test_gpu_openacc.py | 24 +++++++++++++++++++++++- tests/test_gpu_openmp.py | 22 ++++++++++++++++++++++ 4 files changed, 77 insertions(+), 2 deletions(-) diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index b6476192b2..5d75e002db 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -318,6 +318,16 @@ def _make_reductions(self, partree): if not any(i.is_ParallelAtomic for i in partree.collapsed): return partree + # We bypass the corner case where a reduction might not be optimal, mainly: + # - Only the most inner loop is atomic + # In which case we can parallelize the perfect nest + # The opposite corner case (most outer loop atomic) + # should be detected before this pass + nc = len(partree.collapsed) + if all(i.is_ParallelNoAtomic for i in partree.collapsed[:nc-1]): + mapper = {partree.root: partree.root._rebuild(ncollapsed=nc-1)} + return Transformer(mapper).visit(partree) + exprs = [i for i in FindNodes(Expression).visit(partree) if i.is_reduction] reductions = [] diff --git a/tests/test_dle.py b/tests/test_dle.py index ec24128983..ecd5644e41 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -863,7 +863,6 @@ def test_incs_no_atomic(self): op0 = Operator(Inc(uf, 1), opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1, 'par-collapse-work': 0})) - assert 'collapse(3)' in str(op0) assert 'atomic' in str(op0) @@ -875,6 +874,28 @@ def test_incs_no_atomic(self): assert 'collapse' not in str(op1) assert 'atomic' not in str(op1) + def test_incr_perfect_outer(self): + grid = Grid((5, 5)) + d = Dimension(name="d") + u = Function(name="u", dimensions=(*grid.dimensions, d), + grid=grid, shape=(*grid.shape, 5), ) + v = Function(name="v", dimensions=(*grid.dimensions, d), + grid=grid, shape=(*grid.shape, 5)) + u.data.fill(1) + v.data.fill(2) + + w = Function(name="w", grid=grid) + + summation = Inc(w, u*v) + + op0 = Operator([summation]) + assert 'reduction' not in str(op0) + assert 'collapse(2)' in str(op0) + assert 'omp for' in str(op0) + + op0() + assert np.all(w.data == 10) + @pytest.mark.parametrize('exprs,simd_level,expected', [ (['Eq(y.symbolic_max, g[0, x], implicit_dims=(t, x))', 'Inc(h1[0, 0], 1, implicit_dims=(t, x, y))'], diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index 3085ad85c9..526ddca59d 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -2,7 +2,7 @@ import numpy as np from devito import (Grid, Function, TimeFunction, SparseTimeFunction, Eq, Operator, - norm, solve) + norm, solve, Dimension, Inc) from conftest import skipif, assert_blocking, opts_device_tiling from devito.data import LEFT from devito.exceptions import InvalidOperator @@ -168,6 +168,28 @@ def test_multi_tile_blocking_structure(self): assert len(iters) == len(v) assert all(i.step == j for i, j in zip(iters, v)) + def test_incr_perfect_outer(self): + grid = Grid((5, 5)) + d = Dimension(name="d") + u = Function(name="u", dimensions=(*grid.dimensions, d), + grid=grid, shape=(*grid.shape, 5), ) + v = Function(name="v", dimensions=(*grid.dimensions, d), + grid=grid, shape=(*grid.shape, 5)) + u.data.fill(1) + v.data.fill(2) + + w = Function(name="w", grid=grid) + + summation = Inc(w, u*v) + + op0 = Operator([summation]) + assert 'reduction' not in str(op0) + assert 'collapse(2)' in str(op0) + assert 'acc parallel loop' in str(op0) + + op0() + assert np.all(w.data == 10) + class TestOperator(object): diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py index 29866508d8..38157f0962 100644 --- a/tests/test_gpu_openmp.py +++ b/tests/test_gpu_openmp.py @@ -268,6 +268,28 @@ def test_timeparallel_reduction(self): ('omp target teams distribute parallel for collapse(3)' ' reduction(+:f[0])') + def test_incr_perfect_outer(self): + grid = Grid((5, 5)) + d = Dimension(name="d") + u = Function(name="u", dimensions=(*grid.dimensions, d), + grid=grid, shape=(*grid.shape, 5), ) + v = Function(name="v", dimensions=(*grid.dimensions, d), + grid=grid, shape=(*grid.shape, 5)) + u.data.fill(1) + v.data.fill(2) + + w = Function(name="w", grid=grid) + + summation = Inc(w, u*v) + + op0 = Operator([summation]) + assert 'reduction' not in str(op0) + assert 'collapse(2)' in str(op0) + assert 'omp target teams distribute parallel' in str(op0) + + op0() + assert np.all(w.data == 10) + class TestOperator(object): From 5f18b8d7275dc97b90fbd19583baf05dc9ba9e2d Mon Sep 17 00:00:00 2001 From: Mathias Louboutin Date: Fri, 6 Oct 2023 11:16:02 -0400 Subject: [PATCH 2/6] ci: switch innerproduct tests to inner --- devito/passes/iet/parpragma.py | 2 +- tests/test_adjoint.py | 8 ++++---- tests/test_dle.py | 3 +-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 5d75e002db..a4513ef31f 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -324,7 +324,7 @@ def _make_reductions(self, partree): # The opposite corner case (most outer loop atomic) # should be detected before this pass nc = len(partree.collapsed) - if all(i.is_ParallelNoAtomic for i in partree.collapsed[:nc-1]): + if nc > 1 and all(i.is_ParallelNoAtomic for i in partree.collapsed[:nc-1]): mapper = {partree.root: partree.root._rebuild(ncollapsed=nc-1)} return Transformer(mapper).visit(partree) diff --git a/tests/test_adjoint.py b/tests/test_adjoint.py index 473e484c0e..759b86a3ae 100644 --- a/tests/test_adjoint.py +++ b/tests/test_adjoint.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from devito import Operator, norm, Function, Grid, SparseFunction +from devito import Operator, norm, Function, Grid, SparseFunction, inner from devito.logger import info from examples.seismic import demo_model, Receiver from examples.seismic.acoustic import acoustic_setup @@ -114,7 +114,7 @@ def test_adjoint_F(self, mkey, shape, kernel, space_order, time_order, setup_fun solver.adjoint(rec=rec, srca=srca) # Adjoint test: Verify matches closely - term1 = np.dot(srca.data.reshape(-1), solver.geometry.src.data) + term1 = inner(srca, solver.geometry.src) term2 = norm(rec) ** 2 info(': %f, : %f, difference: %4.4e, ratio: %f' % (term1, term2, (term1 - term2)/term1, term1 / term2)) @@ -231,6 +231,6 @@ def test_adjoint_inject_interpolate(self, shape, coords, npoints=19): # y => p # x => c # P^T y => a - term1 = np.dot(p2.data.reshape(-1), p.data.reshape(-1)) - term2 = np.dot(c.data.reshape(-1), a.data.reshape(-1)) + term1 = inner(p2, p) + term2 = inner(c, a) assert np.isclose((term1-term2) / term1, 0., atol=1.e-6) diff --git a/tests/test_dle.py b/tests/test_dle.py index ecd5644e41..e7935dd45d 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -888,9 +888,8 @@ def test_incr_perfect_outer(self): summation = Inc(w, u*v) - op0 = Operator([summation]) + op0 = Operator([summation], opt=('advanced', {'openmp': True})) assert 'reduction' not in str(op0) - assert 'collapse(2)' in str(op0) assert 'omp for' in str(op0) op0() From c807e2b4dfcf1960183d4b8016f885098ab164f4 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 10 Oct 2023 07:33:58 +0000 Subject: [PATCH 3/6] tests: Factorize test_incr_perfect_outer --- tests/test_gpu_common.py | 30 +++++++++++++++++++++++++++--- tests/test_gpu_openacc.py | 24 +----------------------- tests/test_gpu_openmp.py | 22 ---------------------- 3 files changed, 28 insertions(+), 48 deletions(-) diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index 031bd9181b..071005464d 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -6,9 +6,9 @@ from conftest import assert_structure from devito import (Constant, Eq, Inc, Grid, Function, ConditionalDimension, - MatrixSparseTimeFunction, SparseTimeFunction, SubDimension, - SubDomain, SubDomainSet, TimeFunction, Operator, configuration, - switchconfig) + Dimension, MatrixSparseTimeFunction, SparseTimeFunction, + SubDimension, SubDomain, SubDomainSet, TimeFunction, + Operator, configuration, switchconfig) from devito.arch import get_gpu_info from devito.exceptions import InvalidArgument from devito.ir import (Conditional, Expression, Section, FindNodes, FindSymbols, @@ -110,6 +110,30 @@ def test_fission(self): assert np.all(usave.data[5:] == expected[5:]) assert np.all(vsave.data[:5] == expected[:5]) + def test_incr_perfect_outer(self): + grid = Grid((5, 5)) + d = Dimension(name="d") + + u = Function(name="u", dimensions=(*grid.dimensions, d), + grid=grid, shape=(*grid.shape, 5), ) + v = Function(name="v", dimensions=(*grid.dimensions, d), + grid=grid, shape=(*grid.shape, 5)) + w = Function(name="w", grid=grid) + + u.data.fill(1) + v.data.fill(2) + + summation = Inc(w, u*v) + + op = Operator([summation]) + + assert 'reduction' not in str(op) + assert 'collapse(2)' in str(op) + assert 'parallel' in str(op) + + op() + assert np.all(w.data == 10) + class Bundle(SubDomain): """ diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index 526ddca59d..3085ad85c9 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -2,7 +2,7 @@ import numpy as np from devito import (Grid, Function, TimeFunction, SparseTimeFunction, Eq, Operator, - norm, solve, Dimension, Inc) + norm, solve) from conftest import skipif, assert_blocking, opts_device_tiling from devito.data import LEFT from devito.exceptions import InvalidOperator @@ -168,28 +168,6 @@ def test_multi_tile_blocking_structure(self): assert len(iters) == len(v) assert all(i.step == j for i, j in zip(iters, v)) - def test_incr_perfect_outer(self): - grid = Grid((5, 5)) - d = Dimension(name="d") - u = Function(name="u", dimensions=(*grid.dimensions, d), - grid=grid, shape=(*grid.shape, 5), ) - v = Function(name="v", dimensions=(*grid.dimensions, d), - grid=grid, shape=(*grid.shape, 5)) - u.data.fill(1) - v.data.fill(2) - - w = Function(name="w", grid=grid) - - summation = Inc(w, u*v) - - op0 = Operator([summation]) - assert 'reduction' not in str(op0) - assert 'collapse(2)' in str(op0) - assert 'acc parallel loop' in str(op0) - - op0() - assert np.all(w.data == 10) - class TestOperator(object): diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py index 38157f0962..29866508d8 100644 --- a/tests/test_gpu_openmp.py +++ b/tests/test_gpu_openmp.py @@ -268,28 +268,6 @@ def test_timeparallel_reduction(self): ('omp target teams distribute parallel for collapse(3)' ' reduction(+:f[0])') - def test_incr_perfect_outer(self): - grid = Grid((5, 5)) - d = Dimension(name="d") - u = Function(name="u", dimensions=(*grid.dimensions, d), - grid=grid, shape=(*grid.shape, 5), ) - v = Function(name="v", dimensions=(*grid.dimensions, d), - grid=grid, shape=(*grid.shape, 5)) - u.data.fill(1) - v.data.fill(2) - - w = Function(name="w", grid=grid) - - summation = Inc(w, u*v) - - op0 = Operator([summation]) - assert 'reduction' not in str(op0) - assert 'collapse(2)' in str(op0) - assert 'omp target teams distribute parallel' in str(op0) - - op0() - assert np.all(w.data == 10) - class TestOperator(object): From a2faf422ffa1956ec56dd60b934b665edc2e67d4 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 10 Oct 2023 08:07:33 +0000 Subject: [PATCH 4/6] compiler: Revamp Parizer scoring function --- devito/passes/iet/langbase.py | 3 -- devito/passes/iet/parpragma.py | 74 ++++++++++++++++++++++------------ tests/test_dle.py | 18 +++++---- 3 files changed, 59 insertions(+), 36 deletions(-) diff --git a/devito/passes/iet/langbase.py b/devito/passes/iet/langbase.py index 2acccba648..91e68fc02b 100644 --- a/devito/passes/iet/langbase.py +++ b/devito/passes/iet/langbase.py @@ -214,9 +214,6 @@ def DeviceIteration(self): def Prodder(self): return self.lang.Prodder - def _device_pointers(self, *args, **kwargs): - return {} - class DeviceAwareMixin(object): diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index a4513ef31f..1deb4f3f8b 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -1,3 +1,5 @@ +from itertools import takewhile + import numpy as np import cgen as c from cached_property import cached_property @@ -254,6 +256,36 @@ def nthreads_nonaffine(self): def threadid(self): return self.sregistry.threadid + def _score_candidate(self, n0, root, collapsable=()): + """ + The score of a collapsable nest depends on the number of fully-parallel + Iterations and their position in the nest (the outer, the better). + """ + nest = [root] + list(collapsable) + n = len(nest) + + # Number of fully-parallel collapsable Iterations + key = lambda i: i.is_ParallelNoAtomic + fpiters = list(takewhile(key, nest)) + nfpiters = len(fpiters) + + # Prioritize the Dimensions that are more likely to define larger + # iteration spaces + fpdims = [i.dim for i in fpiters] + key = lambda d: (not d.is_Derived or + d.is_Custom or # NOTE: might use a refinement + (d.is_Block and d._depth == 1)) + nfpiters_large = len([d for d in fpdims if key(d)]) + + return ( + int(nfpiters == n), # Fully-parallel nest + int(nfpiters == 0 and n), # Fully-atomic nest + nfpiters_large, + -(n0 + 1), # The outer, the better + nfpiters, + n, + ) + def _select_candidates(self, candidates): assert candidates @@ -263,15 +295,18 @@ def _select_candidates(self, candidates): mapper = {} for n0, root in enumerate(candidates): + # Score `root` in isolation + mapper[(root, ())] = self._score_candidate(n0, root) + collapsable = [] for n, i in enumerate(candidates[n0+1:], n0+1): # The Iteration nest [root, ..., i] must be perfect if not IsPerfectIteration(depth=i).visit(root): break - # Loops are collapsable only if none of the iteration variables appear - # in initializer expressions. For example, the following two loops - # cannot be collapsed + # Loops are collapsable only if none of the iteration variables + # appear in initializer expressions. For example, the following + # two loops cannot be collapsed # # for (i = ... ) # for (j = i ...) @@ -281,7 +316,7 @@ def _select_candidates(self, candidates): if any(j.dim in i.symbolic_min.free_symbols for j in candidates[n0:n]): break - # Also, we do not want to collapse SIMD-vectorized Iterations + # Can't collapse SIMD-vectorized Iterations if i.is_Vectorized: break @@ -297,17 +332,9 @@ def _select_candidates(self, candidates): collapsable.append(i) - # Give a score to this candidate, based on the number of fully-parallel - # Iterations and their position (i.e. outermost to innermost) in the nest - score = ( - int(root.is_ParallelNoAtomic), - len(self._device_pointers(root)), # Outermost offloadable - int(len([i for i in collapsable if i.is_ParallelNoAtomic]) >= 1), - int(len([i for i in collapsable if i.is_ParallelRelaxed]) >= 1), - -(n0 + 1) # The outermost, the better - ) - - mapper[(root, tuple(collapsable))] = score + # Score `root + collapsable` + v = tuple(collapsable) + mapper[(root, v)] = self._score_candidate(n0, root, v) # Retrieve the candidates with highest score root, collapsable = max(mapper, key=mapper.get) @@ -318,16 +345,6 @@ def _make_reductions(self, partree): if not any(i.is_ParallelAtomic for i in partree.collapsed): return partree - # We bypass the corner case where a reduction might not be optimal, mainly: - # - Only the most inner loop is atomic - # In which case we can parallelize the perfect nest - # The opposite corner case (most outer loop atomic) - # should be detected before this pass - nc = len(partree.collapsed) - if nc > 1 and all(i.is_ParallelNoAtomic for i in partree.collapsed[:nc-1]): - mapper = {partree.root: partree.root._rebuild(ncollapsed=nc-1)} - return Transformer(mapper).visit(partree) - exprs = [i for i in FindNodes(Expression).visit(partree) if i.is_reduction] reductions = [] @@ -586,6 +603,13 @@ def __init__(self, sregistry, options, platform, compiler): self.par_tile = UnboundTuple(options['par-tile']) self.par_disabled = options['par-disabled'] + def _score_candidate(self, n0, root, collapsable=()): + # `ndptrs`, the number of device pointers, part of the score too to + # ensure the outermost loop is offloaded + ndptrs = len(self._device_pointers(root)) + + return (ndptrs,) + super()._score_candidate(n0, root, collapsable) + def _make_threaded_prodders(self, partree): if isinstance(partree.root, self.DeviceIteration): # no-op for now diff --git a/tests/test_dle.py b/tests/test_dle.py index e7935dd45d..3a94f46a9d 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -863,8 +863,9 @@ def test_incs_no_atomic(self): op0 = Operator(Inc(uf, 1), opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1, 'par-collapse-work': 0})) - assert 'collapse(3)' in str(op0) - assert 'atomic' in str(op0) + assert 'omp for schedule' in str(op0) + assert 'collapse' not in str(op0) + assert 'atomic' not in str(op0) # Now only `x` is parallelized op1 = Operator([Eq(v[t, x, 0, 0], v[t, x, 0, 0] + 1), Inc(uf, 1)], @@ -877,22 +878,23 @@ def test_incs_no_atomic(self): def test_incr_perfect_outer(self): grid = Grid((5, 5)) d = Dimension(name="d") + u = Function(name="u", dimensions=(*grid.dimensions, d), grid=grid, shape=(*grid.shape, 5), ) v = Function(name="v", dimensions=(*grid.dimensions, d), grid=grid, shape=(*grid.shape, 5)) + w = Function(name="w", grid=grid) + u.data.fill(1) v.data.fill(2) - w = Function(name="w", grid=grid) - summation = Inc(w, u*v) - op0 = Operator([summation], opt=('advanced', {'openmp': True})) - assert 'reduction' not in str(op0) - assert 'omp for' in str(op0) + op = Operator([summation], opt=('advanced', {'openmp': True})) + assert 'reduction' not in str(op) + assert 'omp for' in str(op) - op0() + op() assert np.all(w.data == 10) @pytest.mark.parametrize('exprs,simd_level,expected', [ From 1b6cd885eb6d13a13876a8b5236316605377f2d0 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 10 Oct 2023 14:29:57 +0000 Subject: [PATCH 5/6] compiler: Prioritize large sparse loops over tiny ones --- devito/passes/iet/parpragma.py | 28 +++++++++++++++++++--------- tests/test_dle.py | 19 ++++++++++++++++++- 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 1deb4f3f8b..6bfe2a4c42 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -266,23 +266,33 @@ def _score_candidate(self, n0, root, collapsable=()): # Number of fully-parallel collapsable Iterations key = lambda i: i.is_ParallelNoAtomic - fpiters = list(takewhile(key, nest)) - nfpiters = len(fpiters) + fp_iters = list(takewhile(key, nest)) + n_fp_iters = len(fp_iters) + + # Number of parallel-if-atomic collapsable Iterations + key = lambda i: i.is_ParallelAtomic + pia_iters = list(takewhile(key, nest)) + n_pia_iters = len(pia_iters) # Prioritize the Dimensions that are more likely to define larger # iteration spaces - fpdims = [i.dim for i in fpiters] key = lambda d: (not d.is_Derived or - d.is_Custom or # NOTE: might use a refinement + (d.is_Custom and not is_integer(d.symbolic_size)) or (d.is_Block and d._depth == 1)) - nfpiters_large = len([d for d in fpdims if key(d)]) + + fpdims = [i.dim for i in fp_iters] + n_fp_iters_large = len([d for d in fpdims if key(d)]) + + piadims = [i.dim for i in pia_iters] + n_pia_iters_large = len([d for d in piadims if key(d)]) return ( - int(nfpiters == n), # Fully-parallel nest - int(nfpiters == 0 and n), # Fully-atomic nest - nfpiters_large, + int(n_fp_iters == n), # Fully-parallel nest + n_fp_iters_large, + n_pia_iters_large, + n_pia_iters, -(n0 + 1), # The outer, the better - nfpiters, + n_fp_iters, n, ) diff --git a/tests/test_dle.py b/tests/test_dle.py index 3a94f46a9d..520405f839 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -284,7 +284,7 @@ def test_cache_blocking_structure_optrelax_prec_inject(): 'openmp': True, 'par-collapse-ncores': 1})) - assert_structure(op, ['t,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'], + assert_structure(op, ['t', 't,p_s0_blk0,p_s,rsx,rsy'], 't,p_s0_blk0,p_s,rsx,rsy') @@ -897,6 +897,23 @@ def test_incr_perfect_outer(self): op() assert np.all(w.data == 10) + def test_incr_perfect_sparse_outer(self): + grid = Grid(shape=(3, 3, 3)) + + u = TimeFunction(name='u', grid=grid) + s = SparseTimeFunction(name='u', grid=grid, npoint=1, nt=11) + + eqns = s.inject(u, expr=s) + + op = Operator(eqns, opt=('advanced', {'par-collapse-ncores': 0})) + + iters = FindNodes(Iteration).visit(op) + assert len(iters) == 5 + assert iters[0].is_Sequential + assert all(i.is_ParallelAtomic for i in iters[1:]) + assert iters[1].pragmas[0].value == 'omp for schedule(dynamic,chunk_size)' + assert all(not i.pragmas for i in iters[2:]) + @pytest.mark.parametrize('exprs,simd_level,expected', [ (['Eq(y.symbolic_max, g[0, x], implicit_dims=(t, x))', 'Inc(h1[0, 0], 1, implicit_dims=(t, x, y))'], From daf671c3d08a63d0a4b8e65fb1e12d2fa8c349c2 Mon Sep 17 00:00:00 2001 From: mloubout Date: Tue, 10 Oct 2023 10:53:58 -0400 Subject: [PATCH 6/6] compiler: prioritize smaller perfect loop over atomics --- devito/passes/iet/parpragma.py | 2 +- tests/test_dle.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 6bfe2a4c42..29ba9a986b 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -289,10 +289,10 @@ def _score_candidate(self, n0, root, collapsable=()): return ( int(n_fp_iters == n), # Fully-parallel nest n_fp_iters_large, + n_fp_iters, n_pia_iters_large, n_pia_iters, -(n0 + 1), # The outer, the better - n_fp_iters, n, ) diff --git a/tests/test_dle.py b/tests/test_dle.py index 520405f839..03c0b533c9 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -905,7 +905,8 @@ def test_incr_perfect_sparse_outer(self): eqns = s.inject(u, expr=s) - op = Operator(eqns, opt=('advanced', {'par-collapse-ncores': 0})) + op = Operator(eqns, opt=('advanced', {'par-collapse-ncores': 0, + 'openmp': True})) iters = FindNodes(Iteration).visit(op) assert len(iters) == 5