Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compiler: prevent reduction clause for perfect-enough outer loops #2226

Merged
merged 6 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions devito/passes/iet/langbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,9 +214,6 @@ def DeviceIteration(self):
def Prodder(self):
return self.lang.Prodder

def _device_pointers(self, *args, **kwargs):
return {}


class DeviceAwareMixin(object):

Expand Down
74 changes: 59 additions & 15 deletions devito/passes/iet/parpragma.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from itertools import takewhile

import numpy as np
import cgen as c
from cached_property import cached_property
Expand Down Expand Up @@ -254,6 +256,46 @@ def nthreads_nonaffine(self):
def threadid(self):
return self.sregistry.threadid

def _score_candidate(self, n0, root, collapsable=()):
"""
The score of a collapsable nest depends on the number of fully-parallel
Iterations and their position in the nest (the outer, the better).
"""
nest = [root] + list(collapsable)
n = len(nest)

# Number of fully-parallel collapsable Iterations
key = lambda i: i.is_ParallelNoAtomic
fp_iters = list(takewhile(key, nest))
n_fp_iters = len(fp_iters)

# Number of parallel-if-atomic collapsable Iterations
key = lambda i: i.is_ParallelAtomic
pia_iters = list(takewhile(key, nest))
n_pia_iters = len(pia_iters)

# Prioritize the Dimensions that are more likely to define larger
# iteration spaces
key = lambda d: (not d.is_Derived or
(d.is_Custom and not is_integer(d.symbolic_size)) or
(d.is_Block and d._depth == 1))

fpdims = [i.dim for i in fp_iters]
n_fp_iters_large = len([d for d in fpdims if key(d)])

piadims = [i.dim for i in pia_iters]
n_pia_iters_large = len([d for d in piadims if key(d)])

return (
int(n_fp_iters == n), # Fully-parallel nest
n_fp_iters_large,
n_fp_iters,
n_pia_iters_large,
n_pia_iters,
-(n0 + 1), # The outer, the better
n,
)

def _select_candidates(self, candidates):
assert candidates

Expand All @@ -263,15 +305,18 @@ def _select_candidates(self, candidates):
mapper = {}
for n0, root in enumerate(candidates):

# Score `root` in isolation
mapper[(root, ())] = self._score_candidate(n0, root)

collapsable = []
for n, i in enumerate(candidates[n0+1:], n0+1):
# The Iteration nest [root, ..., i] must be perfect
if not IsPerfectIteration(depth=i).visit(root):
break

# Loops are collapsable only if none of the iteration variables appear
# in initializer expressions. For example, the following two loops
# cannot be collapsed
# Loops are collapsable only if none of the iteration variables
# appear in initializer expressions. For example, the following
# two loops cannot be collapsed
#
# for (i = ... )
# for (j = i ...)
Expand All @@ -281,7 +326,7 @@ def _select_candidates(self, candidates):
if any(j.dim in i.symbolic_min.free_symbols for j in candidates[n0:n]):
break

# Also, we do not want to collapse SIMD-vectorized Iterations
# Can't collapse SIMD-vectorized Iterations
if i.is_Vectorized:
break

Expand All @@ -297,17 +342,9 @@ def _select_candidates(self, candidates):

collapsable.append(i)

# Give a score to this candidate, based on the number of fully-parallel
# Iterations and their position (i.e. outermost to innermost) in the nest
score = (
int(root.is_ParallelNoAtomic),
len(self._device_pointers(root)), # Outermost offloadable
int(len([i for i in collapsable if i.is_ParallelNoAtomic]) >= 1),
int(len([i for i in collapsable if i.is_ParallelRelaxed]) >= 1),
-(n0 + 1) # The outermost, the better
)

mapper[(root, tuple(collapsable))] = score
# Score `root + collapsable`
v = tuple(collapsable)
mapper[(root, v)] = self._score_candidate(n0, root, v)

# Retrieve the candidates with highest score
root, collapsable = max(mapper, key=mapper.get)
Expand Down Expand Up @@ -576,6 +613,13 @@ def __init__(self, sregistry, options, platform, compiler):
self.par_tile = UnboundTuple(options['par-tile'])
self.par_disabled = options['par-disabled']

def _score_candidate(self, n0, root, collapsable=()):
# `ndptrs`, the number of device pointers, part of the score too to
# ensure the outermost loop is offloaded
ndptrs = len(self._device_pointers(root))

return (ndptrs,) + super()._score_candidate(n0, root, collapsable)

def _make_threaded_prodders(self, partree):
if isinstance(partree.root, self.DeviceIteration):
# no-op for now
Expand Down
8 changes: 4 additions & 4 deletions tests/test_adjoint.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pytest

from devito import Operator, norm, Function, Grid, SparseFunction
from devito import Operator, norm, Function, Grid, SparseFunction, inner
from devito.logger import info
from examples.seismic import demo_model, Receiver
from examples.seismic.acoustic import acoustic_setup
Expand Down Expand Up @@ -114,7 +114,7 @@ def test_adjoint_F(self, mkey, shape, kernel, space_order, time_order, setup_fun
solver.adjoint(rec=rec, srca=srca)

# Adjoint test: Verify <Ax,y> matches <x, A^Ty> closely
term1 = np.dot(srca.data.reshape(-1), solver.geometry.src.data)
term1 = inner(srca, solver.geometry.src)
term2 = norm(rec) ** 2
info('<x, A^Ty>: %f, <Ax,y>: %f, difference: %4.4e, ratio: %f'
% (term1, term2, (term1 - term2)/term1, term1 / term2))
Expand Down Expand Up @@ -231,6 +231,6 @@ def test_adjoint_inject_interpolate(self, shape, coords, npoints=19):
# y => p
# x => c
# P^T y => a
term1 = np.dot(p2.data.reshape(-1), p.data.reshape(-1))
term2 = np.dot(c.data.reshape(-1), a.data.reshape(-1))
term1 = inner(p2, p)
term2 = inner(c, a)
assert np.isclose((term1-term2) / term1, 0., atol=1.e-6)
48 changes: 44 additions & 4 deletions tests/test_dle.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def test_cache_blocking_structure_optrelax_prec_inject():
'openmp': True,
'par-collapse-ncores': 1}))

assert_structure(op, ['t,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'],
assert_structure(op, ['t', 't,p_s0_blk0,p_s,rsx,rsy'],
't,p_s0_blk0,p_s,rsx,rsy')


Expand Down Expand Up @@ -863,9 +863,9 @@ def test_incs_no_atomic(self):
op0 = Operator(Inc(uf, 1), opt=('advanced', {'openmp': True,
'par-collapse-ncores': 1,
'par-collapse-work': 0}))

assert 'collapse(3)' in str(op0)
assert 'atomic' in str(op0)
assert 'omp for schedule' in str(op0)
assert 'collapse' not in str(op0)
assert 'atomic' not in str(op0)

# Now only `x` is parallelized
op1 = Operator([Eq(v[t, x, 0, 0], v[t, x, 0, 0] + 1), Inc(uf, 1)],
Expand All @@ -875,6 +875,46 @@ def test_incs_no_atomic(self):
assert 'collapse' not in str(op1)
assert 'atomic' not in str(op1)

def test_incr_perfect_outer(self):
grid = Grid((5, 5))
d = Dimension(name="d")

u = Function(name="u", dimensions=(*grid.dimensions, d),
grid=grid, shape=(*grid.shape, 5), )
v = Function(name="v", dimensions=(*grid.dimensions, d),
grid=grid, shape=(*grid.shape, 5))
w = Function(name="w", grid=grid)

u.data.fill(1)
v.data.fill(2)

summation = Inc(w, u*v)

op = Operator([summation], opt=('advanced', {'openmp': True}))
assert 'reduction' not in str(op)
assert 'omp for' in str(op)

op()
assert np.all(w.data == 10)

def test_incr_perfect_sparse_outer(self):
grid = Grid(shape=(3, 3, 3))

u = TimeFunction(name='u', grid=grid)
s = SparseTimeFunction(name='u', grid=grid, npoint=1, nt=11)

eqns = s.inject(u, expr=s)

op = Operator(eqns, opt=('advanced', {'par-collapse-ncores': 0,
'openmp': True}))

iters = FindNodes(Iteration).visit(op)
assert len(iters) == 5
assert iters[0].is_Sequential
assert all(i.is_ParallelAtomic for i in iters[1:])
assert iters[1].pragmas[0].value == 'omp for schedule(dynamic,chunk_size)'
assert all(not i.pragmas for i in iters[2:])

@pytest.mark.parametrize('exprs,simd_level,expected', [
(['Eq(y.symbolic_max, g[0, x], implicit_dims=(t, x))',
'Inc(h1[0, 0], 1, implicit_dims=(t, x, y))'],
Expand Down
30 changes: 27 additions & 3 deletions tests/test_gpu_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

from conftest import assert_structure
from devito import (Constant, Eq, Inc, Grid, Function, ConditionalDimension,
MatrixSparseTimeFunction, SparseTimeFunction, SubDimension,
SubDomain, SubDomainSet, TimeFunction, Operator, configuration,
switchconfig)
Dimension, MatrixSparseTimeFunction, SparseTimeFunction,
SubDimension, SubDomain, SubDomainSet, TimeFunction,
Operator, configuration, switchconfig)
from devito.arch import get_gpu_info
from devito.exceptions import InvalidArgument
from devito.ir import (Conditional, Expression, Section, FindNodes, FindSymbols,
Expand Down Expand Up @@ -110,6 +110,30 @@ def test_fission(self):
assert np.all(usave.data[5:] == expected[5:])
assert np.all(vsave.data[:5] == expected[:5])

def test_incr_perfect_outer(self):
grid = Grid((5, 5))
d = Dimension(name="d")

u = Function(name="u", dimensions=(*grid.dimensions, d),
grid=grid, shape=(*grid.shape, 5), )
v = Function(name="v", dimensions=(*grid.dimensions, d),
grid=grid, shape=(*grid.shape, 5))
w = Function(name="w", grid=grid)

u.data.fill(1)
v.data.fill(2)

summation = Inc(w, u*v)

op = Operator([summation])

assert 'reduction' not in str(op)
assert 'collapse(2)' in str(op)
assert 'parallel' in str(op)

op()
assert np.all(w.data == 10)


class Bundle(SubDomain):
"""
Expand Down