From 76a3c4b30512fe67ad265fae32efa0faa95e1a82 Mon Sep 17 00:00:00 2001 From: mloubout Date: Mon, 11 Sep 2023 09:35:15 -0400 Subject: [PATCH 1/7] api: always use conditional dimension for interpolation radius dim --- devito/core/autotuning.py | 7 +++++-- devito/operations/interpolators.py | 28 +++++++++++++++---------- devito/operator/operator.py | 5 +++-- devito/tools/data_structures.py | 9 ++++++++ devito/tools/utils.py | 9 +++++++- devito/types/basic.py | 4 +++- devito/types/dense.py | 2 +- devito/types/dimension.py | 33 +++++++++++++++++++----------- tests/test_operator.py | 6 ++++-- 9 files changed, 71 insertions(+), 32 deletions(-) diff --git a/devito/core/autotuning.py b/devito/core/autotuning.py index 603a78efd6..16b146721a 100644 --- a/devito/core/autotuning.py +++ b/devito/core/autotuning.py @@ -209,8 +209,11 @@ def init_time_bounds(stepper, at_args, args): else: at_args[dim.max_name] = at_args[dim.min_name] + options['squeezer'] if dim.size_name in args: - # May need to shrink to avoid OOB accesses - at_args[dim.max_name] = min(at_args[dim.max_name], args[dim.max_name]) + if isinstance(args[dim.size_name], range): + pass + else: + # May need to shrink to avoid OOB accesses + at_args[dim.max_name] = min(at_args[dim.max_name], args[dim.max_name]) if at_args[dim.min_name] > at_args[dim.max_name]: warning("too few time iterations; skipping") return False diff --git a/devito/operations/interpolators.py b/devito/operations/interpolators.py index ab758ba982..92bc392392 100644 --- a/devito/operations/interpolators.py +++ b/devito/operations/interpolators.py @@ -155,7 +155,19 @@ def _rdim(self): -self.r+1, self.r, 2*self.r, parent) for d in self._gdims] - return DimensionTuple(*dims, getters=self._gdims) + # Make radius dimension conditional to avoid OOB + rdims = [] + pos = self.sfunction._position_map.values() + + for (d, rd, p) in zip(self._gdims, dims, pos): + # Add conditional to avoid OOB + lb = sympy.And(rd + p >= d.symbolic_min - self.r, evaluate=False) + ub = sympy.And(rd + p <= d.symbolic_max + self.r, evaluate=False) + cond = sympy.And(lb, ub, evaluate=False) + rdims.append(ConditionalDimension(rd.name, rd, condition=cond, + indirect=True)) + + return DimensionTuple(*rdims, getters=self._gdims) def _augment_implicit_dims(self, implicit_dims): if self.sfunction._sparse_position == -1: @@ -177,13 +189,6 @@ def _interp_idx(self, variables, implicit_dims=None): mapper = {} pos = self.sfunction._position_map.values() - for ((di, d), rd, p) in zip(enumerate(self._gdims), self._rdim, pos): - # Add conditional to avoid OOB - lb = sympy.And(rd + p >= d.symbolic_min - self.r, evaluate=False) - ub = sympy.And(rd + p <= d.symbolic_max + self.r, evaluate=False) - cond = sympy.And(lb, ub, evaluate=False) - mapper[d] = ConditionalDimension(rd.name, rd, condition=cond, indirect=True) - # Temporaries for the position temps = self._positions(implicit_dims) @@ -191,10 +196,10 @@ def _interp_idx(self, variables, implicit_dims=None): temps.extend(self._coeff_temps(implicit_dims)) # Substitution mapper for variables + mapper = self._rdim._getters idx_subs = {v: v.subs({k: c + p for ((k, c), p) in zip(mapper.items(), pos)}) for v in variables} - idx_subs.update(dict(zip(self._rdim, mapper.values()))) return idx_subs, temps @@ -290,7 +295,7 @@ def _inject(self, field, expr, implicit_dims=None): injection expression, but that should be honored when constructing the operator. """ - implicit_dims = self._augment_implicit_dims(implicit_dims) + self._rdim + implicit_dims = self._augment_implicit_dims(implicit_dims) # Make iterable to support inject((u, v), expr=expr) # or inject((u, v), expr=(expr1, expr2)) @@ -380,5 +385,6 @@ def interpolation_coeffs(self): @property def _weights(self): ddim, cdim = self.interpolation_coeffs.dimensions[1:] - return Mul(*[self.interpolation_coeffs.subs({ddim: ri, cdim: rd-rd.symbolic_min}) + return Mul(*[self.interpolation_coeffs.subs({ddim: ri, + cdim: rd-rd.parent.symbolic_min}) for (ri, rd) in enumerate(self._rdim)]) diff --git a/devito/operator/operator.py b/devito/operator/operator.py index d1bee9fa66..b91d51727f 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -24,7 +24,7 @@ from devito.symbolics import estimate_cost from devito.tools import (DAG, OrderedSet, Signer, ReducerMap, as_tuple, flatten, filter_sorted, frozendict, is_integer, split, timed_pass, - timed_region) + timed_region, contains_val) from devito.types import Grid, Evaluable __all__ = ['Operator'] @@ -526,6 +526,7 @@ def _prepare_arguments(self, autotune=None, **kwargs): edges = [(i, i.parent) for i in self.dimensions if i.is_Derived and i.parent in set(nodes)] toposort = DAG(nodes, edges).topological_sort() + futures = {} for d in reversed(toposort): if set(d._arg_names).intersection(kwargs): @@ -560,7 +561,7 @@ def _prepare_arguments(self, autotune=None, **kwargs): # a TimeFunction `usave(t_sub, x, y)`, an override for `fact` is # supplied w/o overriding `usave`; that's legal pass - elif is_integer(args[k]) and args[k] not in as_tuple(v): + elif is_integer(args[k]) and not contains_val(args[k], v): raise ValueError("Default `%s` is incompatible with other args as " "`%s=%s`, while `%s=%s` is expected. Perhaps you " "forgot to override `%s`?" % diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index 539f75d593..3afe7197eb 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -110,6 +110,7 @@ def unique(self, key): Key for which to retrieve a unique value. """ candidates = self.getall(key) + candidates = [c for c in candidates if c is not None] def compare_to_first(v): first = candidates[0] @@ -122,12 +123,20 @@ def compare_to_first(v): return first in v elif isinstance(first, Set): return v in first + elif isinstance(v, range): + if isinstance(first, range): + return first.stop > v.start or v.stop > first.start + else: + return first >= v.start and first < v.stop + elif isinstance(first, range): + return v >= first.start and v < first.stop else: return first == v if len(candidates) == 1: return candidates[0] elif all(map(compare_to_first, candidates)): + # return first non-range return candidates[0] else: raise ValueError("Unable to find unique value for key %s, candidates: %s" diff --git a/devito/tools/utils.py b/devito/tools/utils.py index 16f0987930..d99bf34b25 100644 --- a/devito/tools/utils.py +++ b/devito/tools/utils.py @@ -12,7 +12,7 @@ 'roundm', 'powerset', 'invert', 'flatten', 'single_or', 'filter_ordered', 'as_mapper', 'filter_sorted', 'pprint', 'sweep', 'all_equal', 'as_list', 'indices_to_slices', 'indices_to_sections', 'transitive_closure', - 'humanbytes'] + 'humanbytes', 'contains_val'] def prod(iterable, initial=1): @@ -75,6 +75,13 @@ def is_integer(value): return isinstance(value, (int, np.integer, sympy.Integer)) +def contains_val(val, items): + try: + return val in items + except TypeError: + return val == items + + def generator(): """ Return a function ``f`` that generates integer numbers starting at 0 diff --git a/devito/types/basic.py b/devito/types/basic.py index c36496d195..400232faed 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -838,13 +838,15 @@ def __new__(cls, *args, **kwargs): newobj._dimensions = dimensions newobj._shape = cls.__shape_setup__(**kwargs) newobj._dtype = cls.__dtype_setup__(**kwargs) - newobj.__init_finalize__(*args, **kwargs) # All objects created off an existing AbstractFunction `f` (e.g., # via .func, or .subs, such as `f(x + 1)`) keep a reference to `f` # through the `function` field newobj.function = function or newobj + # Initialization + newobj.__init_finalize__(*args, **kwargs) + return newobj def __init__(self, *args, **kwargs): diff --git a/devito/types/dense.py b/devito/types/dense.py index ec371b662c..2a13fa91af 100644 --- a/devito/types/dense.py +++ b/devito/types/dense.py @@ -94,7 +94,7 @@ def __init_finalize__(self, *args, function=None, **kwargs): # a reference to the user-provided buffer self._initializer = None if len(initializer) > 0: - self.data_with_halo[:] = initializer + self.data_with_halo[:] = initializer[:] else: # This is a corner case -- we might get here, for example, when # running with MPI and some processes get 0-size arrays after diff --git a/devito/types/dimension.py b/devito/types/dimension.py index 76d9d9e60a..6044f01469 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -298,13 +298,19 @@ def _arg_values(self, interval, grid=None, args=None, **kwargs): # may represent sets of legal values. If that's the case, here we just # pick one. Note that we sort for determinism try: - loc_minv = sorted(loc_minv).pop(0) - except TypeError: - pass + loc_minv = loc_minv.start + except AttributeError: + try: + loc_minv = sorted(loc_minv).pop(0) + except TypeError: + pass try: - loc_maxv = sorted(loc_maxv).pop(0) - except TypeError: - pass + loc_maxv = loc_maxv.start + except AttributeError: + try: + loc_maxv = sorted(loc_maxv).pop(0) + except TypeError: + pass return {self.min_name: loc_minv, self.max_name: loc_maxv} @@ -853,8 +859,7 @@ def _arg_defaults(self, _min=None, size=None, alias=None): factor = defaults[dim._factor.name] = dim._factor.data except AttributeError: factor = dim._factor - defaults[dim.parent.max_name] = \ - frozenset(range(factor*(size - 1), factor*(size))) + defaults[dim.parent.max_name] = range(1, factor*(size)) return defaults @@ -977,8 +982,9 @@ def symbolic_incr(self): def bound_symbols(self): return set(self.parent.bound_symbols) - def _arg_defaults(self, **kwargs): - return {} + def _arg_defaults(self, alias=None, **kwargs): + dim = alias or self + return {dim.parent.size_name: range(self.symbolic_size, np.iinfo(np.int64).max)} def _arg_values(self, *args, **kwargs): return {} @@ -1446,7 +1452,7 @@ def symbolic_max(self): def _arg_names(self): return (self.min_name, self.max_name, self.name) + self.parent._arg_names - def _arg_defaults(self, _min=None, **kwargs): + def _arg_defaults(self, _min=None, size=None, **kwargs): """ A map of default argument values defined by this dimension. @@ -1460,7 +1466,10 @@ def _arg_defaults(self, _min=None, **kwargs): A SteppingDimension does not know its max point and therefore does not have a size argument. """ - return {self.parent.min_name: _min} + args = {self.parent.min_name: _min} + if size: + args[self.parent.size_name] = range(size-1, np.iinfo(np.int32).max) + return args def _arg_values(self, *args, **kwargs): """ diff --git a/tests/test_operator.py b/tests/test_operator.py index f38ac01942..4f8228bc24 100644 --- a/tests/test_operator.py +++ b/tests/test_operator.py @@ -707,6 +707,8 @@ def verify_arguments(self, arguments, expected): if isinstance(v, (Function, SparseFunction)): condition = v._C_as_ndarray(arguments[name])[v._mask_domain] == v.data condition = condition.all() + elif isinstance(arguments[name], range): + condition = arguments[name].start <= v < arguments[name].stop else: condition = arguments[name] == v @@ -1803,7 +1805,7 @@ def test_scheduling_sparse_functions(self): # `trees` than 6 op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False})) trees = retrieve_iteration_tree(op) - assert len(trees) == 5 + assert len(trees) == 6 # Time loop not shared due to the WAR assert trees[0][0].dim is time and trees[0][0] is trees[1][0] # this IS shared assert trees[1][0] is not trees[3][0] @@ -1813,7 +1815,7 @@ def test_scheduling_sparse_functions(self): eqn2 = sf1.inject(u1.forward, expr=sf1) op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False})) trees = retrieve_iteration_tree(op) - assert len(trees) == 5 + assert len(trees) == 6 assert all(trees[0][0] is i[0] for i in trees) def test_scheduling_with_free_dims(self): From fa1a9b7eb772676f67fe452f6afa4d4451fbadc4 Mon Sep 17 00:00:00 2001 From: mloubout Date: Fri, 15 Sep 2023 09:19:22 -0400 Subject: [PATCH 2/7] api: process injected expression dimensions in case it's not the sparse function --- devito/builtins/initializers.py | 7 ++++++- devito/core/autotuning.py | 4 +--- devito/operations/interpolators.py | 30 ++++++++++++++++++++---------- devito/operator/operator.py | 8 ++++++++ devito/passes/iet/langbase.py | 3 +++ devito/passes/iet/parpragma.py | 1 + devito/tools/data_structures.py | 3 +++ devito/types/dimension.py | 15 ++++++--------- tests/test_buffering.py | 2 +- tests/test_dimension.py | 3 +-- tests/test_dle.py | 11 ++++++++--- tests/test_dse.py | 28 +++++++++++++++------------- tests/test_interpolation.py | 26 ++++++++++++++++++++++++++ tests/test_mpi.py | 4 ++-- 14 files changed, 101 insertions(+), 44 deletions(-) diff --git a/devito/builtins/initializers.py b/devito/builtins/initializers.py index f338e194e1..83bad735fa 100644 --- a/devito/builtins/initializers.py +++ b/devito/builtins/initializers.py @@ -77,7 +77,12 @@ def assign(f, rhs=0, options=None, name='assign', assign_halo=False, **kwargs): symbolic_max=d.symbolic_max + h.right) eqs = [eq.xreplace(subs) for eq in eqs] - dv.Operator(eqs, name=name, **kwargs)() + op = dv.Operator(eqs, name=name, **kwargs) + try: + op() + except ValueError: + # Corner case such as assign(u, v) with v a Buffered TimeFunction + op(time_M=f._time_size) def smooth(f, g, axis=None): diff --git a/devito/core/autotuning.py b/devito/core/autotuning.py index 16b146721a..f30c020ef7 100644 --- a/devito/core/autotuning.py +++ b/devito/core/autotuning.py @@ -209,9 +209,7 @@ def init_time_bounds(stepper, at_args, args): else: at_args[dim.max_name] = at_args[dim.min_name] + options['squeezer'] if dim.size_name in args: - if isinstance(args[dim.size_name], range): - pass - else: + if not isinstance(args[dim.size_name], range): # May need to shrink to avoid OOB accesses at_args[dim.max_name] = min(at_args[dim.max_name], args[dim.max_name]) if at_args[dim.min_name] > at_args[dim.max_name]: diff --git a/devito/operations/interpolators.py b/devito/operations/interpolators.py index 92bc392392..dae96d8dfe 100644 --- a/devito/operations/interpolators.py +++ b/devito/operations/interpolators.py @@ -169,11 +169,17 @@ def _rdim(self): return DimensionTuple(*rdims, getters=self._gdims) - def _augment_implicit_dims(self, implicit_dims): + def _augment_implicit_dims(self, implicit_dims, extras=None): + if extras is not None: + extra = set([i for v in extras for i in v.dimensions]) - set(self._gdims) + extra = tuple(extra) + else: + extra = tuple() + if self.sfunction._sparse_position == -1: - return self.sfunction.dimensions + as_tuple(implicit_dims) + return self.sfunction.dimensions + as_tuple(implicit_dims) + extra else: - return as_tuple(implicit_dims) + self.sfunction.dimensions + return as_tuple(implicit_dims) + self.sfunction.dimensions + extra def _coeff_temps(self, implicit_dims): return [] @@ -252,8 +258,6 @@ def _interpolate(self, expr, increment=False, self_subs={}, implicit_dims=None): interpolation expression, but that should be honored when constructing the operator. """ - implicit_dims = self._augment_implicit_dims(implicit_dims) - # Derivatives must be evaluated before the introduction of indirect accesses try: _expr = expr.evaluate @@ -263,6 +267,9 @@ def _interpolate(self, expr, increment=False, self_subs={}, implicit_dims=None): variables = list(retrieve_function_carriers(_expr)) + # Implicit dimensions + implicit_dims = self._augment_implicit_dims(implicit_dims) + # List of indirection indices for all adjacent grid points idx_subs, temps = self._interp_idx(variables, implicit_dims=implicit_dims) @@ -295,8 +302,6 @@ def _inject(self, field, expr, implicit_dims=None): injection expression, but that should be honored when constructing the operator. """ - implicit_dims = self._augment_implicit_dims(implicit_dims) - # Make iterable to support inject((u, v), expr=expr) # or inject((u, v), expr=(expr1, expr2)) fields, exprs = as_tuple(field), as_tuple(expr) @@ -315,6 +320,10 @@ def _inject(self, field, expr, implicit_dims=None): _exprs = exprs variables = list(v for e in _exprs for v in retrieve_function_carriers(e)) + + # Implicit dimensions + implicit_dims = self._augment_implicit_dims(implicit_dims, variables) + variables = variables + list(fields) # List of indirection indices for all adjacent grid points @@ -385,6 +394,7 @@ def interpolation_coeffs(self): @property def _weights(self): ddim, cdim = self.interpolation_coeffs.dimensions[1:] - return Mul(*[self.interpolation_coeffs.subs({ddim: ri, - cdim: rd-rd.parent.symbolic_min}) - for (ri, rd) in enumerate(self._rdim)]) + mappers = [{ddim: ri, cdim: rd-rd.parent.symbolic_min} + for (ri, rd) in enumerate(self._rdim)] + return Mul(*[self.interpolation_coeffs.subs(mapper) + for mapper in mappers]) diff --git a/devito/operator/operator.py b/devito/operator/operator.py index b91d51727f..609c69295f 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -566,6 +566,7 @@ def _prepare_arguments(self, autotune=None, **kwargs): "`%s=%s`, while `%s=%s` is expected. Perhaps you " "forgot to override `%s`?" % (p, k, v, k, args[k], p)) + args = kwargs['args'] = args.reduce_all() # DiscreteFunctions may be created from CartesianDiscretizations, which in @@ -573,6 +574,10 @@ def _prepare_arguments(self, autotune=None, **kwargs): discretizations = {getattr(kwargs[p.name], 'grid', None) for p in overrides} discretizations.update({getattr(p, 'grid', None) for p in defaults}) discretizations.discard(None) + # Remove subgrids if multiple grids + if len(discretizations) > 1: + discretizations = {g for g in discretizations + if not any(d.is_Derived for d in g.dimensions)} for i in discretizations: args.update(i._arg_values(**kwargs)) @@ -585,6 +590,9 @@ def _prepare_arguments(self, autotune=None, **kwargs): if configuration['mpi']: raise ValueError("Multiple Grids found") try: + # Take biggest grid, i.e discard grids with subdimensions + grids = {g for g in grids if not any(d.is_Sub for d in g.dimensions)} + # First grid as there is no heuristic on how to choose from the leftovers grid = grids.pop() except KeyError: grid = None diff --git a/devito/passes/iet/langbase.py b/devito/passes/iet/langbase.py index 36ce348ac4..4a4f6ac465 100644 --- a/devito/passes/iet/langbase.py +++ b/devito/passes/iet/langbase.py @@ -214,6 +214,9 @@ def DeviceIteration(self): def Prodder(self): return self.lang.Prodder + def _is_offloadable(self, *args, **kwargs): + return False + class DeviceAwareMixin(object): diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 44ee6afd6c..4af585f86a 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -304,6 +304,7 @@ def _select_candidates(self, candidates): # Iterations and their position (i.e. outermost to innermost) in the nest score = ( int(root.is_ParallelNoAtomic), + -int(self._is_offloadable(root))*(n0 + 1), # Outermost offloadable int(len([i for i in collapsable if i.is_ParallelNoAtomic]) >= 1), int(len([i for i in collapsable if i.is_ParallelRelaxed]) >= 1), -(n0 + 1) # The outermost, the better diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index 3afe7197eb..01a9a3f4bd 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -137,6 +137,9 @@ def compare_to_first(v): return candidates[0] elif all(map(compare_to_first, candidates)): # return first non-range + for c in candidates: + if not isinstance(c, range): + return c return candidates[0] else: raise ValueError("Unable to find unique value for key %s, candidates: %s" diff --git a/devito/types/dimension.py b/devito/types/dimension.py index 6044f01469..2d11ccb220 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -298,14 +298,14 @@ def _arg_values(self, interval, grid=None, args=None, **kwargs): # may represent sets of legal values. If that's the case, here we just # pick one. Note that we sort for determinism try: - loc_minv = loc_minv.start + loc_minv = loc_minv.stop except AttributeError: try: loc_minv = sorted(loc_minv).pop(0) except TypeError: pass try: - loc_maxv = loc_maxv.start + loc_maxv = loc_maxv.stop except AttributeError: try: loc_maxv = sorted(loc_maxv).pop(0) @@ -859,7 +859,8 @@ def _arg_defaults(self, _min=None, size=None, alias=None): factor = defaults[dim._factor.name] = dim._factor.data except AttributeError: factor = dim._factor - defaults[dim.parent.max_name] = range(1, factor*(size)) + + defaults[dim.parent.max_name] = range(1, factor*size - 1) return defaults @@ -983,8 +984,7 @@ def bound_symbols(self): return set(self.parent.bound_symbols) def _arg_defaults(self, alias=None, **kwargs): - dim = alias or self - return {dim.parent.size_name: range(self.symbolic_size, np.iinfo(np.int64).max)} + return {} def _arg_values(self, *args, **kwargs): return {} @@ -1466,10 +1466,7 @@ def _arg_defaults(self, _min=None, size=None, **kwargs): A SteppingDimension does not know its max point and therefore does not have a size argument. """ - args = {self.parent.min_name: _min} - if size: - args[self.parent.size_name] = range(size-1, np.iinfo(np.int32).max) - return args + return {self.parent.min_name: _min} def _arg_values(self, *args, **kwargs): """ diff --git a/tests/test_buffering.py b/tests/test_buffering.py index 16f98b4f94..ba200d220c 100644 --- a/tests/test_buffering.py +++ b/tests/test_buffering.py @@ -272,7 +272,7 @@ def test_over_injection(): # Check generated code assert len(retrieve_iteration_tree(op1)) == \ - 7 + int(configuration['language'] != 'C') + 8 + int(configuration['language'] != 'C') buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 1 diff --git a/tests/test_dimension.py b/tests/test_dimension.py index 32da3b22e3..9d41dddf48 100644 --- a/tests/test_dimension.py +++ b/tests/test_dimension.py @@ -1515,7 +1515,7 @@ def test_issue_1927(self, factor): op = Operator(Eq(f, 1)) - assert op.arguments()['time_M'] == 4*(save-1) # == min legal endpoint + assert op.arguments()['time_M'] == 4*save-1 # == min legal endpoint # Also no issues when supplying an override assert op.arguments(time_M=10)['time_M'] == 10 @@ -1530,7 +1530,6 @@ def test_issue_1927_v2(self): i = Dimension(name='i') ci = ConditionalDimension(name='ci', parent=i, factor=factor) - g = Function(name='g', shape=(size,), dimensions=(i,)) f = Function(name='f', shape=(int(size/factor),), dimensions=(ci,)) diff --git a/tests/test_dle.py b/tests/test_dle.py index 86a288ac00..3b9883e665 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -187,9 +187,14 @@ def test_cache_blocking_structure_optrelax(): op = Operator(eqns, opt=('advanced', {'blockrelax': True})) - bns, _ = assert_blocking(op, {'x0_blk0', 'p_src0_blk0'}) + bns, _ = assert_blocking(op, {'x0_blk0', 'p_src0_blk0', 'p_src1_blk0'}) iters = FindNodes(Iteration).visit(bns['p_src0_blk0']) + assert len(iters) == 2 + assert iters[0].dim.is_Block + assert iters[1].dim.is_Block + + iters = FindNodes(Iteration).visit(bns['p_src1_blk0']) assert len(iters) == 5 assert iters[0].dim.is_Block assert iters[1].dim.is_Block @@ -286,7 +291,7 @@ def test_cache_blocking_structure_optrelax_prec_inject(): 'openmp': True, 'par-collapse-ncores': 1})) - assert_structure(op, ['t', 't,p_s0_blk0,p_s,rsx,rsy'], + assert_structure(op, ['t', 't,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'], 't,p_s0_blk0,p_s,rsx,rsy') @@ -958,7 +963,7 @@ def test_parallel_prec_inject(self): iterations = FindNodes(Iteration).visit(op0) assert not iterations[0].pragmas - assert 'omp for collapse(2)' in iterations[1].pragmas[0].value + assert 'omp for' in iterations[1].pragmas[0].value class TestNestedParallelism(object): diff --git a/tests/test_dse.py b/tests/test_dse.py index 2aefe69ed4..1e18157c77 100644 --- a/tests/test_dse.py +++ b/tests/test_dse.py @@ -48,9 +48,9 @@ def test_scheduling_after_rewrite(): trees = retrieve_iteration_tree(op) # Check loop nest structure - assert all(i.dim is j for i, j in zip(trees[0], grid.dimensions)) # time invariant - assert trees[1].root.dim is grid.time_dim - assert all(trees[1].root.dim is tree.root.dim for tree in trees[1:]) + assert all(i.dim is j for i, j in zip(trees[1], grid.dimensions)) # time invariant + assert trees[2].root.dim is grid.time_dim + assert all(trees[2].root.dim is tree.root.dim for tree in trees[2:]) @pytest.mark.parametrize('exprs,expected,min_cost', [ @@ -1687,7 +1687,7 @@ def test_drop_redundants_after_fusion(self, rotate): op = Operator(eqns, opt=('advanced', {'cire-rotate': rotate})) arrays = [i for i in FindSymbols().visit(op) if i.is_Array] - assert len(arrays) == 2 + assert len(arrays) == 4 assert all(i._mem_heap and not i._mem_external for i in arrays) def test_full_shape_big_temporaries(self): @@ -2711,9 +2711,11 @@ def test_fullopt(self): assert np.isclose(summary0[('section0', None)].oi, 2.851, atol=0.001) assert summary1[('section0', None)].ops == 9 - assert summary1[('section1', None)].ops == 31 - assert summary1[('section2', None)].ops == 88 - assert np.isclose(summary1[('section1', None)].oi, 1.767, atol=0.001) + assert summary1[('section1', None)].ops == 9 + assert summary1[('section2', None)].ops == 31 + assert summary1[('section3', None)].ops == 26 + assert summary1[('section4', None)].ops == 22 + assert np.isclose(summary1[('section2', None)].oi, 1.767, atol=0.001) assert np.allclose(u0.data, u1.data, atol=10e-5) assert np.allclose(rec0.data, rec1.data, atol=10e-5) @@ -2773,8 +2775,8 @@ def test_fullopt(self): assert np.allclose(self.tti_noopt[1].data, rec.data, atol=10e-1) # Check expected opcount/oi - assert summary[('section2', None)].ops == 92 - assert np.isclose(summary[('section2', None)].oi, 2.074, atol=0.001) + assert summary[('section3', None)].ops == 92 + assert np.isclose(summary[('section3', None)].oi, 2.074, atol=0.001) # With optimizations enabled, there should be exactly four BlockDimensions op = wavesolver.op_fwd() @@ -2792,7 +2794,7 @@ def test_fullopt(self): # 3 Arrays are defined globally for the sparse positions temporaries # and two additional bock-sized Arrays are defined locally arrays = [i for i in FindSymbols().visit(op) if i.is_Array] - extra_arrays = 2+3 + extra_arrays = 2+3+3 assert len(arrays) == 4 + extra_arrays assert all(i._mem_heap and not i._mem_external for i in arrays) bns, pbs = assert_blocking(op, {'x0_blk0'}) @@ -2828,7 +2830,7 @@ def test_fullopt_w_mpi(self): def test_opcounts(self, space_order, expected): op = self.tti_operator(opt='advanced', space_order=space_order) sections = list(op.op_fwd()._profiler._sections.values()) - assert sections[2].sops == expected + assert sections[3].sops == expected @switchconfig(profiling='advanced') @pytest.mark.parametrize('space_order,expected', [ @@ -2838,8 +2840,8 @@ def test_opcounts_adjoint(self, space_order, expected): wavesolver = self.tti_operator(opt=('advanced', {'openmp': False})) op = wavesolver.op_adj() - assert op._profiler._sections['section2'].sops == expected - assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+3 + assert op._profiler._sections['section3'].sops == expected + assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+3+3 class TestTTIv2(object): diff --git a/tests/test_interpolation.py b/tests/test_interpolation.py index 3a22ca1db7..c7a15665a4 100644 --- a/tests/test_interpolation.py +++ b/tests/test_interpolation.py @@ -734,3 +734,29 @@ class SparseFirst(SparseFunction): op(time_M=10) expected = 10*11/2 # n (n+1)/2 assert np.allclose(s.data, expected) + + +def test_inject_function(): + nt = 11 + + grid = Grid(shape=(5, 5)) + u = TimeFunction(name="u", grid=grid, time_order=2) + src = SparseTimeFunction(name="src", grid=grid, nt=nt, npoint=1, + coordinates=[[0.5, 0.5]]) + + nfreq = 5 + freq_dim = DefaultDimension(name="freq", default_value=nfreq) + omega = Function(name="omega", dimensions=(freq_dim,), shape=(nfreq,), grid=grid) + omega.data.fill(1.) + + inj = src.inject(field=u.forward, expr=omega) + + op = Operator([inj]) + + op(time_M=0) + assert u.data[1, 2, 2] == nfreq + assert np.all(u.data[0] == 0) + assert np.all(u.data[2] == 0) + for i in [0, 1, 3, 4]: + for j in [0, 1, 3, 4]: + assert u.data[1, i, j] == 0 diff --git a/tests/test_mpi.py b/tests/test_mpi.py index 14ddbec249..2860fc726e 100644 --- a/tests/test_mpi.py +++ b/tests/test_mpi.py @@ -2499,8 +2499,8 @@ def test_adjoint_codegen(self, shape, kernel, space_order, save): op_adj = solver.op_adj() adj_calls = FindNodes(Call).visit(op_adj) - # one halo, ndim memalign and free (pos temp rec) - sf_calls = 2 * len(shape) + # one halo, ndim memalign and free (pos temp rec/src) + sf_calls = 2 * len(shape) * 2 assert len(fwd_calls) == 1 + sf_calls assert len(adj_calls) == 1 + sf_calls From 87d8d0e1f25c7dc4f6d75d0af2d92c5163f6a05c Mon Sep 17 00:00:00 2001 From: Mathias Louboutin Date: Mon, 18 Sep 2023 08:25:17 -0400 Subject: [PATCH 3/7] compiler: remove atomic collapse hack --- devito/passes/iet/parpragma.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 4af585f86a..9d69e12df7 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -295,9 +295,6 @@ def _select_candidates(self, candidates): except TypeError: pass - # At least one inner loop (nested) or - # we do not collapse most inner loop if it is an atomic reduction - if not i.is_ParallelAtomic or nested: collapsable.append(i) # Give a score to this candidate, based on the number of fully-parallel @@ -429,11 +426,6 @@ def _make_nested_partree(self, partree): if self.nhyperthreads <= self.nested: return partree - # Loop nest with atomic reductions are more likely to have less latency - # keep outer loop parallel - if partree.root.is_ParallelAtomic: - return partree - # Note: there might be multiple sub-trees amenable to nested parallelism, # hence we loop over all of them # From f7ab007e0a735952ca9d40509fe7ac9ec222f55c Mon Sep 17 00:00:00 2001 From: Mathias Louboutin Date: Mon, 18 Sep 2023 11:56:45 -0400 Subject: [PATCH 4/7] compiler: prevent halo to be moved outside their iteration space --- devito/ir/stree/algorithms.py | 6 ++++++ devito/mpi/halo_scheme.py | 4 ++++ devito/passes/iet/langbase.py | 13 ++++++++++--- devito/passes/iet/parpragma.py | 10 ++++++++-- tests/test_dle.py | 10 ++++++---- tests/test_gpu_openacc.py | 16 ++++++++-------- tests/test_gpu_openmp.py | 2 +- tests/test_mpi.py | 3 ++- 8 files changed, 45 insertions(+), 19 deletions(-) diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index 58e8e844e6..d8bbb4958a 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -147,6 +147,12 @@ def preprocess(clusters, options=None, **kwargs): found = [] for c1 in list(queue): distributed_aindices = c1.halo_scheme.distributed_aindices + h_indices = set().union(*[(d, d.root) + for d in c1.halo_scheme.loc_indices]) + + # Skip if the Halo echange would end up outside its need iteration space + if h_indices and not h_indices & dims: + continue diff = dims - distributed_aindices intersection = dims & distributed_aindices diff --git a/devito/mpi/halo_scheme.py b/devito/mpi/halo_scheme.py index 0204c171e6..970e84633d 100644 --- a/devito/mpi/halo_scheme.py +++ b/devito/mpi/halo_scheme.py @@ -361,6 +361,10 @@ def distributed(self): def distributed_aindices(self): return set().union(*[i.dims for i in self.fmapper.values()]) + @cached_property + def loc_indices(self): + return set().union(*[i.loc_indices.keys() for i in self.fmapper.values()]) + @cached_property def arguments(self): return self.dimensions | set(flatten(self.honored.values())) diff --git a/devito/passes/iet/langbase.py b/devito/passes/iet/langbase.py index 4a4f6ac465..457d8476c3 100644 --- a/devito/passes/iet/langbase.py +++ b/devito/passes/iet/langbase.py @@ -214,8 +214,8 @@ def DeviceIteration(self): def Prodder(self): return self.lang.Prodder - def _is_offloadable(self, *args, **kwargs): - return False + def _n_device_pointers(self, *args, **kwargs): + return 0 class DeviceAwareMixin(object): @@ -328,6 +328,12 @@ def _(iet): return _initialize(iet) + def _n_device_pointers(self, iet): + functions = FindSymbols().visit(iet) + devfuncs = [f for f in functions if f.is_Array and f._mem_local] + + return len(devfuncs) + def _is_offloadable(self, iet): """ True if the IET computation is offloadable to device, False otherwise. @@ -339,7 +345,8 @@ def _is_offloadable(self, iet): functions = FindSymbols().visit(iet) buffers = [f for f in functions if f.is_Array and f._mem_mapped] hostfuncs = [f for f in functions if not is_on_device(f, self.gpu_fit)] - return not (buffers and hostfuncs) + + return not (hostfuncs and buffers) class Sections(tuple): diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 9d69e12df7..34ca370a60 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -295,13 +295,13 @@ def _select_candidates(self, candidates): except TypeError: pass - collapsable.append(i) + collapsable.append(i) # Give a score to this candidate, based on the number of fully-parallel # Iterations and their position (i.e. outermost to innermost) in the nest score = ( int(root.is_ParallelNoAtomic), - -int(self._is_offloadable(root))*(n0 + 1), # Outermost offloadable + self._n_device_pointers(root), # Outermost offloadable int(len([i for i in collapsable if i.is_ParallelNoAtomic]) >= 1), int(len([i for i in collapsable if i.is_ParallelRelaxed]) >= 1), -(n0 + 1) # The outermost, the better @@ -375,6 +375,12 @@ def _make_partree(self, candidates, nthreads=None): ncollapsed=ncollapsed, nthreads=nthreads, **root.args) prefix = [] + elif all(i.is_ParallelRelaxed for i in candidates) and nthreads is not None: + body = self.HostIteration(schedule='static', + parallel=nthreads is not self.nthreads_nested, + ncollapsed=ncollapsed, nthreads=nthreads, + **root.args) + prefix = [] else: # pragma ... for ... schedule(..., expr) assert nthreads is None diff --git a/tests/test_dle.py b/tests/test_dle.py index 3b9883e665..df3c4adfa5 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -291,7 +291,7 @@ def test_cache_blocking_structure_optrelax_prec_inject(): 'openmp': True, 'par-collapse-ncores': 1})) - assert_structure(op, ['t', 't,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'], + assert_structure(op, ['t,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'], 't,p_s0_blk0,p_s,rsx,rsy') @@ -821,12 +821,13 @@ def test_incs_no_atomic(self): 'par-collapse-ncores': 1, 'par-collapse-work': 0})) - assert 'collapse(2)' in str(op0) + assert 'collapse(3)' in str(op0) assert 'atomic' in str(op0) # Now only `x` is parallelized op1 = Operator([Eq(v[t, x, 0, 0], v[t, x, 0, 0] + 1), Inc(uf, 1)], opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1})) + assert 'omp for' in str(op1) assert 'collapse' not in str(op1) assert 'atomic' not in str(op1) @@ -951,11 +952,12 @@ def test_parallel_prec_inject(self): eqns = sf.inject(field=u.forward, expr=sf * dt**2) op0 = Operator(eqns, opt=('advanced', {'openmp': True, - 'par-collapse-ncores': 1})) + 'par-collapse-ncores': 20})) iterations = FindNodes(Iteration).visit(op0) assert not iterations[0].pragmas assert 'omp for' in iterations[1].pragmas[0].value + assert 'collapse' not in iterations[1].pragmas[0].value op0 = Operator(eqns, opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1, @@ -963,7 +965,7 @@ def test_parallel_prec_inject(self): iterations = FindNodes(Iteration).visit(op0) assert not iterations[0].pragmas - assert 'omp for' in iterations[1].pragmas[0].value + assert 'omp for collapse' in iterations[2].pragmas[0].value class TestNestedParallelism(object): diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index 823d11854d..db92db3c83 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -102,15 +102,15 @@ def test_tile_insteadof_collapse(self, par_tile): opt=('advanced', {'par-tile': par_tile})) trees = retrieve_iteration_tree(op) - assert len(trees) == 4 + assert len(trees) == 6 - assert trees[0][1].pragmas[0].value ==\ - 'acc parallel loop tile(32,4,4) present(u)' assert trees[1][1].pragmas[0].value ==\ + 'acc parallel loop tile(32,4,4) present(u)' + assert trees[2][1].pragmas[0].value ==\ 'acc parallel loop tile(32,4) present(u)' # Only the AFFINE Iterations are tiled - assert trees[3][1].pragmas[0].value ==\ - 'acc parallel loop collapse(3) present(src,src_coords,u)' + assert trees[4][1].pragmas[0].value ==\ + 'acc parallel loop present(src,src_coords,u) deviceptr(r1,r2,r3)' @pytest.mark.parametrize('par_tile', [((32, 4, 4), (8, 8)), ((32, 4), (8, 8)), ((32, 4, 4), (8, 8, 8))]) @@ -130,11 +130,11 @@ def test_multiple_tile_sizes(self, par_tile): opt=('advanced', {'par-tile': par_tile})) trees = retrieve_iteration_tree(op) - assert len(trees) == 4 + assert len(trees) == 6 - assert trees[0][1].pragmas[0].value ==\ - 'acc parallel loop tile(32,4,4) present(u)' assert trees[1][1].pragmas[0].value ==\ + 'acc parallel loop tile(32,4,4) present(u)' + assert trees[2][1].pragmas[0].value ==\ 'acc parallel loop tile(8,8) present(u)' def test_multi_tile_blocking_structure(self): diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py index bc2de71708..29866508d8 100644 --- a/tests/test_gpu_openmp.py +++ b/tests/test_gpu_openmp.py @@ -265,7 +265,7 @@ def test_timeparallel_reduction(self): assert not tree.root.pragmas assert len(tree[1].pragmas) == 1 assert tree[1].pragmas[0].value ==\ - ('omp target teams distribute parallel for collapse(2)' + ('omp target teams distribute parallel for collapse(3)' ' reduction(+:f[0])') diff --git a/tests/test_mpi.py b/tests/test_mpi.py index 2860fc726e..51facd7a7c 100644 --- a/tests/test_mpi.py +++ b/tests/test_mpi.py @@ -2558,7 +2558,8 @@ def test_adjoint_F_no_omp(self): # TestDecomposition().test_reshape_left_right() # TestOperatorSimple().test_trivial_eq_2d() # TestFunction().test_halo_exchange_bilateral() - TestSparseFunction().test_sparse_coords() + # TestSparseFunction().test_sparse_coords() # TestSparseFunction().test_precomputed_sparse(2) # TestOperatorAdvanced().test_fission_due_to_antidep() + TestOperatorAdvanced().test_injection_wodup_wtime() # TestIsotropicAcoustic().test_adjoint_F(1) From 441de0f148a57619d61ec80468965e289f2cc0cd Mon Sep 17 00:00:00 2001 From: mloubout Date: Tue, 19 Sep 2023 09:51:08 -0400 Subject: [PATCH 5/7] compiler: improve interpolation parallelism --- devito/operations/interpolators.py | 5 +++++ tests/test_buffering.py | 2 +- tests/test_dle.py | 11 ++--------- tests/test_dse.py | 29 ++++++++++++++--------------- tests/test_gpu_openacc.py | 18 ++++++++++-------- tests/test_interpolation.py | 3 ++- tests/test_mpi.py | 2 +- tests/test_operator.py | 4 ++-- 8 files changed, 37 insertions(+), 37 deletions(-) diff --git a/devito/operations/interpolators.py b/devito/operations/interpolators.py index dae96d8dfe..3d2dcb7466 100644 --- a/devito/operations/interpolators.py +++ b/devito/operations/interpolators.py @@ -305,6 +305,7 @@ def _inject(self, field, expr, implicit_dims=None): # Make iterable to support inject((u, v), expr=expr) # or inject((u, v), expr=(expr1, expr2)) fields, exprs = as_tuple(field), as_tuple(expr) + # Provide either one expr per field or on expr for all fields if len(fields) > 1: if len(exprs) == 1: @@ -323,6 +324,10 @@ def _inject(self, field, expr, implicit_dims=None): # Implicit dimensions implicit_dims = self._augment_implicit_dims(implicit_dims, variables) + # Move all temporaries inside inner loop to improve parallelism + # Can only be done for inject as interpolation need a temporary + # summing temp that wouldn't allow collapsing + implicit_dims = implicit_dims + tuple(r.parent for r in self._rdim) variables = variables + list(fields) diff --git a/tests/test_buffering.py b/tests/test_buffering.py index ba200d220c..16f98b4f94 100644 --- a/tests/test_buffering.py +++ b/tests/test_buffering.py @@ -272,7 +272,7 @@ def test_over_injection(): # Check generated code assert len(retrieve_iteration_tree(op1)) == \ - 8 + int(configuration['language'] != 'C') + 7 + int(configuration['language'] != 'C') buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 1 diff --git a/tests/test_dle.py b/tests/test_dle.py index df3c4adfa5..8d58827a61 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -187,19 +187,12 @@ def test_cache_blocking_structure_optrelax(): op = Operator(eqns, opt=('advanced', {'blockrelax': True})) - bns, _ = assert_blocking(op, {'x0_blk0', 'p_src0_blk0', 'p_src1_blk0'}) + bns, _ = assert_blocking(op, {'x0_blk0', 'p_src0_blk0'}) iters = FindNodes(Iteration).visit(bns['p_src0_blk0']) - assert len(iters) == 2 - assert iters[0].dim.is_Block - assert iters[1].dim.is_Block - - iters = FindNodes(Iteration).visit(bns['p_src1_blk0']) assert len(iters) == 5 assert iters[0].dim.is_Block assert iters[1].dim.is_Block - for i in range(2, 5): - assert not iters[i].dim.is_Block def test_cache_blocking_structure_optrelax_customdim(): @@ -965,7 +958,7 @@ def test_parallel_prec_inject(self): iterations = FindNodes(Iteration).visit(op0) assert not iterations[0].pragmas - assert 'omp for collapse' in iterations[2].pragmas[0].value + assert 'omp for collapse' in iterations[1].pragmas[0].value class TestNestedParallelism(object): diff --git a/tests/test_dse.py b/tests/test_dse.py index 1e18157c77..728f8f9357 100644 --- a/tests/test_dse.py +++ b/tests/test_dse.py @@ -48,9 +48,9 @@ def test_scheduling_after_rewrite(): trees = retrieve_iteration_tree(op) # Check loop nest structure - assert all(i.dim is j for i, j in zip(trees[1], grid.dimensions)) # time invariant - assert trees[2].root.dim is grid.time_dim - assert all(trees[2].root.dim is tree.root.dim for tree in trees[2:]) + assert all(i.dim is j for i, j in zip(trees[0], grid.dimensions)) # time invariant + assert trees[1].root.dim is grid.time_dim + assert all(trees[1].root.dim is tree.root.dim for tree in trees[1:]) @pytest.mark.parametrize('exprs,expected,min_cost', [ @@ -1687,7 +1687,7 @@ def test_drop_redundants_after_fusion(self, rotate): op = Operator(eqns, opt=('advanced', {'cire-rotate': rotate})) arrays = [i for i in FindSymbols().visit(op) if i.is_Array] - assert len(arrays) == 4 + assert len(arrays) == 2 assert all(i._mem_heap and not i._mem_external for i in arrays) def test_full_shape_big_temporaries(self): @@ -2711,11 +2711,10 @@ def test_fullopt(self): assert np.isclose(summary0[('section0', None)].oi, 2.851, atol=0.001) assert summary1[('section0', None)].ops == 9 - assert summary1[('section1', None)].ops == 9 - assert summary1[('section2', None)].ops == 31 - assert summary1[('section3', None)].ops == 26 - assert summary1[('section4', None)].ops == 22 - assert np.isclose(summary1[('section2', None)].oi, 1.767, atol=0.001) + assert summary1[('section1', None)].ops == 31 + assert summary1[('section2', None)].ops == 88 + assert summary1[('section3', None)].ops == 22 + assert np.isclose(summary1[('section1', None)].oi, 1.767, atol=0.001) assert np.allclose(u0.data, u1.data, atol=10e-5) assert np.allclose(rec0.data, rec1.data, atol=10e-5) @@ -2775,8 +2774,8 @@ def test_fullopt(self): assert np.allclose(self.tti_noopt[1].data, rec.data, atol=10e-1) # Check expected opcount/oi - assert summary[('section3', None)].ops == 92 - assert np.isclose(summary[('section3', None)].oi, 2.074, atol=0.001) + assert summary[('section2', None)].ops == 92 + assert np.isclose(summary[('section2', None)].oi, 2.074, atol=0.001) # With optimizations enabled, there should be exactly four BlockDimensions op = wavesolver.op_fwd() @@ -2794,7 +2793,7 @@ def test_fullopt(self): # 3 Arrays are defined globally for the sparse positions temporaries # and two additional bock-sized Arrays are defined locally arrays = [i for i in FindSymbols().visit(op) if i.is_Array] - extra_arrays = 2+3+3 + extra_arrays = 2+3 assert len(arrays) == 4 + extra_arrays assert all(i._mem_heap and not i._mem_external for i in arrays) bns, pbs = assert_blocking(op, {'x0_blk0'}) @@ -2830,7 +2829,7 @@ def test_fullopt_w_mpi(self): def test_opcounts(self, space_order, expected): op = self.tti_operator(opt='advanced', space_order=space_order) sections = list(op.op_fwd()._profiler._sections.values()) - assert sections[3].sops == expected + assert sections[2].sops == expected @switchconfig(profiling='advanced') @pytest.mark.parametrize('space_order,expected', [ @@ -2840,8 +2839,8 @@ def test_opcounts_adjoint(self, space_order, expected): wavesolver = self.tti_operator(opt=('advanced', {'openmp': False})) op = wavesolver.op_adj() - assert op._profiler._sections['section3'].sops == expected - assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+3+3 + assert op._profiler._sections['section2'].sops == expected + assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 7+3 class TestTTIv2(object): diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index db92db3c83..3085ad85c9 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -102,15 +102,15 @@ def test_tile_insteadof_collapse(self, par_tile): opt=('advanced', {'par-tile': par_tile})) trees = retrieve_iteration_tree(op) - assert len(trees) == 6 + assert len(trees) == 4 - assert trees[1][1].pragmas[0].value ==\ + assert trees[0][1].pragmas[0].value ==\ 'acc parallel loop tile(32,4,4) present(u)' - assert trees[2][1].pragmas[0].value ==\ + assert trees[1][1].pragmas[0].value ==\ 'acc parallel loop tile(32,4) present(u)' # Only the AFFINE Iterations are tiled - assert trees[4][1].pragmas[0].value ==\ - 'acc parallel loop present(src,src_coords,u) deviceptr(r1,r2,r3)' + assert trees[3][1].pragmas[0].value ==\ + 'acc parallel loop collapse(4) present(src,src_coords,u)' @pytest.mark.parametrize('par_tile', [((32, 4, 4), (8, 8)), ((32, 4), (8, 8)), ((32, 4, 4), (8, 8, 8))]) @@ -130,12 +130,14 @@ def test_multiple_tile_sizes(self, par_tile): opt=('advanced', {'par-tile': par_tile})) trees = retrieve_iteration_tree(op) - assert len(trees) == 6 + assert len(trees) == 4 - assert trees[1][1].pragmas[0].value ==\ + assert trees[0][1].pragmas[0].value ==\ 'acc parallel loop tile(32,4,4) present(u)' - assert trees[2][1].pragmas[0].value ==\ + assert trees[1][1].pragmas[0].value ==\ 'acc parallel loop tile(8,8) present(u)' + assert trees[3][1].pragmas[0].value ==\ + 'acc parallel loop collapse(4) present(src,src_coords,u)' def test_multi_tile_blocking_structure(self): grid = Grid(shape=(8, 8, 8)) diff --git a/tests/test_interpolation.py b/tests/test_interpolation.py index c7a15665a4..97d86c1759 100644 --- a/tests/test_interpolation.py +++ b/tests/test_interpolation.py @@ -5,7 +5,7 @@ from sympy import Float from devito import (Grid, Operator, Dimension, SparseFunction, SparseTimeFunction, - Function, TimeFunction, DefaultDimension, Eq, + Function, TimeFunction, DefaultDimension, Eq, switchconfig, PrecomputedSparseFunction, PrecomputedSparseTimeFunction, MatrixSparseTimeFunction) from examples.seismic import (demo_model, TimeAxis, RickerSource, Receiver, @@ -736,6 +736,7 @@ class SparseFirst(SparseFunction): assert np.allclose(s.data, expected) +@switchconfig(safe_math=True) def test_inject_function(): nt = 11 diff --git a/tests/test_mpi.py b/tests/test_mpi.py index 51facd7a7c..ab7092ba1a 100644 --- a/tests/test_mpi.py +++ b/tests/test_mpi.py @@ -2500,7 +2500,7 @@ def test_adjoint_codegen(self, shape, kernel, space_order, save): adj_calls = FindNodes(Call).visit(op_adj) # one halo, ndim memalign and free (pos temp rec/src) - sf_calls = 2 * len(shape) * 2 + sf_calls = 2 * len(shape) assert len(fwd_calls) == 1 + sf_calls assert len(adj_calls) == 1 + sf_calls diff --git a/tests/test_operator.py b/tests/test_operator.py index 4f8228bc24..3064565e3c 100644 --- a/tests/test_operator.py +++ b/tests/test_operator.py @@ -1805,7 +1805,7 @@ def test_scheduling_sparse_functions(self): # `trees` than 6 op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False})) trees = retrieve_iteration_tree(op) - assert len(trees) == 6 + assert len(trees) == 5 # Time loop not shared due to the WAR assert trees[0][0].dim is time and trees[0][0] is trees[1][0] # this IS shared assert trees[1][0] is not trees[3][0] @@ -1815,7 +1815,7 @@ def test_scheduling_sparse_functions(self): eqn2 = sf1.inject(u1.forward, expr=sf1) op = Operator([eqn1] + eqn2 + [eqn3] + eqn4, opt=('noop', {'openmp': False})) trees = retrieve_iteration_tree(op) - assert len(trees) == 6 + assert len(trees) == 5 assert all(trees[0][0] is i[0] for i in trees) def test_scheduling_with_free_dims(self): From ab160ddd1b3ec6810d77f8296704744967d6854f Mon Sep 17 00:00:00 2001 From: mloubout Date: Wed, 20 Sep 2023 08:08:00 -0400 Subject: [PATCH 6/7] compiler: remove redundant subgrid check and cleanup --- devito/ir/stree/algorithms.py | 6 +++--- devito/operator/operator.py | 4 +--- devito/passes/iet/langbase.py | 12 +++++------- devito/passes/iet/parpragma.py | 5 ++--- devito/types/dimension.py | 4 ++-- 5 files changed, 13 insertions(+), 18 deletions(-) diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index d8bbb4958a..0f7e46bcfd 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -147,10 +147,10 @@ def preprocess(clusters, options=None, **kwargs): found = [] for c1 in list(queue): distributed_aindices = c1.halo_scheme.distributed_aindices - h_indices = set().union(*[(d, d.root) - for d in c1.halo_scheme.loc_indices]) + h_indices = set().union(*[d._defines for d in c1.halo_scheme.loc_indices]) - # Skip if the Halo echange would end up outside its need iteration space + # Skip if the halo exchange would end up outside + # its iteration space if h_indices and not h_indices & dims: continue diff --git a/devito/operator/operator.py b/devito/operator/operator.py index 609c69295f..eb8b793f22 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -578,6 +578,7 @@ def _prepare_arguments(self, autotune=None, **kwargs): if len(discretizations) > 1: discretizations = {g for g in discretizations if not any(d.is_Derived for d in g.dimensions)} + for i in discretizations: args.update(i._arg_values(**kwargs)) @@ -590,9 +591,6 @@ def _prepare_arguments(self, autotune=None, **kwargs): if configuration['mpi']: raise ValueError("Multiple Grids found") try: - # Take biggest grid, i.e discard grids with subdimensions - grids = {g for g in grids if not any(d.is_Sub for d in g.dimensions)} - # First grid as there is no heuristic on how to choose from the leftovers grid = grids.pop() except KeyError: grid = None diff --git a/devito/passes/iet/langbase.py b/devito/passes/iet/langbase.py index 457d8476c3..2acccba648 100644 --- a/devito/passes/iet/langbase.py +++ b/devito/passes/iet/langbase.py @@ -214,8 +214,8 @@ def DeviceIteration(self): def Prodder(self): return self.lang.Prodder - def _n_device_pointers(self, *args, **kwargs): - return 0 + def _device_pointers(self, *args, **kwargs): + return {} class DeviceAwareMixin(object): @@ -328,11 +328,10 @@ def _(iet): return _initialize(iet) - def _n_device_pointers(self, iet): + def _device_pointers(self, iet): functions = FindSymbols().visit(iet) devfuncs = [f for f in functions if f.is_Array and f._mem_local] - - return len(devfuncs) + return set(devfuncs) def _is_offloadable(self, iet): """ @@ -345,8 +344,7 @@ def _is_offloadable(self, iet): functions = FindSymbols().visit(iet) buffers = [f for f in functions if f.is_Array and f._mem_mapped] hostfuncs = [f for f in functions if not is_on_device(f, self.gpu_fit)] - - return not (hostfuncs and buffers) + return not (buffers and hostfuncs) class Sections(tuple): diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 34ca370a60..b6476192b2 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -301,7 +301,7 @@ def _select_candidates(self, candidates): # Iterations and their position (i.e. outermost to innermost) in the nest score = ( int(root.is_ParallelNoAtomic), - self._n_device_pointers(root), # Outermost offloadable + len(self._device_pointers(root)), # Outermost offloadable int(len([i for i in collapsable if i.is_ParallelNoAtomic]) >= 1), int(len([i for i in collapsable if i.is_ParallelRelaxed]) >= 1), -(n0 + 1) # The outermost, the better @@ -375,7 +375,7 @@ def _make_partree(self, candidates, nthreads=None): ncollapsed=ncollapsed, nthreads=nthreads, **root.args) prefix = [] - elif all(i.is_ParallelRelaxed for i in candidates) and nthreads is not None: + elif nthreads is not None: body = self.HostIteration(schedule='static', parallel=nthreads is not self.nthreads_nested, ncollapsed=ncollapsed, nthreads=nthreads, @@ -383,7 +383,6 @@ def _make_partree(self, candidates, nthreads=None): prefix = [] else: # pragma ... for ... schedule(..., expr) - assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') body = self.HostIteration(ncollapsed=ncollapsed, chunk_size=chunk_size, diff --git a/devito/types/dimension.py b/devito/types/dimension.py index 2d11ccb220..43865c72f5 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -983,7 +983,7 @@ def symbolic_incr(self): def bound_symbols(self): return set(self.parent.bound_symbols) - def _arg_defaults(self, alias=None, **kwargs): + def _arg_defaults(self, **kwargs): return {} def _arg_values(self, *args, **kwargs): @@ -1452,7 +1452,7 @@ def symbolic_max(self): def _arg_names(self): return (self.min_name, self.max_name, self.name) + self.parent._arg_names - def _arg_defaults(self, _min=None, size=None, **kwargs): + def _arg_defaults(self, _min=None, **kwargs): """ A map of default argument values defined by this dimension. From 97f1cc876d063a5a3aba9ce41678b013e1529be4 Mon Sep 17 00:00:00 2001 From: mloubout Date: Wed, 20 Sep 2023 09:43:26 -0400 Subject: [PATCH 7/7] api: make SubDimension's side easy to check --- devito/types/dimension.py | 43 +++++++++++++++++++++++++++------------ tests/test_dimension.py | 9 +++++++- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/devito/types/dimension.py b/devito/types/dimension.py index 43865c72f5..d7b25e382d 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -586,7 +586,7 @@ def left(cls, name, parent, thickness, local=True): return cls(name, parent, left=parent.symbolic_min, right=parent.symbolic_min+lst-1, - thickness=((lst, thickness), (rst, 0)), + thickness=((lst, thickness), (rst, None)), local=local) @classmethod @@ -595,7 +595,7 @@ def right(cls, name, parent, thickness, local=True): return cls(name, parent, left=parent.symbolic_max-rst+1, right=parent.symbolic_max, - thickness=((lst, 0), (rst, thickness)), + thickness=((lst, None), (rst, thickness)), local=local) @classmethod @@ -628,6 +628,18 @@ def local(self): def thickness(self): return self._thickness + @property + def is_left(self): + return self.thickness.right[1] is None + + @property + def is_right(self): + return self.thickness.left[1] is None + + @property + def is_middle(self): + return not self.is_left and not self.is_right + @cached_property def bound_symbols(self): # Add thickness symbols @@ -701,7 +713,7 @@ def _arg_values(self, interval, grid=None, **kwargs): # However, arguments from the user are considered global # So overriding the thickness to a nonzero value should not cause # boundaries to exist between ranks where they did not before - requested_ltkn, requested_rtkn = ( + r_ltkn, r_rtkn = ( kwargs.get(k.name, v) for k, v in self.thickness ) @@ -710,19 +722,24 @@ def _arg_values(self, interval, grid=None, **kwargs): if self.local: # dimension is of type ``left``/right`` - compute the 'offset' # and then add 1 to get the appropriate thickness - ltkn = grid.distributor.glb_to_loc(self.root, requested_ltkn-1, LEFT) - rtkn = grid.distributor.glb_to_loc(self.root, requested_rtkn-1, RIGHT) - ltkn = ltkn+1 if ltkn is not None else 0 - rtkn = rtkn+1 if rtkn is not None else 0 + if r_ltkn is not None: + ltkn = grid.distributor.glb_to_loc(self.root, r_ltkn-1, LEFT) + ltkn = ltkn+1 if ltkn is not None else 0 + else: + ltkn = 0 + + if r_rtkn is not None: + rtkn = grid.distributor.glb_to_loc(self.root, r_rtkn-1, RIGHT) + rtkn = rtkn+1 if rtkn is not None else 0 + else: + rtkn = 0 else: # dimension is of type ``middle`` - ltkn = grid.distributor.glb_to_loc(self.root, requested_ltkn, - LEFT) or 0 - rtkn = grid.distributor.glb_to_loc(self.root, requested_rtkn, - RIGHT) or 0 + ltkn = grid.distributor.glb_to_loc(self.root, r_ltkn, LEFT) or 0 + rtkn = grid.distributor.glb_to_loc(self.root, r_rtkn, RIGHT) or 0 else: - ltkn = requested_ltkn - rtkn = requested_rtkn + ltkn = r_ltkn or 0 + rtkn = r_rtkn or 0 return {i.name: v for i, v in zip(self._thickness_map, (ltkn, rtkn))} diff --git a/tests/test_dimension.py b/tests/test_dimension.py index 9d41dddf48..a54e160b38 100644 --- a/tests/test_dimension.py +++ b/tests/test_dimension.py @@ -243,6 +243,10 @@ def test_subdim_middle(self, opt): xi = SubDimension.middle(name='xi', parent=x, thickness_left=1, thickness_right=1) + assert xi.is_middle + assert not xi.is_left + assert not xi.is_right + eqs = [Eq(u.forward, u + 1)] eqs = [e.subs(x, xi) for e in eqs] @@ -261,6 +265,8 @@ def test_symbolic_size(self): thickness = 4 xleft = SubDimension.left(name='xleft', parent=x, thickness=thickness) + assert xleft.is_left + assert not xleft.is_middle assert xleft.symbolic_size == xleft.thickness.left[0] xi = SubDimension.middle(name='xi', parent=x, @@ -289,7 +295,8 @@ def test_bcs(self, opt): xi = SubDimension.middle(name='xi', parent=x, thickness_left=thickness, thickness_right=thickness) xright = SubDimension.right(name='xright', parent=x, thickness=thickness) - + assert xright.is_right + assert not xright.is_middle yi = SubDimension.middle(name='yi', parent=y, thickness_left=thickness, thickness_right=thickness)