From 2c21b092d9c691010d4c9d751f7cfe1184d7b3b5 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Fri, 5 Jan 2024 15:19:00 +0000
Subject: [PATCH 01/23] compiler: Start revamping autopadding

---
 devito/arch/archinfo.py           | 59 ++++++++++++++++------
 devito/data/allocators.py         | 82 ++++++++++++++++++-------------
 devito/data/data.py               | 14 ++++--
 devito/operator/operator.py       |  3 +-
 devito/passes/clusters/aliases.py | 13 +++--
 devito/passes/iet/parpragma.py    |  6 +--
 devito/types/array.py             | 33 -------------
 devito/types/basic.py             | 22 +++++++--
 devito/types/dense.py             | 72 +++++++++++++++------------
 devito/types/misc.py              | 39 ++++++++++++++-
 10 files changed, 211 insertions(+), 132 deletions(-)

diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py
index 3b5e647c50..aa796f2f70 100644
--- a/devito/arch/archinfo.py
+++ b/devito/arch/archinfo.py
@@ -598,6 +598,14 @@ def get_platform():
 class Platform(object):
 
     registry = {}
+    """
+    The Platform registry.
+
+    Each new Platform instance is automatically added to the registry.
+    """
+
+    max_mem_trans_nbytes = None
+    """Maximum memory transaction size in bytes."""
 
     def __init__(self, name):
         self.name = name
@@ -630,16 +638,6 @@ def _detect_isa(self):
     def threads_per_core(self):
         return self.cores_logical // self.cores_physical
 
-    @property
-    def simd_reg_size(self):
-        """Size in bytes of a SIMD register."""
-        return isa_registry.get(self.isa, 0)
-
-    def simd_items_per_reg(self, dtype):
-        """Number of items of type ``dtype`` that can fit in a SIMD register."""
-        assert self.simd_reg_size % np.dtype(dtype).itemsize == 0
-        return int(self.simd_reg_size / np.dtype(dtype).itemsize)
-
     @property
     def memtotal(self):
         """Physical memory size in bytes, or None if unknown."""
@@ -649,9 +647,23 @@ def memavail(self, *args, **kwargs):
         """Available physical memory in bytes, or None if unknown."""
         return None
 
+    def max_mem_trans_size(self, dtype):
+        """
+        Number of items of type `dtype` that can be transferred in a single
+        memory transaction.
+        """
+        assert self.max_mem_trans_nbytes % np.dtype(dtype).itemsize == 0
+        return int(self.max_mem_trans_nbytes / np.dtype(dtype).itemsize)
+
 
 class Cpu64(Platform):
 
+    # The vast majority of CPUs have a 64-byte cache line
+    max_mem_trans_nbytes = 64
+
+    # The known ISAs are to be provided by the subclasses
+    known_isas = ()
+
     def __init__(self, name, cores_logical=None, cores_physical=None, isa=None):
         super().__init__(name)
 
@@ -661,9 +673,6 @@ def __init__(self, name, cores_logical=None, cores_physical=None, isa=None):
         self.cores_physical = cores_physical or cpu_info['physical']
         self.isa = isa or self._detect_isa()
 
-    # The known ISAs are to be provided by the subclasses
-    known_isas = ()
-
     @classmethod
     def _mro(cls):
         # Retain only the CPU Platforms
@@ -683,6 +692,20 @@ def _detect_isa(self):
                 return i
         return 'cpp'
 
+    @property
+    def simd_reg_nbytes(self):
+        """
+        Size in bytes of a SIMD register.
+        """
+        return isa_registry.get(self.isa, 0)
+
+    def simd_items_per_reg(self, dtype):
+        """
+        Number of items of type `dtype` that fit in a SIMD register.
+        """
+        assert self.simd_reg_nbytes % np.dtype(dtype).itemsize == 0
+        return int(self.simd_reg_nbytes / np.dtype(dtype).itemsize)
+
     @cached_property
     def memtotal(self):
         return psutil.virtual_memory().total
@@ -758,7 +781,7 @@ def _mro(cls):
                 break
         return retval
 
-    @cached_property
+    @property
     def march(self):
         return None
 
@@ -783,13 +806,17 @@ def memavail(self, deviceid=0):
 
 class IntelDevice(Device):
 
-    @cached_property
+    max_mem_trans_nbytes = 64
+
+    @property
     def march(self):
         return ''
 
 
 class NvidiaDevice(Device):
 
+    max_mem_trans_nbytes = 128
+
     @cached_property
     def march(self):
         info = get_gpu_info()
@@ -802,6 +829,8 @@ def march(self):
 
 class AmdDevice(Device):
 
+    max_mem_trans_nbytes = 256
+
     @cached_property
     def march(cls):
         # TODO: this corresponds to Vega, which acts as the fallback `march`
diff --git a/devito/data/allocators.py b/devito/data/allocators.py
index d8626fd153..b051ff02cf 100644
--- a/devito/data/allocators.py
+++ b/devito/data/allocators.py
@@ -1,13 +1,13 @@
 import abc
 from functools import reduce
 from operator import mul
+import ctypes
+from ctypes.util import find_library
 import mmap
 import os
 import sys
 
 import numpy as np
-import ctypes
-from ctypes.util import find_library
 
 from devito.logger import logger
 from devito.parameters import configuration
@@ -24,9 +24,6 @@ class MemoryAllocator(object):
 
     __metaclass__ = abc.ABCMeta
 
-    is_Posix = False
-    is_Numa = False
-
     _attempted_init = False
     lib = None
 
@@ -51,7 +48,7 @@ def initialize(cls):
         """
         return
 
-    def alloc(self, shape, dtype):
+    def alloc(self, shape, dtype, padding=0):
         """
         Allocate memory.
 
@@ -61,6 +58,9 @@ def alloc(self, shape, dtype):
             Shape of the allocated array.
         dtype : numpy.dtype
             The data type of the raw data.
+        padding : int or 2-tuple of ints, optional
+            The number of points that are allocated before and after the data,
+            that is in addition to the requested shape. Defaults to 0.
 
         Returns
         -------
@@ -69,25 +69,41 @@ def alloc(self, shape, dtype):
             access the data as a ctypes object. The second element is an opaque
             object that is needed only for the "memfree" call.
         """
-        size = int(reduce(mul, shape))
+        datasize = int(reduce(mul, shape))
         ctype = dtype_to_ctype(dtype)
 
-        c_pointer, memfree_args = self._alloc_C_libcall(size, ctype)
-        if c_pointer is None:
-            raise RuntimeError("Unable to allocate %d elements in memory", str(size))
-
-        # cast to 1D array of the specified size
-        ctype_1d = ctype * size
+        # Add padding, if any
+        try:
+            padleft, padright = padding
+        except TypeError:
+            padleft, padright = padding, padding
+        if not isinstance(padleft, int) and not isinstance(padright, int):
+            raise TypeError("padding must be an int or a 2-tuple of ints")
+        size = datasize + padleft + padright
+
+        padleft_pointer, memfree_args = self._alloc_C_libcall(size, ctype)
+        if padleft_pointer is None:
+            raise RuntimeError("Unable to allocate %d elements in memory" % size)
+
+        # Compute the pointer to the user data
+        padleft_bytes = padleft * ctypes.sizeof(ctype)
+        c_pointer = ctypes.c_void_p(padleft_pointer.value + padleft_bytes)
+
+        # Cast to 1D array of the specified `datasize`
+        ctype_1d = ctype * datasize
         buf = ctypes.cast(c_pointer, ctypes.POINTER(ctype_1d)).contents
-        pointer = np.frombuffer(buf, dtype=dtype)
-        # pointer.reshape should not be used here because it may introduce a copy
-        # From https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html:
-        # It is not always possible to change the shape of an array without copying the
-        # data. If you want an error to be raised when the data is copied, you should
-        # assign the new shape to the shape attribute of the array:
-        pointer.shape = shape
+        array = np.frombuffer(buf, dtype=dtype)
+
+        # `array.reshape` should not be used here because it may introduce
+        # a copy. From `docs.scipy.org/doc/numpy/reference/generated/numpy.reshape`:
+        #   It is not always possible to change the shape of an array without
+        #   copying the data. If you want an error to be raised when the data
+        #   is copied, you should assign the new shape to the shape attribute
+        #   of the array:
+        array.shape = shape
+        ndarray = array  # At this point it's interpreted as an ndarray
 
-        return (pointer, memfree_args)
+        return (ndarray, memfree_args)
 
     @abc.abstractmethod
     def _alloc_C_libcall(self, size, ctype):
@@ -124,8 +140,6 @@ class PosixAllocator(MemoryAllocator):
     aligned to page boundaries.
     """
 
-    is_Posix = True
-
     @classmethod
     def initialize(cls):
         handle = find_library('c')
@@ -162,7 +176,7 @@ def free(self, c_pointer):
 class GuardAllocator(PosixAllocator):
 
     """
-    Memory allocator based on ``posix`` functions. The allocated memory is
+    Memory allocator based on `posix` functions. The allocated memory is
     aligned to page boundaries.  Additionally, it allocates extra memory
     before and after the data, and configures it so that an SEGV is thrown
     immediately if an out-of-bounds access occurs.
@@ -195,20 +209,20 @@ def _alloc_C_libcall(self, size, ctype):
         if ret != 0:
             return None, None
 
-        # generate pointers to the left padding, the user data, and the right pad
+        # Generate pointers to the left padding, the user data, and the right pad
         padleft_pointer = c_pointer
         c_pointer = ctypes.c_void_p(c_pointer.value + self.padding_bytes)
         padright_pointer = ctypes.c_void_p(c_pointer.value + npages_user * pagesize)
 
-        # and set the permissions on the pad memory to 0 (no access)
-        # if these fail, don't worry about failing the entire allocation
+        # And set the permissions on the pad memory to 0 (no access)
+        # If these fail, don't worry about failing the entire allocation
         c_padsize = ctypes.c_ulong(self.padding_bytes)
         if self.lib.mprotect(padleft_pointer, c_padsize, ctypes.c_int(0)):
             logger.warning("couldn't protect memory")
         if self.lib.mprotect(padright_pointer, c_padsize, ctypes.c_int(0)):
             logger.warning("couldn't protect memory")
 
-        # if there is a multiple of 4 bytes left, use the code below to poison
+        # If there is a multiple of 4 bytes left, use the code below to poison
         # the memory
         if nbytes_user % 4 == 0:
             poison_size = npages_user*pagesize - nbytes_user
@@ -216,16 +230,16 @@ def _alloc_C_libcall(self, size, ctype):
             poison_ptr = ctypes.cast(ctypes.c_void_p(c_pointer.value + nbytes_user),
                                      intp_type)
 
-            # for both float32 and float64, a sequence of -100 int32s represents NaNs,
-            # at least on little-endian architectures.  It shouldn't matter what we
-            # put in there, anyway
+            # For both float32 and float64, a sequence of -100 int32s
+            # represents NaNs, at least on little-endian architectures;
+            # it shouldn't matter what we put in there, anyway
             for i in range(poison_size // 4):
                 poison_ptr[i] = -100
 
         return c_pointer, (padleft_pointer, c_bytesize)
 
     def free(self, c_pointer, total_size):
-        # unprotect it, since free() accesses it, I think...
+        # Unprotect it, since free() accesses it, I think...
         self.lib.mprotect(c_pointer, total_size,
                           ctypes.c_int(mmap.PROT_READ | mmap.PROT_WRITE))
         self.lib.free(c_pointer)
@@ -247,8 +261,6 @@ class NumaAllocator(MemoryAllocator):
         ("allocate on any NUMA node with sufficient free memory") are accepted.
     """
 
-    is_Numa = True
-
     @classmethod
     def initialize(cls):
         handle = find_library('numa')
@@ -429,4 +441,4 @@ def default_allocator(name=None):
           infer_knl_mode() == 'flat'):
         return ALLOC_KNL_MCDRAM
     else:
-        return ALLOC_ALIGNED
+        return custom_allocators.get('default', ALLOC_ALIGNED)
diff --git a/devito/data/data.py b/devito/data/data.py
index a7d5dc17f7..31db578cd1 100644
--- a/devito/data/data.py
+++ b/devito/data/data.py
@@ -26,12 +26,16 @@ class Data(np.ndarray):
     decomposition : tuple of Decomposition, optional
         The data decomposition, for each dimension.
     modulo : tuple of bool, optional
-        If the i-th entry is True, then the i-th array dimension uses modulo indexing.
+        If the i-th entry is True, then the i-th array dimension uses modulo
+        indexing.
     allocator : MemoryAllocator, optional
         Used to allocate memory. Defaults to `ALLOC_ALIGNED`.
     distributor : Distributor, optional
-        The distributor from which the original decomposition was produced. Note that
-        the decomposition Parameter above may be different to distributor.decomposition.
+        The distributor from which the original decomposition was produced.
+        Note that `decomposition` may differ from `distributor.decomposition`.
+    padding : int or 2-tuple of ints, optional
+        The number of points that are allocated before and after the data,
+        that is in addition to the requested shape. Defaults to 0.
 
     Notes
     -----
@@ -45,9 +49,9 @@ class Data(np.ndarray):
     """
 
     def __new__(cls, shape, dtype, decomposition=None, modulo=None,
-                allocator=ALLOC_ALIGNED, distributor=None):
+                allocator=ALLOC_ALIGNED, distributor=None, padding=0):
         assert len(shape) == len(modulo)
-        ndarray, memfree_args = allocator.alloc(shape, dtype)
+        ndarray, memfree_args = allocator.alloc(shape, dtype, padding=padding)
         obj = ndarray.view(cls)
         obj._allocator = allocator
         obj._memfree_args = memfree_args
diff --git a/devito/operator/operator.py b/devito/operator/operator.py
index 4d5208235a..ac91069f93 100644
--- a/devito/operator/operator.py
+++ b/devito/operator/operator.py
@@ -624,7 +624,8 @@ def _prepare_arguments(self, autotune=None, **kwargs):
 
         # Sanity check
         for p in self.parameters:
-            p._arg_check(args, self._dspace[p], am=self._access_modes.get(p))
+            p._arg_check(args, self._dspace[p], am=self._access_modes.get(p),
+                         **kwargs)
         for d in self.dimensions:
             if d.is_Derived:
                 d._arg_check(args, self._dspace[p])
diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py
index 7a8321a7f9..ed749113bf 100644
--- a/devito/passes/clusters/aliases.py
+++ b/devito/passes/clusters/aliases.py
@@ -6,6 +6,7 @@
 import numpy as np
 import sympy
 
+from devito.arch import Cpu64
 from devito.finite_differences import EvalDerivative, IndexDerivative, Weights
 from devito.ir import (SEQUENTIAL, PARALLEL_IF_PVT, ROUNDABLE, SEPARABLE, Forward,
                        IterationSpace, Interval, Cluster, ExprGeometry, Queue,
@@ -830,17 +831,21 @@ def optimize_schedule_rotations(schedule, sregistry):
 
 def optimize_schedule_padding(schedule, meta, platform):
     """
-    Round up the innermost IterationInterval of the tensor temporaries IterationSpace
-    to a multiple of the SIMD vector length. This is not always possible though (it
-    depends on how much halo is safely accessible in all read Functions).
+    Attempt roundin up the innermost IterationInterval of the tensor temporaries
+    IterationSpace to a multiple of the SIMD vector length.
     """
+    if not isinstance(platform, Cpu64):
+        return schedule
+
+    #TODO: REMOVE ROUNDABLE AND PERFORM ANALYSIS HERE
+
     processed = []
     for i in schedule:
         try:
             it = i.ispace.itintervals[-1]
             if it.dim is i.writeto[-1].dim and ROUNDABLE in meta.properties[it.dim]:
                 vl = platform.simd_items_per_reg(meta.dtype)
-                ispace = i.ispace.add(Interval(it.dim, 0, it.size % vl))
+                ispace = i.ispace.add(Interval(it.dim, 0, vl - it.size % vl))
             else:
                 ispace = i.ispace
             processed.append(ScheduledAlias(
diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py
index b41b871b55..252cd9ddd6 100644
--- a/devito/passes/iet/parpragma.py
+++ b/devito/passes/iet/parpragma.py
@@ -41,15 +41,15 @@ def _support_array_reduction(cls, compiler):
         return True
 
     @property
-    def simd_reg_size(self):
-        return self.platform.simd_reg_size
+    def simd_reg_nbytes(self):
+        return self.platform.simd_reg_nbytes
 
     def _make_simd_pragma(self, iet):
         indexeds = FindSymbols('indexeds').visit(iet)
         aligned = {i.name for i in indexeds if i.function.is_DiscreteFunction}
         if aligned:
             simd = self.lang['simd-for-aligned']
-            simd = as_tuple(simd(','.join(sorted(aligned)), self.simd_reg_size))
+            simd = as_tuple(simd(','.join(sorted(aligned)), self.simd_reg_nbytes))
         else:
             simd = as_tuple(self.lang['simd-for'])
 
diff --git a/devito/types/array.py b/devito/types/array.py
index e51edd27ed..0ada7e6ad5 100644
--- a/devito/types/array.py
+++ b/devito/types/array.py
@@ -127,39 +127,6 @@ def __init_finalize__(self, *args, **kwargs):
         self._initvalue = kwargs.get('initvalue')
         assert self._initvalue is None or self._scope != 'heap'
 
-    def __padding_setup__(self, **kwargs):
-        padding = kwargs.get('padding')
-        if padding is None:
-            padding = [(0, 0) for _ in range(self.ndim)]
-            if kwargs.get('autopadding', configuration['autopadding']):
-                # Heuristic 1; Arrays are typically introduced for temporaries
-                # introduced during compilation, and are almost always used together
-                # with loop blocking.  Since the typical block size is a multiple of
-                # the SIMD vector length, `vl`, padding is made such that the
-                # NODOMAIN size is a multiple of `vl` too
-
-                # Heuristic 2: the right-NODOMAIN size is not only a multiple of
-                # `vl`, but also guaranteed to be *at least* greater or equal than
-                # `vl`, so that the compiler can tweak loop trip counts to maximize
-                # the effectiveness of SIMD vectorization
-
-                # Let UB be a function that rounds up a value `x` to the nearest
-                # multiple of the SIMD vector length
-                vl = configuration['platform'].simd_items_per_reg(self.dtype)
-                ub = lambda x: int(ceil(x / vl)) * vl
-
-                fvd_halo_size = sum(self.halo[-1])
-                fvd_pad_size = (ub(fvd_halo_size) - fvd_halo_size) + vl
-
-                padding[-1] = (0, fvd_pad_size)
-            return tuple(padding)
-        elif isinstance(padding, int):
-            return tuple((0, padding) for _ in range(self.ndim))
-        elif isinstance(padding, tuple) and len(padding) == self.ndim:
-            return tuple((0, i) if isinstance(i, int) else i for i in padding)
-        else:
-            raise TypeError("`padding` must be int or %d-tuple of ints" % self.ndim)
-
     @classmethod
     def __dtype_setup__(cls, **kwargs):
         return kwargs.get('dtype', np.float32)
diff --git a/devito/types/basic.py b/devito/types/basic.py
index 17835933e3..c8c7760ad6 100644
--- a/devito/types/basic.py
+++ b/devito/types/basic.py
@@ -839,8 +839,8 @@ class AbstractFunction(sympy.Function, Basic, Pickable, Evaluable):
     True if data is allocated as a single, contiguous chunk of memory.
     """
 
-    __rkwargs__ = ('name', 'dtype', 'grid', 'halo', 'padding', 'alias',
-                   'space', 'function')
+    __rkwargs__ = ('name', 'dtype', 'grid', 'halo', 'padding', 'ghost',
+                   'alias', 'space', 'function')
 
     def __new__(cls, *args, **kwargs):
         # Preprocess arguments
@@ -930,10 +930,11 @@ def sort_key(self, order=None):
         return class_key, args, exp, coeff
 
     def __init_finalize__(self, *args, **kwargs):
-        # Setup halo and padding regions
+        # Setup halo, padding, and ghost regions
         self._is_halo_dirty = False
         self._halo = self.__halo_setup__(**kwargs)
         self._padding = self.__padding_setup__(**kwargs)
+        self._ghost = self.__ghost_setup__(**kwargs)
 
         # There may or may not be a `Grid`
         self._grid = kwargs.get('grid')
@@ -988,6 +989,9 @@ def __padding_setup__(self, **kwargs):
         padding = tuple(kwargs.get('padding', [(0, 0) for i in range(self.ndim)]))
         return DimensionTuple(*padding, getters=self.dimensions)
 
+    def __ghost_setup__(self, **kwargs):
+        return (0, 0)
+
     def __distributor_setup__(self, **kwargs):
         # There may or may not be a `Distributor`. In the latter case, the
         # AbstractFunction is to be considered "local" to each MPI rank
@@ -1171,6 +1175,10 @@ def halo(self):
     def padding(self):
         return self._padding
 
+    @property
+    def ghost(self):
+        return self._ghost
+
     @property
     def is_const(self):
         return False
@@ -1266,6 +1274,14 @@ def _size_nodomain(self):
 
         return DimensionTuple(*sizes, getters=self.dimensions, left=left, right=right)
 
+    @cached_property
+    def _size_ghost(self):
+        """
+        Number of points in the ghost region, that is the two areas before
+        and after the allocated data.
+        """
+        return Size(*self._ghost)
+
     @cached_property
     def _offset_domain(self):
         """Number of points before the first domain element."""
diff --git a/devito/types/dense.py b/devito/types/dense.py
index 9d33777551..c9ad5dcb98 100644
--- a/devito/types/dense.py
+++ b/devito/types/dense.py
@@ -126,7 +126,8 @@ def wrapper(self):
                 self._data = self._DataType(self.shape_allocated, self.dtype,
                                             modulo=self._mask_modulo,
                                             allocator=self._allocator,
-                                            distributor=self._distributor)
+                                            distributor=self._distributor,
+                                            padding=self._size_ghost)
 
                 # Initialize data
                 if self._first_touch:
@@ -836,32 +837,40 @@ def _arg_check(self, args, intervals, **kwargs):
         Raises
         ------
         InvalidArgument
-            If, given the runtime values `args`, an out-of-bounds array
-            access would be performed, or if shape/dtype don't match with
-            self's shape/dtype.
+            If an incompatibility is detected.
         """
         if self.name not in args:
             raise InvalidArgument("No runtime value for `%s`" % self.name)
 
-        key = args[self.name]
-        if len(key.shape) != self.ndim:
+        obj = kwargs.get(self.name, self)
+        data = args[self.name]
+
+        if len(data.shape) != self.ndim:
             raise InvalidArgument("Shape %s of runtime value `%s` does not match "
                                   "dimensions %s" %
-                                  (key.shape, self.name, self.dimensions))
-        if key.dtype != self.dtype:
+                                  (data.shape, self.name, self.dimensions))
+        if data.dtype != self.dtype:
             warning("Data type %s of runtime value `%s` does not match the "
-                    "Function data type %s" % (key.dtype, self.name, self.dtype))
+                    "Function data type %s" % (data.dtype, self.name, self.dtype))
+
+        if self._honors_autopadding and not obj._honors_autopadding:
+            raise InvalidArgument("Runtime override `%s` does not honour "
+                                  "`autopadding`" % new.name)
 
-        for i, s in zip(self.dimensions, key.shape):
-            i._arg_check(args, s, intervals[i])
+        # Check each Dimension for potential OOB accesses
+        # NOTE: The contiguous dimension is special in that it can rely on the
+        # `ghost` region too
+        safezone = [0]*(self.ndim - 1) + [obj._size_ghost.right]
+        for i, s, z in zip(self.dimensions, data.shape, safezone):
+            i._arg_check(args, s + z, intervals[i])
 
         if args.options['index-mode'] == 'int32' and \
            args.options['linearize'] and \
-           self.size - 1 >= np.iinfo(np.int32).max:
+           data.size - 1 >= np.iinfo(np.int32).max:
             raise InvalidArgument("`%s`, with its %d elements, may be too big for "
                                   "int32 pointer arithmetic, which might cause an "
                                   "overflow. Use the 'index-mode=int64' option"
-                                  % (self, self.size))
+                                  % (self, data.size))
 
     def _arg_finalize(self, args, alias=None):
         key = alias or self
@@ -1119,37 +1128,38 @@ def __halo_setup__(self, **kwargs):
 
     def __padding_setup__(self, **kwargs):
         padding = kwargs.get('padding')
+
         if padding is None:
             if kwargs.get('autopadding', configuration['autopadding']):
-                # Auto-padding
-                # 0-padding in all Dimensions except in the Fastest Varying Dimension,
-                # `fvd`, which is the innermost one
-                padding = [(0, 0) for i in self.dimensions[:-1]]
-                fvd = self.dimensions[-1]
-                # Let UB be a function that rounds up a value `x` to the nearest
-                # multiple of the SIMD vector length, `vl`
-                vl = configuration['platform'].simd_items_per_reg(self.dtype)
-                ub = lambda x: int(ceil(x / vl)) * vl
-                # Given the HALO and DOMAIN sizes, the right-PADDING is such that:
-                # * the `fvd` size is a multiple of `vl`
-                # * it contains *at least* `vl` points
-                # This way:
-                # * all first grid points along the `fvd` will be cache-aligned
-                # * there is enough room to round up the loop trip counts to maximize
-                #   the effectiveness SIMD vectorization
-                fvd_pad_size = (ub(self._size_nopad[fvd]) - self._size_nopad[fvd]) + vl
-                padding.append((0, fvd_pad_size))
+                # Auto-padding is to maximize the efficiency of memory accesses:
+                # * Perform as many aligned accesses as possible
+                # * Ensure there's enough room to perform memory transactions
+                #   of maximum size. This essentially means that the remainder
+                #   DOMAIN region (i.e., the last block of elements that is not
+                #   a multiple of the maximum memory transaction size) plus the
+                #   NODOMAIN size has to be as large as the maximum memory
+                #   transaction size.
+                mmts = configuration['platform'].max_mem_trans_size(self.dtype)
+
+                d = self.dimensions[-1]
+                pad_size = mmts - self._size_nopad[d] % mmts
+                padding = [(0, 0) for i in self.dimensions[:-1]] + [(0, pad_size)]
             else:
                 padding = tuple((0, 0) for d in self.dimensions)
+
         elif isinstance(padding, DimensionTuple):
             padding = tuple(padding[d] for d in self.dimensions)
+
         elif isinstance(padding, int):
             padding = tuple((0, padding) if d.is_Space else (0, 0)
                             for d in self.dimensions)
+
         elif isinstance(padding, tuple) and len(padding) == self.ndim:
             padding = tuple((0, i) if isinstance(i, int) else i for i in padding)
+
         else:
             raise TypeError("`padding` must be int or %d-tuple of ints" % self.ndim)
+
         return DimensionTuple(*padding, getters=self.dimensions)
 
     @property
diff --git a/devito/types/misc.py b/devito/types/misc.py
index 88d5dcae13..0297b0f781 100644
--- a/devito/types/misc.py
+++ b/devito/types/misc.py
@@ -4,7 +4,8 @@
 import sympy
 from sympy.core.core import ordering_of_classes
 
-from devito.types import Array, CompositeObject, Indexed, Symbol
+from devito.parameters import configuration
+from devito.types import Array, CompositeObject, DimensionTuple, Indexed, Symbol
 from devito.types.basic import IndexedData
 from devito.tools import Pickable, as_tuple
 
@@ -193,7 +194,41 @@ class TempArray(Array):
     sub-expressions.
     """
 
-    pass
+    def __padding_setup__(self, **kwargs):
+        #TODO: CURRENTLY IDENTICAL TO dense.__padding_setup__
+        #TODO: MUST BE FACTORED OUT INTO A COMMON BASE CLASS
+        padding = kwargs.get('padding')
+        if padding is None:
+            if kwargs.get('autopadding', configuration['autopadding']):
+                # Auto-padding is to maximize the efficiency of memory accesses:
+                # * Perform as many aligned accesses as possible
+                # * Ensure there's enough room to perform memory transactions
+                #   of maximum size. This essentially means that the remainder
+                #   DOMAIN region (i.e., the last block of elements that is not
+                #   a multiple of the maximum memory transaction size) plus the
+                #   NODOMAIN size has to be as large as the maximum memory
+                #   transaction size.
+                mmts = configuration['platform'].max_mem_trans_size(self.dtype)
+
+                d = self.dimensions[-1]
+                pad_size = mmts - self._size_nopad[d] % mmts
+                padding = [(0, 0) for i in self.dimensions[:-1]] + [(0, pad_size)]
+            else:
+                padding = tuple((0, 0) for d in self.dimensions)
+
+        elif isinstance(padding, DimensionTuple):
+            padding = tuple(padding[d] for d in self.dimensions)
+
+        elif isinstance(padding, int):
+            padding = tuple((0, padding) for _ in range(self.ndim))
+
+        elif isinstance(padding, tuple) and len(padding) == self.ndim:
+            padding = tuple((0, i) if isinstance(i, int) else i for i in padding)
+
+        else:
+            raise TypeError("`padding` must be int or %d-tuple of ints" % self.ndim)
+
+        return DimensionTuple(*padding, getters=self.dimensions)
 
 
 class Fence(object):

From c7b601e779495b3c4b1442dc1e4c6f9b644a6c64 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Mon, 8 Jan 2024 09:53:43 +0000
Subject: [PATCH 02/23] compiler: Extirpate decrepit loop-rounding optimization

---
 devito/ir/clusters/analysis.py    | 38 +------------------------
 devito/ir/support/properties.py   |  8 +-----
 devito/passes/clusters/aliases.py | 34 ++--------------------
 devito/types/basic.py             | 10 -------
 tests/test_dse.py                 | 47 -------------------------------
 5 files changed, 4 insertions(+), 133 deletions(-)

diff --git a/devito/ir/clusters/analysis.py b/devito/ir/clusters/analysis.py
index 4778f2b2b9..7e94796971 100644
--- a/devito/ir/clusters/analysis.py
+++ b/devito/ir/clusters/analysis.py
@@ -1,6 +1,6 @@
 from devito.ir.clusters.visitors import QueueStateful
 from devito.ir.support import (AFFINE, PARALLEL, PARALLEL_INDEP, PARALLEL_IF_ATOMIC,
-                               ROUNDABLE, SEQUENTIAL, Forward)
+                               SEQUENTIAL, Forward)
 from devito.tools import as_tuple, flatten, timed_pass
 
 __all__ = ['analyze']
@@ -13,7 +13,6 @@ def analyze(clusters):
     # Collect properties
     clusters = Parallelism(state).process(clusters)
     clusters = Affiness(state).process(clusters)
-    clusters = Rounding(state).process(clusters)
 
     # Reconstruct Clusters attaching the discovered properties
     processed = [c.rebuild(properties=state.properties.get(c)) for c in clusters]
@@ -114,41 +113,6 @@ def _callback(self, clusters, d, prefix):
             return PARALLEL
 
 
-class Rounding(Detector):
-
-    def _callback(self, clusters, d, prefix):
-        itinterval = prefix[-1]
-
-        # The iteration direction must be Forward -- ROUNDABLE is for rounding *up*
-        if itinterval.direction is not Forward:
-            return
-
-        properties = self._fetch_properties(clusters, prefix)
-        if PARALLEL not in properties[d]:
-            return
-
-        scope = self._fetch_scope(clusters)
-
-        # All accessed Functions must have enough room in the PADDING region
-        # so that `i`'s trip count can safely be rounded up
-        # Note: autopadding guarantees that the padding size along the
-        # Fastest Varying Dimension is a multiple of the SIMD vector length
-        functions = [f for f in scope.functions if f.is_AbstractFunction]
-        try:
-            if any(not f._honors_autopadding for f in functions):
-                return
-        except ValueError:
-            # E.g., lazily allocated Functions don't have an accessible
-            # `.shape` until after the first call to `f._arg_values`
-            return
-
-        # Mixed data types (e.g., float and double) is unsupported
-        if len({f.dtype for f in functions}) > 1:
-            return
-
-        return ROUNDABLE
-
-
 class Affiness(Detector):
 
     """
diff --git a/devito/ir/support/properties.py b/devito/ir/support/properties.py
index 1b0e5fc872..0cb8cb5231 100644
--- a/devito/ir/support/properties.py
+++ b/devito/ir/support/properties.py
@@ -48,12 +48,6 @@ def __init__(self, name, val=None):
 SKEWABLE = Property('skewable')
 """A fully parallel Dimension that would benefit from wavefront/skewed tiling."""
 
-ROUNDABLE = Property('roundable')
-"""
-A Dimension whose upper limit may be rounded up to a multiple of the SIMD
-vector length thanks to the presence of enough padding.
-"""
-
 AFFINE = Property('affine')
 """
 A Dimension used to index into tensor objects only through affine and regular
@@ -130,7 +124,7 @@ def normalize_properties(*args):
 
 
 def relax_properties(properties):
-    return frozenset(properties - {PARALLEL_INDEP, ROUNDABLE})
+    return frozenset(properties - {PARALLEL_INDEP})
 
 
 class Properties(frozendict):
diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py
index ed749113bf..dd4eaaf7e0 100644
--- a/devito/passes/clusters/aliases.py
+++ b/devito/passes/clusters/aliases.py
@@ -8,7 +8,7 @@
 
 from devito.arch import Cpu64
 from devito.finite_differences import EvalDerivative, IndexDerivative, Weights
-from devito.ir import (SEQUENTIAL, PARALLEL_IF_PVT, ROUNDABLE, SEPARABLE, Forward,
+from devito.ir import (SEQUENTIAL, PARALLEL_IF_PVT, SEPARABLE, Forward,
                        IterationSpace, Interval, Cluster, ExprGeometry, Queue,
                        IntervalGroup, LabeledVector, Vector, normalize_properties,
                        relax_properties, unbounded, minimum, maximum, extrema,
@@ -141,7 +141,6 @@ def _aliases_from_clusters(self, clusters, exclude, meta):
         # Schedule -> Schedule (optimization)
         if self.opt_rotate:
             schedule = optimize_schedule_rotations(schedule, self.sregistry)
-        schedule = optimize_schedule_padding(schedule, meta, self.platform)
 
         # Schedule -> [Clusters]_k
         processed, subs = lower_schedule(schedule, meta, self.sregistry,
@@ -829,34 +828,6 @@ def optimize_schedule_rotations(schedule, sregistry):
     return schedule.rebuild(*processed, rmapper=rmapper)
 
 
-def optimize_schedule_padding(schedule, meta, platform):
-    """
-    Attempt roundin up the innermost IterationInterval of the tensor temporaries
-    IterationSpace to a multiple of the SIMD vector length.
-    """
-    if not isinstance(platform, Cpu64):
-        return schedule
-
-    #TODO: REMOVE ROUNDABLE AND PERFORM ANALYSIS HERE
-
-    processed = []
-    for i in schedule:
-        try:
-            it = i.ispace.itintervals[-1]
-            if it.dim is i.writeto[-1].dim and ROUNDABLE in meta.properties[it.dim]:
-                vl = platform.simd_items_per_reg(meta.dtype)
-                ispace = i.ispace.add(Interval(it.dim, 0, vl - it.size % vl))
-            else:
-                ispace = i.ispace
-            processed.append(ScheduledAlias(
-                i.pivot, i.writeto, ispace, i.aliaseds, i.indicess,
-            ))
-        except (TypeError, KeyError, IndexError):
-            processed.append(i)
-
-    return schedule.rebuild(*processed)
-
-
 def lower_schedule(schedule, meta, sregistry, ftemps):
     """
     Turn a Schedule into a sequence of Clusters.
@@ -931,8 +902,7 @@ def lower_schedule(schedule, meta, sregistry, ftemps):
                 if any(i.is_Modulo for i in ispace.sub_iterators[d]):
                     properties[d] = normalize_properties(v, {SEQUENTIAL})
                 elif d not in writeto.itdims:
-                    properties[d] = normalize_properties(v, {PARALLEL_IF_PVT}) - \
-                        {ROUNDABLE}
+                    properties[d] = normalize_properties(v, {PARALLEL_IF_PVT})
             except KeyError:
                 # Non-dimension key such as (x, y) for diagonal stencil u(x+i hx, y+i hy)
                 pass
diff --git a/devito/types/basic.py b/devito/types/basic.py
index c8c7760ad6..5b678ef91d 100644
--- a/devito/types/basic.py
+++ b/devito/types/basic.py
@@ -1000,16 +1000,6 @@ def __distributor_setup__(self, **kwargs):
         except AttributeError:
             return kwargs.get('distributor')
 
-    @cached_property
-    def _honors_autopadding(self):
-        """
-        True if the actual padding is greater or equal than whatever autopadding
-        would produce, False otherwise.
-        """
-        autopadding = self.__padding_setup__(autopadding=True)
-        return all(l0 >= l1 and r0 >= r1
-                   for (l0, r0), (l1, r1) in zip(self.padding, autopadding))
-
     @property
     def name(self):
         """The name of the object."""
diff --git a/tests/test_dse.py b/tests/test_dse.py
index 5341bb010c..7e7956c314 100644
--- a/tests/test_dse.py
+++ b/tests/test_dse.py
@@ -1289,53 +1289,6 @@ def test_from_different_nests(self, rotate):
         assert np.all(u.data == u1.data)
         assert np.all(v.data == v1.data)
 
-    @skipif('cpu64-arm')
-    @pytest.mark.parametrize('rotate', [False, True])
-    @switchconfig(autopadding=True, platform='knl7210')  # Platform is to fix pad value
-    def test_minimize_remainders_due_to_autopadding(self, rotate):
-        """
-        Check that the bounds of the Iteration computing an aliasing expression are
-        relaxed (i.e., slightly larger) so that backend-compiler-generated remainder
-        loops are avoided.
-        """
-        grid = Grid(shape=(3, 3, 3))
-        x, y, z = grid.dimensions
-        t = grid.stepping_dim
-
-        u = TimeFunction(name='u', grid=grid, space_order=3)
-        u1 = TimeFunction(name='u1', grid=grid, space_order=3)
-
-        u.data_with_halo[:] = 0.5
-        u1.data_with_halo[:] = 0.5
-
-        # Leads to 3D aliases
-        eqn = Eq(u.forward, _R(_R(u[t, x, y, z] + u[t, x+1, y+1, z+1])*3. +
-                               _R(u[t, x+2, y+2, z+2] + u[t, x+3, y+3, z+3])*3. + 1.))
-
-        op0 = Operator(eqn, opt=('noop', {'openmp': False}))
-        op1 = Operator(eqn, opt=('advanced', {'openmp': False, 'cire-mingain': 0,
-                                              'cire-rotate': rotate}))
-
-        # Check code generation
-        bns, pbs = assert_blocking(op1, {'x0_blk0'})
-        xs, ys, zs = get_params(op1, 'x0_blk0_size', 'y0_blk0_size', 'z_size')
-        arrays = [i for i in FindSymbols().visit(bns['x0_blk0']) if i.is_Array]
-        assert len(arrays) == 1
-        assert len(FindNodes(VExpanded).visit(pbs['x0_blk0'])) == 0
-        assert arrays[0].padding == ((0, 0), (0, 0), (0, 30))
-        check_array(arrays[0], ((1, 1), (1, 1), (1, 1)), (xs+2, ys+2, zs+32), rotate)
-        # Check loop bounds
-        trees = retrieve_iteration_tree(bns['x0_blk0'])
-        assert len(trees) == 2
-        expected_rounded = trees[0].inner
-        assert expected_rounded.symbolic_max ==\
-            z.symbolic_max + (z.symbolic_max - z.symbolic_min + 3) % 16 + 1
-
-        # Check numerical output
-        op0(time_M=1)
-        op1(time_M=1, u=u1)
-        assert np.all(u.data == u1.data)
-
     def test_catch_best_invariant_v1(self):
         """
         Make sure the best time-invariant sub-expressions are extracted.

From 060e0dedf9a90957075dd466700f033b4a8bab4c Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Mon, 8 Jan 2024 10:11:15 +0000
Subject: [PATCH 03/23] compiler: Remove useless runtime checks

---
 devito/types/dense.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/devito/types/dense.py b/devito/types/dense.py
index c9ad5dcb98..400c7cc256 100644
--- a/devito/types/dense.py
+++ b/devito/types/dense.py
@@ -853,16 +853,9 @@ def _arg_check(self, args, intervals, **kwargs):
             warning("Data type %s of runtime value `%s` does not match the "
                     "Function data type %s" % (data.dtype, self.name, self.dtype))
 
-        if self._honors_autopadding and not obj._honors_autopadding:
-            raise InvalidArgument("Runtime override `%s` does not honour "
-                                  "`autopadding`" % new.name)
-
         # Check each Dimension for potential OOB accesses
-        # NOTE: The contiguous dimension is special in that it can rely on the
-        # `ghost` region too
-        safezone = [0]*(self.ndim - 1) + [obj._size_ghost.right]
-        for i, s, z in zip(self.dimensions, data.shape, safezone):
-            i._arg_check(args, s + z, intervals[i])
+        for i, s in zip(self.dimensions, data.shape):
+            i._arg_check(args, s, intervals[i])
 
         if args.options['index-mode'] == 'int32' and \
            args.options['linearize'] and \

From e2a10772c9842f3528a66f3f4005afa7dcac43a4 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Mon, 8 Jan 2024 10:36:44 +0000
Subject: [PATCH 04/23] compiler: Refactor __padding_setup__

---
 devito/types/array.py | 19 +++++++++++++++++++
 devito/types/basic.py | 14 ++++++++++++--
 devito/types/dense.py | 17 +----------------
 devito/types/misc.py  | 37 +------------------------------------
 4 files changed, 33 insertions(+), 54 deletions(-)

diff --git a/devito/types/array.py b/devito/types/array.py
index 0ada7e6ad5..66bcf12a3e 100644
--- a/devito/types/array.py
+++ b/devito/types/array.py
@@ -131,6 +131,25 @@ def __init_finalize__(self, *args, **kwargs):
     def __dtype_setup__(cls, **kwargs):
         return kwargs.get('dtype', np.float32)
 
+    def __padding_setup__(self, **kwargs):
+        padding = kwargs.get('padding')
+        if padding is None:
+            padding = self.__padding_auto_setup__(**kwargs)
+
+        elif isinstance(padding, DimensionTuple):
+            padding = tuple(padding[d] for d in self.dimensions)
+
+        elif isinstance(padding, int):
+            padding = tuple((0, padding) for _ in range(self.ndim))
+
+        elif isinstance(padding, tuple) and len(padding) == self.ndim:
+            padding = tuple((0, i) if isinstance(i, int) else i for i in padding)
+
+        else:
+            raise TypeError("`padding` must be int or %d-tuple of ints" % self.ndim)
+
+        return DimensionTuple(*padding, getters=self.dimensions)
+
     @property
     def liveness(self):
         return self._liveness
diff --git a/devito/types/basic.py b/devito/types/basic.py
index 5b678ef91d..5d2a32addd 100644
--- a/devito/types/basic.py
+++ b/devito/types/basic.py
@@ -12,6 +12,7 @@
 from cached_property import cached_property
 
 from devito.data import default_allocator
+from devito.parameters import configuration
 from devito.tools import (Pickable, as_tuple, ctypes_to_cstr, dtype_to_ctype,
                           frozendict, memoized_meth, sympy_mutex)
 from devito.types.args import ArgProvider
@@ -982,13 +983,22 @@ def __dtype_setup__(cls, **kwargs):
         return None
 
     def __halo_setup__(self, **kwargs):
-        halo = tuple(kwargs.get('halo', [(0, 0) for i in range(self.ndim)]))
+        halo = tuple(kwargs.get('halo', ((0, 0),)*self.ndim))
         return DimensionTuple(*halo, getters=self.dimensions)
 
     def __padding_setup__(self, **kwargs):
-        padding = tuple(kwargs.get('padding', [(0, 0) for i in range(self.ndim)]))
+        padding = tuple(kwargs.get('padding', ((0, 0),)*self.ndim))
         return DimensionTuple(*padding, getters=self.dimensions)
 
+    def __padding_auto_setup__(self, **kwargs):
+        if kwargs.get('autopadding', configuration['autopadding']):
+            mmts = configuration['platform'].max_mem_trans_size(self.dtype)
+            d = self.dimensions[-1]
+            pad_size = mmts - self._size_nopad[d] % mmts
+            return ((0, 0),)*(self.ndim - 1) + ((0, pad_size),)
+        else:
+            return ((0, 0),)*self.ndim
+
     def __ghost_setup__(self, **kwargs):
         return (0, 0)
 
diff --git a/devito/types/dense.py b/devito/types/dense.py
index 400c7cc256..8cfdb92dfa 100644
--- a/devito/types/dense.py
+++ b/devito/types/dense.py
@@ -1123,22 +1123,7 @@ def __padding_setup__(self, **kwargs):
         padding = kwargs.get('padding')
 
         if padding is None:
-            if kwargs.get('autopadding', configuration['autopadding']):
-                # Auto-padding is to maximize the efficiency of memory accesses:
-                # * Perform as many aligned accesses as possible
-                # * Ensure there's enough room to perform memory transactions
-                #   of maximum size. This essentially means that the remainder
-                #   DOMAIN region (i.e., the last block of elements that is not
-                #   a multiple of the maximum memory transaction size) plus the
-                #   NODOMAIN size has to be as large as the maximum memory
-                #   transaction size.
-                mmts = configuration['platform'].max_mem_trans_size(self.dtype)
-
-                d = self.dimensions[-1]
-                pad_size = mmts - self._size_nopad[d] % mmts
-                padding = [(0, 0) for i in self.dimensions[:-1]] + [(0, pad_size)]
-            else:
-                padding = tuple((0, 0) for d in self.dimensions)
+            padding = self.__padding_auto_setup__(**kwargs)
 
         elif isinstance(padding, DimensionTuple):
             padding = tuple(padding[d] for d in self.dimensions)
diff --git a/devito/types/misc.py b/devito/types/misc.py
index 0297b0f781..06ba23d50a 100644
--- a/devito/types/misc.py
+++ b/devito/types/misc.py
@@ -4,7 +4,6 @@
 import sympy
 from sympy.core.core import ordering_of_classes
 
-from devito.parameters import configuration
 from devito.types import Array, CompositeObject, DimensionTuple, Indexed, Symbol
 from devito.types.basic import IndexedData
 from devito.tools import Pickable, as_tuple
@@ -194,41 +193,7 @@ class TempArray(Array):
     sub-expressions.
     """
 
-    def __padding_setup__(self, **kwargs):
-        #TODO: CURRENTLY IDENTICAL TO dense.__padding_setup__
-        #TODO: MUST BE FACTORED OUT INTO A COMMON BASE CLASS
-        padding = kwargs.get('padding')
-        if padding is None:
-            if kwargs.get('autopadding', configuration['autopadding']):
-                # Auto-padding is to maximize the efficiency of memory accesses:
-                # * Perform as many aligned accesses as possible
-                # * Ensure there's enough room to perform memory transactions
-                #   of maximum size. This essentially means that the remainder
-                #   DOMAIN region (i.e., the last block of elements that is not
-                #   a multiple of the maximum memory transaction size) plus the
-                #   NODOMAIN size has to be as large as the maximum memory
-                #   transaction size.
-                mmts = configuration['platform'].max_mem_trans_size(self.dtype)
-
-                d = self.dimensions[-1]
-                pad_size = mmts - self._size_nopad[d] % mmts
-                padding = [(0, 0) for i in self.dimensions[:-1]] + [(0, pad_size)]
-            else:
-                padding = tuple((0, 0) for d in self.dimensions)
-
-        elif isinstance(padding, DimensionTuple):
-            padding = tuple(padding[d] for d in self.dimensions)
-
-        elif isinstance(padding, int):
-            padding = tuple((0, padding) for _ in range(self.ndim))
-
-        elif isinstance(padding, tuple) and len(padding) == self.ndim:
-            padding = tuple((0, i) if isinstance(i, int) else i for i in padding)
-
-        else:
-            raise TypeError("`padding` must be int or %d-tuple of ints" % self.ndim)
-
-        return DimensionTuple(*padding, getters=self.dimensions)
+    pass
 
 
 class Fence(object):

From 9233a8866a3100dfbd5cad475e0c2e44133d448d Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Mon, 8 Jan 2024 13:23:33 +0000
Subject: [PATCH 05/23] compiler: Add platform to DataManager

---
 devito/passes/iet/definitions.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/devito/passes/iet/definitions.py b/devito/passes/iet/definitions.py
index 913432da8e..22f4ae8ae4 100644
--- a/devito/passes/iet/definitions.py
+++ b/devito/passes/iet/definitions.py
@@ -73,9 +73,10 @@ class DataManager(object):
     The language used to express data allocations, deletions, and host-device transfers.
     """
 
-    def __init__(self, rcompile=None, sregistry=None, **kwargs):
+    def __init__(self, rcompile=None, sregistry=None, platform=None, **kwargs):
         self.rcompile = rcompile
         self.sregistry = sregistry
+        self.platform = platform
 
     def _alloc_object_on_low_lat_mem(self, site, obj, storage):
         """

From c5c058c50ecf7d47a31816b1869164b1c0f7e308 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Mon, 8 Jan 2024 16:29:43 +0000
Subject: [PATCH 06/23] arch: Add Platform.suggested_alignment

---
 devito/arch/archinfo.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py
index aa796f2f70..70f674654b 100644
--- a/devito/arch/archinfo.py
+++ b/devito/arch/archinfo.py
@@ -607,6 +607,11 @@ class Platform(object):
     max_mem_trans_nbytes = None
     """Maximum memory transaction size in bytes."""
 
+    suggested_alignment = None
+    """
+    The dimension along which data should be aligned for optimal performance.
+    """
+
     def __init__(self, name):
         self.name = name
 
@@ -658,9 +663,13 @@ def max_mem_trans_size(self, dtype):
 
 class Cpu64(Platform):
 
-    # The vast majority of CPUs have a 64-byte cache line
+    # The vast majority of CPUs have a 64-byte cache line size
     max_mem_trans_nbytes = 64
 
+    # The vast majority of CPUs expect the data's contiguous dimension to be
+    # aligned to the cache line size
+    suggested_alignment = '1-stride'  # AKA contiguous dimension
+
     # The known ISAs are to be provided by the subclasses
     known_isas = ()
 
@@ -808,6 +817,9 @@ class IntelDevice(Device):
 
     max_mem_trans_nbytes = 64
 
+    #TODO
+    suggested_alignment = None
+
     @property
     def march(self):
         return ''
@@ -816,6 +828,7 @@ def march(self):
 class NvidiaDevice(Device):
 
     max_mem_trans_nbytes = 128
+    suggested_alignment = 'max-stride'
 
     @cached_property
     def march(self):
@@ -830,6 +843,7 @@ def march(self):
 class AmdDevice(Device):
 
     max_mem_trans_nbytes = 256
+    suggested_alignment = 'max-stride'
 
     @cached_property
     def march(cls):

From 1f840385e5a6dd96ae760e857e03847e1f40bec6 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Mon, 8 Jan 2024 16:30:03 +0000
Subject: [PATCH 07/23] compiler: Rework __padding_setup__

---
 devito/types/array.py |  7 +------
 devito/types/basic.py | 33 ++++++++++++++++++++++++++++-----
 devito/types/dense.py | 17 +----------------
 devito/types/misc.py  |  6 +++++-
 4 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/devito/types/array.py b/devito/types/array.py
index 66bcf12a3e..307cc41cfd 100644
--- a/devito/types/array.py
+++ b/devito/types/array.py
@@ -134,20 +134,15 @@ def __dtype_setup__(cls, **kwargs):
     def __padding_setup__(self, **kwargs):
         padding = kwargs.get('padding')
         if padding is None:
-            padding = self.__padding_auto_setup__(**kwargs)
-
+            padding = ((0, 0),)*self.ndim
         elif isinstance(padding, DimensionTuple):
             padding = tuple(padding[d] for d in self.dimensions)
-
         elif isinstance(padding, int):
             padding = tuple((0, padding) for _ in range(self.ndim))
-
         elif isinstance(padding, tuple) and len(padding) == self.ndim:
             padding = tuple((0, i) if isinstance(i, int) else i for i in padding)
-
         else:
             raise TypeError("`padding` must be int or %d-tuple of ints" % self.ndim)
-
         return DimensionTuple(*padding, getters=self.dimensions)
 
     @property
diff --git a/devito/types/basic.py b/devito/types/basic.py
index 5d2a32addd..724e7ce1c8 100644
--- a/devito/types/basic.py
+++ b/devito/types/basic.py
@@ -990,14 +990,32 @@ def __padding_setup__(self, **kwargs):
         padding = tuple(kwargs.get('padding', ((0, 0),)*self.ndim))
         return DimensionTuple(*padding, getters=self.dimensions)
 
-    def __padding_auto_setup__(self, **kwargs):
+    def __padding_setup_smart__(self, **kwargs):
+        nopadding = ((0, 0),)*self.ndim
+
         if kwargs.get('autopadding', configuration['autopadding']):
+            candidates = self.space_dimensions
+            if not candidates:
+                return nopadding
+
             mmts = configuration['platform'].max_mem_trans_size(self.dtype)
-            d = self.dimensions[-1]
-            pad_size = mmts - self._size_nopad[d] % mmts
-            return ((0, 0),)*(self.ndim - 1) + ((0, pad_size),)
+            hint = configuration['platform'].suggested_alignment
+
+            if hint == '1-stride':
+                d = candidates[-1]
+            elif hint == 'max-stride':
+                d = candidates[0]
+            else:
+                assert False, 'Unknown platform hint `%s`' % str(hint)
+
+            dpadding = (0, mmts - self._size_nopad[d] % mmts)
+
+            padding = [(0, 0)]*self.ndim
+            padding[self.dimensions.index(d)] = dpadding
+
+            return tuple(padding)
         else:
-            return ((0, 0),)*self.ndim
+            return nopadding
 
     def __ghost_setup__(self, **kwargs):
         return (0, 0)
@@ -1040,6 +1058,11 @@ def dimensions(self):
         """Tuple of Dimensions representing the object indices."""
         return self._dimensions
 
+    @cached_property
+    def space_dimensions(self):
+        """Tuple of Dimensions defining the physical space."""
+        return tuple(d for d in self.dimensions if d.is_Space)
+
     @property
     def base(self):
         return self.indexed
diff --git a/devito/types/dense.py b/devito/types/dense.py
index 8cfdb92dfa..a5df263e8f 100644
--- a/devito/types/dense.py
+++ b/devito/types/dense.py
@@ -628,11 +628,6 @@ def local_indices(self):
             return tuple(self._distributor.glb_slices.get(d, slice(0, s))
                          for s, d in zip(self.shape, self.dimensions))
 
-    @cached_property
-    def space_dimensions(self):
-        """Tuple of Dimensions defining the physical space."""
-        return tuple(d for d in self.dimensions if d.is_Space)
-
     @property
     def initializer(self):
         if isinstance(self._data, np.ndarray):
@@ -921,8 +916,6 @@ class Function(DiscreteFunction):
         to take advantage of the memory hierarchy in a NUMA architecture. Refer to
         `default_allocator.__doc__` for more information.
     padding : int or tuple of ints, optional
-        .. deprecated:: shouldn't be used; padding is now automatically inserted.
-
         Allocate extra grid points to maximize data access alignment. When a tuple
         of ints, one int per Dimension should be provided.
 
@@ -1121,23 +1114,17 @@ def __halo_setup__(self, **kwargs):
 
     def __padding_setup__(self, **kwargs):
         padding = kwargs.get('padding')
-
         if padding is None:
-            padding = self.__padding_auto_setup__(**kwargs)
-
+            padding = self.__padding_setup_smart__(**kwargs)
         elif isinstance(padding, DimensionTuple):
             padding = tuple(padding[d] for d in self.dimensions)
-
         elif isinstance(padding, int):
             padding = tuple((0, padding) if d.is_Space else (0, 0)
                             for d in self.dimensions)
-
         elif isinstance(padding, tuple) and len(padding) == self.ndim:
             padding = tuple((0, i) if isinstance(i, int) else i for i in padding)
-
         else:
             raise TypeError("`padding` must be int or %d-tuple of ints" % self.ndim)
-
         return DimensionTuple(*padding, getters=self.dimensions)
 
     @property
@@ -1246,8 +1233,6 @@ class TimeFunction(Function):
         to take advantage of the memory hierarchy in a NUMA architecture. Refer to
         `default_allocator.__doc__` for more information.
     padding : int or tuple of ints, optional
-        .. deprecated:: shouldn't be used; padding is now automatically inserted.
-
         Allocate extra grid points to maximize data access alignment. When a tuple
         of ints, one int per Dimension should be provided.
 
diff --git a/devito/types/misc.py b/devito/types/misc.py
index 06ba23d50a..61566e3f89 100644
--- a/devito/types/misc.py
+++ b/devito/types/misc.py
@@ -193,7 +193,11 @@ class TempArray(Array):
     sub-expressions.
     """
 
-    pass
+    def __padding_setup__(self, **kwargs):
+        padding = kwargs.pop('padding', None)
+        if padding is None:
+            padding = self.__padding_setup_smart__(**kwargs)
+        return super().__padding_setup__(padding=padding, **kwargs)
 
 
 class Fence(object):

From 2c6439b58581ddf645e80f2875cecd65f5e2671b Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Tue, 9 Jan 2024 16:43:18 +0000
Subject: [PATCH 08/23] compiler: Unpick suggested_alignment

---
 devito/arch/archinfo.py | 14 --------------
 devito/types/basic.py   | 12 ++----------
 2 files changed, 2 insertions(+), 24 deletions(-)

diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py
index 70f674654b..e679d7a785 100644
--- a/devito/arch/archinfo.py
+++ b/devito/arch/archinfo.py
@@ -607,11 +607,6 @@ class Platform(object):
     max_mem_trans_nbytes = None
     """Maximum memory transaction size in bytes."""
 
-    suggested_alignment = None
-    """
-    The dimension along which data should be aligned for optimal performance.
-    """
-
     def __init__(self, name):
         self.name = name
 
@@ -666,10 +661,6 @@ class Cpu64(Platform):
     # The vast majority of CPUs have a 64-byte cache line size
     max_mem_trans_nbytes = 64
 
-    # The vast majority of CPUs expect the data's contiguous dimension to be
-    # aligned to the cache line size
-    suggested_alignment = '1-stride'  # AKA contiguous dimension
-
     # The known ISAs are to be provided by the subclasses
     known_isas = ()
 
@@ -817,9 +808,6 @@ class IntelDevice(Device):
 
     max_mem_trans_nbytes = 64
 
-    #TODO
-    suggested_alignment = None
-
     @property
     def march(self):
         return ''
@@ -828,7 +816,6 @@ def march(self):
 class NvidiaDevice(Device):
 
     max_mem_trans_nbytes = 128
-    suggested_alignment = 'max-stride'
 
     @cached_property
     def march(self):
@@ -843,7 +830,6 @@ def march(self):
 class AmdDevice(Device):
 
     max_mem_trans_nbytes = 256
-    suggested_alignment = 'max-stride'
 
     @cached_property
     def march(cls):
diff --git a/devito/types/basic.py b/devito/types/basic.py
index 724e7ce1c8..55351a44b9 100644
--- a/devito/types/basic.py
+++ b/devito/types/basic.py
@@ -994,22 +994,14 @@ def __padding_setup_smart__(self, **kwargs):
         nopadding = ((0, 0),)*self.ndim
 
         if kwargs.get('autopadding', configuration['autopadding']):
+            # The padded Dimension
             candidates = self.space_dimensions
             if not candidates:
                 return nopadding
+            d = candidates[-1]
 
             mmts = configuration['platform'].max_mem_trans_size(self.dtype)
-            hint = configuration['platform'].suggested_alignment
-
-            if hint == '1-stride':
-                d = candidates[-1]
-            elif hint == 'max-stride':
-                d = candidates[0]
-            else:
-                assert False, 'Unknown platform hint `%s`' % str(hint)
-
             dpadding = (0, mmts - self._size_nopad[d] % mmts)
-
             padding = [(0, 0)]*self.ndim
             padding[self.dimensions.index(d)] = dpadding
 

From 3ba34035c8986e7f6a031bcd42abd2257b3da44a Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Wed, 10 Jan 2024 10:34:50 +0000
Subject: [PATCH 09/23] misc: Logging's perf_adv -> hint

---
 devito/logger.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/devito/logger.py b/devito/logger.py
index 0ca9ec6aea..92ede2e8dc 100644
--- a/devito/logger.py
+++ b/devito/logger.py
@@ -5,7 +5,7 @@
 from contextlib import contextmanager
 
 __all__ = ('set_log_level', 'set_log_noperf', 'is_log_enabled_for',
-           'log', 'warning', 'error', 'perf', 'perf_adv',
+           'log', 'warning', 'error', 'perf', 'hint',
            'RED', 'GREEN', 'BLUE')
 
 
@@ -124,8 +124,8 @@ def perf(msg, *args, **kwargs):
     log(msg, PERF, *args, **kwargs)
 
 
-def perf_adv(msg, *args, **kwargs):
-    log("Potential optimisation missed: %s" % msg, PERF, *args, **kwargs)
+def hint(msg, *args, **kwargs):
+    log("Hint: %s" % msg, PERF, *args, **kwargs)
 
 
 def warning(msg, *args, **kwargs):

From feb1f18480b5245c1eea98478300a4a11e091cab Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Wed, 10 Jan 2024 16:15:31 +0000
Subject: [PATCH 10/23] api: Enable halo customization

---
 devito/types/dense.py | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/devito/types/dense.py b/devito/types/dense.py
index a5df263e8f..3cadb863ba 100644
--- a/devito/types/dense.py
+++ b/devito/types/dense.py
@@ -983,10 +983,10 @@ def __init_finalize__(self, *args, **kwargs):
         space_order = kwargs.get('space_order', 1)
         if isinstance(space_order, int):
             self._space_order = space_order
-        elif isinstance(space_order, tuple) and len(space_order) == 3:
-            self._space_order, _, _ = space_order
+        elif isinstance(space_order, tuple) and len(space_order) >= 2:
+            self._space_order = space_order[0]
         else:
-            raise TypeError("`space_order` must be int or 3-tuple of ints")
+            raise TypeError("Invalid `space_order`")
 
         # Acquire derivative shortcuts
         if self is self.function:
@@ -1103,26 +1103,41 @@ def __halo_setup__(self, **kwargs):
         else:
             space_order = kwargs.get('space_order', 1)
             if isinstance(space_order, int):
-                halo = (space_order, space_order)
+                v = (space_order, space_order)
+                halo = [v if i.is_Space else (0, 0) for i in self.dimensions]
+
             elif isinstance(space_order, tuple) and len(space_order) == 3:
-                _, left_points, right_points = space_order
-                halo = (left_points, right_points)
+                _, l, r = space_order
+                halo = [(l, r) if i.is_Space else (0, 0) for i in self.dimensions]
+
+            elif isinstance(space_order, tuple) and len(space_order) == 2:
+                _, space_halo = space_order
+                if not isinstance(space_halo, tuple) or \
+                   len(space_halo) != len(self.space_dimensions):
+                    raise TypeError("Invalid `space_order`")
+                v = list(space_halo)
+                halo = [v.pop(0) if i.is_Space else (0, 0)
+                        for i in self.dimensions]
+
             else:
-                raise TypeError("`space_order` must be int or 3-tuple of ints")
-            halo = tuple(halo if i.is_Space else (0, 0) for i in self.dimensions)
+                raise TypeError("Invalid `space_order`")
         return DimensionTuple(*halo, getters=self.dimensions)
 
     def __padding_setup__(self, **kwargs):
         padding = kwargs.get('padding')
         if padding is None:
             padding = self.__padding_setup_smart__(**kwargs)
+
         elif isinstance(padding, DimensionTuple):
             padding = tuple(padding[d] for d in self.dimensions)
+
         elif isinstance(padding, int):
             padding = tuple((0, padding) if d.is_Space else (0, 0)
                             for d in self.dimensions)
+
         elif isinstance(padding, tuple) and len(padding) == self.ndim:
             padding = tuple((0, i) if isinstance(i, int) else i for i in padding)
+
         else:
             raise TypeError("`padding` must be int or %d-tuple of ints" % self.ndim)
         return DimensionTuple(*padding, getters=self.dimensions)

From 4e53c7aea9e0f68d49932acce67c447ff7ee63e8 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Fri, 12 Jan 2024 14:57:32 +0000
Subject: [PATCH 11/23] compiler: Tweak autopadding

---
 devito/types/basic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/devito/types/basic.py b/devito/types/basic.py
index 55351a44b9..c31bf583b8 100644
--- a/devito/types/basic.py
+++ b/devito/types/basic.py
@@ -1001,7 +1001,7 @@ def __padding_setup_smart__(self, **kwargs):
             d = candidates[-1]
 
             mmts = configuration['platform'].max_mem_trans_size(self.dtype)
-            dpadding = (0, mmts - self._size_nopad[d] % mmts)
+            dpadding = (0, (mmts - self._size_nopad[d] % mmts) % mmts)
             padding = [(0, 0)]*self.ndim
             padding[self.dimensions.index(d)] = dpadding
 

From 7268c9cfea4c57fe94a49007627f970dc02de43d Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Sat, 13 Jan 2024 11:03:24 +0000
Subject: [PATCH 12/23] compiler: Fix autopadding

---
 devito/types/basic.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/devito/types/basic.py b/devito/types/basic.py
index c31bf583b8..bbaef97784 100644
--- a/devito/types/basic.py
+++ b/devito/types/basic.py
@@ -1001,7 +1001,12 @@ def __padding_setup_smart__(self, **kwargs):
             d = candidates[-1]
 
             mmts = configuration['platform'].max_mem_trans_size(self.dtype)
-            dpadding = (0, (mmts - self._size_nopad[d] % mmts) % mmts)
+            remainder = self._size_nopad[d] % mmts
+            if remainder == 0:
+                # Already a multiple of `mmts`, no need to pad
+                return nopadding
+
+            dpadding = (0, (mmts - remainder))
             padding = [(0, 0)]*self.ndim
             padding[self.dimensions.index(d)] = dpadding
 

From a812bb30b41d846c03de45722418df1740c2c636 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Mon, 15 Jan 2024 15:47:02 +0000
Subject: [PATCH 13/23] compiler: Remove useless is_compact attribute

---
 devito/types/basic.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/devito/types/basic.py b/devito/types/basic.py
index bbaef97784..1f6617d44c 100644
--- a/devito/types/basic.py
+++ b/devito/types/basic.py
@@ -835,11 +835,6 @@ class AbstractFunction(sympy.Function, Basic, Pickable, Evaluable):
     Functions; etc.
     """
 
-    is_compact = True
-    """
-    True if data is allocated as a single, contiguous chunk of memory.
-    """
-
     __rkwargs__ = ('name', 'dtype', 'grid', 'halo', 'padding', 'ghost',
                    'alias', 'space', 'function')
 

From 3cb062d13942919cc679925a7cc4ade3392bc0a2 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Mon, 15 Jan 2024 16:08:15 +0000
Subject: [PATCH 14/23] compiler: Fix linearization in presence of autopadding

---
 devito/passes/iet/linearization.py | 11 ++++++-----
 devito/types/array.py              |  4 ++++
 devito/types/basic.py              |  7 +++++++
 devito/types/dense.py              |  7 ++++++-
 devito/types/misc.py               |  2 ++
 5 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/devito/passes/iet/linearization.py b/devito/passes/iet/linearization.py
index 9cd16dc416..8847351444 100644
--- a/devito/passes/iet/linearization.py
+++ b/devito/passes/iet/linearization.py
@@ -13,9 +13,10 @@
 from devito.passes.iet.parpragma import PragmaIteration
 from devito.symbolics import DefFunction, MacroArgument, ccode
 from devito.tools import Bunch, filter_ordered, prod
-from devito.types import Array, Bundle, Symbol, FIndexed, Indexed, Wildcard
+from devito.types import (Array, Bundle, Symbol, FIndexed, Indexed, TempArray,
+                          Wildcard)
 from devito.types.basic import IndexedData
-from devito.types.dense import DiscreteFunction
+from devito.types.dense import DiscreteFunction, Function
 
 
 __all__ = ['linearize']
@@ -81,9 +82,9 @@ def key1(f, d):
         * A 3-tuple `(Dimension, halo size, grid)` otherwise.
     """
     if f.is_regular:
-        # TODO: same grid + same halo => same padding, however this is not asserted
-        # during compilation... so maybe we should do it at `prepare_args` time?
-        return (d, f._size_halo[d], getattr(f, 'grid', None))
+        # For pad-dable objects (Function and TempArray), the following holds:
+        # `same dim + same halo => same (auto-)padding`
+        return (d, f._size_halo[d], f.is_autopaddable)
     else:
         return False
 
diff --git a/devito/types/array.py b/devito/types/array.py
index 307cc41cfd..258e0497a2 100644
--- a/devito/types/array.py
+++ b/devito/types/array.py
@@ -416,6 +416,10 @@ def is_TimeFunction(self):
     def is_Input(self):
         return all(i.is_Input for i in self.components)
 
+    @property
+    def is_autopaddable(self):
+        return all(i.is_autopaddable for i in self.components)
+
     # Other properties and methods
 
     @property
diff --git a/devito/types/basic.py b/devito/types/basic.py
index 1f6617d44c..745d6cd0de 100644
--- a/devito/types/basic.py
+++ b/devito/types/basic.py
@@ -835,6 +835,13 @@ class AbstractFunction(sympy.Function, Basic, Pickable, Evaluable):
     Functions; etc.
     """
 
+    is_autopaddable = False
+    """
+    True if the Function can be padded automatically by the Devito runtime,
+    thus increasing its size, False otherwise. Note that this property has no
+    effect if autopadding is disabled, which is the default behavior.
+    """
+
     __rkwargs__ = ('name', 'dtype', 'grid', 'halo', 'padding', 'ghost',
                    'alias', 'space', 'function')
 
diff --git a/devito/types/dense.py b/devito/types/dense.py
index 3cadb863ba..78166002f8 100644
--- a/devito/types/dense.py
+++ b/devito/types/dense.py
@@ -969,6 +969,8 @@ class Function(DiscreteFunction):
 
     is_Function = True
 
+    is_autopaddable = True
+
     __rkwargs__ = (DiscreteFunction.__rkwargs__ +
                    ('space_order', 'shape_global', 'dimensions'))
 
@@ -1126,7 +1128,10 @@ def __halo_setup__(self, **kwargs):
     def __padding_setup__(self, **kwargs):
         padding = kwargs.get('padding')
         if padding is None:
-            padding = self.__padding_setup_smart__(**kwargs)
+            if self.is_autopaddable:
+                padding = self.__padding_setup_smart__(**kwargs)
+            else:
+                padding = super().__padding_setup__(**kwargs)
 
         elif isinstance(padding, DimensionTuple):
             padding = tuple(padding[d] for d in self.dimensions)
diff --git a/devito/types/misc.py b/devito/types/misc.py
index 61566e3f89..fb10e7d3b6 100644
--- a/devito/types/misc.py
+++ b/devito/types/misc.py
@@ -193,6 +193,8 @@ class TempArray(Array):
     sub-expressions.
     """
 
+    is_autopaddable = True
+
     def __padding_setup__(self, **kwargs):
         padding = kwargs.pop('padding', None)
         if padding is None:

From f8f13345b445b167027478538db72d04279f7dce Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Mon, 15 Jan 2024 16:10:36 +0000
Subject: [PATCH 15/23] pep8 happiness

---
 devito/ir/clusters/analysis.py     | 2 +-
 devito/passes/clusters/aliases.py  | 1 -
 devito/passes/iet/linearization.py | 7 +++----
 devito/types/array.py              | 2 --
 devito/types/dense.py              | 2 --
 devito/types/misc.py               | 2 +-
 6 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/devito/ir/clusters/analysis.py b/devito/ir/clusters/analysis.py
index 7e94796971..6c360effd7 100644
--- a/devito/ir/clusters/analysis.py
+++ b/devito/ir/clusters/analysis.py
@@ -1,6 +1,6 @@
 from devito.ir.clusters.visitors import QueueStateful
 from devito.ir.support import (AFFINE, PARALLEL, PARALLEL_INDEP, PARALLEL_IF_ATOMIC,
-                               SEQUENTIAL, Forward)
+                               SEQUENTIAL)
 from devito.tools import as_tuple, flatten, timed_pass
 
 __all__ = ['analyze']
diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py
index dd4eaaf7e0..30f21516fb 100644
--- a/devito/passes/clusters/aliases.py
+++ b/devito/passes/clusters/aliases.py
@@ -6,7 +6,6 @@
 import numpy as np
 import sympy
 
-from devito.arch import Cpu64
 from devito.finite_differences import EvalDerivative, IndexDerivative, Weights
 from devito.ir import (SEQUENTIAL, PARALLEL_IF_PVT, SEPARABLE, Forward,
                        IterationSpace, Interval, Cluster, ExprGeometry, Queue,
diff --git a/devito/passes/iet/linearization.py b/devito/passes/iet/linearization.py
index 8847351444..5ec2a41a9d 100644
--- a/devito/passes/iet/linearization.py
+++ b/devito/passes/iet/linearization.py
@@ -13,10 +13,9 @@
 from devito.passes.iet.parpragma import PragmaIteration
 from devito.symbolics import DefFunction, MacroArgument, ccode
 from devito.tools import Bunch, filter_ordered, prod
-from devito.types import (Array, Bundle, Symbol, FIndexed, Indexed, TempArray,
-                          Wildcard)
+from devito.types import Array, Bundle, Symbol, FIndexed, Indexed, Wildcard
 from devito.types.basic import IndexedData
-from devito.types.dense import DiscreteFunction, Function
+from devito.types.dense import DiscreteFunction
 
 
 __all__ = ['linearize']
@@ -82,7 +81,7 @@ def key1(f, d):
         * A 3-tuple `(Dimension, halo size, grid)` otherwise.
     """
     if f.is_regular:
-        # For pad-dable objects (Function and TempArray), the following holds:
+        # For paddable objects the following holds:
         # `same dim + same halo => same (auto-)padding`
         return (d, f._size_halo[d], f.is_autopaddable)
     else:
diff --git a/devito/types/array.py b/devito/types/array.py
index 258e0497a2..d9ca2bafd2 100644
--- a/devito/types/array.py
+++ b/devito/types/array.py
@@ -1,11 +1,9 @@
 from ctypes import POINTER, Structure, c_void_p, c_ulong
-from math import ceil
 
 import numpy as np
 from cached_property import cached_property
 from sympy import Expr
 
-from devito.parameters import configuration
 from devito.tools import (Reconstructable, as_tuple, c_restrict_void_p,
                           dtype_to_ctype, dtypes_vector_mapper)
 from devito.types.basic import AbstractFunction
diff --git a/devito/types/dense.py b/devito/types/dense.py
index 78166002f8..3cb6725554 100644
--- a/devito/types/dense.py
+++ b/devito/types/dense.py
@@ -1,7 +1,6 @@
 from collections import namedtuple
 from ctypes import POINTER, Structure, c_int, c_ulong, c_void_p, cast, byref
 from functools import wraps, reduce
-from math import ceil
 from operator import mul
 
 import numpy as np
@@ -837,7 +836,6 @@ def _arg_check(self, args, intervals, **kwargs):
         if self.name not in args:
             raise InvalidArgument("No runtime value for `%s`" % self.name)
 
-        obj = kwargs.get(self.name, self)
         data = args[self.name]
 
         if len(data.shape) != self.ndim:
diff --git a/devito/types/misc.py b/devito/types/misc.py
index fb10e7d3b6..bbf2ed5137 100644
--- a/devito/types/misc.py
+++ b/devito/types/misc.py
@@ -4,7 +4,7 @@
 import sympy
 from sympy.core.core import ordering_of_classes
 
-from devito.types import Array, CompositeObject, DimensionTuple, Indexed, Symbol
+from devito.types import Array, CompositeObject, Indexed, Symbol
 from devito.types.basic import IndexedData
 from devito.tools import Pickable, as_tuple
 

From cfa50279d4f984e794672ab6dd1adbd18a2aebce Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Tue, 16 Jan 2024 09:42:52 +0000
Subject: [PATCH 16/23] tests: Update expected output after tweaking
 autopadding

---
 devito/data/allocators.py | 2 +-
 tests/test_data.py        | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/devito/data/allocators.py b/devito/data/allocators.py
index b051ff02cf..c54950cf54 100644
--- a/devito/data/allocators.py
+++ b/devito/data/allocators.py
@@ -368,7 +368,7 @@ class ExternalAllocator(MemoryAllocator):
     def __init__(self, numpy_array):
         self.numpy_array = numpy_array
 
-    def alloc(self, shape, dtype):
+    def alloc(self, shape, dtype, padding=0):
         assert shape == self.numpy_array.shape, \
             "Provided array has shape %s. Expected %s" %\
             (str(self.numpy_array.shape), str(shape))
diff --git a/tests/test_data.py b/tests/test_data.py
index d8988f21a1..b6df686118 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -316,9 +316,9 @@ def test_w_halo_w_autopadding(self):
         assert u0.shape_allocated == (4, 4, 16)
 
         assert u1._size_halo == ((3, 3), (3, 3), (3, 3))
-        assert u1._size_padding == ((0, 0), (0, 0), (0, 14))  # 14 stems from 6 + 8
-        assert u1._size_nodomain == ((3, 3), (3, 3), (3, 17))
-        assert u1.shape_allocated == (10, 10, 24)
+        assert u1._size_padding == ((0, 0), (0, 0), (0, 6))  # 14 stems from 6 + 8
+        assert u1._size_nodomain == ((3, 3), (3, 3), (3, 9))
+        assert u1.shape_allocated == (10, 10, 16)
 
 
 class TestDecomposition(object):

From d335eb81ffb5854f71a315cef84b0e684445e018 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Wed, 17 Jan 2024 08:26:36 +0000
Subject: [PATCH 17/23] examples: Update expected output

---
 devito/types/tensor.py        | 1 -
 examples/userapi/01_dsl.ipynb | 2 --
 2 files changed, 3 deletions(-)

diff --git a/devito/types/tensor.py b/devito/types/tensor.py
index 9be56fd093..9d08bfe77b 100644
--- a/devito/types/tensor.py
+++ b/devito/types/tensor.py
@@ -54,7 +54,6 @@ class TensorFunction(AbstractTensor):
         to take advantage of the memory hierarchy in a NUMA architecture. Refer to
         `default_allocator.__doc__` for more information.
     padding : int or tuple of ints, optional
-        .. deprecated:: shouldn't be used; padding is now automatically inserted.
         Allocate extra grid points to maximize data access alignment. When a tuple
         of ints, one int per Dimension should be provided.
     symmetric : bool, optional
diff --git a/examples/userapi/01_dsl.ipynb b/examples/userapi/01_dsl.ipynb
index 65182c0b74..eab415e355 100644
--- a/examples/userapi/01_dsl.ipynb
+++ b/examples/userapi/01_dsl.ipynb
@@ -122,8 +122,6 @@
       "        to take advantage of the memory hierarchy in a NUMA architecture. Refer to\n",
       "        `default_allocator.__doc__` for more information.\n",
       "    padding : int or tuple of ints, optional\n",
-      "        .. deprecated:: shouldn't be used; padding is now automatically inserted.\n",
-      "\n",
       "        Allocate extra grid points to maximize data access alignment. When a tuple\n",
       "        of ints, one int per Dimension should be provided.\n",
       "\n",

From 6fb992eae7ca2d01c8776d5486ec99e9e3db4479 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Wed, 17 Jan 2024 08:40:59 +0000
Subject: [PATCH 18/23] compiler: Use is_integer where possible

---
 devito/data/allocators.py | 7 +++----
 devito/types/array.py     | 8 ++++----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/devito/data/allocators.py b/devito/data/allocators.py
index c54950cf54..3d3817f56a 100644
--- a/devito/data/allocators.py
+++ b/devito/data/allocators.py
@@ -11,7 +11,7 @@
 
 from devito.logger import logger
 from devito.parameters import configuration
-from devito.tools import dtype_to_ctype
+from devito.tools import dtype_to_ctype, is_integer
 
 __all__ = ['ALLOC_ALIGNED', 'ALLOC_NUMA_LOCAL', 'ALLOC_NUMA_ANY',
            'ALLOC_KNL_MCDRAM', 'ALLOC_KNL_DRAM', 'ALLOC_GUARD',
@@ -77,7 +77,7 @@ def alloc(self, shape, dtype, padding=0):
             padleft, padright = padding
         except TypeError:
             padleft, padright = padding, padding
-        if not isinstance(padleft, int) and not isinstance(padright, int):
+        if not is_integer(padleft) and not is_integer(padright):
             raise TypeError("padding must be an int or a 2-tuple of ints")
         size = datasize + padleft + padright
 
@@ -101,9 +101,8 @@ def alloc(self, shape, dtype, padding=0):
         #   is copied, you should assign the new shape to the shape attribute
         #   of the array:
         array.shape = shape
-        ndarray = array  # At this point it's interpreted as an ndarray
 
-        return (ndarray, memfree_args)
+        return (array, memfree_args)
 
     @abc.abstractmethod
     def _alloc_C_libcall(self, size, ctype):
diff --git a/devito/types/array.py b/devito/types/array.py
index d9ca2bafd2..0cfc754bd6 100644
--- a/devito/types/array.py
+++ b/devito/types/array.py
@@ -5,7 +5,7 @@
 from sympy import Expr
 
 from devito.tools import (Reconstructable, as_tuple, c_restrict_void_p,
-                          dtype_to_ctype, dtypes_vector_mapper)
+                          dtype_to_ctype, dtypes_vector_mapper, is_integer)
 from devito.types.basic import AbstractFunction
 from devito.types.utils import CtypesFactory, DimensionTuple
 
@@ -135,10 +135,10 @@ def __padding_setup__(self, **kwargs):
             padding = ((0, 0),)*self.ndim
         elif isinstance(padding, DimensionTuple):
             padding = tuple(padding[d] for d in self.dimensions)
-        elif isinstance(padding, int):
+        elif is_integer(padding):
             padding = tuple((0, padding) for _ in range(self.ndim))
         elif isinstance(padding, tuple) and len(padding) == self.ndim:
-            padding = tuple((0, i) if isinstance(i, int) else i for i in padding)
+            padding = tuple((0, i) if is_integer(i) else i for i in padding)
         else:
             raise TypeError("`padding` must be int or %d-tuple of ints" % self.ndim)
         return DimensionTuple(*padding, getters=self.dimensions)
@@ -504,7 +504,7 @@ class ComponentAccess(Expr, Reconstructable):
     def __new__(cls, arg, index=0, **kwargs):
         if not arg.is_Indexed:
             raise ValueError("Expected Indexed, got `%s` instead" % type(arg))
-        if not isinstance(index, int) or index > 3:
+        if not is_integer(index) or index > 3:
             raise ValueError("Expected 0 <= index < 4")
 
         obj = Expr.__new__(cls, arg)

From 74792381fd07ace517daf811709655e6c799102f Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Wed, 17 Jan 2024 09:14:36 +0000
Subject: [PATCH 19/23] compiler: Fix mapify_reduce

---
 devito/ir/clusters/algorithms.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py
index 075f7275bd..6da5f3ddb8 100644
--- a/devito/ir/clusters/algorithms.py
+++ b/devito/ir/clusters/algorithms.py
@@ -490,7 +490,7 @@ def normalize_reductions_dense(cluster, sregistry, options):
 
     processed = []
     for e in cluster.exprs:
-        if e.is_Reduction and e.lhs.is_Symbol and opt_mapify_reduce:
+        if e.is_Reduction and opt_mapify_reduce:
             # Transform `e` into what is in essence an explicit map-reduce
             # For example, turn:
             # `s += f(u[x], v[x], ...)`
@@ -499,10 +499,20 @@ def normalize_reductions_dense(cluster, sregistry, options):
             # `s += r[x]`
             # This makes it much easier to parallelize the map part regardless
             # of the target backend
-            name = sregistry.make_name()
-            a = Array(name=name, dtype=e.dtype, dimensions=dims)
-            processed.extend([Eq(a.indexify(), e.rhs),
-                              e.func(e.lhs, a.indexify())])
+
+            if e.lhs.function.is_Array:
+                # Probably a compiler-generated reduction, e.g. via
+                # recursive compilation; it's an Array already, so nothing to do
+                processed.append(e)
+            else:
+                # Here the LHS could be a Symbol or a user-level Function
+                # In the latter case we copy the data into a temporary Array
+                # because the Function might be padded, and reduction operations
+                # require, in general, the data values to be contiguous
+                name = sregistry.make_name()
+                a = Array(name=name, dtype=e.dtype, dimensions=dims)
+                processed.extend([Eq(a.indexify(), e.rhs),
+                                  e.func(e.lhs, a.indexify())])
         else:
             processed.append(e)
 

From d85d4025b9ad5df4546a073d38131989a03c33d8 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Wed, 17 Jan 2024 09:32:47 +0000
Subject: [PATCH 20/23] misc: Homogenize docstrings to use single quotes

---
 devito/types/dense.py | 77 +++++++++++++++++++++++--------------------
 1 file changed, 41 insertions(+), 36 deletions(-)

diff --git a/devito/types/dense.py b/devito/types/dense.py
index 3cb6725554..b9ee56c0be 100644
--- a/devito/types/dense.py
+++ b/devito/types/dense.py
@@ -98,7 +98,7 @@ def __init_finalize__(self, *args, function=None, **kwargs):
                 # This is a corner case -- we might get here, for example, when
                 # running with MPI and some processes get 0-size arrays after
                 # domain decomposition. We touch the data anyway to avoid the
-                # case ``self._data is None``
+                # case `self._data is None`
                 self.data
         else:
             raise ValueError("`initializer` must be callable or buffer, not %s"
@@ -219,7 +219,7 @@ def shape_domain(self):
         Notes
         -----
         In an MPI context, this is the *local* domain region shape.
-        Alias to ``self.shape``.
+        Alias to `self.shape`.
         """
         return self.shape
 
@@ -436,7 +436,7 @@ def data_gather(self, start=None, stop=None, step=1, rank=0):
 
         Notes
         -----
-        Alias to ``self.data._gather``.
+        Alias to `self.data._gather`.
 
         Note that gathering data from large simulations onto a single rank may
         result in memory blow-up and hence should use this method judiciously.
@@ -453,7 +453,7 @@ def data_domain(self):
 
         Notes
         -----
-        Alias to ``self.data``.
+        Alias to `self.data`.
 
         With this accessor you are claiming that you will modify the values you
         get back. If you only need to look at the values, use
@@ -616,10 +616,10 @@ def local_indices(self):
 
         Notes
         -----
-        Given a Function ``f(x, y)`` with shape ``(nx, ny)``, when *not* using
-        MPI this property will return ``(slice(0, nx-1), slice(0, ny-1))``. On
+        Given a Function `f(x, y)` with shape `(nx, ny)`, when *not* using
+        MPI this property will return `(slice(0, nx-1), slice(0, ny-1))`. On
         the other hand, when MPI is used, the local ranges depend on the domain
-        decomposition, which is carried by ``self.grid``.
+        decomposition, which is carried by `self.grid`.
         """
         if self._distributor is None:
             return tuple(slice(0, s) for s in self.shape)
@@ -899,12 +899,14 @@ class Function(DiscreteFunction):
         order (``o``) as well as the number of points on the left (``lp``) and right
         (``rp``) sides of a generic point of interest.
     shape : tuple of ints, optional
-        Shape of the domain region in grid points. Only necessary if ``grid`` isn't given.
+        Shape of the domain region in grid points. Only necessary if `grid`
+        isn't given.
     dimensions : tuple of Dimension, optional
-        Dimensions associated with the object. Only necessary if ``grid`` isn't given.
+        Dimensions associated with the object. Only necessary if `grid` isn't
+        given.
     dtype : data-type, optional
         Any object that can be interpreted as a numpy data type. Defaults
-        to ``np.float32``.
+        to `np.float32`.
     staggered : Dimension or tuple of Dimension or Stagger, optional
         Define how the Function is staggered.
     initializer : callable or any object exposing the buffer interface, optional
@@ -962,7 +964,7 @@ class Function(DiscreteFunction):
     Notes
     -----
     The parameters must always be given as keyword arguments, since SymPy
-    uses ``*args`` to (re-)create the dimension arguments of the symbolic object.
+    uses `*args` to (re-)create the dimension arguments of the symbolic object.
     """
 
     is_Function = True
@@ -1152,8 +1154,8 @@ def space_order(self):
 
     def sum(self, p=None, dims=None):
         """
-        Generate a symbolic expression computing the sum of ``p`` points
-        along the spatial dimensions ``dims``.
+        Generate a symbolic expression computing the sum of `p` points
+        along the spatial dimensions `dims`.
 
         Parameters
         ----------
@@ -1161,7 +1163,7 @@ def sum(self, p=None, dims=None):
             The number of summands. Defaults to the halo size.
         dims : tuple of Dimension, optional
             The Dimensions along which the sum is computed. Defaults to
-            ``self``'s spatial dimensions.
+            `self`'s spatial dimensions.
         """
         points = []
         for d in (as_tuple(dims) or self.space_dimensions):
@@ -1178,8 +1180,8 @@ def sum(self, p=None, dims=None):
 
     def avg(self, p=None, dims=None):
         """
-        Generate a symbolic expression computing the average of ``p`` points
-        along the spatial dimensions ``dims``.
+        Generate a symbolic expression computing the average of `p` points
+        along the spatial dimensions `dims`.
 
         Parameters
         ----------
@@ -1187,7 +1189,7 @@ def avg(self, p=None, dims=None):
             The number of summands. Defaults to the halo size.
         dims : tuple of Dimension, optional
             The Dimensions along which the average is computed. Defaults to
-            ``self``'s spatial dimensions.
+            `self`'s spatial dimensions.
         """
         tot = self.sum(p, dims)
         return tot / len(tot.args)
@@ -1223,25 +1225,28 @@ class TimeFunction(Function):
     time_order : int, optional
         Discretization order for time derivatives. Defaults to 1.
     shape : tuple of ints, optional
-        Shape of the domain region in grid points. Only necessary if `grid` isn't given.
+        Shape of the domain region in grid points. Only necessary if `grid`
+        isn't given.
     dimensions : tuple of Dimension, optional
-        Dimensions associated with the object. Only necessary if `grid` isn't given.
+        Dimensions associated with the object. Only necessary if `grid` isn't
+        given.
     dtype : data-type, optional
         Any object that can be interpreted as a numpy data type. Defaults
         to `np.float32`.
     save : int or Buffer, optional
-        By default, ``save=None``, which indicates the use of alternating buffers. This
-        enables cyclic writes to the TimeFunction. For example, if the TimeFunction
-        ``u(t, x)`` has shape (3, 100), then, in an Operator, ``t`` will assume the
-        values ``1, 2, 0, 1, 2, 0, 1, ...`` (note that the very first value depends
-        on the stencil equation in which ``u`` is written.). The default size of the time
-        buffer when ``save=None`` is ``time_order + 1``.  To specify a different size for
-        the time buffer, one should use the syntax ``save=Buffer(mysize)``.
-        Alternatively, if all of the intermediate results are required (or, simply, to
-        avoid using an alternating buffer), an explicit value for ``save`` ( an integer)
-        must be provided.
+        By default, `save=None`, which indicates the use of alternating
+        buffers. This enables cyclic writes to the TimeFunction. For example,
+        if the TimeFunction `u(t, x)` has shape (3, 100), then, in an Operator,
+        `t` will assume the values `1, 2, 0, 1, 2, 0, 1, ...` (note that the
+        very first value depends on the stencil equation in which `u` is
+        written.). The default size of the time buffer when `save=None` is
+        `time_order + 1`.  To specify a different size for the time buffer, one
+        should use the syntax `save=Buffer(mysize)`.  Alternatively, if all of
+        the intermediate results are required (or, simply, to avoid using an
+        alternating buffer), an explicit value for `save` ( an integer) must be
+        provided.
     time_dim : Dimension, optional
-        TimeDimension to be used in the TimeFunction. Defaults to ``grid.time_dim``.
+        TimeDimension to be used in the TimeFunction. Defaults to `grid.time_dim`.
     staggered : Dimension or tuple of Dimension or Stagger, optional
         Define how the Function is staggered.
     initializer : callable or any object exposing the buffer interface, optional
@@ -1278,14 +1283,14 @@ class TimeFunction(Function):
     Derivative(g(t, x, y), t)
 
     When using the alternating buffer protocol, the size of the time dimension
-    is given by ``time_order + 1``
+    is given by `time_order + 1`
 
     >>> f.shape
     (2, 4, 4)
     >>> g.shape
     (3, 4, 4)
 
-    One can drop the alternating buffer protocol specifying a value for ``save``
+    One can drop the alternating buffer protocol specifying a value for `save`
 
     >>> h = TimeFunction(name='h', grid=grid, save=20)
     >>> h
@@ -1296,10 +1301,10 @@ class TimeFunction(Function):
     Notes
     -----
     The parameters must always be given as keyword arguments, since SymPy uses
-    ``*args`` to (re-)create the dimension arguments of the symbolic object.
-    If the parameter ``grid`` is provided, the values for ``shape``,
-    ``dimensions`` and ``dtype`` will be derived from it. When present, the
-    parameter ``shape`` should only define the spatial shape of the grid. The
+    `*args` to (re-)create the dimension arguments of the symbolic object.
+    If the parameter `grid` is provided, the values for `shape`,
+    `dimensions` and `dtype` will be derived from it. When present, the
+    parameter `shape` should only define the spatial shape of the grid. The
     temporal dimension will be inserted automatically as the leading dimension.
     """
 

From 733d7bc3a5f041283b5c5ef323b435028e1d4f22 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Wed, 17 Jan 2024 09:33:12 +0000
Subject: [PATCH 21/23] api: Improve halo setup API and docs

---
 devito/types/dense.py | 45 ++++++++++++++++++++++++++++---------------
 tests/test_data.py    | 14 +++++++++++++-
 2 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/devito/types/dense.py b/devito/types/dense.py
index b9ee56c0be..0aa16d2435 100644
--- a/devito/types/dense.py
+++ b/devito/types/dense.py
@@ -890,14 +890,20 @@ class Function(DiscreteFunction):
         provided, shape and dimensions must be given. For MPI execution, a
         Grid is compulsory.
     space_order : int or 3-tuple of ints, optional
-        Discretisation order for space derivatives. Defaults to 1. ``space_order`` also
-        impacts the number of points available around a generic point of interest.  By
-        default, ``space_order`` points are available on both sides of a generic point of
-        interest, including those nearby the grid boundary. Sometimes, fewer points
-        suffice; in other scenarios, more points are necessary. In such cases, instead of
-        an integer, one can pass a 3-tuple ``(o, lp, rp)`` indicating the discretization
-        order (``o``) as well as the number of points on the left (``lp``) and right
-        (``rp``) sides of a generic point of interest.
+        Discretisation order for space derivatives. Defaults to 1.
+        `space_order` also impacts the number of points available around a
+        generic point of interest.  By default, `space_order` points are
+        available on both sides of a generic point of interest, including those
+        nearby the grid boundary. Sometimes, fewer points suffice; in other
+        scenarios, more points are necessary. In such cases, instead of an
+        integer, one can pass:
+          * a 3-tuple `(o, lp, rp)` indicating the discretization order
+            (`o`) as well as the number of points on the left (`lp`) and
+            right (`rp`) sides of a generic point of interest;
+          * a 2-tuple `(o, ((lp0, rp0), (lp1, rp1), ...))` indicating the
+            discretization order (`o`) as well as the number of points on
+            the left/right sides of a generic point of interest for each
+            SpaceDimension.
     shape : tuple of ints, optional
         Shape of the domain region in grid points. Only necessary if `grid`
         isn't given.
@@ -1115,6 +1121,7 @@ def __halo_setup__(self, **kwargs):
             elif isinstance(space_order, tuple) and len(space_order) == 2:
                 _, space_halo = space_order
                 if not isinstance(space_halo, tuple) or \
+                   not all(isinstance(i, tuple) for i in space_halo) or \
                    len(space_halo) != len(self.space_dimensions):
                     raise TypeError("Invalid `space_order`")
                 v = list(space_halo)
@@ -1214,14 +1221,20 @@ class TimeFunction(Function):
         provided, shape and dimensions must be given. For MPI execution, a
         Grid is compulsory.
     space_order : int or 3-tuple of ints, optional
-        Discretisation order for space derivatives. Defaults to 1. ``space_order`` also
-        impacts the number of points available around a generic point of interest.  By
-        default, ``space_order`` points are available on both sides of a generic point of
-        interest, including those nearby the grid boundary. Sometimes, fewer points
-        suffice; in other scenarios, more points are necessary. In such cases, instead of
-        an integer, one can pass a 3-tuple ``(o, lp, rp)`` indicating the discretization
-        order (``o``) as well as the number of points on the left (``lp``) and right
-        (``rp``) sides of a generic point of interest.
+        Discretisation order for space derivatives. Defaults to 1.
+        `space_order` also impacts the number of points available around a
+        generic point of interest.  By default, `space_order` points are
+        available on both sides of a generic point of interest, including those
+        nearby the grid boundary. Sometimes, fewer points suffice; in other
+        scenarios, more points are necessary. In such cases, instead of an
+        integer, one can pass:
+          * a 3-tuple `(o, lp, rp)` indicating the discretization order
+            (`o`) as well as the number of points on the left (`lp`) and
+            right (`rp`) sides of a generic point of interest;
+          * a 2-tuple `(o, ((lp0, rp0), (lp1, rp1), ...))` indicating the
+            discretization order (`o`) as well as the number of points on
+            the left/right sides of a generic point of interest for each
+            SpaceDimension.
     time_order : int, optional
         Discretization order for time derivatives. Defaults to 1.
     shape : tuple of ints, optional
diff --git a/tests/test_data.py b/tests/test_data.py
index b6df686118..232aff9c97 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -316,10 +316,22 @@ def test_w_halo_w_autopadding(self):
         assert u0.shape_allocated == (4, 4, 16)
 
         assert u1._size_halo == ((3, 3), (3, 3), (3, 3))
-        assert u1._size_padding == ((0, 0), (0, 0), (0, 6))  # 14 stems from 6 + 8
+        assert u1._size_padding == ((0, 0), (0, 0), (0, 6))  # 6 stems from 16-(3+4+3)
         assert u1._size_nodomain == ((3, 3), (3, 3), (3, 9))
         assert u1.shape_allocated == (10, 10, 16)
 
+    def test_w_halo_custom(self):
+        grid = Grid(shape=(4, 4))
+
+        # Custom halo with not enougn entries raises an exception
+        with pytest.raises(TypeError):
+            Function(name='u', grid=grid, space_order=(8, (4, 3)))
+
+        u = TimeFunction(name='u', grid=grid, space_order=(8, ((4, 3), (1, 1))))
+
+        assert u._size_halo == ((0, 0), (4, 3), (1, 1))
+        assert u.shape_allocated == (2, 11, 6)
+
 
 class TestDecomposition(object):
 

From 3cd612711d223f52af338831f6ee59c6575b6c9c Mon Sep 17 00:00:00 2001
From: FabioLuporini <fabilupo@gmail.com>
Date: Wed, 17 Jan 2024 11:04:14 +0100
Subject: [PATCH 22/23] examples: Update expected output

---
 examples/userapi/01_dsl.ipynb | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/examples/userapi/01_dsl.ipynb b/examples/userapi/01_dsl.ipynb
index eab415e355..e3653b7eb7 100644
--- a/examples/userapi/01_dsl.ipynb
+++ b/examples/userapi/01_dsl.ipynb
@@ -98,21 +98,29 @@
       "        provided, shape and dimensions must be given. For MPI execution, a\n",
       "        Grid is compulsory.\n",
       "    space_order : int or 3-tuple of ints, optional\n",
-      "        Discretisation order for space derivatives. Defaults to 1. ``space_order`` also\n",
-      "        impacts the number of points available around a generic point of interest.  By\n",
-      "        default, ``space_order`` points are available on both sides of a generic point of\n",
-      "        interest, including those nearby the grid boundary. Sometimes, fewer points\n",
-      "        suffice; in other scenarios, more points are necessary. In such cases, instead of\n",
-      "        an integer, one can pass a 3-tuple ``(o, lp, rp)`` indicating the discretization\n",
-      "        order (``o``) as well as the number of points on the left (``lp``) and right\n",
-      "        (``rp``) sides of a generic point of interest.\n",
+      "        Discretisation order for space derivatives. Defaults to 1.\n",
+      "        `space_order` also impacts the number of points available around a\n",
+      "        generic point of interest.  By default, `space_order` points are\n",
+      "        available on both sides of a generic point of interest, including those\n",
+      "        nearby the grid boundary. Sometimes, fewer points suffice; in other\n",
+      "        scenarios, more points are necessary. In such cases, instead of an\n",
+      "        integer, one can pass:\n",
+      "          * a 3-tuple `(o, lp, rp)` indicating the discretization order\n",
+      "            (`o`) as well as the number of points on the left (`lp`) and\n",
+      "            right (`rp`) sides of a generic point of interest;\n",
+      "          * a 2-tuple `(o, ((lp0, rp0), (lp1, rp1), ...))` indicating the\n",
+      "            discretization order (`o`) as well as the number of points on\n",
+      "            the left/right sides of a generic point of interest for each\n",
+      "            SpaceDimension.\n",
       "    shape : tuple of ints, optional\n",
-      "        Shape of the domain region in grid points. Only necessary if ``grid`` isn't given.\n",
+      "        Shape of the domain region in grid points. Only necessary if `grid`\n",
+      "        isn't given.\n",
       "    dimensions : tuple of Dimension, optional\n",
-      "        Dimensions associated with the object. Only necessary if ``grid`` isn't given.\n",
+      "        Dimensions associated with the object. Only necessary if `grid` isn't\n",
+      "        given.\n",
       "    dtype : data-type, optional\n",
       "        Any object that can be interpreted as a numpy data type. Defaults\n",
-      "        to ``np.float32``.\n",
+      "        to `np.float32`.\n",
       "    staggered : Dimension or tuple of Dimension or Stagger, optional\n",
       "        Define how the Function is staggered.\n",
       "    initializer : callable or any object exposing the buffer interface, optional\n",
@@ -170,7 +178,7 @@
       "    Notes\n",
       "    -----\n",
       "    The parameters must always be given as keyword arguments, since SymPy\n",
-      "    uses ``*args`` to (re-)create the dimension arguments of the symbolic object.\n",
+      "    uses `*args` to (re-)create the dimension arguments of the symbolic object.\n",
       "    \n"
      ]
     }
@@ -680,7 +688,7 @@
       "} ;\n",
       "\n",
       "\n",
-      "int Kernel(const float dt, const float h_x, const float h_y, struct dataobj *restrict u_vec, const int time_M, const int time_m, const int x_M, const int x_m, const int y_M, const int y_m, struct profiler * timers)\n",
+      "int Kernel(struct dataobj *restrict u_vec, const float dt, const float h_x, const float h_y, const int time_M, const int time_m, const int x_M, const int x_m, const int y_M, const int y_m, struct profiler * timers)\n",
       "{\n",
       "  float (*restrict u)[u_vec->size[1]][u_vec->size[2]] __attribute__ ((aligned (64))) = (float (*)[u_vec->size[1]][u_vec->size[2]]) u_vec->data;\n",
       "\n",

From e58dc5306e4ccad867111f69ed43b4dc6e30989b Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Thu, 25 Jan 2024 09:43:51 +0000
Subject: [PATCH 23/23] compiler: Skip avoid_denormals upon recursive
 compilation

---
 devito/core/cpu.py          | 4 ++--
 devito/operator/operator.py | 1 +
 devito/passes/iet/engine.py | 6 ++++++
 devito/passes/iet/misc.py   | 2 +-
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/devito/core/cpu.py b/devito/core/cpu.py
index 7137c08b06..956258a91e 100644
--- a/devito/core/cpu.py
+++ b/devito/core/cpu.py
@@ -166,7 +166,7 @@ def _specialize_iet(cls, graph, **kwargs):
         sregistry = kwargs['sregistry']
 
         # Flush denormal numbers
-        avoid_denormals(graph, platform=platform)
+        avoid_denormals(graph, **kwargs)
 
         # Distributed-memory parallelism
         mpiize(graph, **kwargs)
@@ -260,7 +260,7 @@ def _make_iet_passes_mapper(cls, **kwargs):
         parizer = cls._Target.Parizer(sregistry, options, platform, compiler)
 
         return {
-            'denormals': avoid_denormals,
+            'denormals': partial(avoid_denormals, **kwargs),
             'blocking': partial(relax_incr_dimensions, **kwargs),
             'parallel': parizer.make_parallel,
             'openmp': parizer.make_parallel,
diff --git a/devito/operator/operator.py b/devito/operator/operator.py
index ac91069f93..84644a8c83 100644
--- a/devito/operator/operator.py
+++ b/devito/operator/operator.py
@@ -1038,6 +1038,7 @@ def __setstate__(self, state):
 # dangerous as some of them (the minority) might break in some circumstances
 # if applied in cascade (e.g., `linearization` on top of `linearization`)
 rcompile_registry = {
+    'avoid_denormals': False,
     'mpi': False,
     'linearize': False,
     'place-transfers': False
diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py
index c4e98e715c..8a665445c6 100644
--- a/devito/passes/iet/engine.py
+++ b/devito/passes/iet/engine.py
@@ -153,6 +153,12 @@ def wrapper(*args, **kwargs):
             maybe_timed = timed_pass
         else:
             maybe_timed = lambda func, name: func
+        try:
+            # If the pass has been disabled, skip it
+            if not kwargs['options'][func.__name__]:
+                return
+        except KeyError:
+            pass
         try:
             # Pure function case
             graph, = args
diff --git a/devito/passes/iet/misc.py b/devito/passes/iet/misc.py
index 95a30c93c0..e4db2c41c9 100644
--- a/devito/passes/iet/misc.py
+++ b/devito/passes/iet/misc.py
@@ -17,7 +17,7 @@
 
 
 @iet_pass
-def avoid_denormals(iet, platform=None):
+def avoid_denormals(iet, platform=None, **kwargs):
     """
     Introduce nodes in the Iteration/Expression tree that will expand to C
     macros telling the CPU to flush denormal numbers in hardware. Denormals