diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py
index 3b5e647c50..e679d7a785 100644
--- a/devito/arch/archinfo.py
+++ b/devito/arch/archinfo.py
@@ -598,6 +598,14 @@ def get_platform():
 class Platform(object):
 
     registry = {}
+    """
+    The Platform registry.
+
+    Each new Platform instance is automatically added to the registry.
+    """
+
+    max_mem_trans_nbytes = None
+    """Maximum memory transaction size in bytes."""
 
     def __init__(self, name):
         self.name = name
@@ -630,16 +638,6 @@ def _detect_isa(self):
     def threads_per_core(self):
         return self.cores_logical // self.cores_physical
 
-    @property
-    def simd_reg_size(self):
-        """Size in bytes of a SIMD register."""
-        return isa_registry.get(self.isa, 0)
-
-    def simd_items_per_reg(self, dtype):
-        """Number of items of type ``dtype`` that can fit in a SIMD register."""
-        assert self.simd_reg_size % np.dtype(dtype).itemsize == 0
-        return int(self.simd_reg_size / np.dtype(dtype).itemsize)
-
     @property
     def memtotal(self):
         """Physical memory size in bytes, or None if unknown."""
@@ -649,9 +647,23 @@ def memavail(self, *args, **kwargs):
         """Available physical memory in bytes, or None if unknown."""
         return None
 
+    def max_mem_trans_size(self, dtype):
+        """
+        Number of items of type `dtype` that can be transferred in a single
+        memory transaction.
+        """
+        assert self.max_mem_trans_nbytes % np.dtype(dtype).itemsize == 0
+        return int(self.max_mem_trans_nbytes / np.dtype(dtype).itemsize)
+
 
 class Cpu64(Platform):
 
+    # The vast majority of CPUs have a 64-byte cache line size
+    max_mem_trans_nbytes = 64
+
+    # The known ISAs are to be provided by the subclasses
+    known_isas = ()
+
     def __init__(self, name, cores_logical=None, cores_physical=None, isa=None):
         super().__init__(name)
 
@@ -661,9 +673,6 @@ def __init__(self, name, cores_logical=None, cores_physical=None, isa=None):
         self.cores_physical = cores_physical or cpu_info['physical']
         self.isa = isa or self._detect_isa()
 
-    # The known ISAs are to be provided by the subclasses
-    known_isas = ()
-
     @classmethod
     def _mro(cls):
         # Retain only the CPU Platforms
@@ -683,6 +692,20 @@ def _detect_isa(self):
                 return i
         return 'cpp'
 
+    @property
+    def simd_reg_nbytes(self):
+        """
+        Size in bytes of a SIMD register.
+        """
+        return isa_registry.get(self.isa, 0)
+
+    def simd_items_per_reg(self, dtype):
+        """
+        Number of items of type `dtype` that fit in a SIMD register.
+        """
+        assert self.simd_reg_nbytes % np.dtype(dtype).itemsize == 0
+        return int(self.simd_reg_nbytes / np.dtype(dtype).itemsize)
+
     @cached_property
     def memtotal(self):
         return psutil.virtual_memory().total
@@ -758,7 +781,7 @@ def _mro(cls):
                 break
         return retval
 
-    @cached_property
+    @property
     def march(self):
         return None
 
@@ -783,13 +806,17 @@ def memavail(self, deviceid=0):
 
 class IntelDevice(Device):
 
-    @cached_property
+    max_mem_trans_nbytes = 64
+
+    @property
     def march(self):
         return ''
 
 
 class NvidiaDevice(Device):
 
+    max_mem_trans_nbytes = 128
+
     @cached_property
     def march(self):
         info = get_gpu_info()
@@ -802,6 +829,8 @@ def march(self):
 
 class AmdDevice(Device):
 
+    max_mem_trans_nbytes = 256
+
     @cached_property
     def march(cls):
         # TODO: this corresponds to Vega, which acts as the fallback `march`
diff --git a/devito/core/cpu.py b/devito/core/cpu.py
index 7137c08b06..956258a91e 100644
--- a/devito/core/cpu.py
+++ b/devito/core/cpu.py
@@ -166,7 +166,7 @@ def _specialize_iet(cls, graph, **kwargs):
         sregistry = kwargs['sregistry']
 
         # Flush denormal numbers
-        avoid_denormals(graph, platform=platform)
+        avoid_denormals(graph, **kwargs)
 
         # Distributed-memory parallelism
         mpiize(graph, **kwargs)
@@ -260,7 +260,7 @@ def _make_iet_passes_mapper(cls, **kwargs):
         parizer = cls._Target.Parizer(sregistry, options, platform, compiler)
 
         return {
-            'denormals': avoid_denormals,
+            'denormals': partial(avoid_denormals, **kwargs),
             'blocking': partial(relax_incr_dimensions, **kwargs),
             'parallel': parizer.make_parallel,
             'openmp': parizer.make_parallel,
diff --git a/devito/data/allocators.py b/devito/data/allocators.py
index d8626fd153..3d3817f56a 100644
--- a/devito/data/allocators.py
+++ b/devito/data/allocators.py
@@ -1,17 +1,17 @@
 import abc
 from functools import reduce
 from operator import mul
+import ctypes
+from ctypes.util import find_library
 import mmap
 import os
 import sys
 
 import numpy as np
-import ctypes
-from ctypes.util import find_library
 
 from devito.logger import logger
 from devito.parameters import configuration
-from devito.tools import dtype_to_ctype
+from devito.tools import dtype_to_ctype, is_integer
 
 __all__ = ['ALLOC_ALIGNED', 'ALLOC_NUMA_LOCAL', 'ALLOC_NUMA_ANY',
            'ALLOC_KNL_MCDRAM', 'ALLOC_KNL_DRAM', 'ALLOC_GUARD',
@@ -24,9 +24,6 @@ class MemoryAllocator(object):
 
     __metaclass__ = abc.ABCMeta
 
-    is_Posix = False
-    is_Numa = False
-
     _attempted_init = False
     lib = None
 
@@ -51,7 +48,7 @@ def initialize(cls):
         """
         return
 
-    def alloc(self, shape, dtype):
+    def alloc(self, shape, dtype, padding=0):
         """
         Allocate memory.
 
@@ -61,6 +58,9 @@ def alloc(self, shape, dtype):
             Shape of the allocated array.
         dtype : numpy.dtype
             The data type of the raw data.
+        padding : int or 2-tuple of ints, optional
+            The number of points that are allocated before and after the data,
+            that is in addition to the requested shape. Defaults to 0.
 
         Returns
         -------
@@ -69,25 +69,40 @@ def alloc(self, shape, dtype):
             access the data as a ctypes object. The second element is an opaque
             object that is needed only for the "memfree" call.
         """
-        size = int(reduce(mul, shape))
+        datasize = int(reduce(mul, shape))
         ctype = dtype_to_ctype(dtype)
 
-        c_pointer, memfree_args = self._alloc_C_libcall(size, ctype)
-        if c_pointer is None:
-            raise RuntimeError("Unable to allocate %d elements in memory", str(size))
-
-        # cast to 1D array of the specified size
-        ctype_1d = ctype * size
+        # Add padding, if any
+        try:
+            padleft, padright = padding
+        except TypeError:
+            padleft, padright = padding, padding
+        if not is_integer(padleft) and not is_integer(padright):
+            raise TypeError("padding must be an int or a 2-tuple of ints")
+        size = datasize + padleft + padright
+
+        padleft_pointer, memfree_args = self._alloc_C_libcall(size, ctype)
+        if padleft_pointer is None:
+            raise RuntimeError("Unable to allocate %d elements in memory" % size)
+
+        # Compute the pointer to the user data
+        padleft_bytes = padleft * ctypes.sizeof(ctype)
+        c_pointer = ctypes.c_void_p(padleft_pointer.value + padleft_bytes)
+
+        # Cast to 1D array of the specified `datasize`
+        ctype_1d = ctype * datasize
         buf = ctypes.cast(c_pointer, ctypes.POINTER(ctype_1d)).contents
-        pointer = np.frombuffer(buf, dtype=dtype)
-        # pointer.reshape should not be used here because it may introduce a copy
-        # From https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html:
-        # It is not always possible to change the shape of an array without copying the
-        # data. If you want an error to be raised when the data is copied, you should
-        # assign the new shape to the shape attribute of the array:
-        pointer.shape = shape
+        array = np.frombuffer(buf, dtype=dtype)
+
+        # `array.reshape` should not be used here because it may introduce
+        # a copy. From `docs.scipy.org/doc/numpy/reference/generated/numpy.reshape`:
+        #   It is not always possible to change the shape of an array without
+        #   copying the data. If you want an error to be raised when the data
+        #   is copied, you should assign the new shape to the shape attribute
+        #   of the array:
+        array.shape = shape
 
-        return (pointer, memfree_args)
+        return (array, memfree_args)
 
     @abc.abstractmethod
     def _alloc_C_libcall(self, size, ctype):
@@ -124,8 +139,6 @@ class PosixAllocator(MemoryAllocator):
     aligned to page boundaries.
     """
 
-    is_Posix = True
-
     @classmethod
     def initialize(cls):
         handle = find_library('c')
@@ -162,7 +175,7 @@ def free(self, c_pointer):
 class GuardAllocator(PosixAllocator):
 
     """
-    Memory allocator based on ``posix`` functions. The allocated memory is
+    Memory allocator based on `posix` functions. The allocated memory is
     aligned to page boundaries.  Additionally, it allocates extra memory
     before and after the data, and configures it so that an SEGV is thrown
     immediately if an out-of-bounds access occurs.
@@ -195,20 +208,20 @@ def _alloc_C_libcall(self, size, ctype):
         if ret != 0:
             return None, None
 
-        # generate pointers to the left padding, the user data, and the right pad
+        # Generate pointers to the left padding, the user data, and the right pad
         padleft_pointer = c_pointer
         c_pointer = ctypes.c_void_p(c_pointer.value + self.padding_bytes)
         padright_pointer = ctypes.c_void_p(c_pointer.value + npages_user * pagesize)
 
-        # and set the permissions on the pad memory to 0 (no access)
-        # if these fail, don't worry about failing the entire allocation
+        # And set the permissions on the pad memory to 0 (no access)
+        # If these fail, don't worry about failing the entire allocation
         c_padsize = ctypes.c_ulong(self.padding_bytes)
         if self.lib.mprotect(padleft_pointer, c_padsize, ctypes.c_int(0)):
             logger.warning("couldn't protect memory")
         if self.lib.mprotect(padright_pointer, c_padsize, ctypes.c_int(0)):
             logger.warning("couldn't protect memory")
 
-        # if there is a multiple of 4 bytes left, use the code below to poison
+        # If there is a multiple of 4 bytes left, use the code below to poison
         # the memory
         if nbytes_user % 4 == 0:
             poison_size = npages_user*pagesize - nbytes_user
@@ -216,16 +229,16 @@ def _alloc_C_libcall(self, size, ctype):
             poison_ptr = ctypes.cast(ctypes.c_void_p(c_pointer.value + nbytes_user),
                                      intp_type)
 
-            # for both float32 and float64, a sequence of -100 int32s represents NaNs,
-            # at least on little-endian architectures.  It shouldn't matter what we
-            # put in there, anyway
+            # For both float32 and float64, a sequence of -100 int32s
+            # represents NaNs, at least on little-endian architectures;
+            # it shouldn't matter what we put in there, anyway
             for i in range(poison_size // 4):
                 poison_ptr[i] = -100
 
         return c_pointer, (padleft_pointer, c_bytesize)
 
     def free(self, c_pointer, total_size):
-        # unprotect it, since free() accesses it, I think...
+        # Unprotect it, since free() accesses it, I think...
         self.lib.mprotect(c_pointer, total_size,
                           ctypes.c_int(mmap.PROT_READ | mmap.PROT_WRITE))
         self.lib.free(c_pointer)
@@ -247,8 +260,6 @@ class NumaAllocator(MemoryAllocator):
         ("allocate on any NUMA node with sufficient free memory") are accepted.
     """
 
-    is_Numa = True
-
     @classmethod
     def initialize(cls):
         handle = find_library('numa')
@@ -356,7 +367,7 @@ class ExternalAllocator(MemoryAllocator):
     def __init__(self, numpy_array):
         self.numpy_array = numpy_array
 
-    def alloc(self, shape, dtype):
+    def alloc(self, shape, dtype, padding=0):
         assert shape == self.numpy_array.shape, \
             "Provided array has shape %s. Expected %s" %\
             (str(self.numpy_array.shape), str(shape))
@@ -429,4 +440,4 @@ def default_allocator(name=None):
           infer_knl_mode() == 'flat'):
         return ALLOC_KNL_MCDRAM
     else:
-        return ALLOC_ALIGNED
+        return custom_allocators.get('default', ALLOC_ALIGNED)
diff --git a/devito/data/data.py b/devito/data/data.py
index a7d5dc17f7..31db578cd1 100644
--- a/devito/data/data.py
+++ b/devito/data/data.py
@@ -26,12 +26,16 @@ class Data(np.ndarray):
     decomposition : tuple of Decomposition, optional
         The data decomposition, for each dimension.
     modulo : tuple of bool, optional
-        If the i-th entry is True, then the i-th array dimension uses modulo indexing.
+        If the i-th entry is True, then the i-th array dimension uses modulo
+        indexing.
     allocator : MemoryAllocator, optional
         Used to allocate memory. Defaults to `ALLOC_ALIGNED`.
     distributor : Distributor, optional
-        The distributor from which the original decomposition was produced. Note that
-        the decomposition Parameter above may be different to distributor.decomposition.
+        The distributor from which the original decomposition was produced.
+        Note that `decomposition` may differ from `distributor.decomposition`.
+    padding : int or 2-tuple of ints, optional
+        The number of points that are allocated before and after the data,
+        that is in addition to the requested shape. Defaults to 0.
 
     Notes
     -----
@@ -45,9 +49,9 @@ class Data(np.ndarray):
     """
 
     def __new__(cls, shape, dtype, decomposition=None, modulo=None,
-                allocator=ALLOC_ALIGNED, distributor=None):
+                allocator=ALLOC_ALIGNED, distributor=None, padding=0):
         assert len(shape) == len(modulo)
-        ndarray, memfree_args = allocator.alloc(shape, dtype)
+        ndarray, memfree_args = allocator.alloc(shape, dtype, padding=padding)
         obj = ndarray.view(cls)
         obj._allocator = allocator
         obj._memfree_args = memfree_args
diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py
index 075f7275bd..6da5f3ddb8 100644
--- a/devito/ir/clusters/algorithms.py
+++ b/devito/ir/clusters/algorithms.py
@@ -490,7 +490,7 @@ def normalize_reductions_dense(cluster, sregistry, options):
 
     processed = []
     for e in cluster.exprs:
-        if e.is_Reduction and e.lhs.is_Symbol and opt_mapify_reduce:
+        if e.is_Reduction and opt_mapify_reduce:
             # Transform `e` into what is in essence an explicit map-reduce
             # For example, turn:
             # `s += f(u[x], v[x], ...)`
@@ -499,10 +499,20 @@ def normalize_reductions_dense(cluster, sregistry, options):
             # `s += r[x]`
             # This makes it much easier to parallelize the map part regardless
             # of the target backend
-            name = sregistry.make_name()
-            a = Array(name=name, dtype=e.dtype, dimensions=dims)
-            processed.extend([Eq(a.indexify(), e.rhs),
-                              e.func(e.lhs, a.indexify())])
+
+            if e.lhs.function.is_Array:
+                # Probably a compiler-generated reduction, e.g. via
+                # recursive compilation; it's an Array already, so nothing to do
+                processed.append(e)
+            else:
+                # Here the LHS could be a Symbol or a user-level Function
+                # In the latter case we copy the data into a temporary Array
+                # because the Function might be padded, and reduction operations
+                # require, in general, the data values to be contiguous
+                name = sregistry.make_name()
+                a = Array(name=name, dtype=e.dtype, dimensions=dims)
+                processed.extend([Eq(a.indexify(), e.rhs),
+                                  e.func(e.lhs, a.indexify())])
         else:
             processed.append(e)
 
diff --git a/devito/ir/clusters/analysis.py b/devito/ir/clusters/analysis.py
index 4778f2b2b9..6c360effd7 100644
--- a/devito/ir/clusters/analysis.py
+++ b/devito/ir/clusters/analysis.py
@@ -1,6 +1,6 @@
 from devito.ir.clusters.visitors import QueueStateful
 from devito.ir.support import (AFFINE, PARALLEL, PARALLEL_INDEP, PARALLEL_IF_ATOMIC,
-                               ROUNDABLE, SEQUENTIAL, Forward)
+                               SEQUENTIAL)
 from devito.tools import as_tuple, flatten, timed_pass
 
 __all__ = ['analyze']
@@ -13,7 +13,6 @@ def analyze(clusters):
     # Collect properties
     clusters = Parallelism(state).process(clusters)
     clusters = Affiness(state).process(clusters)
-    clusters = Rounding(state).process(clusters)
 
     # Reconstruct Clusters attaching the discovered properties
     processed = [c.rebuild(properties=state.properties.get(c)) for c in clusters]
@@ -114,41 +113,6 @@ def _callback(self, clusters, d, prefix):
             return PARALLEL
 
 
-class Rounding(Detector):
-
-    def _callback(self, clusters, d, prefix):
-        itinterval = prefix[-1]
-
-        # The iteration direction must be Forward -- ROUNDABLE is for rounding *up*
-        if itinterval.direction is not Forward:
-            return
-
-        properties = self._fetch_properties(clusters, prefix)
-        if PARALLEL not in properties[d]:
-            return
-
-        scope = self._fetch_scope(clusters)
-
-        # All accessed Functions must have enough room in the PADDING region
-        # so that `i`'s trip count can safely be rounded up
-        # Note: autopadding guarantees that the padding size along the
-        # Fastest Varying Dimension is a multiple of the SIMD vector length
-        functions = [f for f in scope.functions if f.is_AbstractFunction]
-        try:
-            if any(not f._honors_autopadding for f in functions):
-                return
-        except ValueError:
-            # E.g., lazily allocated Functions don't have an accessible
-            # `.shape` until after the first call to `f._arg_values`
-            return
-
-        # Mixed data types (e.g., float and double) is unsupported
-        if len({f.dtype for f in functions}) > 1:
-            return
-
-        return ROUNDABLE
-
-
 class Affiness(Detector):
 
     """
diff --git a/devito/ir/support/properties.py b/devito/ir/support/properties.py
index 1b0e5fc872..0cb8cb5231 100644
--- a/devito/ir/support/properties.py
+++ b/devito/ir/support/properties.py
@@ -48,12 +48,6 @@ def __init__(self, name, val=None):
 SKEWABLE = Property('skewable')
 """A fully parallel Dimension that would benefit from wavefront/skewed tiling."""
 
-ROUNDABLE = Property('roundable')
-"""
-A Dimension whose upper limit may be rounded up to a multiple of the SIMD
-vector length thanks to the presence of enough padding.
-"""
-
 AFFINE = Property('affine')
 """
 A Dimension used to index into tensor objects only through affine and regular
@@ -130,7 +124,7 @@ def normalize_properties(*args):
 
 
 def relax_properties(properties):
-    return frozenset(properties - {PARALLEL_INDEP, ROUNDABLE})
+    return frozenset(properties - {PARALLEL_INDEP})
 
 
 class Properties(frozendict):
diff --git a/devito/logger.py b/devito/logger.py
index 0ca9ec6aea..92ede2e8dc 100644
--- a/devito/logger.py
+++ b/devito/logger.py
@@ -5,7 +5,7 @@
 from contextlib import contextmanager
 
 __all__ = ('set_log_level', 'set_log_noperf', 'is_log_enabled_for',
-           'log', 'warning', 'error', 'perf', 'perf_adv',
+           'log', 'warning', 'error', 'perf', 'hint',
            'RED', 'GREEN', 'BLUE')
 
 
@@ -124,8 +124,8 @@ def perf(msg, *args, **kwargs):
     log(msg, PERF, *args, **kwargs)
 
 
-def perf_adv(msg, *args, **kwargs):
-    log("Potential optimisation missed: %s" % msg, PERF, *args, **kwargs)
+def hint(msg, *args, **kwargs):
+    log("Hint: %s" % msg, PERF, *args, **kwargs)
 
 
 def warning(msg, *args, **kwargs):
diff --git a/devito/operator/operator.py b/devito/operator/operator.py
index 4d5208235a..84644a8c83 100644
--- a/devito/operator/operator.py
+++ b/devito/operator/operator.py
@@ -624,7 +624,8 @@ def _prepare_arguments(self, autotune=None, **kwargs):
 
         # Sanity check
         for p in self.parameters:
-            p._arg_check(args, self._dspace[p], am=self._access_modes.get(p))
+            p._arg_check(args, self._dspace[p], am=self._access_modes.get(p),
+                         **kwargs)
         for d in self.dimensions:
             if d.is_Derived:
                 d._arg_check(args, self._dspace[p])
@@ -1037,6 +1038,7 @@ def __setstate__(self, state):
 # dangerous as some of them (the minority) might break in some circumstances
 # if applied in cascade (e.g., `linearization` on top of `linearization`)
 rcompile_registry = {
+    'avoid_denormals': False,
     'mpi': False,
     'linearize': False,
     'place-transfers': False
diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py
index 7a8321a7f9..30f21516fb 100644
--- a/devito/passes/clusters/aliases.py
+++ b/devito/passes/clusters/aliases.py
@@ -7,7 +7,7 @@
 import sympy
 
 from devito.finite_differences import EvalDerivative, IndexDerivative, Weights
-from devito.ir import (SEQUENTIAL, PARALLEL_IF_PVT, ROUNDABLE, SEPARABLE, Forward,
+from devito.ir import (SEQUENTIAL, PARALLEL_IF_PVT, SEPARABLE, Forward,
                        IterationSpace, Interval, Cluster, ExprGeometry, Queue,
                        IntervalGroup, LabeledVector, Vector, normalize_properties,
                        relax_properties, unbounded, minimum, maximum, extrema,
@@ -140,7 +140,6 @@ def _aliases_from_clusters(self, clusters, exclude, meta):
         # Schedule -> Schedule (optimization)
         if self.opt_rotate:
             schedule = optimize_schedule_rotations(schedule, self.sregistry)
-        schedule = optimize_schedule_padding(schedule, meta, self.platform)
 
         # Schedule -> [Clusters]_k
         processed, subs = lower_schedule(schedule, meta, self.sregistry,
@@ -828,30 +827,6 @@ def optimize_schedule_rotations(schedule, sregistry):
     return schedule.rebuild(*processed, rmapper=rmapper)
 
 
-def optimize_schedule_padding(schedule, meta, platform):
-    """
-    Round up the innermost IterationInterval of the tensor temporaries IterationSpace
-    to a multiple of the SIMD vector length. This is not always possible though (it
-    depends on how much halo is safely accessible in all read Functions).
-    """
-    processed = []
-    for i in schedule:
-        try:
-            it = i.ispace.itintervals[-1]
-            if it.dim is i.writeto[-1].dim and ROUNDABLE in meta.properties[it.dim]:
-                vl = platform.simd_items_per_reg(meta.dtype)
-                ispace = i.ispace.add(Interval(it.dim, 0, it.size % vl))
-            else:
-                ispace = i.ispace
-            processed.append(ScheduledAlias(
-                i.pivot, i.writeto, ispace, i.aliaseds, i.indicess,
-            ))
-        except (TypeError, KeyError, IndexError):
-            processed.append(i)
-
-    return schedule.rebuild(*processed)
-
-
 def lower_schedule(schedule, meta, sregistry, ftemps):
     """
     Turn a Schedule into a sequence of Clusters.
@@ -926,8 +901,7 @@ def lower_schedule(schedule, meta, sregistry, ftemps):
                 if any(i.is_Modulo for i in ispace.sub_iterators[d]):
                     properties[d] = normalize_properties(v, {SEQUENTIAL})
                 elif d not in writeto.itdims:
-                    properties[d] = normalize_properties(v, {PARALLEL_IF_PVT}) - \
-                        {ROUNDABLE}
+                    properties[d] = normalize_properties(v, {PARALLEL_IF_PVT})
             except KeyError:
                 # Non-dimension key such as (x, y) for diagonal stencil u(x+i hx, y+i hy)
                 pass
diff --git a/devito/passes/iet/definitions.py b/devito/passes/iet/definitions.py
index 913432da8e..22f4ae8ae4 100644
--- a/devito/passes/iet/definitions.py
+++ b/devito/passes/iet/definitions.py
@@ -73,9 +73,10 @@ class DataManager(object):
     The language used to express data allocations, deletions, and host-device transfers.
     """
 
-    def __init__(self, rcompile=None, sregistry=None, **kwargs):
+    def __init__(self, rcompile=None, sregistry=None, platform=None, **kwargs):
         self.rcompile = rcompile
         self.sregistry = sregistry
+        self.platform = platform
 
     def _alloc_object_on_low_lat_mem(self, site, obj, storage):
         """
diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py
index c4e98e715c..8a665445c6 100644
--- a/devito/passes/iet/engine.py
+++ b/devito/passes/iet/engine.py
@@ -153,6 +153,12 @@ def wrapper(*args, **kwargs):
             maybe_timed = timed_pass
         else:
             maybe_timed = lambda func, name: func
+        try:
+            # If the pass has been disabled, skip it
+            if not kwargs['options'][func.__name__]:
+                return
+        except KeyError:
+            pass
         try:
             # Pure function case
             graph, = args
diff --git a/devito/passes/iet/linearization.py b/devito/passes/iet/linearization.py
index 9cd16dc416..5ec2a41a9d 100644
--- a/devito/passes/iet/linearization.py
+++ b/devito/passes/iet/linearization.py
@@ -81,9 +81,9 @@ def key1(f, d):
         * A 3-tuple `(Dimension, halo size, grid)` otherwise.
     """
     if f.is_regular:
-        # TODO: same grid + same halo => same padding, however this is not asserted
-        # during compilation... so maybe we should do it at `prepare_args` time?
-        return (d, f._size_halo[d], getattr(f, 'grid', None))
+        # For paddable objects the following holds:
+        # `same dim + same halo => same (auto-)padding`
+        return (d, f._size_halo[d], f.is_autopaddable)
     else:
         return False
 
diff --git a/devito/passes/iet/misc.py b/devito/passes/iet/misc.py
index 95a30c93c0..e4db2c41c9 100644
--- a/devito/passes/iet/misc.py
+++ b/devito/passes/iet/misc.py
@@ -17,7 +17,7 @@
 
 
 @iet_pass
-def avoid_denormals(iet, platform=None):
+def avoid_denormals(iet, platform=None, **kwargs):
     """
     Introduce nodes in the Iteration/Expression tree that will expand to C
     macros telling the CPU to flush denormal numbers in hardware. Denormals
diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py
index b41b871b55..252cd9ddd6 100644
--- a/devito/passes/iet/parpragma.py
+++ b/devito/passes/iet/parpragma.py
@@ -41,15 +41,15 @@ def _support_array_reduction(cls, compiler):
         return True
 
     @property
-    def simd_reg_size(self):
-        return self.platform.simd_reg_size
+    def simd_reg_nbytes(self):
+        return self.platform.simd_reg_nbytes
 
     def _make_simd_pragma(self, iet):
         indexeds = FindSymbols('indexeds').visit(iet)
         aligned = {i.name for i in indexeds if i.function.is_DiscreteFunction}
         if aligned:
             simd = self.lang['simd-for-aligned']
-            simd = as_tuple(simd(','.join(sorted(aligned)), self.simd_reg_size))
+            simd = as_tuple(simd(','.join(sorted(aligned)), self.simd_reg_nbytes))
         else:
             simd = as_tuple(self.lang['simd-for'])
 
diff --git a/devito/types/array.py b/devito/types/array.py
index e51edd27ed..0cfc754bd6 100644
--- a/devito/types/array.py
+++ b/devito/types/array.py
@@ -1,13 +1,11 @@
 from ctypes import POINTER, Structure, c_void_p, c_ulong
-from math import ceil
 
 import numpy as np
 from cached_property import cached_property
 from sympy import Expr
 
-from devito.parameters import configuration
 from devito.tools import (Reconstructable, as_tuple, c_restrict_void_p,
-                          dtype_to_ctype, dtypes_vector_mapper)
+                          dtype_to_ctype, dtypes_vector_mapper, is_integer)
 from devito.types.basic import AbstractFunction
 from devito.types.utils import CtypesFactory, DimensionTuple
 
@@ -127,42 +125,23 @@ def __init_finalize__(self, *args, **kwargs):
         self._initvalue = kwargs.get('initvalue')
         assert self._initvalue is None or self._scope != 'heap'
 
+    @classmethod
+    def __dtype_setup__(cls, **kwargs):
+        return kwargs.get('dtype', np.float32)
+
     def __padding_setup__(self, **kwargs):
         padding = kwargs.get('padding')
         if padding is None:
-            padding = [(0, 0) for _ in range(self.ndim)]
-            if kwargs.get('autopadding', configuration['autopadding']):
-                # Heuristic 1; Arrays are typically introduced for temporaries
-                # introduced during compilation, and are almost always used together
-                # with loop blocking.  Since the typical block size is a multiple of
-                # the SIMD vector length, `vl`, padding is made such that the
-                # NODOMAIN size is a multiple of `vl` too
-
-                # Heuristic 2: the right-NODOMAIN size is not only a multiple of
-                # `vl`, but also guaranteed to be *at least* greater or equal than
-                # `vl`, so that the compiler can tweak loop trip counts to maximize
-                # the effectiveness of SIMD vectorization
-
-                # Let UB be a function that rounds up a value `x` to the nearest
-                # multiple of the SIMD vector length
-                vl = configuration['platform'].simd_items_per_reg(self.dtype)
-                ub = lambda x: int(ceil(x / vl)) * vl
-
-                fvd_halo_size = sum(self.halo[-1])
-                fvd_pad_size = (ub(fvd_halo_size) - fvd_halo_size) + vl
-
-                padding[-1] = (0, fvd_pad_size)
-            return tuple(padding)
-        elif isinstance(padding, int):
-            return tuple((0, padding) for _ in range(self.ndim))
+            padding = ((0, 0),)*self.ndim
+        elif isinstance(padding, DimensionTuple):
+            padding = tuple(padding[d] for d in self.dimensions)
+        elif is_integer(padding):
+            padding = tuple((0, padding) for _ in range(self.ndim))
         elif isinstance(padding, tuple) and len(padding) == self.ndim:
-            return tuple((0, i) if isinstance(i, int) else i for i in padding)
+            padding = tuple((0, i) if is_integer(i) else i for i in padding)
         else:
             raise TypeError("`padding` must be int or %d-tuple of ints" % self.ndim)
-
-    @classmethod
-    def __dtype_setup__(cls, **kwargs):
-        return kwargs.get('dtype', np.float32)
+        return DimensionTuple(*padding, getters=self.dimensions)
 
     @property
     def liveness(self):
@@ -435,6 +414,10 @@ def is_TimeFunction(self):
     def is_Input(self):
         return all(i.is_Input for i in self.components)
 
+    @property
+    def is_autopaddable(self):
+        return all(i.is_autopaddable for i in self.components)
+
     # Other properties and methods
 
     @property
@@ -521,7 +504,7 @@ class ComponentAccess(Expr, Reconstructable):
     def __new__(cls, arg, index=0, **kwargs):
         if not arg.is_Indexed:
             raise ValueError("Expected Indexed, got `%s` instead" % type(arg))
-        if not isinstance(index, int) or index > 3:
+        if not is_integer(index) or index > 3:
             raise ValueError("Expected 0 <= index < 4")
 
         obj = Expr.__new__(cls, arg)
diff --git a/devito/types/basic.py b/devito/types/basic.py
index 17835933e3..745d6cd0de 100644
--- a/devito/types/basic.py
+++ b/devito/types/basic.py
@@ -12,6 +12,7 @@
 from cached_property import cached_property
 
 from devito.data import default_allocator
+from devito.parameters import configuration
 from devito.tools import (Pickable, as_tuple, ctypes_to_cstr, dtype_to_ctype,
                           frozendict, memoized_meth, sympy_mutex)
 from devito.types.args import ArgProvider
@@ -834,13 +835,15 @@ class AbstractFunction(sympy.Function, Basic, Pickable, Evaluable):
     Functions; etc.
     """
 
-    is_compact = True
+    is_autopaddable = False
     """
-    True if data is allocated as a single, contiguous chunk of memory.
+    True if the Function can be padded automatically by the Devito runtime,
+    thus increasing its size, False otherwise. Note that this property has no
+    effect if autopadding is disabled, which is the default behavior.
     """
 
-    __rkwargs__ = ('name', 'dtype', 'grid', 'halo', 'padding', 'alias',
-                   'space', 'function')
+    __rkwargs__ = ('name', 'dtype', 'grid', 'halo', 'padding', 'ghost',
+                   'alias', 'space', 'function')
 
     def __new__(cls, *args, **kwargs):
         # Preprocess arguments
@@ -930,10 +933,11 @@ def sort_key(self, order=None):
         return class_key, args, exp, coeff
 
     def __init_finalize__(self, *args, **kwargs):
-        # Setup halo and padding regions
+        # Setup halo, padding, and ghost regions
         self._is_halo_dirty = False
         self._halo = self.__halo_setup__(**kwargs)
         self._padding = self.__padding_setup__(**kwargs)
+        self._ghost = self.__ghost_setup__(**kwargs)
 
         # There may or may not be a `Grid`
         self._grid = kwargs.get('grid')
@@ -981,13 +985,40 @@ def __dtype_setup__(cls, **kwargs):
         return None
 
     def __halo_setup__(self, **kwargs):
-        halo = tuple(kwargs.get('halo', [(0, 0) for i in range(self.ndim)]))
+        halo = tuple(kwargs.get('halo', ((0, 0),)*self.ndim))
         return DimensionTuple(*halo, getters=self.dimensions)
 
     def __padding_setup__(self, **kwargs):
-        padding = tuple(kwargs.get('padding', [(0, 0) for i in range(self.ndim)]))
+        padding = tuple(kwargs.get('padding', ((0, 0),)*self.ndim))
         return DimensionTuple(*padding, getters=self.dimensions)
 
+    def __padding_setup_smart__(self, **kwargs):
+        nopadding = ((0, 0),)*self.ndim
+
+        if kwargs.get('autopadding', configuration['autopadding']):
+            # The padded Dimension
+            candidates = self.space_dimensions
+            if not candidates:
+                return nopadding
+            d = candidates[-1]
+
+            mmts = configuration['platform'].max_mem_trans_size(self.dtype)
+            remainder = self._size_nopad[d] % mmts
+            if remainder == 0:
+                # Already a multiple of `mmts`, no need to pad
+                return nopadding
+
+            dpadding = (0, (mmts - remainder))
+            padding = [(0, 0)]*self.ndim
+            padding[self.dimensions.index(d)] = dpadding
+
+            return tuple(padding)
+        else:
+            return nopadding
+
+    def __ghost_setup__(self, **kwargs):
+        return (0, 0)
+
     def __distributor_setup__(self, **kwargs):
         # There may or may not be a `Distributor`. In the latter case, the
         # AbstractFunction is to be considered "local" to each MPI rank
@@ -996,16 +1027,6 @@ def __distributor_setup__(self, **kwargs):
         except AttributeError:
             return kwargs.get('distributor')
 
-    @cached_property
-    def _honors_autopadding(self):
-        """
-        True if the actual padding is greater or equal than whatever autopadding
-        would produce, False otherwise.
-        """
-        autopadding = self.__padding_setup__(autopadding=True)
-        return all(l0 >= l1 and r0 >= r1
-                   for (l0, r0), (l1, r1) in zip(self.padding, autopadding))
-
     @property
     def name(self):
         """The name of the object."""
@@ -1036,6 +1057,11 @@ def dimensions(self):
         """Tuple of Dimensions representing the object indices."""
         return self._dimensions
 
+    @cached_property
+    def space_dimensions(self):
+        """Tuple of Dimensions defining the physical space."""
+        return tuple(d for d in self.dimensions if d.is_Space)
+
     @property
     def base(self):
         return self.indexed
@@ -1171,6 +1197,10 @@ def halo(self):
     def padding(self):
         return self._padding
 
+    @property
+    def ghost(self):
+        return self._ghost
+
     @property
     def is_const(self):
         return False
@@ -1266,6 +1296,14 @@ def _size_nodomain(self):
 
         return DimensionTuple(*sizes, getters=self.dimensions, left=left, right=right)
 
+    @cached_property
+    def _size_ghost(self):
+        """
+        Number of points in the ghost region, that is the two areas before
+        and after the allocated data.
+        """
+        return Size(*self._ghost)
+
     @cached_property
     def _offset_domain(self):
         """Number of points before the first domain element."""
diff --git a/devito/types/dense.py b/devito/types/dense.py
index 9d33777551..0aa16d2435 100644
--- a/devito/types/dense.py
+++ b/devito/types/dense.py
@@ -1,7 +1,6 @@
 from collections import namedtuple
 from ctypes import POINTER, Structure, c_int, c_ulong, c_void_p, cast, byref
 from functools import wraps, reduce
-from math import ceil
 from operator import mul
 
 import numpy as np
@@ -99,7 +98,7 @@ def __init_finalize__(self, *args, function=None, **kwargs):
                 # This is a corner case -- we might get here, for example, when
                 # running with MPI and some processes get 0-size arrays after
                 # domain decomposition. We touch the data anyway to avoid the
-                # case ``self._data is None``
+                # case `self._data is None`
                 self.data
         else:
             raise ValueError("`initializer` must be callable or buffer, not %s"
@@ -126,7 +125,8 @@ def wrapper(self):
                 self._data = self._DataType(self.shape_allocated, self.dtype,
                                             modulo=self._mask_modulo,
                                             allocator=self._allocator,
-                                            distributor=self._distributor)
+                                            distributor=self._distributor,
+                                            padding=self._size_ghost)
 
                 # Initialize data
                 if self._first_touch:
@@ -219,7 +219,7 @@ def shape_domain(self):
         Notes
         -----
         In an MPI context, this is the *local* domain region shape.
-        Alias to ``self.shape``.
+        Alias to `self.shape`.
         """
         return self.shape
 
@@ -436,7 +436,7 @@ def data_gather(self, start=None, stop=None, step=1, rank=0):
 
         Notes
         -----
-        Alias to ``self.data._gather``.
+        Alias to `self.data._gather`.
 
         Note that gathering data from large simulations onto a single rank may
         result in memory blow-up and hence should use this method judiciously.
@@ -453,7 +453,7 @@ def data_domain(self):
 
         Notes
         -----
-        Alias to ``self.data``.
+        Alias to `self.data`.
 
         With this accessor you are claiming that you will modify the values you
         get back. If you only need to look at the values, use
@@ -616,10 +616,10 @@ def local_indices(self):
 
         Notes
         -----
-        Given a Function ``f(x, y)`` with shape ``(nx, ny)``, when *not* using
-        MPI this property will return ``(slice(0, nx-1), slice(0, ny-1))``. On
+        Given a Function `f(x, y)` with shape `(nx, ny)`, when *not* using
+        MPI this property will return `(slice(0, nx-1), slice(0, ny-1))`. On
         the other hand, when MPI is used, the local ranges depend on the domain
-        decomposition, which is carried by ``self.grid``.
+        decomposition, which is carried by `self.grid`.
         """
         if self._distributor is None:
             return tuple(slice(0, s) for s in self.shape)
@@ -627,11 +627,6 @@ def local_indices(self):
             return tuple(self._distributor.glb_slices.get(d, slice(0, s))
                          for s, d in zip(self.shape, self.dimensions))
 
-    @cached_property
-    def space_dimensions(self):
-        """Tuple of Dimensions defining the physical space."""
-        return tuple(d for d in self.dimensions if d.is_Space)
-
     @property
     def initializer(self):
         if isinstance(self._data, np.ndarray):
@@ -836,32 +831,32 @@ def _arg_check(self, args, intervals, **kwargs):
         Raises
         ------
         InvalidArgument
-            If, given the runtime values `args`, an out-of-bounds array
-            access would be performed, or if shape/dtype don't match with
-            self's shape/dtype.
+            If an incompatibility is detected.
         """
         if self.name not in args:
             raise InvalidArgument("No runtime value for `%s`" % self.name)
 
-        key = args[self.name]
-        if len(key.shape) != self.ndim:
+        data = args[self.name]
+
+        if len(data.shape) != self.ndim:
             raise InvalidArgument("Shape %s of runtime value `%s` does not match "
                                   "dimensions %s" %
-                                  (key.shape, self.name, self.dimensions))
-        if key.dtype != self.dtype:
+                                  (data.shape, self.name, self.dimensions))
+        if data.dtype != self.dtype:
             warning("Data type %s of runtime value `%s` does not match the "
-                    "Function data type %s" % (key.dtype, self.name, self.dtype))
+                    "Function data type %s" % (data.dtype, self.name, self.dtype))
 
-        for i, s in zip(self.dimensions, key.shape):
+        # Check each Dimension for potential OOB accesses
+        for i, s in zip(self.dimensions, data.shape):
             i._arg_check(args, s, intervals[i])
 
         if args.options['index-mode'] == 'int32' and \
            args.options['linearize'] and \
-           self.size - 1 >= np.iinfo(np.int32).max:
+           data.size - 1 >= np.iinfo(np.int32).max:
             raise InvalidArgument("`%s`, with its %d elements, may be too big for "
                                   "int32 pointer arithmetic, which might cause an "
                                   "overflow. Use the 'index-mode=int64' option"
-                                  % (self, self.size))
+                                  % (self, data.size))
 
     def _arg_finalize(self, args, alias=None):
         key = alias or self
@@ -895,21 +890,29 @@ class Function(DiscreteFunction):
         provided, shape and dimensions must be given. For MPI execution, a
         Grid is compulsory.
     space_order : int or 3-tuple of ints, optional
-        Discretisation order for space derivatives. Defaults to 1. ``space_order`` also
-        impacts the number of points available around a generic point of interest.  By
-        default, ``space_order`` points are available on both sides of a generic point of
-        interest, including those nearby the grid boundary. Sometimes, fewer points
-        suffice; in other scenarios, more points are necessary. In such cases, instead of
-        an integer, one can pass a 3-tuple ``(o, lp, rp)`` indicating the discretization
-        order (``o``) as well as the number of points on the left (``lp``) and right
-        (``rp``) sides of a generic point of interest.
+        Discretisation order for space derivatives. Defaults to 1.
+        `space_order` also impacts the number of points available around a
+        generic point of interest.  By default, `space_order` points are
+        available on both sides of a generic point of interest, including those
+        nearby the grid boundary. Sometimes, fewer points suffice; in other
+        scenarios, more points are necessary. In such cases, instead of an
+        integer, one can pass:
+          * a 3-tuple `(o, lp, rp)` indicating the discretization order
+            (`o`) as well as the number of points on the left (`lp`) and
+            right (`rp`) sides of a generic point of interest;
+          * a 2-tuple `(o, ((lp0, rp0), (lp1, rp1), ...))` indicating the
+            discretization order (`o`) as well as the number of points on
+            the left/right sides of a generic point of interest for each
+            SpaceDimension.
     shape : tuple of ints, optional
-        Shape of the domain region in grid points. Only necessary if ``grid`` isn't given.
+        Shape of the domain region in grid points. Only necessary if `grid`
+        isn't given.
     dimensions : tuple of Dimension, optional
-        Dimensions associated with the object. Only necessary if ``grid`` isn't given.
+        Dimensions associated with the object. Only necessary if `grid` isn't
+        given.
     dtype : data-type, optional
         Any object that can be interpreted as a numpy data type. Defaults
-        to ``np.float32``.
+        to `np.float32`.
     staggered : Dimension or tuple of Dimension or Stagger, optional
         Define how the Function is staggered.
     initializer : callable or any object exposing the buffer interface, optional
@@ -919,8 +922,6 @@ class Function(DiscreteFunction):
         to take advantage of the memory hierarchy in a NUMA architecture. Refer to
         `default_allocator.__doc__` for more information.
     padding : int or tuple of ints, optional
-        .. deprecated:: shouldn't be used; padding is now automatically inserted.
-
         Allocate extra grid points to maximize data access alignment. When a tuple
         of ints, one int per Dimension should be provided.
 
@@ -969,11 +970,13 @@ class Function(DiscreteFunction):
     Notes
     -----
     The parameters must always be given as keyword arguments, since SymPy
-    uses ``*args`` to (re-)create the dimension arguments of the symbolic object.
+    uses `*args` to (re-)create the dimension arguments of the symbolic object.
     """
 
     is_Function = True
 
+    is_autopaddable = True
+
     __rkwargs__ = (DiscreteFunction.__rkwargs__ +
                    ('space_order', 'shape_global', 'dimensions'))
 
@@ -988,10 +991,10 @@ def __init_finalize__(self, *args, **kwargs):
         space_order = kwargs.get('space_order', 1)
         if isinstance(space_order, int):
             self._space_order = space_order
-        elif isinstance(space_order, tuple) and len(space_order) == 3:
-            self._space_order, _, _ = space_order
+        elif isinstance(space_order, tuple) and len(space_order) >= 2:
+            self._space_order = space_order[0]
         else:
-            raise TypeError("`space_order` must be int or 3-tuple of ints")
+            raise TypeError("Invalid `space_order`")
 
         # Acquire derivative shortcuts
         if self is self.function:
@@ -1108,46 +1111,45 @@ def __halo_setup__(self, **kwargs):
         else:
             space_order = kwargs.get('space_order', 1)
             if isinstance(space_order, int):
-                halo = (space_order, space_order)
+                v = (space_order, space_order)
+                halo = [v if i.is_Space else (0, 0) for i in self.dimensions]
+
             elif isinstance(space_order, tuple) and len(space_order) == 3:
-                _, left_points, right_points = space_order
-                halo = (left_points, right_points)
+                _, l, r = space_order
+                halo = [(l, r) if i.is_Space else (0, 0) for i in self.dimensions]
+
+            elif isinstance(space_order, tuple) and len(space_order) == 2:
+                _, space_halo = space_order
+                if not isinstance(space_halo, tuple) or \
+                   not all(isinstance(i, tuple) for i in space_halo) or \
+                   len(space_halo) != len(self.space_dimensions):
+                    raise TypeError("Invalid `space_order`")
+                v = list(space_halo)
+                halo = [v.pop(0) if i.is_Space else (0, 0)
+                        for i in self.dimensions]
+
             else:
-                raise TypeError("`space_order` must be int or 3-tuple of ints")
-            halo = tuple(halo if i.is_Space else (0, 0) for i in self.dimensions)
+                raise TypeError("Invalid `space_order`")
         return DimensionTuple(*halo, getters=self.dimensions)
 
     def __padding_setup__(self, **kwargs):
         padding = kwargs.get('padding')
         if padding is None:
-            if kwargs.get('autopadding', configuration['autopadding']):
-                # Auto-padding
-                # 0-padding in all Dimensions except in the Fastest Varying Dimension,
-                # `fvd`, which is the innermost one
-                padding = [(0, 0) for i in self.dimensions[:-1]]
-                fvd = self.dimensions[-1]
-                # Let UB be a function that rounds up a value `x` to the nearest
-                # multiple of the SIMD vector length, `vl`
-                vl = configuration['platform'].simd_items_per_reg(self.dtype)
-                ub = lambda x: int(ceil(x / vl)) * vl
-                # Given the HALO and DOMAIN sizes, the right-PADDING is such that:
-                # * the `fvd` size is a multiple of `vl`
-                # * it contains *at least* `vl` points
-                # This way:
-                # * all first grid points along the `fvd` will be cache-aligned
-                # * there is enough room to round up the loop trip counts to maximize
-                #   the effectiveness SIMD vectorization
-                fvd_pad_size = (ub(self._size_nopad[fvd]) - self._size_nopad[fvd]) + vl
-                padding.append((0, fvd_pad_size))
+            if self.is_autopaddable:
+                padding = self.__padding_setup_smart__(**kwargs)
             else:
-                padding = tuple((0, 0) for d in self.dimensions)
+                padding = super().__padding_setup__(**kwargs)
+
         elif isinstance(padding, DimensionTuple):
             padding = tuple(padding[d] for d in self.dimensions)
+
         elif isinstance(padding, int):
             padding = tuple((0, padding) if d.is_Space else (0, 0)
                             for d in self.dimensions)
+
         elif isinstance(padding, tuple) and len(padding) == self.ndim:
             padding = tuple((0, i) if isinstance(i, int) else i for i in padding)
+
         else:
             raise TypeError("`padding` must be int or %d-tuple of ints" % self.ndim)
         return DimensionTuple(*padding, getters=self.dimensions)
@@ -1159,8 +1161,8 @@ def space_order(self):
 
     def sum(self, p=None, dims=None):
         """
-        Generate a symbolic expression computing the sum of ``p`` points
-        along the spatial dimensions ``dims``.
+        Generate a symbolic expression computing the sum of `p` points
+        along the spatial dimensions `dims`.
 
         Parameters
         ----------
@@ -1168,7 +1170,7 @@ def sum(self, p=None, dims=None):
             The number of summands. Defaults to the halo size.
         dims : tuple of Dimension, optional
             The Dimensions along which the sum is computed. Defaults to
-            ``self``'s spatial dimensions.
+            `self`'s spatial dimensions.
         """
         points = []
         for d in (as_tuple(dims) or self.space_dimensions):
@@ -1185,8 +1187,8 @@ def sum(self, p=None, dims=None):
 
     def avg(self, p=None, dims=None):
         """
-        Generate a symbolic expression computing the average of ``p`` points
-        along the spatial dimensions ``dims``.
+        Generate a symbolic expression computing the average of `p` points
+        along the spatial dimensions `dims`.
 
         Parameters
         ----------
@@ -1194,7 +1196,7 @@ def avg(self, p=None, dims=None):
             The number of summands. Defaults to the halo size.
         dims : tuple of Dimension, optional
             The Dimensions along which the average is computed. Defaults to
-            ``self``'s spatial dimensions.
+            `self`'s spatial dimensions.
         """
         tot = self.sum(p, dims)
         return tot / len(tot.args)
@@ -1219,36 +1221,45 @@ class TimeFunction(Function):
         provided, shape and dimensions must be given. For MPI execution, a
         Grid is compulsory.
     space_order : int or 3-tuple of ints, optional
-        Discretisation order for space derivatives. Defaults to 1. ``space_order`` also
-        impacts the number of points available around a generic point of interest.  By
-        default, ``space_order`` points are available on both sides of a generic point of
-        interest, including those nearby the grid boundary. Sometimes, fewer points
-        suffice; in other scenarios, more points are necessary. In such cases, instead of
-        an integer, one can pass a 3-tuple ``(o, lp, rp)`` indicating the discretization
-        order (``o``) as well as the number of points on the left (``lp``) and right
-        (``rp``) sides of a generic point of interest.
+        Discretisation order for space derivatives. Defaults to 1.
+        `space_order` also impacts the number of points available around a
+        generic point of interest.  By default, `space_order` points are
+        available on both sides of a generic point of interest, including those
+        nearby the grid boundary. Sometimes, fewer points suffice; in other
+        scenarios, more points are necessary. In such cases, instead of an
+        integer, one can pass:
+          * a 3-tuple `(o, lp, rp)` indicating the discretization order
+            (`o`) as well as the number of points on the left (`lp`) and
+            right (`rp`) sides of a generic point of interest;
+          * a 2-tuple `(o, ((lp0, rp0), (lp1, rp1), ...))` indicating the
+            discretization order (`o`) as well as the number of points on
+            the left/right sides of a generic point of interest for each
+            SpaceDimension.
     time_order : int, optional
         Discretization order for time derivatives. Defaults to 1.
     shape : tuple of ints, optional
-        Shape of the domain region in grid points. Only necessary if `grid` isn't given.
+        Shape of the domain region in grid points. Only necessary if `grid`
+        isn't given.
     dimensions : tuple of Dimension, optional
-        Dimensions associated with the object. Only necessary if `grid` isn't given.
+        Dimensions associated with the object. Only necessary if `grid` isn't
+        given.
     dtype : data-type, optional
         Any object that can be interpreted as a numpy data type. Defaults
         to `np.float32`.
     save : int or Buffer, optional
-        By default, ``save=None``, which indicates the use of alternating buffers. This
-        enables cyclic writes to the TimeFunction. For example, if the TimeFunction
-        ``u(t, x)`` has shape (3, 100), then, in an Operator, ``t`` will assume the
-        values ``1, 2, 0, 1, 2, 0, 1, ...`` (note that the very first value depends
-        on the stencil equation in which ``u`` is written.). The default size of the time
-        buffer when ``save=None`` is ``time_order + 1``.  To specify a different size for
-        the time buffer, one should use the syntax ``save=Buffer(mysize)``.
-        Alternatively, if all of the intermediate results are required (or, simply, to
-        avoid using an alternating buffer), an explicit value for ``save`` ( an integer)
-        must be provided.
+        By default, `save=None`, which indicates the use of alternating
+        buffers. This enables cyclic writes to the TimeFunction. For example,
+        if the TimeFunction `u(t, x)` has shape (3, 100), then, in an Operator,
+        `t` will assume the values `1, 2, 0, 1, 2, 0, 1, ...` (note that the
+        very first value depends on the stencil equation in which `u` is
+        written.). The default size of the time buffer when `save=None` is
+        `time_order + 1`.  To specify a different size for the time buffer, one
+        should use the syntax `save=Buffer(mysize)`.  Alternatively, if all of
+        the intermediate results are required (or, simply, to avoid using an
+        alternating buffer), an explicit value for `save` ( an integer) must be
+        provided.
     time_dim : Dimension, optional
-        TimeDimension to be used in the TimeFunction. Defaults to ``grid.time_dim``.
+        TimeDimension to be used in the TimeFunction. Defaults to `grid.time_dim`.
     staggered : Dimension or tuple of Dimension or Stagger, optional
         Define how the Function is staggered.
     initializer : callable or any object exposing the buffer interface, optional
@@ -1258,8 +1269,6 @@ class TimeFunction(Function):
         to take advantage of the memory hierarchy in a NUMA architecture. Refer to
         `default_allocator.__doc__` for more information.
     padding : int or tuple of ints, optional
-        .. deprecated:: shouldn't be used; padding is now automatically inserted.
-
         Allocate extra grid points to maximize data access alignment. When a tuple
         of ints, one int per Dimension should be provided.
 
@@ -1287,14 +1296,14 @@ class TimeFunction(Function):
     Derivative(g(t, x, y), t)
 
     When using the alternating buffer protocol, the size of the time dimension
-    is given by ``time_order + 1``
+    is given by `time_order + 1`
 
     >>> f.shape
     (2, 4, 4)
     >>> g.shape
     (3, 4, 4)
 
-    One can drop the alternating buffer protocol specifying a value for ``save``
+    One can drop the alternating buffer protocol specifying a value for `save`
 
     >>> h = TimeFunction(name='h', grid=grid, save=20)
     >>> h
@@ -1305,10 +1314,10 @@ class TimeFunction(Function):
     Notes
     -----
     The parameters must always be given as keyword arguments, since SymPy uses
-    ``*args`` to (re-)create the dimension arguments of the symbolic object.
-    If the parameter ``grid`` is provided, the values for ``shape``,
-    ``dimensions`` and ``dtype`` will be derived from it. When present, the
-    parameter ``shape`` should only define the spatial shape of the grid. The
+    `*args` to (re-)create the dimension arguments of the symbolic object.
+    If the parameter `grid` is provided, the values for `shape`,
+    `dimensions` and `dtype` will be derived from it. When present, the
+    parameter `shape` should only define the spatial shape of the grid. The
     temporal dimension will be inserted automatically as the leading dimension.
     """
 
diff --git a/devito/types/misc.py b/devito/types/misc.py
index 88d5dcae13..bbf2ed5137 100644
--- a/devito/types/misc.py
+++ b/devito/types/misc.py
@@ -193,7 +193,13 @@ class TempArray(Array):
     sub-expressions.
     """
 
-    pass
+    is_autopaddable = True
+
+    def __padding_setup__(self, **kwargs):
+        padding = kwargs.pop('padding', None)
+        if padding is None:
+            padding = self.__padding_setup_smart__(**kwargs)
+        return super().__padding_setup__(padding=padding, **kwargs)
 
 
 class Fence(object):
diff --git a/devito/types/tensor.py b/devito/types/tensor.py
index 9be56fd093..9d08bfe77b 100644
--- a/devito/types/tensor.py
+++ b/devito/types/tensor.py
@@ -54,7 +54,6 @@ class TensorFunction(AbstractTensor):
         to take advantage of the memory hierarchy in a NUMA architecture. Refer to
         `default_allocator.__doc__` for more information.
     padding : int or tuple of ints, optional
-        .. deprecated:: shouldn't be used; padding is now automatically inserted.
         Allocate extra grid points to maximize data access alignment. When a tuple
         of ints, one int per Dimension should be provided.
     symmetric : bool, optional
diff --git a/examples/userapi/01_dsl.ipynb b/examples/userapi/01_dsl.ipynb
index 65182c0b74..e3653b7eb7 100644
--- a/examples/userapi/01_dsl.ipynb
+++ b/examples/userapi/01_dsl.ipynb
@@ -98,21 +98,29 @@
       "        provided, shape and dimensions must be given. For MPI execution, a\n",
       "        Grid is compulsory.\n",
       "    space_order : int or 3-tuple of ints, optional\n",
-      "        Discretisation order for space derivatives. Defaults to 1. ``space_order`` also\n",
-      "        impacts the number of points available around a generic point of interest.  By\n",
-      "        default, ``space_order`` points are available on both sides of a generic point of\n",
-      "        interest, including those nearby the grid boundary. Sometimes, fewer points\n",
-      "        suffice; in other scenarios, more points are necessary. In such cases, instead of\n",
-      "        an integer, one can pass a 3-tuple ``(o, lp, rp)`` indicating the discretization\n",
-      "        order (``o``) as well as the number of points on the left (``lp``) and right\n",
-      "        (``rp``) sides of a generic point of interest.\n",
+      "        Discretisation order for space derivatives. Defaults to 1.\n",
+      "        `space_order` also impacts the number of points available around a\n",
+      "        generic point of interest.  By default, `space_order` points are\n",
+      "        available on both sides of a generic point of interest, including those\n",
+      "        nearby the grid boundary. Sometimes, fewer points suffice; in other\n",
+      "        scenarios, more points are necessary. In such cases, instead of an\n",
+      "        integer, one can pass:\n",
+      "          * a 3-tuple `(o, lp, rp)` indicating the discretization order\n",
+      "            (`o`) as well as the number of points on the left (`lp`) and\n",
+      "            right (`rp`) sides of a generic point of interest;\n",
+      "          * a 2-tuple `(o, ((lp0, rp0), (lp1, rp1), ...))` indicating the\n",
+      "            discretization order (`o`) as well as the number of points on\n",
+      "            the left/right sides of a generic point of interest for each\n",
+      "            SpaceDimension.\n",
       "    shape : tuple of ints, optional\n",
-      "        Shape of the domain region in grid points. Only necessary if ``grid`` isn't given.\n",
+      "        Shape of the domain region in grid points. Only necessary if `grid`\n",
+      "        isn't given.\n",
       "    dimensions : tuple of Dimension, optional\n",
-      "        Dimensions associated with the object. Only necessary if ``grid`` isn't given.\n",
+      "        Dimensions associated with the object. Only necessary if `grid` isn't\n",
+      "        given.\n",
       "    dtype : data-type, optional\n",
       "        Any object that can be interpreted as a numpy data type. Defaults\n",
-      "        to ``np.float32``.\n",
+      "        to `np.float32`.\n",
       "    staggered : Dimension or tuple of Dimension or Stagger, optional\n",
       "        Define how the Function is staggered.\n",
       "    initializer : callable or any object exposing the buffer interface, optional\n",
@@ -122,8 +130,6 @@
       "        to take advantage of the memory hierarchy in a NUMA architecture. Refer to\n",
       "        `default_allocator.__doc__` for more information.\n",
       "    padding : int or tuple of ints, optional\n",
-      "        .. deprecated:: shouldn't be used; padding is now automatically inserted.\n",
-      "\n",
       "        Allocate extra grid points to maximize data access alignment. When a tuple\n",
       "        of ints, one int per Dimension should be provided.\n",
       "\n",
@@ -172,7 +178,7 @@
       "    Notes\n",
       "    -----\n",
       "    The parameters must always be given as keyword arguments, since SymPy\n",
-      "    uses ``*args`` to (re-)create the dimension arguments of the symbolic object.\n",
+      "    uses `*args` to (re-)create the dimension arguments of the symbolic object.\n",
       "    \n"
      ]
     }
@@ -682,7 +688,7 @@
       "} ;\n",
       "\n",
       "\n",
-      "int Kernel(const float dt, const float h_x, const float h_y, struct dataobj *restrict u_vec, const int time_M, const int time_m, const int x_M, const int x_m, const int y_M, const int y_m, struct profiler * timers)\n",
+      "int Kernel(struct dataobj *restrict u_vec, const float dt, const float h_x, const float h_y, const int time_M, const int time_m, const int x_M, const int x_m, const int y_M, const int y_m, struct profiler * timers)\n",
       "{\n",
       "  float (*restrict u)[u_vec->size[1]][u_vec->size[2]] __attribute__ ((aligned (64))) = (float (*)[u_vec->size[1]][u_vec->size[2]]) u_vec->data;\n",
       "\n",
diff --git a/tests/test_data.py b/tests/test_data.py
index d8988f21a1..232aff9c97 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -316,9 +316,21 @@ def test_w_halo_w_autopadding(self):
         assert u0.shape_allocated == (4, 4, 16)
 
         assert u1._size_halo == ((3, 3), (3, 3), (3, 3))
-        assert u1._size_padding == ((0, 0), (0, 0), (0, 14))  # 14 stems from 6 + 8
-        assert u1._size_nodomain == ((3, 3), (3, 3), (3, 17))
-        assert u1.shape_allocated == (10, 10, 24)
+        assert u1._size_padding == ((0, 0), (0, 0), (0, 6))  # 6 stems from 16-(3+4+3)
+        assert u1._size_nodomain == ((3, 3), (3, 3), (3, 9))
+        assert u1.shape_allocated == (10, 10, 16)
+
+    def test_w_halo_custom(self):
+        grid = Grid(shape=(4, 4))
+
+        # Custom halo with not enougn entries raises an exception
+        with pytest.raises(TypeError):
+            Function(name='u', grid=grid, space_order=(8, (4, 3)))
+
+        u = TimeFunction(name='u', grid=grid, space_order=(8, ((4, 3), (1, 1))))
+
+        assert u._size_halo == ((0, 0), (4, 3), (1, 1))
+        assert u.shape_allocated == (2, 11, 6)
 
 
 class TestDecomposition(object):
diff --git a/tests/test_dse.py b/tests/test_dse.py
index 5341bb010c..7e7956c314 100644
--- a/tests/test_dse.py
+++ b/tests/test_dse.py
@@ -1289,53 +1289,6 @@ def test_from_different_nests(self, rotate):
         assert np.all(u.data == u1.data)
         assert np.all(v.data == v1.data)
 
-    @skipif('cpu64-arm')
-    @pytest.mark.parametrize('rotate', [False, True])
-    @switchconfig(autopadding=True, platform='knl7210')  # Platform is to fix pad value
-    def test_minimize_remainders_due_to_autopadding(self, rotate):
-        """
-        Check that the bounds of the Iteration computing an aliasing expression are
-        relaxed (i.e., slightly larger) so that backend-compiler-generated remainder
-        loops are avoided.
-        """
-        grid = Grid(shape=(3, 3, 3))
-        x, y, z = grid.dimensions
-        t = grid.stepping_dim
-
-        u = TimeFunction(name='u', grid=grid, space_order=3)
-        u1 = TimeFunction(name='u1', grid=grid, space_order=3)
-
-        u.data_with_halo[:] = 0.5
-        u1.data_with_halo[:] = 0.5
-
-        # Leads to 3D aliases
-        eqn = Eq(u.forward, _R(_R(u[t, x, y, z] + u[t, x+1, y+1, z+1])*3. +
-                               _R(u[t, x+2, y+2, z+2] + u[t, x+3, y+3, z+3])*3. + 1.))
-
-        op0 = Operator(eqn, opt=('noop', {'openmp': False}))
-        op1 = Operator(eqn, opt=('advanced', {'openmp': False, 'cire-mingain': 0,
-                                              'cire-rotate': rotate}))
-
-        # Check code generation
-        bns, pbs = assert_blocking(op1, {'x0_blk0'})
-        xs, ys, zs = get_params(op1, 'x0_blk0_size', 'y0_blk0_size', 'z_size')
-        arrays = [i for i in FindSymbols().visit(bns['x0_blk0']) if i.is_Array]
-        assert len(arrays) == 1
-        assert len(FindNodes(VExpanded).visit(pbs['x0_blk0'])) == 0
-        assert arrays[0].padding == ((0, 0), (0, 0), (0, 30))
-        check_array(arrays[0], ((1, 1), (1, 1), (1, 1)), (xs+2, ys+2, zs+32), rotate)
-        # Check loop bounds
-        trees = retrieve_iteration_tree(bns['x0_blk0'])
-        assert len(trees) == 2
-        expected_rounded = trees[0].inner
-        assert expected_rounded.symbolic_max ==\
-            z.symbolic_max + (z.symbolic_max - z.symbolic_min + 3) % 16 + 1
-
-        # Check numerical output
-        op0(time_M=1)
-        op1(time_M=1, u=u1)
-        assert np.all(u.data == u1.data)
-
     def test_catch_best_invariant_v1(self):
         """
         Make sure the best time-invariant sub-expressions are extracted.