Merge pull request #2296 from devitocodes/enhance-halo-setup

compiler: Revamp data alignment
devitocodes · Jan 30, 2024 · 8bb140d · 8bb140d
2 parents 735eaa5 + e58dc53
commit 8bb140d
Show file tree

Hide file tree

Showing 23 changed files with 369 additions and 368 deletions.
diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py
@@ -598,6 +598,14 @@ def get_platform():
 class Platform(object):
 
     registry = {}
+    """
+    The Platform registry.
+
+    Each new Platform instance is automatically added to the registry.
+    """
+
+    max_mem_trans_nbytes = None
+    """Maximum memory transaction size in bytes."""
 
     def __init__(self, name):
         self.name = name
@@ -630,16 +638,6 @@ def _detect_isa(self):
     def threads_per_core(self):
         return self.cores_logical // self.cores_physical
 
-    @property
-    def simd_reg_size(self):
-        """Size in bytes of a SIMD register."""
-        return isa_registry.get(self.isa, 0)
-
-    def simd_items_per_reg(self, dtype):
-        """Number of items of type ``dtype`` that can fit in a SIMD register."""
-        assert self.simd_reg_size % np.dtype(dtype).itemsize == 0
-        return int(self.simd_reg_size / np.dtype(dtype).itemsize)
-
     @property
     def memtotal(self):
         """Physical memory size in bytes, or None if unknown."""
@@ -649,9 +647,23 @@ def memavail(self, *args, **kwargs):
         """Available physical memory in bytes, or None if unknown."""
         return None
 
+    def max_mem_trans_size(self, dtype):
+        """
+        Number of items of type `dtype` that can be transferred in a single
+        memory transaction.
+        """
+        assert self.max_mem_trans_nbytes % np.dtype(dtype).itemsize == 0
+        return int(self.max_mem_trans_nbytes / np.dtype(dtype).itemsize)
+
 
 class Cpu64(Platform):
 
+    # The vast majority of CPUs have a 64-byte cache line size
+    max_mem_trans_nbytes = 64
+
+    # The known ISAs are to be provided by the subclasses
+    known_isas = ()
+
     def __init__(self, name, cores_logical=None, cores_physical=None, isa=None):
         super().__init__(name)
 
@@ -661,9 +673,6 @@ def __init__(self, name, cores_logical=None, cores_physical=None, isa=None):
         self.cores_physical = cores_physical or cpu_info['physical']
         self.isa = isa or self._detect_isa()
 
-    # The known ISAs are to be provided by the subclasses
-    known_isas = ()
-
     @classmethod
     def _mro(cls):
         # Retain only the CPU Platforms
@@ -683,6 +692,20 @@ def _detect_isa(self):
                 return i
         return 'cpp'
 
+    @property
+    def simd_reg_nbytes(self):
+        """
+        Size in bytes of a SIMD register.
+        """
+        return isa_registry.get(self.isa, 0)
+
+    def simd_items_per_reg(self, dtype):
+        """
+        Number of items of type `dtype` that fit in a SIMD register.
+        """
+        assert self.simd_reg_nbytes % np.dtype(dtype).itemsize == 0
+        return int(self.simd_reg_nbytes / np.dtype(dtype).itemsize)
+
     @cached_property
     def memtotal(self):
         return psutil.virtual_memory().total
@@ -758,7 +781,7 @@ def _mro(cls):
                 break
         return retval
 
-    @cached_property
+    @property
     def march(self):
         return None
 
@@ -783,13 +806,17 @@ def memavail(self, deviceid=0):
 
 class IntelDevice(Device):
 
-    @cached_property
+    max_mem_trans_nbytes = 64
+
+    @property
     def march(self):
         return ''
 
 
 class NvidiaDevice(Device):
 
+    max_mem_trans_nbytes = 128
+
     @cached_property
     def march(self):
         info = get_gpu_info()
@@ -802,6 +829,8 @@ def march(self):
 
 class AmdDevice(Device):
 
+    max_mem_trans_nbytes = 256
+
     @cached_property
     def march(cls):
         # TODO: this corresponds to Vega, which acts as the fallback `march`

diff --git a/devito/core/cpu.py b/devito/core/cpu.py
@@ -166,7 +166,7 @@ def _specialize_iet(cls, graph, **kwargs):
         sregistry = kwargs['sregistry']
 
         # Flush denormal numbers
-        avoid_denormals(graph, platform=platform)
+        avoid_denormals(graph, **kwargs)
 
         # Distributed-memory parallelism
         mpiize(graph, **kwargs)
@@ -260,7 +260,7 @@ def _make_iet_passes_mapper(cls, **kwargs):
         parizer = cls._Target.Parizer(sregistry, options, platform, compiler)
 
         return {
-            'denormals': avoid_denormals,
+            'denormals': partial(avoid_denormals, **kwargs),
             'blocking': partial(relax_incr_dimensions, **kwargs),
             'parallel': parizer.make_parallel,
             'openmp': parizer.make_parallel,

diff --git a/devito/data/allocators.py b/devito/data/allocators.py
@@ -1,17 +1,17 @@
 import abc
 from functools import reduce
 from operator import mul
+import ctypes
+from ctypes.util import find_library
 import mmap
 import os
 import sys
 
 import numpy as np
-import ctypes
-from ctypes.util import find_library
 
 from devito.logger import logger
 from devito.parameters import configuration
-from devito.tools import dtype_to_ctype
+from devito.tools import dtype_to_ctype, is_integer
 
 __all__ = ['ALLOC_ALIGNED', 'ALLOC_NUMA_LOCAL', 'ALLOC_NUMA_ANY',
            'ALLOC_KNL_MCDRAM', 'ALLOC_KNL_DRAM', 'ALLOC_GUARD',
@@ -24,9 +24,6 @@ class MemoryAllocator(object):
 
     __metaclass__ = abc.ABCMeta
 
-    is_Posix = False
-    is_Numa = False
-
     _attempted_init = False
     lib = None
 
@@ -51,7 +48,7 @@ def initialize(cls):
         """
         return
 
-    def alloc(self, shape, dtype):
+    def alloc(self, shape, dtype, padding=0):
         """
         Allocate memory.
 
@@ -61,6 +58,9 @@ def alloc(self, shape, dtype):
             Shape of the allocated array.
         dtype : numpy.dtype
             The data type of the raw data.
+        padding : int or 2-tuple of ints, optional
+            The number of points that are allocated before and after the data,
+            that is in addition to the requested shape. Defaults to 0.
 
         Returns
         -------
@@ -69,25 +69,40 @@ def alloc(self, shape, dtype):
             access the data as a ctypes object. The second element is an opaque
             object that is needed only for the "memfree" call.
         """
-        size = int(reduce(mul, shape))
+        datasize = int(reduce(mul, shape))
         ctype = dtype_to_ctype(dtype)
 
-        c_pointer, memfree_args = self._alloc_C_libcall(size, ctype)
-        if c_pointer is None:
-            raise RuntimeError("Unable to allocate %d elements in memory", str(size))
-
-        # cast to 1D array of the specified size
-        ctype_1d = ctype * size
+        # Add padding, if any
+        try:
+            padleft, padright = padding
+        except TypeError:
+            padleft, padright = padding, padding
+        if not is_integer(padleft) and not is_integer(padright):
+            raise TypeError("padding must be an int or a 2-tuple of ints")
+        size = datasize + padleft + padright
+
+        padleft_pointer, memfree_args = self._alloc_C_libcall(size, ctype)
+        if padleft_pointer is None:
+            raise RuntimeError("Unable to allocate %d elements in memory" % size)
+
+        # Compute the pointer to the user data
+        padleft_bytes = padleft * ctypes.sizeof(ctype)
+        c_pointer = ctypes.c_void_p(padleft_pointer.value + padleft_bytes)
+
+        # Cast to 1D array of the specified `datasize`
+        ctype_1d = ctype * datasize
         buf = ctypes.cast(c_pointer, ctypes.POINTER(ctype_1d)).contents
-        pointer = np.frombuffer(buf, dtype=dtype)
-        # pointer.reshape should not be used here because it may introduce a copy
-        # From https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html:
-        # It is not always possible to change the shape of an array without copying the
-        # data. If you want an error to be raised when the data is copied, you should
-        # assign the new shape to the shape attribute of the array:
-        pointer.shape = shape
+        array = np.frombuffer(buf, dtype=dtype)
+
+        # `array.reshape` should not be used here because it may introduce
+        # a copy. From `docs.scipy.org/doc/numpy/reference/generated/numpy.reshape`:
+        #   It is not always possible to change the shape of an array without
+        #   copying the data. If you want an error to be raised when the data
+        #   is copied, you should assign the new shape to the shape attribute
+        #   of the array:
+        array.shape = shape
 
-        return (pointer, memfree_args)
+        return (array, memfree_args)
 
     @abc.abstractmethod
     def _alloc_C_libcall(self, size, ctype):
@@ -124,8 +139,6 @@ class PosixAllocator(MemoryAllocator):
     aligned to page boundaries.
     """
 
-    is_Posix = True
-
     @classmethod
     def initialize(cls):
         handle = find_library('c')
@@ -162,7 +175,7 @@ def free(self, c_pointer):
 class GuardAllocator(PosixAllocator):
 
     """
-    Memory allocator based on ``posix`` functions. The allocated memory is
+    Memory allocator based on `posix` functions. The allocated memory is
     aligned to page boundaries.  Additionally, it allocates extra memory
     before and after the data, and configures it so that an SEGV is thrown
     immediately if an out-of-bounds access occurs.
@@ -195,37 +208,37 @@ def _alloc_C_libcall(self, size, ctype):
         if ret != 0:
             return None, None
 
-        # generate pointers to the left padding, the user data, and the right pad
+        # Generate pointers to the left padding, the user data, and the right pad
         padleft_pointer = c_pointer
         c_pointer = ctypes.c_void_p(c_pointer.value + self.padding_bytes)
         padright_pointer = ctypes.c_void_p(c_pointer.value + npages_user * pagesize)
 
-        # and set the permissions on the pad memory to 0 (no access)
-        # if these fail, don't worry about failing the entire allocation
+        # And set the permissions on the pad memory to 0 (no access)
+        # If these fail, don't worry about failing the entire allocation
         c_padsize = ctypes.c_ulong(self.padding_bytes)
         if self.lib.mprotect(padleft_pointer, c_padsize, ctypes.c_int(0)):
             logger.warning("couldn't protect memory")
         if self.lib.mprotect(padright_pointer, c_padsize, ctypes.c_int(0)):
             logger.warning("couldn't protect memory")
 
-        # if there is a multiple of 4 bytes left, use the code below to poison
+        # If there is a multiple of 4 bytes left, use the code below to poison
         # the memory
         if nbytes_user % 4 == 0:
             poison_size = npages_user*pagesize - nbytes_user
             intp_type = ctypes.POINTER(ctypes.c_int)
             poison_ptr = ctypes.cast(ctypes.c_void_p(c_pointer.value + nbytes_user),
                                      intp_type)
 
-            # for both float32 and float64, a sequence of -100 int32s represents NaNs,
-            # at least on little-endian architectures.  It shouldn't matter what we
-            # put in there, anyway
+            # For both float32 and float64, a sequence of -100 int32s
+            # represents NaNs, at least on little-endian architectures;
+            # it shouldn't matter what we put in there, anyway
             for i in range(poison_size // 4):
                 poison_ptr[i] = -100
 
         return c_pointer, (padleft_pointer, c_bytesize)
 
     def free(self, c_pointer, total_size):
-        # unprotect it, since free() accesses it, I think...
+        # Unprotect it, since free() accesses it, I think...
         self.lib.mprotect(c_pointer, total_size,
                           ctypes.c_int(mmap.PROT_READ | mmap.PROT_WRITE))
         self.lib.free(c_pointer)
@@ -247,8 +260,6 @@ class NumaAllocator(MemoryAllocator):
         ("allocate on any NUMA node with sufficient free memory") are accepted.
     """
 
-    is_Numa = True
-
     @classmethod
     def initialize(cls):
         handle = find_library('numa')
@@ -356,7 +367,7 @@ class ExternalAllocator(MemoryAllocator):
     def __init__(self, numpy_array):
         self.numpy_array = numpy_array
 
-    def alloc(self, shape, dtype):
+    def alloc(self, shape, dtype, padding=0):
         assert shape == self.numpy_array.shape, \
             "Provided array has shape %s. Expected %s" %\
             (str(self.numpy_array.shape), str(shape))
@@ -429,4 +440,4 @@ def default_allocator(name=None):
           infer_knl_mode() == 'flat'):
         return ALLOC_KNL_MCDRAM
     else:
-        return ALLOC_ALIGNED
+        return custom_allocators.get('default', ALLOC_ALIGNED)
diff --git a/devito/data/data.py b/devito/data/data.py
@@ -26,12 +26,16 @@ class Data(np.ndarray):
     decomposition : tuple of Decomposition, optional
         The data decomposition, for each dimension.
     modulo : tuple of bool, optional
-        If the i-th entry is True, then the i-th array dimension uses modulo indexing.
+        If the i-th entry is True, then the i-th array dimension uses modulo
+        indexing.
     allocator : MemoryAllocator, optional
         Used to allocate memory. Defaults to `ALLOC_ALIGNED`.
     distributor : Distributor, optional
-        The distributor from which the original decomposition was produced. Note that
-        the decomposition Parameter above may be different to distributor.decomposition.
+        The distributor from which the original decomposition was produced.
+        Note that `decomposition` may differ from `distributor.decomposition`.
+    padding : int or 2-tuple of ints, optional
+        The number of points that are allocated before and after the data,
+        that is in addition to the requested shape. Defaults to 0.
 
     Notes
     -----
@@ -45,9 +49,9 @@ class Data(np.ndarray):
     """
 
     def __new__(cls, shape, dtype, decomposition=None, modulo=None,
-                allocator=ALLOC_ALIGNED, distributor=None):
+                allocator=ALLOC_ALIGNED, distributor=None, padding=0):
         assert len(shape) == len(modulo)
-        ndarray, memfree_args = allocator.alloc(shape, dtype)
+        ndarray, memfree_args = allocator.alloc(shape, dtype, padding=padding)
         obj = ndarray.view(cls)
         obj._allocator = allocator
         obj._memfree_args = memfree_args