Skip to content

Commit

Permalink
Merge pull request #2296 from devitocodes/enhance-halo-setup
Browse files Browse the repository at this point in the history
compiler: Revamp data alignment
  • Loading branch information
FabioLuporini authored Jan 30, 2024
2 parents 735eaa5 + e58dc53 commit 8bb140d
Show file tree
Hide file tree
Showing 23 changed files with 369 additions and 368 deletions.
59 changes: 44 additions & 15 deletions devito/arch/archinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,14 @@ def get_platform():
class Platform(object):

registry = {}
"""
The Platform registry.
Each new Platform instance is automatically added to the registry.
"""

max_mem_trans_nbytes = None
"""Maximum memory transaction size in bytes."""

def __init__(self, name):
self.name = name
Expand Down Expand Up @@ -630,16 +638,6 @@ def _detect_isa(self):
def threads_per_core(self):
return self.cores_logical // self.cores_physical

@property
def simd_reg_size(self):
"""Size in bytes of a SIMD register."""
return isa_registry.get(self.isa, 0)

def simd_items_per_reg(self, dtype):
"""Number of items of type ``dtype`` that can fit in a SIMD register."""
assert self.simd_reg_size % np.dtype(dtype).itemsize == 0
return int(self.simd_reg_size / np.dtype(dtype).itemsize)

@property
def memtotal(self):
"""Physical memory size in bytes, or None if unknown."""
Expand All @@ -649,9 +647,23 @@ def memavail(self, *args, **kwargs):
"""Available physical memory in bytes, or None if unknown."""
return None

def max_mem_trans_size(self, dtype):
"""
Number of items of type `dtype` that can be transferred in a single
memory transaction.
"""
assert self.max_mem_trans_nbytes % np.dtype(dtype).itemsize == 0
return int(self.max_mem_trans_nbytes / np.dtype(dtype).itemsize)


class Cpu64(Platform):

# The vast majority of CPUs have a 64-byte cache line size
max_mem_trans_nbytes = 64

# The known ISAs are to be provided by the subclasses
known_isas = ()

def __init__(self, name, cores_logical=None, cores_physical=None, isa=None):
super().__init__(name)

Expand All @@ -661,9 +673,6 @@ def __init__(self, name, cores_logical=None, cores_physical=None, isa=None):
self.cores_physical = cores_physical or cpu_info['physical']
self.isa = isa or self._detect_isa()

# The known ISAs are to be provided by the subclasses
known_isas = ()

@classmethod
def _mro(cls):
# Retain only the CPU Platforms
Expand All @@ -683,6 +692,20 @@ def _detect_isa(self):
return i
return 'cpp'

@property
def simd_reg_nbytes(self):
"""
Size in bytes of a SIMD register.
"""
return isa_registry.get(self.isa, 0)

def simd_items_per_reg(self, dtype):
"""
Number of items of type `dtype` that fit in a SIMD register.
"""
assert self.simd_reg_nbytes % np.dtype(dtype).itemsize == 0
return int(self.simd_reg_nbytes / np.dtype(dtype).itemsize)

@cached_property
def memtotal(self):
return psutil.virtual_memory().total
Expand Down Expand Up @@ -758,7 +781,7 @@ def _mro(cls):
break
return retval

@cached_property
@property
def march(self):
return None

Expand All @@ -783,13 +806,17 @@ def memavail(self, deviceid=0):

class IntelDevice(Device):

@cached_property
max_mem_trans_nbytes = 64

@property
def march(self):
return ''


class NvidiaDevice(Device):

max_mem_trans_nbytes = 128

@cached_property
def march(self):
info = get_gpu_info()
Expand All @@ -802,6 +829,8 @@ def march(self):

class AmdDevice(Device):

max_mem_trans_nbytes = 256

@cached_property
def march(cls):
# TODO: this corresponds to Vega, which acts as the fallback `march`
Expand Down
4 changes: 2 additions & 2 deletions devito/core/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def _specialize_iet(cls, graph, **kwargs):
sregistry = kwargs['sregistry']

# Flush denormal numbers
avoid_denormals(graph, platform=platform)
avoid_denormals(graph, **kwargs)

# Distributed-memory parallelism
mpiize(graph, **kwargs)
Expand Down Expand Up @@ -260,7 +260,7 @@ def _make_iet_passes_mapper(cls, **kwargs):
parizer = cls._Target.Parizer(sregistry, options, platform, compiler)

return {
'denormals': avoid_denormals,
'denormals': partial(avoid_denormals, **kwargs),
'blocking': partial(relax_incr_dimensions, **kwargs),
'parallel': parizer.make_parallel,
'openmp': parizer.make_parallel,
Expand Down
85 changes: 48 additions & 37 deletions devito/data/allocators.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import abc
from functools import reduce
from operator import mul
import ctypes
from ctypes.util import find_library
import mmap
import os
import sys

import numpy as np
import ctypes
from ctypes.util import find_library

from devito.logger import logger
from devito.parameters import configuration
from devito.tools import dtype_to_ctype
from devito.tools import dtype_to_ctype, is_integer

__all__ = ['ALLOC_ALIGNED', 'ALLOC_NUMA_LOCAL', 'ALLOC_NUMA_ANY',
'ALLOC_KNL_MCDRAM', 'ALLOC_KNL_DRAM', 'ALLOC_GUARD',
Expand All @@ -24,9 +24,6 @@ class MemoryAllocator(object):

__metaclass__ = abc.ABCMeta

is_Posix = False
is_Numa = False

_attempted_init = False
lib = None

Expand All @@ -51,7 +48,7 @@ def initialize(cls):
"""
return

def alloc(self, shape, dtype):
def alloc(self, shape, dtype, padding=0):
"""
Allocate memory.
Expand All @@ -61,6 +58,9 @@ def alloc(self, shape, dtype):
Shape of the allocated array.
dtype : numpy.dtype
The data type of the raw data.
padding : int or 2-tuple of ints, optional
The number of points that are allocated before and after the data,
that is in addition to the requested shape. Defaults to 0.
Returns
-------
Expand All @@ -69,25 +69,40 @@ def alloc(self, shape, dtype):
access the data as a ctypes object. The second element is an opaque
object that is needed only for the "memfree" call.
"""
size = int(reduce(mul, shape))
datasize = int(reduce(mul, shape))
ctype = dtype_to_ctype(dtype)

c_pointer, memfree_args = self._alloc_C_libcall(size, ctype)
if c_pointer is None:
raise RuntimeError("Unable to allocate %d elements in memory", str(size))

# cast to 1D array of the specified size
ctype_1d = ctype * size
# Add padding, if any
try:
padleft, padright = padding
except TypeError:
padleft, padright = padding, padding
if not is_integer(padleft) and not is_integer(padright):
raise TypeError("padding must be an int or a 2-tuple of ints")
size = datasize + padleft + padright

padleft_pointer, memfree_args = self._alloc_C_libcall(size, ctype)
if padleft_pointer is None:
raise RuntimeError("Unable to allocate %d elements in memory" % size)

# Compute the pointer to the user data
padleft_bytes = padleft * ctypes.sizeof(ctype)
c_pointer = ctypes.c_void_p(padleft_pointer.value + padleft_bytes)

# Cast to 1D array of the specified `datasize`
ctype_1d = ctype * datasize
buf = ctypes.cast(c_pointer, ctypes.POINTER(ctype_1d)).contents
pointer = np.frombuffer(buf, dtype=dtype)
# pointer.reshape should not be used here because it may introduce a copy
# From https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html:
# It is not always possible to change the shape of an array without copying the
# data. If you want an error to be raised when the data is copied, you should
# assign the new shape to the shape attribute of the array:
pointer.shape = shape
array = np.frombuffer(buf, dtype=dtype)

# `array.reshape` should not be used here because it may introduce
# a copy. From `docs.scipy.org/doc/numpy/reference/generated/numpy.reshape`:
# It is not always possible to change the shape of an array without
# copying the data. If you want an error to be raised when the data
# is copied, you should assign the new shape to the shape attribute
# of the array:
array.shape = shape

return (pointer, memfree_args)
return (array, memfree_args)

@abc.abstractmethod
def _alloc_C_libcall(self, size, ctype):
Expand Down Expand Up @@ -124,8 +139,6 @@ class PosixAllocator(MemoryAllocator):
aligned to page boundaries.
"""

is_Posix = True

@classmethod
def initialize(cls):
handle = find_library('c')
Expand Down Expand Up @@ -162,7 +175,7 @@ def free(self, c_pointer):
class GuardAllocator(PosixAllocator):

"""
Memory allocator based on ``posix`` functions. The allocated memory is
Memory allocator based on `posix` functions. The allocated memory is
aligned to page boundaries. Additionally, it allocates extra memory
before and after the data, and configures it so that an SEGV is thrown
immediately if an out-of-bounds access occurs.
Expand Down Expand Up @@ -195,37 +208,37 @@ def _alloc_C_libcall(self, size, ctype):
if ret != 0:
return None, None

# generate pointers to the left padding, the user data, and the right pad
# Generate pointers to the left padding, the user data, and the right pad
padleft_pointer = c_pointer
c_pointer = ctypes.c_void_p(c_pointer.value + self.padding_bytes)
padright_pointer = ctypes.c_void_p(c_pointer.value + npages_user * pagesize)

# and set the permissions on the pad memory to 0 (no access)
# if these fail, don't worry about failing the entire allocation
# And set the permissions on the pad memory to 0 (no access)
# If these fail, don't worry about failing the entire allocation
c_padsize = ctypes.c_ulong(self.padding_bytes)
if self.lib.mprotect(padleft_pointer, c_padsize, ctypes.c_int(0)):
logger.warning("couldn't protect memory")
if self.lib.mprotect(padright_pointer, c_padsize, ctypes.c_int(0)):
logger.warning("couldn't protect memory")

# if there is a multiple of 4 bytes left, use the code below to poison
# If there is a multiple of 4 bytes left, use the code below to poison
# the memory
if nbytes_user % 4 == 0:
poison_size = npages_user*pagesize - nbytes_user
intp_type = ctypes.POINTER(ctypes.c_int)
poison_ptr = ctypes.cast(ctypes.c_void_p(c_pointer.value + nbytes_user),
intp_type)

# for both float32 and float64, a sequence of -100 int32s represents NaNs,
# at least on little-endian architectures. It shouldn't matter what we
# put in there, anyway
# For both float32 and float64, a sequence of -100 int32s
# represents NaNs, at least on little-endian architectures;
# it shouldn't matter what we put in there, anyway
for i in range(poison_size // 4):
poison_ptr[i] = -100

return c_pointer, (padleft_pointer, c_bytesize)

def free(self, c_pointer, total_size):
# unprotect it, since free() accesses it, I think...
# Unprotect it, since free() accesses it, I think...
self.lib.mprotect(c_pointer, total_size,
ctypes.c_int(mmap.PROT_READ | mmap.PROT_WRITE))
self.lib.free(c_pointer)
Expand All @@ -247,8 +260,6 @@ class NumaAllocator(MemoryAllocator):
("allocate on any NUMA node with sufficient free memory") are accepted.
"""

is_Numa = True

@classmethod
def initialize(cls):
handle = find_library('numa')
Expand Down Expand Up @@ -356,7 +367,7 @@ class ExternalAllocator(MemoryAllocator):
def __init__(self, numpy_array):
self.numpy_array = numpy_array

def alloc(self, shape, dtype):
def alloc(self, shape, dtype, padding=0):
assert shape == self.numpy_array.shape, \
"Provided array has shape %s. Expected %s" %\
(str(self.numpy_array.shape), str(shape))
Expand Down Expand Up @@ -429,4 +440,4 @@ def default_allocator(name=None):
infer_knl_mode() == 'flat'):
return ALLOC_KNL_MCDRAM
else:
return ALLOC_ALIGNED
return custom_allocators.get('default', ALLOC_ALIGNED)
14 changes: 9 additions & 5 deletions devito/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,16 @@ class Data(np.ndarray):
decomposition : tuple of Decomposition, optional
The data decomposition, for each dimension.
modulo : tuple of bool, optional
If the i-th entry is True, then the i-th array dimension uses modulo indexing.
If the i-th entry is True, then the i-th array dimension uses modulo
indexing.
allocator : MemoryAllocator, optional
Used to allocate memory. Defaults to `ALLOC_ALIGNED`.
distributor : Distributor, optional
The distributor from which the original decomposition was produced. Note that
the decomposition Parameter above may be different to distributor.decomposition.
The distributor from which the original decomposition was produced.
Note that `decomposition` may differ from `distributor.decomposition`.
padding : int or 2-tuple of ints, optional
The number of points that are allocated before and after the data,
that is in addition to the requested shape. Defaults to 0.
Notes
-----
Expand All @@ -45,9 +49,9 @@ class Data(np.ndarray):
"""

def __new__(cls, shape, dtype, decomposition=None, modulo=None,
allocator=ALLOC_ALIGNED, distributor=None):
allocator=ALLOC_ALIGNED, distributor=None, padding=0):
assert len(shape) == len(modulo)
ndarray, memfree_args = allocator.alloc(shape, dtype)
ndarray, memfree_args = allocator.alloc(shape, dtype, padding=padding)
obj = ndarray.view(cls)
obj._allocator = allocator
obj._memfree_args = memfree_args
Expand Down
Loading

0 comments on commit 8bb140d

Please sign in to comment.