Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compiler: Revamp data alignment #2296

Merged
merged 23 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
2c21b09
compiler: Start revamping autopadding
FabioLuporini Jan 5, 2024
c7b601e
compiler: Extirpate decrepit loop-rounding optimization
FabioLuporini Jan 8, 2024
060e0de
compiler: Remove useless runtime checks
FabioLuporini Jan 8, 2024
e2a1077
compiler: Refactor __padding_setup__
FabioLuporini Jan 8, 2024
9233a88
compiler: Add platform to DataManager
FabioLuporini Jan 8, 2024
c5c058c
arch: Add Platform.suggested_alignment
FabioLuporini Jan 8, 2024
1f84038
compiler: Rework __padding_setup__
FabioLuporini Jan 8, 2024
2c6439b
compiler: Unpick suggested_alignment
FabioLuporini Jan 9, 2024
3ba3403
misc: Logging's perf_adv -> hint
FabioLuporini Jan 10, 2024
feb1f18
api: Enable halo customization
FabioLuporini Jan 10, 2024
4e53c7a
compiler: Tweak autopadding
FabioLuporini Jan 12, 2024
7268c9c
compiler: Fix autopadding
FabioLuporini Jan 13, 2024
a812bb3
compiler: Remove useless is_compact attribute
FabioLuporini Jan 15, 2024
3cb062d
compiler: Fix linearization in presence of autopadding
FabioLuporini Jan 15, 2024
f8f1334
pep8 happiness
FabioLuporini Jan 15, 2024
cfa5027
tests: Update expected output after tweaking autopadding
FabioLuporini Jan 16, 2024
d335eb8
examples: Update expected output
FabioLuporini Jan 17, 2024
6fb992e
compiler: Use is_integer where possible
FabioLuporini Jan 17, 2024
7479238
compiler: Fix mapify_reduce
FabioLuporini Jan 17, 2024
d85d402
misc: Homogenize docstrings to use single quotes
FabioLuporini Jan 17, 2024
733d7bc
api: Improve halo setup API and docs
FabioLuporini Jan 17, 2024
3cd6127
examples: Update expected output
FabioLuporini Jan 17, 2024
e58dc53
compiler: Skip avoid_denormals upon recursive compilation
FabioLuporini Jan 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 44 additions & 15 deletions devito/arch/archinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,14 @@ def get_platform():
class Platform(object):

registry = {}
"""
The Platform registry.

Each new Platform instance is automatically added to the registry.
"""

max_mem_trans_nbytes = None
"""Maximum memory transaction size in bytes."""

def __init__(self, name):
self.name = name
Expand Down Expand Up @@ -630,16 +638,6 @@ def _detect_isa(self):
def threads_per_core(self):
return self.cores_logical // self.cores_physical

@property
def simd_reg_size(self):
"""Size in bytes of a SIMD register."""
return isa_registry.get(self.isa, 0)

def simd_items_per_reg(self, dtype):
"""Number of items of type ``dtype`` that can fit in a SIMD register."""
assert self.simd_reg_size % np.dtype(dtype).itemsize == 0
return int(self.simd_reg_size / np.dtype(dtype).itemsize)

@property
def memtotal(self):
"""Physical memory size in bytes, or None if unknown."""
Expand All @@ -649,9 +647,23 @@ def memavail(self, *args, **kwargs):
"""Available physical memory in bytes, or None if unknown."""
return None

def max_mem_trans_size(self, dtype):
"""
Number of items of type `dtype` that can be transferred in a single
memory transaction.
"""
assert self.max_mem_trans_nbytes % np.dtype(dtype).itemsize == 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be worth raising an exception with a message here if this assertion is False? Not sure how easily a user could bump into this, but could imagine it being quite cryptic to a novice user if they did

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

== 0 ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@EdCaunt
No, because we developers write these classes and upon testing a new one we would be thrown the assert exception right away if the case

@mloubout yes, == 0, the memory transaction nbytes is always a multiple of 2, just like a C base type size

return int(self.max_mem_trans_nbytes / np.dtype(dtype).itemsize)


class Cpu64(Platform):

# The vast majority of CPUs have a 64-byte cache line size
max_mem_trans_nbytes = 64

# The known ISAs are to be provided by the subclasses
known_isas = ()

def __init__(self, name, cores_logical=None, cores_physical=None, isa=None):
super().__init__(name)

Expand All @@ -661,9 +673,6 @@ def __init__(self, name, cores_logical=None, cores_physical=None, isa=None):
self.cores_physical = cores_physical or cpu_info['physical']
self.isa = isa or self._detect_isa()

# The known ISAs are to be provided by the subclasses
known_isas = ()

@classmethod
def _mro(cls):
# Retain only the CPU Platforms
Expand All @@ -683,6 +692,20 @@ def _detect_isa(self):
return i
return 'cpp'

@property
def simd_reg_nbytes(self):
"""
Size in bytes of a SIMD register.
"""
return isa_registry.get(self.isa, 0)

def simd_items_per_reg(self, dtype):
"""
Number of items of type `dtype` that fit in a SIMD register.
"""
assert self.simd_reg_nbytes % np.dtype(dtype).itemsize == 0
return int(self.simd_reg_nbytes / np.dtype(dtype).itemsize)

@cached_property
def memtotal(self):
return psutil.virtual_memory().total
Expand Down Expand Up @@ -758,7 +781,7 @@ def _mro(cls):
break
return retval

@cached_property
@property
def march(self):
return None

Expand All @@ -783,13 +806,17 @@ def memavail(self, deviceid=0):

class IntelDevice(Device):

@cached_property
max_mem_trans_nbytes = 64

@property
def march(self):
return ''


class NvidiaDevice(Device):

max_mem_trans_nbytes = 128

@cached_property
def march(self):
info = get_gpu_info()
Expand All @@ -802,6 +829,8 @@ def march(self):

class AmdDevice(Device):

max_mem_trans_nbytes = 256

@cached_property
def march(cls):
# TODO: this corresponds to Vega, which acts as the fallback `march`
Expand Down
4 changes: 2 additions & 2 deletions devito/core/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def _specialize_iet(cls, graph, **kwargs):
sregistry = kwargs['sregistry']

# Flush denormal numbers
avoid_denormals(graph, platform=platform)
avoid_denormals(graph, **kwargs)

# Distributed-memory parallelism
mpiize(graph, **kwargs)
Expand Down Expand Up @@ -260,7 +260,7 @@ def _make_iet_passes_mapper(cls, **kwargs):
parizer = cls._Target.Parizer(sregistry, options, platform, compiler)

return {
'denormals': avoid_denormals,
'denormals': partial(avoid_denormals, **kwargs),
'blocking': partial(relax_incr_dimensions, **kwargs),
'parallel': parizer.make_parallel,
'openmp': parizer.make_parallel,
Expand Down
85 changes: 48 additions & 37 deletions devito/data/allocators.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import abc
from functools import reduce
from operator import mul
import ctypes
from ctypes.util import find_library
import mmap
import os
import sys

import numpy as np
import ctypes
from ctypes.util import find_library

from devito.logger import logger
from devito.parameters import configuration
from devito.tools import dtype_to_ctype
from devito.tools import dtype_to_ctype, is_integer

__all__ = ['ALLOC_ALIGNED', 'ALLOC_NUMA_LOCAL', 'ALLOC_NUMA_ANY',
'ALLOC_KNL_MCDRAM', 'ALLOC_KNL_DRAM', 'ALLOC_GUARD',
Expand All @@ -24,9 +24,6 @@ class MemoryAllocator(object):

__metaclass__ = abc.ABCMeta

is_Posix = False
is_Numa = False

_attempted_init = False
lib = None

Expand All @@ -51,7 +48,7 @@ def initialize(cls):
"""
return

def alloc(self, shape, dtype):
def alloc(self, shape, dtype, padding=0):
"""
Allocate memory.

Expand All @@ -61,6 +58,9 @@ def alloc(self, shape, dtype):
Shape of the allocated array.
dtype : numpy.dtype
The data type of the raw data.
padding : int or 2-tuple of ints, optional
The number of points that are allocated before and after the data,
that is in addition to the requested shape. Defaults to 0.

Returns
-------
Expand All @@ -69,25 +69,40 @@ def alloc(self, shape, dtype):
access the data as a ctypes object. The second element is an opaque
object that is needed only for the "memfree" call.
"""
size = int(reduce(mul, shape))
datasize = int(reduce(mul, shape))
ctype = dtype_to_ctype(dtype)

c_pointer, memfree_args = self._alloc_C_libcall(size, ctype)
if c_pointer is None:
raise RuntimeError("Unable to allocate %d elements in memory", str(size))

# cast to 1D array of the specified size
ctype_1d = ctype * size
# Add padding, if any
try:
padleft, padright = padding
except TypeError:
padleft, padright = padding, padding
if not is_integer(padleft) and not is_integer(padright):
raise TypeError("padding must be an int or a 2-tuple of ints")
size = datasize + padleft + padright

padleft_pointer, memfree_args = self._alloc_C_libcall(size, ctype)
if padleft_pointer is None:
raise RuntimeError("Unable to allocate %d elements in memory" % size)

# Compute the pointer to the user data
padleft_bytes = padleft * ctypes.sizeof(ctype)
c_pointer = ctypes.c_void_p(padleft_pointer.value + padleft_bytes)

# Cast to 1D array of the specified `datasize`
ctype_1d = ctype * datasize
buf = ctypes.cast(c_pointer, ctypes.POINTER(ctype_1d)).contents
pointer = np.frombuffer(buf, dtype=dtype)
# pointer.reshape should not be used here because it may introduce a copy
# From https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html:
# It is not always possible to change the shape of an array without copying the
# data. If you want an error to be raised when the data is copied, you should
# assign the new shape to the shape attribute of the array:
pointer.shape = shape
array = np.frombuffer(buf, dtype=dtype)

# `array.reshape` should not be used here because it may introduce
# a copy. From `docs.scipy.org/doc/numpy/reference/generated/numpy.reshape`:
# It is not always possible to change the shape of an array without
# copying the data. If you want an error to be raised when the data
# is copied, you should assign the new shape to the shape attribute
# of the array:
array.shape = shape

return (pointer, memfree_args)
return (array, memfree_args)

@abc.abstractmethod
def _alloc_C_libcall(self, size, ctype):
Expand Down Expand Up @@ -124,8 +139,6 @@ class PosixAllocator(MemoryAllocator):
aligned to page boundaries.
"""

is_Posix = True

@classmethod
def initialize(cls):
handle = find_library('c')
Expand Down Expand Up @@ -162,7 +175,7 @@ def free(self, c_pointer):
class GuardAllocator(PosixAllocator):

"""
Memory allocator based on ``posix`` functions. The allocated memory is
Memory allocator based on `posix` functions. The allocated memory is
aligned to page boundaries. Additionally, it allocates extra memory
before and after the data, and configures it so that an SEGV is thrown
immediately if an out-of-bounds access occurs.
Expand Down Expand Up @@ -195,37 +208,37 @@ def _alloc_C_libcall(self, size, ctype):
if ret != 0:
return None, None

# generate pointers to the left padding, the user data, and the right pad
# Generate pointers to the left padding, the user data, and the right pad
padleft_pointer = c_pointer
c_pointer = ctypes.c_void_p(c_pointer.value + self.padding_bytes)
padright_pointer = ctypes.c_void_p(c_pointer.value + npages_user * pagesize)

# and set the permissions on the pad memory to 0 (no access)
# if these fail, don't worry about failing the entire allocation
# And set the permissions on the pad memory to 0 (no access)
# If these fail, don't worry about failing the entire allocation
c_padsize = ctypes.c_ulong(self.padding_bytes)
if self.lib.mprotect(padleft_pointer, c_padsize, ctypes.c_int(0)):
logger.warning("couldn't protect memory")
if self.lib.mprotect(padright_pointer, c_padsize, ctypes.c_int(0)):
logger.warning("couldn't protect memory")

# if there is a multiple of 4 bytes left, use the code below to poison
# If there is a multiple of 4 bytes left, use the code below to poison
# the memory
if nbytes_user % 4 == 0:
poison_size = npages_user*pagesize - nbytes_user
intp_type = ctypes.POINTER(ctypes.c_int)
poison_ptr = ctypes.cast(ctypes.c_void_p(c_pointer.value + nbytes_user),
intp_type)

# for both float32 and float64, a sequence of -100 int32s represents NaNs,
# at least on little-endian architectures. It shouldn't matter what we
# put in there, anyway
# For both float32 and float64, a sequence of -100 int32s
# represents NaNs, at least on little-endian architectures;
# it shouldn't matter what we put in there, anyway
for i in range(poison_size // 4):
poison_ptr[i] = -100

return c_pointer, (padleft_pointer, c_bytesize)

def free(self, c_pointer, total_size):
# unprotect it, since free() accesses it, I think...
# Unprotect it, since free() accesses it, I think...
self.lib.mprotect(c_pointer, total_size,
ctypes.c_int(mmap.PROT_READ | mmap.PROT_WRITE))
self.lib.free(c_pointer)
Expand All @@ -247,8 +260,6 @@ class NumaAllocator(MemoryAllocator):
("allocate on any NUMA node with sufficient free memory") are accepted.
"""

is_Numa = True

@classmethod
def initialize(cls):
handle = find_library('numa')
Expand Down Expand Up @@ -356,7 +367,7 @@ class ExternalAllocator(MemoryAllocator):
def __init__(self, numpy_array):
self.numpy_array = numpy_array

def alloc(self, shape, dtype):
def alloc(self, shape, dtype, padding=0):
assert shape == self.numpy_array.shape, \
"Provided array has shape %s. Expected %s" %\
(str(self.numpy_array.shape), str(shape))
Expand Down Expand Up @@ -429,4 +440,4 @@ def default_allocator(name=None):
infer_knl_mode() == 'flat'):
return ALLOC_KNL_MCDRAM
else:
return ALLOC_ALIGNED
return custom_allocators.get('default', ALLOC_ALIGNED)
14 changes: 9 additions & 5 deletions devito/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,16 @@ class Data(np.ndarray):
decomposition : tuple of Decomposition, optional
The data decomposition, for each dimension.
modulo : tuple of bool, optional
If the i-th entry is True, then the i-th array dimension uses modulo indexing.
If the i-th entry is True, then the i-th array dimension uses modulo
indexing.
allocator : MemoryAllocator, optional
Used to allocate memory. Defaults to `ALLOC_ALIGNED`.
distributor : Distributor, optional
The distributor from which the original decomposition was produced. Note that
the decomposition Parameter above may be different to distributor.decomposition.
The distributor from which the original decomposition was produced.
Note that `decomposition` may differ from `distributor.decomposition`.
padding : int or 2-tuple of ints, optional
The number of points that are allocated before and after the data,
that is in addition to the requested shape. Defaults to 0.

Notes
-----
Expand All @@ -45,9 +49,9 @@ class Data(np.ndarray):
"""

def __new__(cls, shape, dtype, decomposition=None, modulo=None,
allocator=ALLOC_ALIGNED, distributor=None):
allocator=ALLOC_ALIGNED, distributor=None, padding=0):
assert len(shape) == len(modulo)
ndarray, memfree_args = allocator.alloc(shape, dtype)
ndarray, memfree_args = allocator.alloc(shape, dtype, padding=padding)
obj = ndarray.view(cls)
obj._allocator = allocator
obj._memfree_args = memfree_args
Expand Down
Loading
Loading