Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

arch: support rocm for gpu info #2261

Merged
merged 2 commits into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions .github/workflows/pytest-core-nompi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ jobs:
- name: pytest-ubuntu-py39-gcc9-omp
python-version: '3.9'
os: ubuntu-20.04
arch: "gcc-9"
arch: "custom"
language: "openmp"
sympy: "1.9"

Expand Down Expand Up @@ -140,7 +140,7 @@ jobs:
id: set-run

- name: Install ${{ matrix.arch }} compiler
if: "runner.os == 'linux' && !contains(matrix.name, 'docker')"
if: "runner.os == 'linux' && !contains(matrix.name, 'docker') && matrix.arch !='custom' "
run : |
sudo apt-get install -y ${{ matrix.arch }}

Expand All @@ -166,8 +166,6 @@ jobs:

- name: Test with pytest
run: |
${{ env.RUN_CMD }} ${{ matrix.arch }} --version
${{ env.RUN_CMD }} python3 --version
${{ env.RUN_CMD }} pytest -k "${{ matrix.test-set }}" -m "not parallel" --cov --cov-config=.coveragerc --cov-report=xml ${{ env.TESTS }}

- name: Upload coverage to Codecov
Expand Down
72 changes: 72 additions & 0 deletions devito/arch/archinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import re
import os
import sys
import json

from devito.logger import warning
from devito.tools import as_tuple, all_equal, memoized_func
Expand Down Expand Up @@ -249,6 +250,77 @@ def cbk(deviceid=0):
except OSError:
pass

# *** Second try: `rocm-smi`, clearly only works with AMD cards
try:
gpu_infos = {}

# Base gpu info
info_cmd = ['rocm-smi', '--showproductname']
proc = Popen(info_cmd, stdout=PIPE, stderr=DEVNULL)
raw_info = str(proc.stdout.read())

lines = raw_info.replace('\\n', '\n').replace('b\'', '').replace('\\t', '')
lines = lines.splitlines()

for line in lines:
if 'GPU' in line:
# Product
pattern = r'GPU\[(\d+)\].*?Card series:\s*(.*?)\s*$'
match1 = re.match(pattern, line)

if match1:
gid = match1.group(1)
gpu_infos.setdefault(gid, dict())
gpu_infos[gid]['physicalid'] = gid
gpu_infos[gid]['product'] = match1.group(2)

# Model
pattern = r'GPU\[(\d+)\].*?Card model:\s*(.*?)\s*$'
match2 = re.match(pattern, line)

if match2:
gid = match2.group(1)
gpu_infos.setdefault(gid, dict())
gpu_infos[gid]['physicalid'] = match2.group(1)
gpu_infos[gid]['model'] = match2.group(2)

gpu_info = homogenise_gpus(list(gpu_infos.values()))

# Also attach callbacks to retrieve instantaneous memory info
info_cmd = ['rocm-smi', '--showmeminfo', 'vram', '--json']
proc = Popen(info_cmd, stdout=PIPE, stderr=DEVNULL)
raw_info = str(proc.stdout.read())
lines = raw_info.replace('\\n', '').replace('b\'', '').replace('\'', '')
info = json.loads(lines)

for i in ['total', 'free', 'used']:
def make_cbk(i):
def cbk(deviceid=0):
try:
# Should only contain Used and total
assert len(info['card%s' % deviceid]) == 2
used = [int(v) for k, v in info['card%s' % deviceid].items()
if 'Used' in k][0]
total = [int(v) for k, v in info['card%s' % deviceid].items()
if 'Used' not in k][0]
free = total - used
return {'total': total, 'free': free, 'used': used}[i]
except:
# We shouldn't really end up here, unless nvidia-smi changes
# the output format (though we still have tests in place that
# will catch this)
return None

return cbk

gpu_info['mem.%s' % i] = make_cbk(i)

gpu_infos['architecture'] = 'AMD'
return gpu_info

except OSError:
pass

# *** Second try: `lshw`
try:
info_cmd = ['lshw', '-C', 'video']
Expand Down
23 changes: 17 additions & 6 deletions devito/arch/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,27 +862,38 @@ def __new__(cls, *args, **kwargs):

obj = super().__new__(cls)
# Keep base to initialize accordingly
obj._base = _base
obj._cpp = _base._cpp
obj._base = kwargs.pop('base', _base)
obj._cpp = obj._base._cpp

return obj

def __init_finalize__(self, **kwargs):
self._base.__init_finalize__(self, **kwargs)
# Update cflags
extrac = environ.get('CFLAGS', '').split(' ')
self.cflags = filter_ordered(self.cflags + extrac)
try:
extrac = environ.get('CFLAGS').split(' ')
self.cflags = self.cflags + extrac
except AttributeError:
pass
# Update ldflags
extrald = environ.get('LDFLAGS', '').split(' ')
self.ldflags = filter_ordered(self.ldflags + extrald)
try:
extrald = environ.get('LDFLAGS').split(' ')
self.ldflags = self.ldflags + extrald
except AttributeError:
pass

def __lookup_cmds__(self):
self._base.__lookup_cmds__(self)
# TODO: check for conflicts, for example using the nvhpc module file
# will set CXX to nvc++ breaking the cuda backend
self.CC = environ.get('CC', self.CC)
self.CXX = environ.get('CXX', self.CXX)
self.MPICC = environ.get('MPICC', self.MPICC)
self.MPICXX = environ.get('MPICXX', self.MPICXX)

def __new_with__(self, **kwargs):
return super().__new_with__(base=self._base, **kwargs)


compiler_registry = {
'custom': CustomCompiler,
Expand Down
2 changes: 2 additions & 0 deletions devito/mpi/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
from devito.types.utils import DimensionTuple


__all__ = ['CustomTopology']

# Do not prematurely initialize MPI
# This allows launching a Devito program from within another Python program
# that has *already* initialized MPI
Expand Down
17 changes: 15 additions & 2 deletions tests/test_gpu_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,26 @@ class TestGPUInfo(object):

def test_get_gpu_info(self):
info = get_gpu_info()
known = ['nvidia', 'tesla', 'geforce', 'quadro', 'unspecified']
known = ['nvidia', 'tesla', 'geforce', 'quadro', 'amd', 'unspecified']
try:
assert info['architecture'].lower() in known
except KeyError:
# There might be than one GPUs, but for now we don't care
# as we're not really exploiting this info yet...
pass
pytest.xfail("Unsupported platform for get_gpu_info")

def custom_compiler(self):
grid = Grid(shape=(4, 4))

u = TimeFunction(name='u', grid=grid)

eqn = Eq(u.forward, u + 1)

with switchconfig(compiler='custom'):
op = Operator(eqn)()
# Check jit-compilation and correct execution
op.apply(time_M=10)
assert np.all(u.data[1] == 11)


class TestCodeGeneration(object):
Expand Down
Loading