Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

https://docs.rapids.ai/deployment/nightly/platforms/databricks/ #361

Open
Tracked by #354
jacobtomlinson opened this issue Apr 10, 2024 · 1 comment
Open
Tracked by #354
Assignees

Comments

@jacobtomlinson
Copy link
Member

jacobtomlinson commented Apr 10, 2024

When following the multi-node instructions I'm seeing an error.

Reproducer

  1. Create an init script with the following content
#!/bin/bash
set -e

# The Databricks Python directory isn't on the path in
# databricksruntime/gpu-tensorflow:cuda11.8 for some reason
export PATH="/databricks/python/bin:$PATH"

# Install RAPIDS (cudf & dask-cudf) and dask-databricks
/databricks/python/bin/pip install --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple \
      "cudf-cu12>=24.4.0a0,<=24.4" "dask-cudf-cu12>=24.4.0a0,<=24.4" \
      "cuml-cu12>=24.4.0a0,<=24.4" "cugraph-cu12>=24.4.0a0,<=24.4" \
      "cuspatial-cu12>=24.4.0a0,<=24.4" "cuproj-cu12>=24.4.0a0,<=24.4" \
      "cuxfilter-cu12>=24.4.0a0,<=24.4" "cucim-cu12>=24.4.0a0,<=24.4" \
      "pylibraft-cu12>=24.4.0a0,<=24.4" "raft-dask-cu12>=24.4.0a0,<=24.4" \
      "dask-cuda>=24.4.0a0,<=24.4" \
      dask[complete] \
      dask-databricks

# Start the Dask cluster with CUDA workers
dask databricks run --cuda
  1. Choose the 14.2 (Scala 2.12, Spark 3.5.0) runtime

  2. Chose the databricksruntime/gpu-pytorch:cuda11.8 container image

  3. Run the example task cudf code

RuntimeError: CuPy failed to load libnvrtc.so.12: OSError: libnvrtc.so.12: cannot open shared object file: No such file or directory
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
File cupy_backends/cuda/_softlink.pyx:25, in cupy_backends.cuda._softlink.SoftLink.__init__()

File /usr/lib/python3.10/ctypes/__init__.py:374, in CDLL.__init__(self, name, mode, handle, use_errno, use_last_error, winmode)
    373 if handle is None:
--> 374     self._handle = _dlopen(self._name, mode)
    375 else:

OSError: libnvrtc.so.12: cannot open shared object file: No such file or directory

The above exception was the direct cause of the following exception:

RuntimeError                              Traceback (most recent call last)
File <command-2671937974741064>, line 6
      2 import dask
      5 df = dask.datasets.timeseries().map_partitions(cudf.from_pandas)
----> 6 df.x.mean().compute()

File /databricks/python/lib/python3.10/site-packages/nvtx/nvtx.py:116, in annotate.__call__.<locals>.inner(*args, **kwargs)
    113 @wraps(func)
    114 def inner(*args, **kwargs):
    115     libnvtx_push_range(self.attributes, self.domain.handle)
--> 116     result = func(*args, **kwargs)
    117     libnvtx_pop_range(self.domain.handle)
    118     return result

File /databricks/python/lib/python3.10/site-packages/dask_cudf/core.py:367, in Series.mean(self, split_every)
    365 @_dask_cudf_nvtx_annotate
    366 def mean(self, split_every=False):
--> 367     sum = self.sum(split_every=split_every)
    368     n = self.count(split_every=split_every)
    369     return sum / n

File /databricks/python/lib/python3.10/site-packages/dask/dataframe/core.py:256, in _dummy_numpy_dispatcher.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    253         if kwargs.pop(name, None) is not None:
    254             raise ValueError(f"the '{name}' keyword is not supported")
--> 256 return func(*args, **kwargs)

File /databricks/python/lib/python3.10/site-packages/dask/dataframe/core.py:2369, in _Frame.sum(self, axis, skipna, split_every, dtype, out, min_count, numeric_only)
   2357 @_dummy_numpy_dispatcher("dtype", "out", deprecated=True)
   2358 @derived_from(pd.DataFrame)
   2359 def sum(
   (...)
   2367     numeric_only=None,
   2368 ):
-> 2369     result = self._reduction_agg(
   2370         "sum",
   2371         axis=axis,
   2372         skipna=skipna,
   2373         split_every=split_every,
   2374         out=out,
   2375         numeric_only=numeric_only,
   2376     )
   2377     if min_count:
   2378         cond = self.notnull().sum(axis=axis) >= min_count

File /databricks/python/lib/python3.10/site-packages/dask/dataframe/core.py:2286, in _Frame._reduction_agg(self, name, axis, skipna, split_every, out, numeric_only, none_is_zero)
   2274 def _reduction_agg(
   2275     self,
   2276     name,
   (...)
   2282     none_is_zero=True,
   2283 ):
   2284     axis = self._validate_axis(axis, none_is_zero=none_is_zero)
-> 2286     if has_keyword(getattr(self._meta_nonempty, name), "numeric_only"):
   2287         numeric_only_kwargs = {"numeric_only": numeric_only}
   2288     else:

File /databricks/python/lib/python3.10/site-packages/dask/dataframe/core.py:635, in _Frame._meta_nonempty(self)
    632 @property
    633 def _meta_nonempty(self):
    634     """A non-empty version of `_meta` with fake data."""
--> 635     return meta_nonempty(self._meta)

File /databricks/python/lib/python3.10/site-packages/dask/utils.py:767, in Dispatch.__call__(self, arg, *args, **kwargs)
    763 """
    764 Call the corresponding method based on type of argument.
    765 """
    766 meth = self.dispatch(type(arg))
--> 767 return meth(arg, *args, **kwargs)

File /databricks/python/lib/python3.10/site-packages/nvtx/nvtx.py:116, in annotate.__call__.<locals>.inner(*args, **kwargs)
    113 @wraps(func)
    114 def inner(*args, **kwargs):
    115     libnvtx_push_range(self.attributes, self.domain.handle)
--> 116     result = func(*args, **kwargs)
    117     libnvtx_pop_range(self.domain.handle)
    118     return result

File /databricks/python/lib/python3.10/site-packages/dask_cudf/backends.py:153, in _nonempty_series(s, idx)
    151 if idx is None:
    152     idx = _nonempty_index(s.index)
--> 153 data = _get_non_empty_data(s._column)
    155 return cudf.Series(data, name=s.name, index=idx)

File /databricks/python/lib/python3.10/site-packages/nvtx/nvtx.py:116, in annotate.__call__.<locals>.inner(*args, **kwargs)
    113 @wraps(func)
    114 def inner(*args, **kwargs):
    115     libnvtx_push_range(self.attributes, self.domain.handle)
--> 116     result = func(*args, **kwargs)
    117     libnvtx_pop_range(self.domain.handle)
    118     return result

File /databricks/python/lib/python3.10/site-packages/dask_cudf/backends.py:139, in _get_non_empty_data(s)
    136 else:
    137     if pd.api.types.is_numeric_dtype(s.dtype):
    138         data = cudf.core.column.as_column(
--> 139             cp.arange(start=0, stop=2, dtype=s.dtype)
    140         )
    141     else:
    142         data = cudf.core.column.as_column(
    143             cp.arange(start=0, stop=2, dtype="int64")
    144         ).astype(s.dtype)

File /databricks/python/lib/python3.10/site-packages/cupy/_creation/ranges.py:60, in arange(start, stop, step, dtype)
     58 ret = cupy.empty((size,), dtype=dtype)
     59 typ = numpy.dtype(dtype).type
---> 60 _arange_ufunc(typ(start), typ(step), ret, dtype=dtype)
     61 return ret

File cupy/_core/_kernel.pyx:1375, in cupy._core._kernel.ufunc.__call__()

File cupy/_core/_kernel.pyx:1402, in cupy._core._kernel.ufunc._get_ufunc_kernel()

File cupy/_core/_kernel.pyx:1082, in cupy._core._kernel._get_ufunc_kernel()

File cupy/_core/_kernel.pyx:94, in cupy._core._kernel._get_simple_elementwise_kernel()

File cupy/_core/_kernel.pyx:82, in cupy._core._kernel._get_simple_elementwise_kernel_from_code()

File cupy/_core/core.pyx:2254, in cupy._core.core.compile_with_cache()

File /databricks/python/lib/python3.10/site-packages/cupy/cuda/compiler.py:484, in _compile_module_with_cache(source, options, arch, cache_dir, extra_source, backend, enable_cooperative_groups, name_expressions, log_stream, jitify)
    480     return _compile_with_cache_hip(
    481         source, options, arch, cache_dir, extra_source, backend,
    482         name_expressions, log_stream, cache_in_memory)
    483 else:
--> 484     return _compile_with_cache_cuda(
    485         source, options, arch, cache_dir, extra_source, backend,
    486         enable_cooperative_groups, name_expressions, log_stream,
    487         cache_in_memory, jitify)

File /databricks/python/lib/python3.10/site-packages/cupy/cuda/compiler.py:499, in _compile_with_cache_cuda(source, options, arch, cache_dir, extra_source, backend, enable_cooperative_groups, name_expressions, log_stream, cache_in_memory, jitify)
    497     cache_dir = get_cache_dir()
    498 if arch is None:
--> 499     arch = _get_arch()
    501 options += ('-ftz=true',)
    503 if enable_cooperative_groups:
    504     # `cooperative_groups` requires relocatable device code.

File cupy/_util.pyx:64, in cupy._util.memoize.decorator.ret()

File /databricks/python/lib/python3.10/site-packages/cupy/cuda/compiler.py:148, in _get_arch()
    144 @_util.memoize(for_each_device=True)
    145 def _get_arch():
    146     # See Supported Compile Options section of NVRTC User Guide for
    147     # the maximum value allowed for `--gpu-architecture`.
--> 148     nvrtc_max_compute_capability = _get_max_compute_capability()
    150     arch = device.Device().compute_capability
    151     if arch in _tegra_archs:

File cupy/_util.pyx:64, in cupy._util.memoize.decorator.ret()

File /databricks/python/lib/python3.10/site-packages/cupy/cuda/compiler.py:126, in _get_max_compute_capability()
    124 @_util.memoize()
    125 def _get_max_compute_capability():
--> 126     major, minor = _get_nvrtc_version()
    127     if major < 11:
    128         # CUDA 10.2
    129         nvrtc_max_compute_capability = '75'

File /databricks/python/lib/python3.10/site-packages/cupy/cuda/compiler.py:115, in _get_nvrtc_version()
    113 global _nvrtc_version
    114 if _nvrtc_version is None:
--> 115     _nvrtc_version = nvrtc.getVersion()
    117 return _nvrtc_version

File cupy_backends/cuda/libs/nvrtc.pyx:56, in cupy_backends.cuda.libs.nvrtc.getVersion()

File cupy_backends/cuda/libs/nvrtc.pyx:57, in cupy_backends.cuda.libs.nvrtc.getVersion()

File cupy_backends/cuda/libs/_cnvrtc.pxi:72, in cupy_backends.cuda.libs.nvrtc.initialize()

File cupy_backends/cuda/libs/_cnvrtc.pxi:76, in cupy_backends.cuda.libs.nvrtc._initialize()

File cupy_backends/cuda/libs/_cnvrtc.pxi:143, in cupy_backends.cuda.libs.nvrtc._get_softlink()

File cupy_backends/cuda/_softlink.pyx:32, in cupy_backends.cuda._softlink.SoftLink.__init__()

RuntimeError: CuPy failed to load libnvrtc.so.12: OSError: libnvrtc.so.12: cannot open shared object file: No such file or directory
@jacobtomlinson
Copy link
Member Author

Same thing happens with the databricksruntime/gpu-tensorflow:cuda11.8 image.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant