Skip to content

Commit

Permalink
Add support to ArrowDataSource in SourceInfo (rapidsai#16050)
Browse files Browse the repository at this point in the history
ArrowDataSources weren't previously supported in SourceInfo.
(since we didn't need it for Avro).

Adding it now so we can pass tests for orc reader and co. 
(even though ArrowDataSource may potentially be removed in the future)

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: rapidsai#16050
  • Loading branch information
lithomas1 authored Jun 25, 2024
1 parent bc08662 commit e4bd9e8
Show file tree
Hide file tree
Showing 13 changed files with 53 additions and 16 deletions.
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ from libcpp.utility cimport move
from libcpp.vector cimport vector

cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
from cudf._lib.io.datasource cimport Datasource, NativeFileDatasource
from cudf._lib.pylibcudf.io.datasource cimport Datasource, NativeFileDatasource
from cudf._lib.pylibcudf.libcudf.types cimport data_type
from cudf._lib.types cimport dtype_to_data_type

Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/io/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand All @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources datasource.pyx utils.pyx)
set(cython_sources utils.pyx)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
CXX
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/io/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ from libcpp.utility cimport move
from libcpp.vector cimport vector

from cudf._lib.column cimport Column
from cudf._lib.io.datasource cimport Datasource
from cudf._lib.pylibcudf.io.datasource cimport Datasource
from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
from cudf._lib.pylibcudf.libcudf.io.types cimport (
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/orc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ except ImportError:

cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
from cudf._lib.column cimport Column
from cudf._lib.io.datasource cimport NativeFileDatasource
from cudf._lib.io.utils cimport (
make_sink_info,
make_source_info,
update_column_struct_field_names,
)
from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.pylibcudf.libcudf.io.orc cimport (
chunked_orc_writer_options,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
cimport cudf._lib.pylibcudf.libcudf.types as cudf_types
from cudf._lib.column cimport Column
from cudf._lib.expressions cimport Expression
from cudf._lib.io.datasource cimport NativeFileDatasource
from cudf._lib.io.utils cimport (
make_sinks_info,
make_source_info,
update_struct_field_names,
)
from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
from cudf._lib.pylibcudf.libcudf.expressions cimport expression
from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
chunked_parquet_reader as cpp_chunked_parquet_reader,
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources avro.pyx types.pyx)
set(cython_sources avro.pyx datasource.pyx types.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand All @@ -21,5 +21,5 @@ rapids_cython_create_modules(
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
)

set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_types)
set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_types)
link_to_pyarrow_headers("${targets_using_arrow_headers}")
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport avro, types
from . cimport avro, datasource, types
from .types cimport SourceInfo, TableWithMetadata
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import avro, types
from . import avro, datasource, types
from .types import SourceInfo, TableWithMetadata
File renamed without changes.
File renamed without changes.
23 changes: 19 additions & 4 deletions python/cudf/cudf/_lib/pylibcudf/io/types.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector

from cudf._lib.pylibcudf.io.datasource cimport Datasource
from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
from cudf._lib.pylibcudf.libcudf.io.types cimport (
host_buffer,
source_info,
Expand Down Expand Up @@ -56,9 +58,8 @@ cdef class SourceInfo:
Parameters
----------
sources : List[Union[str, os.PathLike, bytes, io.BytesIO]]
A homogeneous list of sources (this can be a string filename,
an os.PathLike, bytes, or an io.BytesIO) to read from.
sources : List[Union[str, os.PathLike, bytes, io.BytesIO, DataSource]]
A homogeneous list of sources to read from.
Mixing different types of sources will raise a `ValueError`.
"""
Expand All @@ -68,6 +69,7 @@ cdef class SourceInfo:
raise ValueError("Need to pass at least one source")

cdef vector[string] c_files
cdef vector[datasource*] c_datasources

if isinstance(sources[0], (os.PathLike, str)):
c_files.reserve(len(sources))
Expand All @@ -84,6 +86,13 @@ cdef class SourceInfo:

self.c_obj = move(source_info(c_files))
return
elif isinstance(sources[0], Datasource):
for csrc in sources:
if not isinstance(csrc, Datasource):
raise ValueError("All sources must be of the same type!")
c_datasources.push_back((<Datasource>csrc).get_datasource())
self.c_obj = move(source_info(c_datasources))
return

# TODO: host_buffer is deprecated API, use host_span instead
cdef vector[host_buffer] c_host_buffers
Expand All @@ -106,5 +115,11 @@ cdef class SourceInfo:
c_buffer = bio.getbuffer() # check if empty?
c_host_buffers.push_back(host_buffer(<char*>&c_buffer[0],
c_buffer.shape[0]))
else:
raise ValueError("Sources must be a list of str/paths, "
"bytes, io.BytesIO, or a Datasource")

if empty_buffer is True:
c_host_buffers.push_back(host_buffer(<char*>NULL, 0))

self.c_obj = source_info(c_host_buffers)
self.c_obj = move(source_info(c_host_buffers))
24 changes: 23 additions & 1 deletion python/cudf/cudf/pylibcudf_tests/test_source_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,21 @@

import io

import pyarrow as pa
import pytest

import cudf._lib.pylibcudf as plc
from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource


@pytest.mark.parametrize(
"source", ["a.txt", b"hello world", io.BytesIO(b"hello world")]
"source",
[
"a.txt",
b"hello world",
io.BytesIO(b"hello world"),
NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
],
)
def test_source_info_ctor(source, tmp_path):
if isinstance(source, str):
Expand All @@ -28,6 +36,10 @@ def test_source_info_ctor(source, tmp_path):
["a.txt", "a.txt"],
[b"hello world", b"hello there"],
[io.BytesIO(b"hello world"), io.BytesIO(b"hello there")],
[
NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
],
],
)
def test_source_info_ctor_multiple(sources, tmp_path):
Expand All @@ -54,6 +66,11 @@ def test_source_info_ctor_multiple(sources, tmp_path):
io.BytesIO(b"hello there"),
b"hello world",
],
[
NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
"awef.txt",
b"hello world",
],
],
)
def test_source_info_ctor_mixing_invalid(sources, tmp_path):
Expand All @@ -67,3 +84,8 @@ def test_source_info_ctor_mixing_invalid(sources, tmp_path):
sources[i] = str(file)
with pytest.raises(ValueError):
plc.io.SourceInfo(sources)


def test_source_info_invalid():
with pytest.raises(ValueError):
plc.io.SourceInfo([123])
2 changes: 1 addition & 1 deletion python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.vector cimport vector

from cudf._lib.io.datasource cimport Datasource
from cudf._lib.pylibcudf.io.datasource cimport Datasource
from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource


Expand Down

0 comments on commit e4bd9e8

Please sign in to comment.