Skip to content

Commit

Permalink
fixed_point + cudf::binary_operation API Changes (#7435)
Browse files Browse the repository at this point in the history
This resolves #7442

Recently while working with @razajafri on `fixed_point` binary ops, it became clear that the `cudf::binary_operation` is breaking the "easy to use, **hard to misuse**" # 1 design guideline. I knew about this but I slotted it as technical debt to be cleaned up later. Long story short, after discussions with both @razajafri, @jrhemstad and comments on the #7442, we will implement the following:

* [x] For `fixed_point` + `cudf::binary_operation` + `DIV` always **use** the `cudf::data_type output_type` parameter
* [x] ~~For `fixed_point` + `cudf::binary_operation` + `TRUE_DIV`, require that the columns/scalars provided as arguments (`lhs` and `rhs`) will result in the specified `data_type`/`scale`~~
* [x] Provide a convenience function (something like `binary_operation_fixed_point_scale()`) that will compute the "expected" scale given two input columns/scalars and a `binary_operator`
* [x] Remove `TRUE_DIV`
* [x] Add unit tests for different output data_types
* [x] Update Python/Cython 

**This will be a breaking change for all `fixed_point` + `cudf::binary_operation`.**

Authors:
  - Conor Hoekstra (@codereport)

Approvers:
  - Keith Kraus (@kkraus14)
  - Mike Wilson (@hyperbolic2346)

URL: #7435
  • Loading branch information
codereport authored Mar 12, 2021
1 parent 5c4fa28 commit 04f9021
Show file tree
Hide file tree
Showing 12 changed files with 420 additions and 293 deletions.
24 changes: 24 additions & 0 deletions cpp/include/cudf/binaryop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,5 +178,29 @@ std::unique_ptr<column> binary_operation(
data_type output_type,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Computes the `scale` for a `fixed_point` number based on given binary operator `op`
*
* @param op The binary_operator used for two `fixed_point` numbers
* @param left_scale Scale of left `fixed_point` number
* @param right_scale Scale of right `fixed_point` number
* @return The resulting `scale` of the computed `fixed_point` number
*/
int32_t binary_operation_fixed_point_scale(binary_operator op,
int32_t left_scale,
int32_t right_scale);

/**
* @brief Computes the `data_type` for a `fixed_point` number based on given binary operator `op`
*
* @param op The binary_operator used for two `fixed_point` numbers
* @param lhs `cudf::data_type` of left `fixed_point` number
* @param rhs `cudf::data_type` of right `fixed_point` number
* @return The resulting `cudf::data_type` of the computed `fixed_point` number
*/
cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
cudf::data_type const& lhs,
cudf::data_type const& rhs);

/** @} */ // end of group
} // namespace cudf
260 changes: 87 additions & 173 deletions cpp/src/binaryop/binaryop.cpp

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions cpp/src/unary/cast_ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -178,11 +178,13 @@ std::unique_ptr<column> rescale(column_view input,

if (input.type().scale() > scale) {
auto const scalar = make_fixed_point_scalar<T>(0, scale_type{scale});
return detail::binary_operation(input, *scalar, binary_operator::ADD, {}, stream, mr);
auto const type = cudf::data_type{cudf::type_to_id<T>(), scale};
return detail::binary_operation(input, *scalar, binary_operator::ADD, type, stream, mr);
} else {
auto const diff = input.type().scale() - scale;
auto const scalar = make_fixed_point_scalar<T>(std::pow(10, -diff), scale_type{diff});
return detail::binary_operation(input, *scalar, binary_operator::DIV, {}, stream, mr);
auto const type = cudf::data_type{cudf::type_to_id<T>(), scale};
return detail::binary_operation(input, *scalar, binary_operator::DIV, type, stream, mr);
}
};

Expand Down
312 changes: 239 additions & 73 deletions cpp/tests/binaryop/binop-integration-test.cpp

Large diffs are not rendered by default.

18 changes: 12 additions & 6 deletions cpp/tests/fixed_point/fixed_point_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -596,8 +596,9 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType32)
auto const expected1 = fp_wrapper{{150000000}, scale_type{6}};
auto const expected2 = fp_wrapper{{50000000}, scale_type{6}};

auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {});
auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {});
auto const type = cudf::data_type{cudf::type_id::DECIMAL32, 6};
auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
Expand All @@ -618,8 +619,9 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType64)
auto const expected1 = fp_wrapper{{150000000}, scale_type{100}};
auto const expected2 = fp_wrapper{{50000000}, scale_type{100}};

auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {});
auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {});
auto const type = cudf::data_type{cudf::type_id::DECIMAL64, 100};
auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
Expand All @@ -630,6 +632,7 @@ TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
// This is testing fixed_point values with an extremely large negative scale. The fixed_point
// implementation should be able to handle any scale representable by an int32_t

using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
using fp_wrapper = cudf::test::fixed_point_column_wrapper<TypeParam>;

auto const a = fp_wrapper{{10}, scale_type{-201}};
Expand All @@ -639,8 +642,11 @@ TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
auto const expected1 = fp_wrapper{{150}, scale_type{-202}};
auto const expected2 = fp_wrapper{{5}, scale_type{-201}};

auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {});
auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {});
auto const type1 = cudf::data_type{cudf::type_to_id<decimalXX>(), -202};
auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type1);

auto const type2 = cudf::data_type{cudf::type_to_id<decimalXX>(), -201};
auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type2);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
Expand Down
11 changes: 2 additions & 9 deletions python/cudf/cudf/_lib/binaryop.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ from cudf._lib.replace import replace_nulls
from cudf._lib.scalar import as_device_scalar
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.types import np_to_cudf_types
from cudf._lib.types cimport underlying_type_t_type_id
from cudf._lib.types cimport underlying_type_t_type_id, dtype_to_data_type

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.scalar.scalar cimport scalar
Expand Down Expand Up @@ -174,15 +174,8 @@ def binaryop(lhs, rhs, op, dtype):
cdef binary_operator c_op = <binary_operator> (
<underlying_type_t_binary_operator> op
)
cdef type_id tid = (
<type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[np.dtype(dtype)]
)
)
)

cdef data_type c_dtype = data_type(tid)
cdef data_type c_dtype = dtype_to_data_type(dtype)

if is_scalar(lhs) or lhs is None:
is_string_col = is_string_dtype(rhs.dtype)
Expand Down
26 changes: 5 additions & 21 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ from rmm._lib.device_buffer cimport DeviceBuffer
from cudf._lib.types import np_to_cudf_types, cudf_to_np_types
from cudf._lib.types cimport (
underlying_type_t_type_id,
dtype_from_column_view
dtype_from_column_view,
dtype_to_data_type
)
from cudf._lib.null_mask import bitmask_allocation_size_bytes

Expand Down Expand Up @@ -378,29 +379,12 @@ cdef class Column:
cdef column_view _view(self, libcudf_types.size_type null_count) except *:
if is_categorical_dtype(self.dtype):
col = self.base_children[0]
data_dtype = col.dtype
else:
col = self
data_dtype = self.dtype

data_dtype = col.dtype
cdef libcudf_types.type_id tid

if is_list_dtype(self.dtype):
tid = libcudf_types.type_id.LIST
elif is_struct_dtype(self.dtype):
tid = libcudf_types.type_id.STRUCT
elif is_decimal_dtype(self.dtype):
tid = libcudf_types.type_id.DECIMAL64
else:
tid = <libcudf_types.type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[np.dtype(data_dtype)]
)
)
cdef libcudf_types.data_type dtype = (
libcudf_types.data_type(tid, -self.dtype.scale)
if tid == libcudf_types.type_id.DECIMAL64
else libcudf_types.data_type(tid)
)
cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
cdef libcudf_types.size_type offset = self.offset
cdef vector[column_view] children
cdef void* data
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/types.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ from libc.stdint cimport int32_t
from libcpp cimport bool
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view

cimport cudf._lib.cpp.types as libcudf_types

ctypedef bool underlying_type_t_order
ctypedef bool underlying_type_t_null_order
Expand All @@ -14,3 +14,5 @@ ctypedef int32_t underlying_type_t_type_id
ctypedef bool underlying_type_t_null_policy

cdef dtype_from_column_view(column_view cv)

cdef libcudf_types.data_type dtype_to_data_type(dtype) except *
21 changes: 19 additions & 2 deletions python/cudf/cudf/_lib/types.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ from cudf._lib.types cimport (
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
from cudf.core.dtypes import ListDtype, StructDtype, Decimal64Dtype
from cudf.utils.dtypes import is_decimal_dtype, is_list_dtype, is_struct_dtype

cimport cudf._lib.cpp.types as libcudf_types

Expand Down Expand Up @@ -192,8 +193,7 @@ cdef dtype_from_structs_column_view(column_view cv):

cdef dtype_from_decimal_column_view(column_view cv):
scale = -cv.type().scale()
precision = 18 # max of 64 bit integer
return Decimal64Dtype(precision=precision, scale=scale)
return Decimal64Dtype(precision=Decimal64Dtype.MAX_PRECISION, scale=scale)

cdef dtype_from_column_view(column_view cv):
cdef libcudf_types.type_id tid = cv.type().id()
Expand All @@ -208,3 +208,20 @@ cdef dtype_from_column_view(column_view cv):
"Use decimal64 instead")
else:
return cudf_to_np_types[<underlying_type_t_type_id>(tid)]

cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
if is_list_dtype(dtype):
tid = libcudf_types.type_id.LIST
elif is_struct_dtype(dtype):
tid = libcudf_types.type_id.STRUCT
elif is_decimal_dtype(dtype):
tid = libcudf_types.type_id.DECIMAL64
else:
tid = <libcudf_types.type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[np.dtype(dtype)]))

if tid == libcudf_types.type_id.DECIMAL64:
return libcudf_types.data_type(tid, -dtype.scale)
else:
return libcudf_types.data_type(tid)
18 changes: 17 additions & 1 deletion python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,11 @@ def to_arrow(self):
def binary_operator(self, op, other, reflect=False):
if reflect:
self, other = other, self
result = libcudf.binaryop.binaryop(self, other, op, "int32")
scale = _binop_scale(self.dtype, other.dtype, op)
output_type = Decimal64Dtype(
scale=scale, precision=Decimal64Dtype.MAX_PRECISION
) # precision will be ignored, libcudf has no notion of precision
result = libcudf.binaryop.binaryop(self, other, op, output_type)
result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)
return result

Expand Down Expand Up @@ -99,6 +103,18 @@ def as_string_column(
)


def _binop_scale(l_dtype, r_dtype, op):
# This should at some point be hooked up to libcudf's
# binary_operation_fixed_point_scale
s1, s2 = l_dtype.scale, r_dtype.scale
if op in ("add", "sub"):
return max(s1, s2)
elif op == "mul":
return s1 + s2
else:
raise NotImplementedError()


def _binop_precision(l_dtype, r_dtype, op):
"""
Returns the result precision when performing the
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ class Decimal64Dtype(ExtensionDtype):

name = "decimal"
_metadata = ("precision", "scale")
_MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max))
MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max))

def __init__(self, precision, scale=0):
"""
Expand Down Expand Up @@ -303,10 +303,10 @@ def __hash__(self):

@classmethod
def _validate(cls, precision, scale=0):
if precision > Decimal64Dtype._MAX_PRECISION:
if precision > Decimal64Dtype.MAX_PRECISION:
raise ValueError(
f"Cannot construct a {cls.__name__}"
f" with precision > {cls._MAX_PRECISION}"
f" with precision > {cls.MAX_PRECISION}"
)
if abs(scale) > precision:
raise ValueError(f"scale={scale} exceeds precision={precision}")
Expand Down
7 changes: 5 additions & 2 deletions python/cudf/cudf/utils/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType

import cudf
from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar
from cudf._lib.scalar import DeviceScalar
from cudf.core._compat import PANDAS_GE_120

_NA_REP = "<NA>"
Expand Down Expand Up @@ -331,7 +331,10 @@ def to_cudf_compatible_scalar(val, dtype=None):
If `val` is None, returns None.
"""
if _is_null_host_scalar(val) or isinstance(val, cudf.Scalar):

if cudf._lib.scalar._is_null_host_scalar(val) or isinstance(
val, cudf.Scalar
):
return val

if not is_scalar(val):
Expand Down

0 comments on commit 04f9021

Please sign in to comment.