-
Notifications
You must be signed in to change notification settings - Fork 205
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Draft] Preview of new extended floating point types #3582
base: main
Are you sure you want to change the base?
Conversation
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef _CUDA___FLOATING_POINT_TYPE_TRAITS_H | ||
#define _CUDA___FLOATING_POINT_TYPE_TRAITS_H | ||
|
||
#include <cuda/std/detail/__config> | ||
|
||
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||
# pragma GCC system_header | ||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||
# pragma clang system_header | ||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||
# pragma system_header | ||
#endif // no system header | ||
|
||
#if _CCCL_STD_VER >= 2017 | ||
|
||
# include <cuda/__fwd/fp.h> | ||
# include <cuda/std/__type_traits/remove_cv.h> | ||
|
||
# if _CCCL_HAS_INCLUDE(<stdfloat>) | ||
# include <stdfloat> | ||
# endif | ||
|
||
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA | ||
|
||
template <size_t _NBits> | ||
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr bool __always_false() | ||
{ | ||
return false; | ||
} | ||
|
||
template <class _Tp> | ||
_CCCL_INLINE_VAR constexpr bool __is_standard_floating_point_impl_v = false; | ||
|
||
template <> | ||
_CCCL_INLINE_VAR constexpr bool __is_standard_floating_point_impl_v<float> = true; | ||
|
||
template <> | ||
_CCCL_INLINE_VAR constexpr bool __is_standard_floating_point_impl_v<double> = true; | ||
|
||
template <> | ||
_CCCL_INLINE_VAR constexpr bool __is_standard_floating_point_impl_v<long double> = true; | ||
|
||
template <class _Tp> | ||
_CCCL_INLINE_VAR constexpr bool __is_standard_floating_point_v = | ||
__is_standard_floating_point_impl_v<_CUDA_VSTD::remove_cv_t<_Tp>>; | ||
|
||
template <class _Tp> | ||
_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v = false; | ||
|
||
# if __STDCPP_FLOAT16_T__ == 1 | ||
template <> | ||
_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v<::std::float16_t> = true; | ||
# endif // __STDCPP_FLOAT16_T__ == 1 | ||
|
||
# if __STDCPP_BFLOAT16_T__ == 1 | ||
template <> | ||
_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v<::std::bfloat16_t> = true; | ||
# endif // __STDCPP_BFLOAT16_T__ == 1 | ||
|
||
# if __STDCPP_FLOAT32_T__ == 1 | ||
template <> | ||
_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v<::std::float32_t> = true; | ||
# endif // __STDCPP_FLOAT32_T__ == 1 | ||
|
||
# if __STDCPP_FLOAT64_T__ == 1 | ||
template <> | ||
_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v<::std::float64_t> = true; | ||
# endif // __STDCPP_FLOAT64_T__ == 1 | ||
|
||
# if __STDCPP_FLOAT128_T__ == 1 | ||
template <> | ||
_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_impl_v<::std::float128_t> = true; | ||
# endif // __STDCPP_FLOAT128_T__ == 1 | ||
|
||
template <class _Tp> | ||
_CCCL_INLINE_VAR constexpr bool __is_std_extended_floating_point_v = | ||
__is_std_extended_floating_point_impl_v<_CUDA_VSTD::remove_cv_t<_Tp>>; | ||
|
||
template <class _Tp> | ||
_CCCL_INLINE_VAR constexpr bool __is_cuda_extended_floating_point_impl_v = false; | ||
|
||
template <class _Config> | ||
_CCCL_INLINE_VAR constexpr bool __is_cuda_extended_floating_point_impl_v<__fp<_Config>> = true; | ||
|
||
template <class _Tp> | ||
_CCCL_INLINE_VAR constexpr bool __is_cuda_extended_floating_point_v = | ||
__is_cuda_extended_floating_point_impl_v<_CUDA_VSTD::remove_cv_t<_Tp>>; | ||
|
||
template <class _Tp> | ||
_CCCL_INLINE_VAR constexpr bool __fp_is_floating_point_v = | ||
__is_standard_floating_point_v<_Tp> || __is_std_extended_floating_point_v<_Tp> | ||
|| __is_cuda_extended_floating_point_v<_Tp>; | ||
|
||
_LIBCUDACXX_END_NAMESPACE_CUDA | ||
|
||
#endif // _CCCL_STD_VER >= 2017 | ||
|
||
#endif // _CUDA___FLOATING_POINT_TYPE_TRAITS_H |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This module is temporary.
template <class _Tp, class _Up> | ||
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr _Tp __cast(const _Up& __src) noexcept | ||
{ | ||
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) | ||
{ | ||
NV_IF_TARGET(NV_IS_DEVICE, (return __cast_impl_device<_Tp>(__src);)) | ||
} | ||
|
||
return __cast_generic<_Tp>(__src); | ||
} | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently, the implementation uses rounding to nearest. My idea is that we could provide cuda::fp_cast
where the user could specify more arguments such as different rounding methods or saturation.
using __host_native_type = typename __config_type::__host_native_type; | ||
using __device_native_type = typename __config_type::__device_native_type; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Splitting host and device type is necessary for nvc++ -cuda
compilation
if constexpr (!_CUDA_VSTD::is_same_v<_FromConfig, __fp_invalid_config> | ||
&& !_CUDA_VSTD::is_same_v<_ToConfig, __fp_invalid_config>) | ||
{ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe we could simplify this logic a bit by inverting here and returning __fp_conv_rank_order::__unordered;
# if defined(_CCCL_NO_CONDITIONAL_EXPLICIT) | ||
_CCCL_TEMPLATE(class _Tp) | ||
_CCCL_REQUIRES(__fp_is_floating_point_v<_Tp> _CCCL_AND __fp_cast_is_implicit<_Tp, __fp>()) | ||
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr operator _Tp() const noexcept | ||
{ | ||
return __cast_to<_Tp>(); | ||
} | ||
|
||
_CCCL_TEMPLATE(class _Tp) | ||
_CCCL_REQUIRES(__fp_is_floating_point_v<_Tp> _CCCL_AND(!__fp_cast_is_implicit<_Tp, __fp>())) | ||
_LIBCUDACXX_HIDE_FROM_ABI explicit constexpr operator _Tp() const noexcept | ||
{ | ||
return __cast_to<_Tp>(); | ||
} | ||
# else // ^^^ _CCCL_NO_CONDITIONAL_EXPLICIT ^^^ / vvv !_CCCL_NO_CONDITIONAL_EXPLICIT vvv |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Strong preference to just dropping the else. C++17 is not going anywhere soon
if constexpr (__has_host_native_type) | ||
{ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ditto strong proponent of inverting condition and keeping the if else tree small.
But I am fearful that some combination of toolchains will do the stupid thing
/ok to test |
simplify fp literals definitions add missing attributes
d6b92fa
to
3557c16
Compare
/ok to test |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A lot of this seems to generally land on the same structure as I've had on my branch, which I think at least partially validates both :) I've left some comments on some part of the structure, though - a lot of things seem to be templateable to reduce hand-written type selection/branching logic, and the inline asms really need to go elsewhere. My gut feeling is still that having them in codegen is correct, but if they go into some other place out-of-line from the main arithmetic logic here, that'll also be fine.
Overall, with some things you've noticed that I haven't (like the real need to split the native type detection logic between host and device), this is a really good base for moving forward.
NV_IF_TARGET(NV_IS_DEVICE, | ||
(asm("cvt.f32.f16 %0, %1;" : "=r"(__ret.__storage_) : "h"(__fp_val.__storage_)); return __ret;)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We really need to move these (and that applies to basically all of the inline asms, which should also be volatile) into a codegen headers. For the beginning of me doing that (though for arithmetic and not conversions, but also including some cmake cleanup related to codegens), please see griwes@c81edb4 in libcudacxx/codegen.
|
||
_CCCL_INLINE_VAR constexpr __fp_from_storage_t __fp_from_storage{}; | ||
|
||
struct __fp_iec559_config_base |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should really be a template. There's no reason to not support arbitrary configurations here (other than the limits of available unsigned integer types), and a lot of the code below just repeats over and over.
My suggestion for a plan here is as follows:
- Make this a template that accepts the number of mantissa and exponent bits.
- Move the "has native type" logic out of the types deriving from this into traits (see griwes@c81edb4#diff-0c3dd6fef1b5e163c5549aa21479e2698be89899baffd111199bdc5bd6633645R49-R70 for reference of how I was approaching that; you are right that it should be split into a host and a device variant).
- In this struct, now template, use that trait to directly to find a native representation, if it exists (and the trait can default to your "no native representation" tag).
- Turn the IEEE configs below into just aliases to specializations of this template.
} | ||
}; | ||
|
||
struct __fp4_e2m1_config |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If possible, I'd love to fold these into a template too, because it seems to me that most of the code is shared, just with specific constants being different - is that right?
} | ||
|
||
template <class _Tp> | ||
using __fp_make_config_from_t = decltype(::cuda::__fp_make_config_from<_Tp>()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
...and when all the above config types are templates, this could get turned into a simple computation of "how many bits of X do you have"; see griwes@c81edb4#diff-0c3dd6fef1b5e163c5549aa21479e2698be89899baffd111199bdc5bd6633645 for a general reference for how I envisioned this idea working.
static constexpr bool __has_nans = __config_type::__has_nans; | ||
static constexpr bool __has_denorm = __config_type::__has_denorm; | ||
|
||
using __storage_type = __fp_storage_t<__nbits>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This could be cuda::std::bitset
- it means there's some extra logic for extracting the bits at times, but it would allow for some operations (that don't need to extract mantissas, really) to still work on floating point types larger than the largest integer type available. (Once again, you can see how I dealt with some of the operations on my branch, in representation.h.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This would need to be a specialization for types larger than the largest integer type.
The reason being that we really want to keep compile times down and bitset is not helping with that
_LIBCUDACXX_HIDE_FROM_ABI explicit constexpr __fp(__fp_from_storage_t, const _Tp& __v) noexcept | ||
: __storage_{__v} | ||
{ | ||
static_assert(_CUDA_VSTD::is_same_v<_Tp, __storage_type>); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could this just be not a template, since we are requiring the types to be the same?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The reason I implemented it like this is because I wanted to avoid unintentional implicit cast between integers. E. g. accidental assignment of other's fp type storage.
I would like to leave it like that for now, it may be changed later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The same can be achieved by making _Tp
non-deducible, for instance laundering it through type_identity
.
if constexpr (_CUDA_VSTD::_CCCL_TRAIT(is_same, _Tp, fp16)) | ||
{ | ||
NV_IF_TARGET(NV_PROVIDES_SM_53, | ||
(asm("neg.f16 %0, %1;" : "=h"(__ret.__storage_) : "h"(__src.__storage_)); return __ret;)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Once again - this really should go into codegen. And you can use a "trick" of having the default implementation (__neg_impl_constexpr
, in this case) and the specialized implementations (containing the inline asm) be in the same overload set, with the specialized ones being better matches. This way, a lot of the inline logic goes away (turning the selection of the implementation into a much more declarative thing).
if constexpr ((_CUDA_VSTD::is_same_v<_Lhs, fp16> && _CUDA_VSTD::is_same_v<_Rhs, fp16>) | ||
|| (_CUDA_VSTD::is_same_v<_Lhs, bf16> && _CUDA_VSTD::is_same_v<_Rhs, bf16>) ) | ||
{ | ||
return _Lhs{static_cast<float>(__lhs) + static_cast<float>(__rhs)}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This needs to take care that the rounding due to the round trip is right, compared to native arithmetic. Also, this code should really be more generic in how it looks for the native type to use, but I think you know that :)
}; | ||
|
||
/**********************************************************************************************************************/ | ||
// Unary operators |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All of these operators should be hidden friends of the __fp
template. This is creating a whole lot of operator overloads, for some very common operators, and we should do our best to hide them unless they are in any way relevant.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree with Michal on this one. Making them hidden friends is important.
return __fp_ops::__ge(__fp{__lhs}, __rhs); \ | ||
} | ||
|
||
_LIBCUDACXX_FP_DEFINE_BINARY_OPERATORS_FOR(float, _CCCL_HOST_DEVICE) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of invoking this macro per each builtin type, it should be possible to make these operators also templated on the builtin type, and constrained on cuda::std::is_floating_point
. That will significantly cut down on the sizes of overloads sets for them.
# include <nv/target> | ||
|
||
// Silence the warning about the use of long double in device code | ||
_CCCL_NV_DIAG_SUPPRESS(20208) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The use of long double
in device code is an error, not just a warning, with NVC++. The error message from the front end can be suppressed with the correct pragma, but that's not very useful because the GPU code generator will choke on the code later in the compilation process. (I haven't looked to see if long double
is actually used in device code when compiling with NVC++.)
|
||
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI static constexpr bool __has_native_type() noexcept | ||
{ | ||
NV_IF_ELSE_TARGET(NV_IS_HOST, (return __has_host_native_type;), (return __has_device_native_type;)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function can't be constexpr
with NVC++ because it contains NV_IF_ELSE_TARGET
. With NVC++ it is impossible for a constexpr
function to return different values on host and device.
# endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE | ||
|
||
// todo: improve the implementation | ||
NV_IF_ELSE_TARGET(NV_IS_HOST, (return __construct_from_host<_Tp>(__v);), (return __construct_from_device<_Tp>(__v);)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same comment about constexpr
and NV_IF_ELSE_TARGET
. (Though this function is less likely to be used where a compile-time value is required.)
# else | ||
using __host_native_type = __fp_no_native_type_tag; | ||
# endif | ||
# if _CCCL_CUDA_COMPILER(CLANG, >=, 19) || _CCCL_CUDA_COMPILER(NVHPC, >=, 24, 9) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In NVHPC the GPU support for _Float16
didn't happen until 25.1. The name _Float16
was there in 24.9, but the device code generator had problems. And the same caveat applies about the GCC version that NVC++ is building against. So I think the correct condition for NVHPC is
(_CCCL_CUDA_COMPILER(NVHPC, >=, 25, 1) && __STDCPP_FLOAT16_T__ == 1)
# if __STDCPP_FLOAT16_T__ == 1 | ||
using __host_native_type = ::std::float16_t; | ||
# elif _CCCL_COMPILER(GCC, >=, 7) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC, >=, 24, 9) | ||
using __host_native_type = _Float16; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The situation for NVHPC is more complicated than that. The name _Float16
is available only when NVC++ is >=24.9 and when NVC++ is using GCC >=13 for its standard library. But rather than try to get the condition correct, just remove NVHPC from the check entirely. The conditions when _Float16
is a builtin type are exactly the same conditions where __STDCPP_FLOAT16_T__
is defined. So NVHPC can just use ::std::float16_t
or nothing.
The condition for GCC is wrong. The GCC C++ compiler defines _Float16
starting in GCC 13. (The C compiler on Arm had _Float16
in earlier versions, but the C++ compiler did not.)
}; | ||
|
||
template <class _To, class _From> | ||
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr __fp_conv_rank_order __fp_make_conv_rank_order() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would find this easier to read if the template parameters were named _Lhs
and _Rhs
rather than _To
and _From
. This is a comparison, not a conversion.
// Extended floating point types have higher subrank, prefer cuda extended types over std extended types | ||
// Fixme: potentially will not work correctly for long double | ||
// auto val = 1.0f64 + 1.0l; // val will be of type long double, is this right? | ||
if constexpr (__is_standard_floating_point_v<_Lhs> || __is_std_extended_floating_point_v<_Lhs>) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is true that the type of 1.0f64 + 1.0L
is always long double
in C++. See https://cplusplus.github.io/CWG/issues/2836.html
If __fp_make_conv_rank_order
is fixed to handle long double
correctly, then long double
won't be a problem here.
If you want to prefer cuda extended type over standard extended types and and standard types, and prefer standard extended types over standard types, then the correct if
expression is
if constexpr ((__is_standard_floating_point_v<_Lhs> || __is_std_extended_floating_point_v<_Lhs>) &&
!__is_standard_floating_point_v<_Rhs>) {
if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) | ||
{ | ||
NV_IF_TARGET(NV_IS_DEVICE, (return __neg_impl_device(__src);)) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had to test if this would work with NVC++. It does. NV_IF_TARGET
doesn't prevent the function from being constexpr
if it is inside a !is_constant_evaluated()
block.
( | ||
if constexpr (_Tp::__has_device_native_type) { return __fp{__fp_from_native, -__src.__device_native()}; } else { | ||
return __neg_impl(__src); | ||
})) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But this won't work as expected. The NV_IF_ELSE_TARGET
prevents the function from being constexpr
with NVC++. So __neg_impl
will never be called in a constexpr
context, and __neg_impl_constexpr
will never do what it is supposed to do.
}; | ||
|
||
/**********************************************************************************************************************/ | ||
// Unary operators |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree with Michal on this one. Making them hidden friends is important.
@@ -281,12 +281,12 @@ struct __fp16_config : __fp_iec559_config_base | |||
|
|||
# if __STDCPP_FLOAT16_T__ == 1 | |||
using __host_native_type = ::std::float16_t; | |||
# elif _CCCL_COMPILER(GCC, >=, 7) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC, >=, 24, 9) | |||
# elif _CCCL_COMPILER(GCC, >=, 12) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC, >=, 24, 9) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldnt this be _CCCL_COMPILER(GCC, >=, 13)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also should we remove NVHPC from the check following the comment from David?
This is a draft PR of the new floating point types. Should partially implement #31.
The implementation should meet the P1467R9 requirements.
Currently,
cuda::fp16
,cuda::bf16
,cuda::fp32
andcuda::fp64
are being implemented. The design should allow to easily add other custom types such ascuda::fp8_e4m3
and other.The testing is very naive, far from finished.