Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CAST(varchar as decimal) #5307

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 41 additions & 1 deletion velox/docs/functions/presto/conversion.rst
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ supported conversions to/from JSON are listed in :doc:`json`.
- Y
-
- Y
-
- Y
* - timestamp
-
-
Expand Down Expand Up @@ -744,3 +744,43 @@ Invalid example

SELECT cast(decimal '-1000.000' as decimal(6, 4)); -- Out of range
SELECT cast(decimal '123456789' as decimal(9, 1)); -- Out of range

From varchar
^^^^^^^^^^^^

Casting varchar to a decimal of given precision and scale is allowed
if the input value can be represented by the precision and scale. When casting from
a larger scale to a smaller one, the fraction part is rounded. Casting from invalid input value throws.

Valid example

::

SELECT cast('9999999999.99' as decimal(12, 2)); -- decimal '9999999999.99'
SELECT cast('1.556' as decimal(12, 2)); -- decimal '1.56'
mbasmanova marked this conversation as resolved.
Show resolved Hide resolved
SELECT cast('1.554' as decimal(12, 2)); -- decimal '1.55'
SELECT cast('-1.554' as decimal(12, 2)); -- decimal '-1.55'
SELECT cast('+09' as decimal(12, 2)); -- decimal '9.00'
SELECT cast('9.' as decimal(12, 2)); -- decimal '9.00'
SELECT cast('.9' as decimal(12, 2)); -- decimal '0.90'
SELECT cast('3E+2' as decimal(12, 2)); -- decimal '300.00'
SELECT cast('3E+00002' as decimal(12, 2)); -- decimal '300.00'
SELECT cast('3e+2' as decimal(12, 2)); -- decimal '300.00'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's also add examples like '12.', '.12', '1.2e-5'

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still think it would be helpful to add an example like 1.2e-5.

SELECT cast('31.423e+2' as decimal(12, 2)); -- decimal '3142.30'
SELECT cast('1.2e-2' as decimal(12, 2)); -- decimal '0.01'
SELECT cast('1.2e-5' as decimal(12, 2)); -- decimal '0.00'
SELECT cast('0000.123' as decimal(12, 2)); -- decimal '0.12'
SELECT cast('.123000000' as decimal(12, 2)); -- decimal '0.12'

Invalid example

::

SELECT cast('1.23e67' as decimal(38, 0)); -- Value too large
SELECT cast('0.0446a' as decimal(9, 1)); -- Value is not a number
SELECT cast('' as decimal(9, 1)); -- Value is not a number
SELECT cast('23e-5d' as decimal(9, 1)); -- Value is not a number
SELECT cast('1.23 ' as decimal(38, 0)); -- Value is not a number
SELECT cast(' -3E+2' as decimal(12, 2)); -- Value is not a number
SELECT cast('-3E+2.1' as decimal(12, 2)); -- Value is not a number
SELECT cast('3E+' as decimal(12, 2)); -- Value is not a number
20 changes: 20 additions & 0 deletions velox/docs/functions/spark/conversion.rst
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,23 @@ Invalid examples
SELECT cast('2012-Oct-23' as date); -- Invalid argument
SELECT cast('2012/10/23' as date); -- Invalid argument
SELECT cast('2012.10.23' as date); -- Invalid argument

Cast to Decimal
---------------

From varchar
^^^^^^^^^^^^

Casting varchar to a decimal of given precision and scale is allowed.
The behavior is similar with Presto except Spark allows leading and trailing white-spaces in input varchars.

Valid example

::

SELECT cast(' 1.23' as decimal(38, 0)); -- 1
SELECT cast('1.23 ' as decimal(38, 0)); -- 1
SELECT cast(' 1.23 ' as decimal(38, 0)); -- 1
SELECT cast(' -3E+2' as decimal(12, 2)); -- -300.00
SELECT cast('-3E+2 ' as decimal(12, 2)); -- -300.00
SELECT cast(' -3E+2 ' as decimal(12, 2)); -- -300.00
175 changes: 175 additions & 0 deletions velox/expression/CastExpr-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,150 @@ StringView convertToStringView(

} // namespace

namespace detail {

/// Represent the varchar fragment.
///
/// For example:
/// | value | wholeDigits | fractionalDigits | exponent | sign |
/// | 9999999999.99 | 9999999999 | 99 | nullopt | 1 |
/// | 15 | 15 | | nullopt | 1 |
/// | 1.5 | 1 | 5 | nullopt | 1 |
/// | -1.5 | 1 | 5 | nullopt | -1 |
/// | 31.523e-2 | 31 | 523 | -2 | 1 |
struct DecimalComponents {
std::string_view wholeDigits;
std::string_view fractionalDigits;
std::optional<int32_t> exponent = std::nullopt;
int8_t sign = 1;
};

// Extract a string view of continuous digits.
std::string_view extractDigits(const char* s, size_t start, size_t size);

/// Parse decimal components, including whole digits, fractional digits,
/// exponent and sign, from input chars. Returns error status if input chars
/// do not represent a valid value.
Status
parseDecimalComponents(const char* s, size_t size, DecimalComponents& out);

/// Parse huge int from decimal components. The fractional part is scaled up by
/// required power of 10, and added with the whole part. Returns error status if
/// overflows.
Status parseHugeInt(const DecimalComponents& decimalComponents, int128_t& out);

/// Converts string view to decimal value of given precision and scale.
/// Derives from Arrow function DecimalFromString. Arrow implementation:
/// https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/decimal.cc#L637.
///
/// Firstly, it parses the varchar to DecimalComponents which contains the
/// message that can represent a decimal value. Secondly, processes the exponent
/// to get the scale. Thirdly, compute the rescaled value. Returns status for
/// the outcome of computing.
template <typename T>
Status toDecimalValue(
const StringView s,
int toPrecision,
int toScale,
T& decimalValue) {
DecimalComponents decimalComponents;
if (auto status =
parseDecimalComponents(s.data(), s.size(), decimalComponents);
!status.ok()) {
return Status::UserError("Value is not a number. " + status.message());
}

// Count number of significant digits (without leading zeros).
const size_t firstNonZero =
decimalComponents.wholeDigits.find_first_not_of('0');
size_t significantDigits = decimalComponents.fractionalDigits.size();
mbasmanova marked this conversation as resolved.
Show resolved Hide resolved
if (firstNonZero != std::string::npos) {
significantDigits += decimalComponents.wholeDigits.size() - firstNonZero;
mbasmanova marked this conversation as resolved.
Show resolved Hide resolved
}
int32_t parsedPrecision = static_cast<int32_t>(significantDigits);

int32_t parsedScale = 0;
bool roundUp = false;
const int32_t fractionSize = decimalComponents.fractionalDigits.size();
if (!decimalComponents.exponent.has_value()) {
if (fractionSize > toScale) {
if (decimalComponents.fractionalDigits[toScale] >= '5') {
roundUp = true;
}
parsedScale = toScale;
decimalComponents.fractionalDigits =
std::string_view(decimalComponents.fractionalDigits.data(), toScale);
} else {
parsedScale = fractionSize;
}
} else {
const auto exponent = decimalComponents.exponent.value();
parsedScale = -exponent + fractionSize;
// Truncate the fractionalDigits.
if (parsedScale > toScale) {
if (-exponent >= toScale) {
// The fractional digits could be dropped.
if (fractionSize > 0 && decimalComponents.fractionalDigits[0] >= '5') {
roundUp = true;
}
decimalComponents.fractionalDigits = "";
parsedScale -= fractionSize;
} else {
const auto reduceDigits = exponent + toScale;
if (fractionSize > reduceDigits &&
decimalComponents.fractionalDigits[reduceDigits] >= '5') {
roundUp = true;
}
decimalComponents.fractionalDigits = std::string_view(
decimalComponents.fractionalDigits.data(),
std::min(reduceDigits, fractionSize));
parsedScale -= fractionSize - decimalComponents.fractionalDigits.size();
}
}
}

int128_t out = 0;
if (auto status = parseHugeInt(decimalComponents, out); !status.ok()) {
return status;
}

if (roundUp) {
bool overflow = __builtin_add_overflow(out, 1, &out);
if (UNLIKELY(overflow)) {
return Status::UserError("Value too large.");
}
}
out *= decimalComponents.sign;

if (parsedScale < 0) {
/// Force the scale to be zero, to avoid negative scales (due to
/// compatibility issues with external systems such as databases).
if (-parsedScale + toScale > LongDecimalType::kMaxPrecision) {
return Status::UserError("Value too large.");
}

bool overflow = __builtin_mul_overflow(
out, DecimalUtil::kPowersOfTen[-parsedScale + toScale], &out);
if (UNLIKELY(overflow)) {
return Status::UserError("Value too large.");
}
parsedPrecision -= parsedScale;
parsedScale = toScale;
}
const auto status = DecimalUtil::rescaleWithRoundUp<int128_t, T>(
out,
std::min((uint8_t)parsedPrecision, LongDecimalType::kMaxPrecision),
parsedScale,
toPrecision,
toScale,
decimalValue);
if (!status.ok()) {
return Status::UserError("Value too large.");
}
return status;
}
} // namespace detail

template <bool adjustForTimeZone>
void CastExpr::castTimestampToDate(
const SelectivityVector& rows,
Expand Down Expand Up @@ -309,6 +453,37 @@ void CastExpr::applyIntToDecimalCastKernel(
});
}

template <typename T>
void CastExpr::applyVarcharToDecimalCastKernel(
const SelectivityVector& rows,
const BaseVector& input,
exec::EvalCtx& context,
const TypePtr& toType,
VectorPtr& result) {
auto sourceVector = input.as<SimpleVector<StringView>>();
auto rawBuffer = result->asUnchecked<FlatVector<T>>()->mutableRawValues();
const auto toPrecisionScale = getDecimalPrecisionScale(*toType);

rows.applyToSelected([&](auto row) {
T decimalValue;
const auto status = detail::toDecimalValue<T>(
hooks_->removeWhiteSpaces(sourceVector->valueAt(row)),
toPrecisionScale.first,
toPrecisionScale.second,
decimalValue);
if (status.ok()) {
rawBuffer[row] = decimalValue;
} else {
if (setNullInResultAtError()) {
result->setNull(row, true);
} else {
context.setVeloxExceptionError(
row, makeBadCastException(toType, input, row, status.message()));
}
}
});
}

template <typename FromNativeType, TypeKind ToKind>
VectorPtr CastExpr::applyDecimalToFloatCast(
const SelectivityVector& rows,
Expand Down
115 changes: 115 additions & 0 deletions velox/expression/CastExpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,117 @@

namespace facebook::velox::exec {

std::string_view
detail::extractDigits(const char* s, size_t start, size_t size) {
size_t pos = start;
for (; pos < size; ++pos) {
if (!std::isdigit(s[pos])) {
break;
}
}
return std::string_view(s + start, pos - start);
}

Status detail::parseDecimalComponents(
const char* s,
size_t size,
detail::DecimalComponents& out) {
if (size == 0) {
return Status::UserError("Input is empty.");
}

size_t pos = 0;

// Sign of the number.
if (s[pos] == '-') {
out.sign = -1;
++pos;
} else if (s[pos] == '+') {
out.sign = 1;
++pos;
}

// Extract the whole digits.
out.wholeDigits = detail::extractDigits(s, pos, size);
pos += out.wholeDigits.size();
if (pos == size) {
return out.wholeDigits.empty()
? Status::UserError("Extracted digits are empty.")
: Status::OK();
}

// Optional dot (if given in fractional form).
if (s[pos] == '.') {
// Extract the fractional digits.
++pos;
out.fractionalDigits = detail::extractDigits(s, pos, size);
pos += out.fractionalDigits.size();
}

if (out.wholeDigits.empty() && out.fractionalDigits.empty()) {
return Status::UserError("Extracted digits are empty.");
}
if (pos == size) {
return Status::OK();
}
// Optional exponent.
if (s[pos] == 'e' || s[pos] == 'E') {
++pos;
bool withSign = pos < size && (s[pos] == '+' || s[pos] == '-');
if (withSign && pos == size - 1) {
return Status::UserError("The exponent part only contains sign.");
}
// Make sure all chars after sign are digits, as as folly::tryTo allows
// leading and trailing whitespaces.
for (auto i = (size_t)withSign; i < size - pos; ++i) {
if (!std::isdigit(s[pos + i])) {
return Status::UserError(
"Non-digit character '{}' is not allowed in the exponent part.",
s[pos + i]);
}
}
out.exponent = folly::to<int32_t>(folly::StringPiece(s + pos, size - pos));
return Status::OK();
}
return pos == size
? Status::OK()
: Status::UserError(
"Chars '{}' are invalid.", std::string(s + pos, size - pos));
}

Status detail::parseHugeInt(
const DecimalComponents& decimalComponents,
int128_t& out) {
// Parse the whole digits.
if (decimalComponents.wholeDigits.size() > 0) {
const auto tryValue = folly::tryTo<int128_t>(folly::StringPiece(
decimalComponents.wholeDigits.data(),
decimalComponents.wholeDigits.size()));
if (tryValue.hasError()) {
return Status::UserError("Value too large.");
}
out = tryValue.value();
}

// Parse the fractional digits.
if (decimalComponents.fractionalDigits.size() > 0) {
const auto length = decimalComponents.fractionalDigits.size();
bool overflow =
__builtin_mul_overflow(out, DecimalUtil::kPowersOfTen[length], &out);
if (overflow) {
return Status::UserError("Value too large.");
}
const auto tryValue = folly::tryTo<int128_t>(
folly::StringPiece(decimalComponents.fractionalDigits.data(), length));
if (tryValue.hasError()) {
return Status::UserError("Value too large.");
}
overflow = __builtin_add_overflow(out, tryValue.value(), &out);
VELOX_DCHECK(!overflow);
}
return Status::OK();
}

VectorPtr CastExpr::castFromDate(
const SelectivityVector& rows,
const BaseVector& input,
Expand Down Expand Up @@ -483,6 +594,10 @@ VectorPtr CastExpr::applyDecimal(
}
[[fallthrough]];
}
case TypeKind::VARCHAR:
applyVarcharToDecimalCastKernel<toDecimalType>(
rows, input, context, toType, castResult);
break;
default:
VELOX_UNSUPPORTED(
"Cast from {} to {} is not supported",
Expand Down
Loading