facebookincubator · jinchengchenghh · Aug 28, 2023 · Jan 19, 2024 · mbasmanova · Aug 22, 2023
diff --git a/velox/docs/functions/presto/conversion.rst b/velox/docs/functions/presto/conversion.rst
@@ -149,7 +149,7 @@ supported conversions to/from JSON are listed in :doc:`json`.
      - Y
      -
      - Y
-     -
+     - Y
    * - timestamp
      -
      -
@@ -744,3 +744,43 @@ Invalid example
 
   SELECT cast(decimal '-1000.000' as decimal(6, 4)); -- Out of range
   SELECT cast(decimal '123456789' as decimal(9, 1)); -- Out of range
+
+From varchar
+^^^^^^^^^^^^
+
+Casting varchar to a decimal of given precision and scale is allowed
+if the input value can be represented by the precision and scale. When casting from
+a larger scale to a smaller one, the fraction part is rounded. Casting from invalid input value throws.
+
+Valid example
+
+::
+
+  SELECT cast('9999999999.99' as decimal(12, 2)); -- decimal '9999999999.99'
+  SELECT cast('1.556' as decimal(12, 2)); -- decimal '1.56'
+  SELECT cast('1.554' as decimal(12, 2)); -- decimal '1.55'
+  SELECT cast('-1.554' as decimal(12, 2)); -- decimal '-1.55'
+  SELECT cast('+09' as decimal(12, 2)); -- decimal '9.00'
+  SELECT cast('9.' as decimal(12, 2)); -- decimal '9.00'
+  SELECT cast('.9' as decimal(12, 2)); -- decimal '0.90'
+  SELECT cast('3E+2' as decimal(12, 2)); -- decimal '300.00'
+  SELECT cast('3E+00002' as decimal(12, 2)); -- decimal '300.00'
+  SELECT cast('3e+2' as decimal(12, 2)); -- decimal '300.00'
+  SELECT cast('31.423e+2' as decimal(12, 2)); -- decimal '3142.30'
+  SELECT cast('1.2e-2' as decimal(12, 2)); -- decimal '0.01'
+  SELECT cast('1.2e-5' as decimal(12, 2)); -- decimal '0.00'
+  SELECT cast('0000.123' as decimal(12, 2)); -- decimal '0.12'
+  SELECT cast('.123000000' as decimal(12, 2)); -- decimal '0.12'
+
+Invalid example
+
+::
+
+  SELECT cast('1.23e67' as decimal(38, 0)); -- Value too large
+  SELECT cast('0.0446a' as decimal(9, 1)); -- Value is not a number
+  SELECT cast('' as decimal(9, 1)); -- Value is not a number
+  SELECT cast('23e-5d' as decimal(9, 1)); -- Value is not a number
+  SELECT cast('1.23 ' as decimal(38, 0)); -- Value is not a number
+  SELECT cast(' -3E+2' as decimal(12, 2)); -- Value is not a number
+  SELECT cast('-3E+2.1' as decimal(12, 2)); -- Value is not a number
+  SELECT cast('3E+' as decimal(12, 2)); -- Value is not a number
diff --git a/velox/docs/functions/spark/conversion.rst b/velox/docs/functions/spark/conversion.rst
@@ -162,3 +162,23 @@ Invalid examples
   SELECT cast('2012-Oct-23' as date); -- Invalid argument
   SELECT cast('2012/10/23' as date); -- Invalid argument
   SELECT cast('2012.10.23' as date); -- Invalid argument
+
+Cast to Decimal
+---------------
+
+From varchar
+^^^^^^^^^^^^
+
+Casting varchar to a decimal of given precision and scale is allowed.
+The behavior is similar with Presto except Spark allows leading and trailing white-spaces in input varchars.
+
+Valid example
+
+::
+
+  SELECT cast(' 1.23' as decimal(38, 0)); -- 1
+  SELECT cast('1.23 ' as decimal(38, 0)); -- 1
+  SELECT cast('  1.23  ' as decimal(38, 0)); -- 1
+  SELECT cast(' -3E+2' as decimal(12, 2)); -- -300.00
+  SELECT cast('-3E+2 ' as decimal(12, 2)); -- -300.00
+  SELECT cast('  -3E+2  ' as decimal(12, 2)); -- -300.00
diff --git a/velox/expression/CastExpr-inl.h b/velox/expression/CastExpr-inl.h
@@ -114,6 +114,150 @@ StringView convertToStringView(
 
 } // namespace
 
+namespace detail {
+
+/// Represent the varchar fragment.
+///
+/// For example:
+/// | value | wholeDigits | fractionalDigits | exponent | sign |
+/// | 9999999999.99 | 9999999999 | 99 | nullopt | 1 |
+/// | 15 | 15 |  | nullopt | 1 |
+/// | 1.5 | 1 | 5 | nullopt | 1 |
+/// | -1.5 | 1 | 5 | nullopt | -1 |
+/// | 31.523e-2 | 31 | 523 | -2 | 1 |
+struct DecimalComponents {
+  std::string_view wholeDigits;
+  std::string_view fractionalDigits;
+  std::optional<int32_t> exponent = std::nullopt;
+  int8_t sign = 1;
+};
+
+// Extract a string view of continuous digits.
+std::string_view extractDigits(const char* s, size_t start, size_t size);
+
+/// Parse decimal components, including whole digits, fractional digits,
+/// exponent and sign, from input chars. Returns error status if input chars
+/// do not represent a valid value.
+Status
+parseDecimalComponents(const char* s, size_t size, DecimalComponents& out);
+
+/// Parse huge int from decimal components. The fractional part is scaled up by
+/// required power of 10, and added with the whole part. Returns error status if
+/// overflows.
+Status parseHugeInt(const DecimalComponents& decimalComponents, int128_t& out);
+
+/// Converts string view to decimal value of given precision and scale.
+/// Derives from Arrow function DecimalFromString. Arrow implementation:
+/// https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/decimal.cc#L637.
+///
+/// Firstly, it parses the varchar to DecimalComponents which contains the
+/// message that can represent a decimal value. Secondly, processes the exponent
+/// to get the scale. Thirdly, compute the rescaled value. Returns status for
+/// the outcome of computing.
+template <typename T>
+Status toDecimalValue(
+    const StringView s,
+    int toPrecision,
+    int toScale,
+    T& decimalValue) {
+  DecimalComponents decimalComponents;
+  if (auto status =
+          parseDecimalComponents(s.data(), s.size(), decimalComponents);
+      !status.ok()) {
+    return Status::UserError("Value is not a number. " + status.message());
+  }
+
+  // Count number of significant digits (without leading zeros).
+  const size_t firstNonZero =
+      decimalComponents.wholeDigits.find_first_not_of('0');
+  size_t significantDigits = decimalComponents.fractionalDigits.size();
+  if (firstNonZero != std::string::npos) {
+    significantDigits += decimalComponents.wholeDigits.size() - firstNonZero;
+  }
+  int32_t parsedPrecision = static_cast<int32_t>(significantDigits);
+
+  int32_t parsedScale = 0;
+  bool roundUp = false;
+  const int32_t fractionSize = decimalComponents.fractionalDigits.size();
+  if (!decimalComponents.exponent.has_value()) {
+    if (fractionSize > toScale) {
+      if (decimalComponents.fractionalDigits[toScale] >= '5') {
+        roundUp = true;
+      }
+      parsedScale = toScale;
+      decimalComponents.fractionalDigits =
+          std::string_view(decimalComponents.fractionalDigits.data(), toScale);
+    } else {
+      parsedScale = fractionSize;
+    }
+  } else {
+    const auto exponent = decimalComponents.exponent.value();
+    parsedScale = -exponent + fractionSize;
+    // Truncate the fractionalDigits.
+    if (parsedScale > toScale) {
+      if (-exponent >= toScale) {
+        // The fractional digits could be dropped.
+        if (fractionSize > 0 && decimalComponents.fractionalDigits[0] >= '5') {
+          roundUp = true;
+        }
+        decimalComponents.fractionalDigits = "";
+        parsedScale -= fractionSize;
+      } else {
+        const auto reduceDigits = exponent + toScale;
+        if (fractionSize > reduceDigits &&
+            decimalComponents.fractionalDigits[reduceDigits] >= '5') {
+          roundUp = true;
+        }
+        decimalComponents.fractionalDigits = std::string_view(
+            decimalComponents.fractionalDigits.data(),
+            std::min(reduceDigits, fractionSize));
+        parsedScale -= fractionSize - decimalComponents.fractionalDigits.size();
+      }
+    }
+  }
+
+  int128_t out = 0;
+  if (auto status = parseHugeInt(decimalComponents, out); !status.ok()) {
+    return status;
+  }
+
+  if (roundUp) {
+    bool overflow = __builtin_add_overflow(out, 1, &out);
+    if (UNLIKELY(overflow)) {
+      return Status::UserError("Value too large.");
+    }
+  }
+  out *= decimalComponents.sign;
+
+  if (parsedScale < 0) {
+    /// Force the scale to be zero, to avoid negative scales (due to
+    /// compatibility issues with external systems such as databases).
+    if (-parsedScale + toScale > LongDecimalType::kMaxPrecision) {
+      return Status::UserError("Value too large.");
+    }
+
+    bool overflow = __builtin_mul_overflow(
+        out, DecimalUtil::kPowersOfTen[-parsedScale + toScale], &out);
+    if (UNLIKELY(overflow)) {
+      return Status::UserError("Value too large.");
+    }
+    parsedPrecision -= parsedScale;
+    parsedScale = toScale;
+  }
+  const auto status = DecimalUtil::rescaleWithRoundUp<int128_t, T>(
+      out,
+      std::min((uint8_t)parsedPrecision, LongDecimalType::kMaxPrecision),
+      parsedScale,
+      toPrecision,
+      toScale,
+      decimalValue);
+  if (!status.ok()) {
+    return Status::UserError("Value too large.");
+  }
+  return status;
+}
+} // namespace detail
+
 template <bool adjustForTimeZone>
 void CastExpr::castTimestampToDate(
     const SelectivityVector& rows,
@@ -309,6 +453,37 @@ void CastExpr::applyIntToDecimalCastKernel(
       });
 }
 
+template <typename T>
+void CastExpr::applyVarcharToDecimalCastKernel(
+    const SelectivityVector& rows,
+    const BaseVector& input,
+    exec::EvalCtx& context,
+    const TypePtr& toType,
+    VectorPtr& result) {
+  auto sourceVector = input.as<SimpleVector<StringView>>();
+  auto rawBuffer = result->asUnchecked<FlatVector<T>>()->mutableRawValues();
+  const auto toPrecisionScale = getDecimalPrecisionScale(*toType);
+
+  rows.applyToSelected([&](auto row) {
+    T decimalValue;
+    const auto status = detail::toDecimalValue<T>(
+        hooks_->removeWhiteSpaces(sourceVector->valueAt(row)),
+        toPrecisionScale.first,
+        toPrecisionScale.second,
+        decimalValue);
+    if (status.ok()) {
+      rawBuffer[row] = decimalValue;
+    } else {
+      if (setNullInResultAtError()) {
+        result->setNull(row, true);
+      } else {
+        context.setVeloxExceptionError(
+            row, makeBadCastException(toType, input, row, status.message()));
+      }
+    }
+  });
+}
+
 template <typename FromNativeType, TypeKind ToKind>
 VectorPtr CastExpr::applyDecimalToFloatCast(
     const SelectivityVector& rows,

diff --git a/velox/expression/CastExpr.cpp b/velox/expression/CastExpr.cpp
@@ -33,6 +33,117 @@
 
 namespace facebook::velox::exec {
 
+std::string_view
+detail::extractDigits(const char* s, size_t start, size_t size) {
+  size_t pos = start;
+  for (; pos < size; ++pos) {
+    if (!std::isdigit(s[pos])) {
+      break;
+    }
+  }
+  return std::string_view(s + start, pos - start);
+}
+
+Status detail::parseDecimalComponents(
+    const char* s,
+    size_t size,
+    detail::DecimalComponents& out) {
+  if (size == 0) {
+    return Status::UserError("Input is empty.");
+  }
+
+  size_t pos = 0;
+
+  // Sign of the number.
+  if (s[pos] == '-') {
+    out.sign = -1;
+    ++pos;
+  } else if (s[pos] == '+') {
+    out.sign = 1;
+    ++pos;
+  }
+
+  // Extract the whole digits.
+  out.wholeDigits = detail::extractDigits(s, pos, size);
+  pos += out.wholeDigits.size();
+  if (pos == size) {
+    return out.wholeDigits.empty()
+        ? Status::UserError("Extracted digits are empty.")
+        : Status::OK();
+  }
+
+  // Optional dot (if given in fractional form).
+  if (s[pos] == '.') {
+    // Extract the fractional digits.
+    ++pos;
+    out.fractionalDigits = detail::extractDigits(s, pos, size);
+    pos += out.fractionalDigits.size();
+  }
+
+  if (out.wholeDigits.empty() && out.fractionalDigits.empty()) {
+    return Status::UserError("Extracted digits are empty.");
+  }
+  if (pos == size) {
+    return Status::OK();
+  }
+  // Optional exponent.
+  if (s[pos] == 'e' || s[pos] == 'E') {
+    ++pos;
+    bool withSign = pos < size && (s[pos] == '+' || s[pos] == '-');
+    if (withSign && pos == size - 1) {
+      return Status::UserError("The exponent part only contains sign.");
+    }
+    // Make sure all chars after sign are digits, as as folly::tryTo allows
+    // leading and trailing whitespaces.
+    for (auto i = (size_t)withSign; i < size - pos; ++i) {
+      if (!std::isdigit(s[pos + i])) {
+        return Status::UserError(
+            "Non-digit character '{}' is not allowed in the exponent part.",
+            s[pos + i]);
+      }
+    }
+    out.exponent = folly::to<int32_t>(folly::StringPiece(s + pos, size - pos));
+    return Status::OK();
+  }
+  return pos == size
+      ? Status::OK()
+      : Status::UserError(
+            "Chars '{}' are invalid.", std::string(s + pos, size - pos));
+}
+
+Status detail::parseHugeInt(
+    const DecimalComponents& decimalComponents,
+    int128_t& out) {
+  // Parse the whole digits.
+  if (decimalComponents.wholeDigits.size() > 0) {
+    const auto tryValue = folly::tryTo<int128_t>(folly::StringPiece(
+        decimalComponents.wholeDigits.data(),
+        decimalComponents.wholeDigits.size()));
+    if (tryValue.hasError()) {
+      return Status::UserError("Value too large.");
+    }
+    out = tryValue.value();
+  }
+
+  // Parse the fractional digits.
+  if (decimalComponents.fractionalDigits.size() > 0) {
+    const auto length = decimalComponents.fractionalDigits.size();
+    bool overflow =
+        __builtin_mul_overflow(out, DecimalUtil::kPowersOfTen[length], &out);
+    if (overflow) {
+      return Status::UserError("Value too large.");
+    }
+    const auto tryValue = folly::tryTo<int128_t>(
+        folly::StringPiece(decimalComponents.fractionalDigits.data(), length));
+    if (tryValue.hasError()) {
+      return Status::UserError("Value too large.");
+    }
+    overflow = __builtin_add_overflow(out, tryValue.value(), &out);
+    VELOX_DCHECK(!overflow);
+  }
+  return Status::OK();
+}
+
 VectorPtr CastExpr::castFromDate(
     const SelectivityVector& rows,
     const BaseVector& input,
@@ -483,6 +594,10 @@ VectorPtr CastExpr::applyDecimal(
       }
       [[fallthrough]];
     }
+    case TypeKind::VARCHAR:
+      applyVarcharToDecimalCastKernel<toDecimalType>(
+          rows, input, context, toType, castResult);
+      break;
     default:
       VELOX_UNSUPPORTED(
           "Cast from {} to {} is not supported",