From 1ab4ac97b4fb1685085d1a39912b16207aa96a43 Mon Sep 17 00:00:00 2001
From: PHILO-HE <feilong.he@intel.com>
Date: Wed, 17 Nov 2021 13:58:03 +0800
Subject: [PATCH] Support "return null" if there is an exception in casting
 string to int, bigint, float or double (#44)

* Initial commit

* Add global mapping for new funcs

* Add to header

* Fix args issue

* Check in_valid
---
 cpp/src/gandiva/function_registry_string.cc |  21 ++++
 cpp/src/gandiva/gdv_function_stubs.cc       | 106 ++++++++++++++++++++
 cpp/src/gandiva/gdv_function_stubs.h        |  12 +++
 3 files changed, 139 insertions(+)
diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc
index eae6fc7b51a0b..b4a3157520ddb 100644
--- a/cpp/src/gandiva/function_registry_string.cc
+++ b/cpp/src/gandiva/function_registry_string.cc
@@ -134,18 +134,39 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
       NativeFunction("castINT", {}, DataTypeVector{utf8()}, int32(), kResultNullIfNull,
                      "gdv_fn_castINT_utf8",
                      NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+      
+      // return null if fail to cast
+      NativeFunction("castINTOrNull", {}, DataTypeVector{utf8()}, int32(), kResultNullInternal,
+                     "gdv_fn_castINT_or_null_utf8",
+                     NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
 
       NativeFunction("castBIGINT", {}, DataTypeVector{utf8()}, int64(), kResultNullIfNull,
                      "gdv_fn_castBIGINT_utf8",
                      NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+      
+      // return null if fail to cast
+      NativeFunction("castBIGINTOrNull", {}, DataTypeVector{utf8()}, int64(), kResultNullInternal,
+                     "gdv_fn_castBIGINT_or_null_utf8",
+                     NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
 
       NativeFunction("castFLOAT4", {}, DataTypeVector{utf8()}, float32(),
                      kResultNullIfNull, "gdv_fn_castFLOAT4_utf8",
                      NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
 
+      // return null if fail to cast
+      NativeFunction("castFLOAT4OrNull", {}, DataTypeVector{utf8()}, float32(),
+                     kResultNullInternal, "gdv_fn_castFLOAT4_or_null_utf8",
+                     NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
       NativeFunction("castFLOAT8", {}, DataTypeVector{utf8()}, float64(),
                      kResultNullIfNull, "gdv_fn_castFLOAT8_utf8",
                      NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+      
+      // return null if fail to cast
+      NativeFunction("castFLOAT8OrNull", {}, DataTypeVector{utf8()}, float64(),
+                     kResultNullInternal, "gdv_fn_castFLOAT8_or_null_utf8",
+                     NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
+
       NativeFunction("castVARCHAR", {}, DataTypeVector{int8(), int64()}, utf8(),
                      kResultNullIfNull, "castVARCHAR_int8_int64",
                      NativeFunction::kNeedsContext),
diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc
index 0886acb10e93d..ed31c55fd5c4c 100644
--- a/cpp/src/gandiva/gdv_function_stubs.cc
+++ b/cpp/src/gandiva/gdv_function_stubs.cc
@@ -498,6 +498,40 @@ CAST_NUMERIC_FROM_VARBINARY(double, arrow::DoubleType, FLOAT8)
 
 #undef CAST_NUMERIC_STRING
 
+#define CAST_NUMERIC_OR_NULL_FROM_STRING(OUT_TYPE, ARROW_TYPE, TYPE_NAME)            \
+  GANDIVA_EXPORT                                                                     \
+  OUT_TYPE gdv_fn_cast##TYPE_NAME##_or_null_utf8(int64_t context, const char* data,  \
+                                    int32_t len, bool in_valid, bool* out_valid) {   \
+    OUT_TYPE val = 0;                                                                \
+    *out_valid = true;                                                               \
+    if (!in_valid) {                                                                 \
+      *out_valid = false;                                                            \
+      return val;                                                                    \
+    }                                                                                \
+    /* trim leading and trailing spaces */                                           \
+    int32_t trimmed_len;                                                             \
+    int32_t start = 0, end = len - 1;                                                \
+    while (start <= end && data[start] == ' ') {                                     \
+      ++start;                                                                       \
+    }                                                                                \
+    while (end >= start && data[end] == ' ') {                                       \
+      --end;                                                                         \
+    }                                                                                \
+    trimmed_len = end - start + 1;                                                   \
+    const char* trimmed_data = data + start;                                         \
+    if (!arrow::internal::ParseValue<ARROW_TYPE>(trimmed_data, trimmed_len, &val)) { \
+      *out_valid = false;                                                            \
+    }                                                                                \
+    return val;                                                                      \
+  }
+
+CAST_NUMERIC_OR_NULL_FROM_STRING(int32_t, arrow::Int32Type, INT)
+CAST_NUMERIC_OR_NULL_FROM_STRING(int64_t, arrow::Int64Type, BIGINT)
+CAST_NUMERIC_OR_NULL_FROM_STRING(float, arrow::FloatType, FLOAT4)
+CAST_NUMERIC_OR_NULL_FROM_STRING(double, arrow::DoubleType, FLOAT8)
+
+#undef CAST_NUMERIC_OR_NULL_FROM_STRING
+
 #define GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(IN_TYPE, CAST_NAME, ARROW_TYPE)         \
   GANDIVA_EXPORT                                                                  \
   const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64(                         \
@@ -534,6 +568,42 @@ CAST_NUMERIC_FROM_VARBINARY(double, arrow::DoubleType, FLOAT8)
     return ret;                                                                   \
   }
 
+#define GDV_FN_CAST_VARCHAR_INTEGER(IN_TYPE, ARROW_TYPE)                                 \
+  GANDIVA_EXPORT                                                                         \
+  const char* gdv_fn_castVARCHAR_##IN_TYPE##_int64(int64_t context, gdv_##IN_TYPE value, \
+                                                   int64_t len, int32_t * out_len) {     \
+    if (len < 0) {                                                                       \
+      gdv_fn_context_set_error_msg(context, "Buffer length can not be negative");        \
+      *out_len = 0;                                                                      \
+      return "";                                                                         \
+    }                                                                                    \
+    if (len == 0) {                                                                      \
+      *out_len = 0;                                                                      \
+      return "";                                                                         \
+    }                                                                                    \
+    arrow::internal::StringFormatter<arrow::ARROW_TYPE> formatter;                       \
+    char* ret = reinterpret_cast<char*>(                                                 \
+        gdv_fn_context_arena_malloc(context, static_cast<int32_t>(len)));                \
+    if (ret == nullptr) {                                                                \
+      gdv_fn_context_set_error_msg(context, "Could not allocate memory");                \
+      *out_len = 0;                                                                      \
+      return "";                                                                         \
+    }                                                                                    \
+    arrow::Status status = formatter(value, [&](arrow::util::string_view v) {            \
+      int64_t size = static_cast<int64_t>(v.size());                                     \
+      *out_len = static_cast<int32_t>(len < size ? len : size);                          \
+      memcpy(ret, v.data(), *out_len);                                                   \
+      return arrow::Status::OK();                                                        \
+    });                                                                                  \
+    if (!status.ok()) {                                                                  \
+      std::string err = "Could not cast " + std::to_string(value) + " to string";        \
+      gdv_fn_context_set_error_msg(context, err.c_str());                                \
+      *out_len = 0;                                                                      \
+      return "";                                                                         \
+    }                                                                                    \
+    return ret;                                                                          \
+  }
+
 #define GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(IN_TYPE, CAST_NAME, ARROW_TYPE)         \
   GANDIVA_EXPORT                                                                  \
   const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64(                         \
@@ -1379,6 +1449,15 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
   engine->AddGlobalMappingForFunc("gdv_fn_castINT_utf8", types->i32_type(), args,
                                   reinterpret_cast<void*>(gdv_fn_castINT_utf8));
 
+  args = {types->i64_type(),     // int64_t context_ptr
+          types->i8_ptr_type(),  // const char* data
+          types->i32_type(),    // int32_t lenr
+          types->i1_type(),    // bool in2_validity
+          types->ptr_type(types->i8_type())};  // bool* out_valid
+
+  engine->AddGlobalMappingForFunc("gdv_fn_castINT_or_null_utf8", types->i32_type(), args,
+                                  reinterpret_cast<void*>(gdv_fn_castINT_or_null_utf8));
+
   args = {types->i64_type(),     // int64_t context_ptr
           types->i8_ptr_type(),  // const char* data
           types->i32_type()};    // int32_t lenr
@@ -1386,6 +1465,15 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
   engine->AddGlobalMappingForFunc("gdv_fn_castBIGINT_utf8", types->i64_type(), args,
                                   reinterpret_cast<void*>(gdv_fn_castBIGINT_utf8));
 
+  args = {types->i64_type(),     // int64_t context_ptr
+          types->i8_ptr_type(),  // const char* data
+          types->i32_type(),    // int32_t lenr
+          types->i1_type(),    // bool in2_validity
+          types->ptr_type(types->i8_type())};  // bool* out_valid
+
+  engine->AddGlobalMappingForFunc("gdv_fn_castBIGINT_or_null_utf8", types->i64_type(), args,
+                                  reinterpret_cast<void*>(gdv_fn_castBIGINT_or_null_utf8));
+
   args = {types->i64_type(),     // int64_t context_ptr
           types->i8_ptr_type(),  // const char* data
           types->i32_type()};    // int32_t lenr
@@ -1393,6 +1481,15 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
   engine->AddGlobalMappingForFunc("gdv_fn_castFLOAT4_utf8", types->float_type(), args,
                                   reinterpret_cast<void*>(gdv_fn_castFLOAT4_utf8));
 
+  args = {types->i64_type(),     // int64_t context_ptr
+          types->i8_ptr_type(),  // const char* data
+          types->i32_type(),    // int32_t lenr
+          types->i1_type(),    // bool in2_validity
+          types->ptr_type(types->i8_type())};  // bool* out_valid
+
+  engine->AddGlobalMappingForFunc("gdv_fn_castFLOAT4_or_null_utf8", types->float_type(), args,
+                                  reinterpret_cast<void*>(gdv_fn_castFLOAT4_or_null_utf8));
+
   args = {types->i64_type(),     // int64_t context_ptr
           types->i8_ptr_type(),  // const char* data
           types->i32_type()};    // int32_t lenr
@@ -1400,6 +1497,15 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
   engine->AddGlobalMappingForFunc("gdv_fn_castFLOAT8_utf8", types->double_type(), args,
                                   reinterpret_cast<void*>(gdv_fn_castFLOAT8_utf8));
 
+  args = {types->i64_type(),     // int64_t context_ptr
+          types->i8_ptr_type(),  // const char* data
+          types->i32_type(),    // int32_t lenr
+          types->i1_type(),    // bool in2_validity
+          types->ptr_type(types->i8_type())};  // bool* out_valid
+
+  engine->AddGlobalMappingForFunc("gdv_fn_castFLOAT8_or_null_utf8", types->double_type(), args,
+                                  reinterpret_cast<void*>(gdv_fn_castFLOAT8_or_null_utf8));
+
   // gdv_fn_castVARCHAR_int32_int64
   args = {types->i64_type(),       // int64_t execution_context
           types->i32_type(),       // int32_t value
diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h
index 5d52fb8fe4083..e652e5c5e97f2 100644
--- a/cpp/src/gandiva/gdv_function_stubs.h
+++ b/cpp/src/gandiva/gdv_function_stubs.h
@@ -131,15 +131,27 @@ char* gdv_fn_dec_to_string(int64_t context, int64_t x_high, uint64_t x_low,
 GANDIVA_EXPORT
 int32_t gdv_fn_castINT_utf8(int64_t context, const char* data, int32_t data_len);
 
+GANDIVA_EXPORT
+int32_t gdv_fn_castINT_or_null_utf8(int64_t context, const char* data, int32_t data_len, bool in_valid, bool* out_valid);
+
 GANDIVA_EXPORT
 int64_t gdv_fn_castBIGINT_utf8(int64_t context, const char* data, int32_t data_len);
 
+GANDIVA_EXPORT
+int64_t gdv_fn_castBIGINT_or_null_utf8(int64_t context, const char* data, int32_t data_len, bool in_valid, bool* out_valid);
+
 GANDIVA_EXPORT
 float gdv_fn_castFLOAT4_utf8(int64_t context, const char* data, int32_t data_len);
 
+GANDIVA_EXPORT
+float gdv_fn_castFLOAT4_or_null_utf8(int64_t context, const char* data, int32_t data_len, bool in_valid, bool* out_valid);
+
 GANDIVA_EXPORT
 double gdv_fn_castFLOAT8_utf8(int64_t context, const char* data, int32_t data_len);
 
+GANDIVA_EXPORT
+double gdv_fn_castFLOAT8_or_null_utf8(int64_t context, const char* data, int32_t data_len, bool in_valid, bool* out_valid);
+
 GANDIVA_EXPORT
 const char* gdv_fn_castVARCHAR_int32_int64(int64_t context, int32_t value, int64_t len,
                                            int32_t* out_len);