Skip to content

Commit 00fa2d9

Browse files
qidayedataroaring
authored andcommitted
[fix](ES Catalog)Support parse single value for array column (#40614)
Follow up #39104, when the field has one value and we map it as array type in Doris, we parse the single value to a single element array to make them queryable. close #40406
1 parent 9c18947 commit 00fa2d9

File tree

8 files changed

+264
-165
lines changed

8 files changed

+264
-165
lines changed

be/src/exec/es/es_scroll_parser.cpp

+154-119
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,159 @@ Status insert_int_value(const rapidjson::Value& col, PrimitiveType type,
418418
return parse_and_insert_data(col);
419419
}
420420

421+
template <typename T>
422+
Status handle_value(const rapidjson::Value& col, PrimitiveType sub_type, bool pure_doc_value,
423+
T& val) {
424+
RETURN_IF_ERROR(get_int_value<T>(col, sub_type, &val, pure_doc_value));
425+
return Status::OK();
426+
}
427+
428+
template <>
429+
Status handle_value<float>(const rapidjson::Value& col, PrimitiveType sub_type, bool pure_doc_value,
430+
float& val) {
431+
RETURN_IF_ERROR(get_float_value<float>(col, sub_type, &val, pure_doc_value));
432+
return Status::OK();
433+
}
434+
435+
template <>
436+
Status handle_value<double>(const rapidjson::Value& col, PrimitiveType sub_type,
437+
bool pure_doc_value, double& val) {
438+
RETURN_IF_ERROR(get_float_value<double>(col, sub_type, &val, pure_doc_value));
439+
return Status::OK();
440+
}
441+
442+
template <>
443+
Status handle_value<std::string>(const rapidjson::Value& col, PrimitiveType sub_type,
444+
bool pure_doc_value, std::string& val) {
445+
RETURN_ERROR_IF_COL_IS_ARRAY(col, sub_type, true);
446+
if (!col.IsString()) {
447+
val = json_value_to_string(col);
448+
} else {
449+
val = col.GetString();
450+
}
451+
return Status::OK();
452+
}
453+
454+
template <>
455+
Status handle_value<bool>(const rapidjson::Value& col, PrimitiveType sub_type, bool pure_doc_value,
456+
bool& val) {
457+
if (col.IsBool()) {
458+
val = col.GetBool();
459+
return Status::OK();
460+
}
461+
462+
if (col.IsNumber()) {
463+
val = col.GetInt();
464+
return Status::OK();
465+
}
466+
467+
bool is_nested_str = false;
468+
if (pure_doc_value && col.IsArray() && !col.Empty() && col[0].IsBool()) {
469+
val = col[0].GetBool();
470+
return Status::OK();
471+
} else if (pure_doc_value && col.IsArray() && !col.Empty() && col[0].IsString()) {
472+
is_nested_str = true;
473+
} else if (pure_doc_value && col.IsArray()) {
474+
return Status::InternalError(ERROR_INVALID_COL_DATA, "BOOLEAN");
475+
}
476+
477+
const rapidjson::Value& str_col = is_nested_str ? col[0] : col;
478+
const std::string& str_val = str_col.GetString();
479+
size_t val_size = str_col.GetStringLength();
480+
StringParser::ParseResult result;
481+
val = StringParser::string_to_bool(str_val.c_str(), val_size, &result);
482+
RETURN_ERROR_IF_PARSING_FAILED(result, str_col, sub_type);
483+
return Status::OK();
484+
}
485+
486+
template <typename T>
487+
Status process_single_column(const rapidjson::Value& col, PrimitiveType sub_type,
488+
bool pure_doc_value, vectorized::Array& array) {
489+
T val;
490+
RETURN_IF_ERROR(handle_value<T>(col, sub_type, pure_doc_value, val));
491+
array.push_back(val);
492+
return Status::OK();
493+
}
494+
495+
template <typename T>
496+
Status process_column_array(const rapidjson::Value& col, PrimitiveType sub_type,
497+
bool pure_doc_value, vectorized::Array& array) {
498+
for (const auto& sub_col : col.GetArray()) {
499+
RETURN_IF_ERROR(process_single_column<T>(sub_col, sub_type, pure_doc_value, array));
500+
}
501+
return Status::OK();
502+
}
503+
504+
template <typename T>
505+
Status process_column(const rapidjson::Value& col, PrimitiveType sub_type, bool pure_doc_value,
506+
vectorized::Array& array) {
507+
if (!col.IsArray()) {
508+
return process_single_column<T>(col, sub_type, pure_doc_value, array);
509+
} else {
510+
return process_column_array<T>(col, sub_type, pure_doc_value, array);
511+
}
512+
}
513+
514+
template <typename DateType, typename RT>
515+
Status process_date_column(const rapidjson::Value& col, PrimitiveType sub_type, bool pure_doc_value,
516+
vectorized::Array& array, const cctz::time_zone& time_zone) {
517+
if (!col.IsArray()) {
518+
RT data;
519+
RETURN_IF_ERROR(
520+
(get_date_int<DateType, RT>(col, sub_type, pure_doc_value, &data, time_zone)));
521+
array.push_back(data);
522+
} else {
523+
for (const auto& sub_col : col.GetArray()) {
524+
RT data;
525+
RETURN_IF_ERROR((get_date_int<DateType, RT>(sub_col, sub_type, pure_doc_value, &data,
526+
time_zone)));
527+
array.push_back(data);
528+
}
529+
}
530+
return Status::OK();
531+
}
532+
533+
Status ScrollParser::parse_column(const rapidjson::Value& col, PrimitiveType sub_type,
534+
bool pure_doc_value, vectorized::Array& array,
535+
const cctz::time_zone& time_zone) {
536+
switch (sub_type) {
537+
case TYPE_CHAR:
538+
case TYPE_VARCHAR:
539+
case TYPE_STRING:
540+
return process_column<std::string>(col, sub_type, pure_doc_value, array);
541+
case TYPE_TINYINT:
542+
return process_column<int8_t>(col, sub_type, pure_doc_value, array);
543+
case TYPE_SMALLINT:
544+
return process_column<int16_t>(col, sub_type, pure_doc_value, array);
545+
case TYPE_INT:
546+
return process_column<int32>(col, sub_type, pure_doc_value, array);
547+
case TYPE_BIGINT:
548+
return process_column<int64_t>(col, sub_type, pure_doc_value, array);
549+
case TYPE_LARGEINT:
550+
return process_column<__int128>(col, sub_type, pure_doc_value, array);
551+
case TYPE_FLOAT:
552+
return process_column<float>(col, sub_type, pure_doc_value, array);
553+
case TYPE_DOUBLE:
554+
return process_column<double>(col, sub_type, pure_doc_value, array);
555+
case TYPE_BOOLEAN:
556+
return process_column<bool>(col, sub_type, pure_doc_value, array);
557+
// date/datetime v2 is the default type for catalog table,
558+
// see https://github.com/apache/doris/pull/16304
559+
// No need to support date and datetime types.
560+
case TYPE_DATEV2: {
561+
return process_date_column<DateV2Value<DateV2ValueType>, uint32_t>(
562+
col, sub_type, pure_doc_value, array, time_zone);
563+
}
564+
case TYPE_DATETIMEV2: {
565+
return process_date_column<DateV2Value<DateTimeV2ValueType>, uint64_t>(
566+
col, sub_type, pure_doc_value, array, time_zone);
567+
}
568+
default:
569+
LOG(ERROR) << "Do not support Array type: " << sub_type;
570+
return Status::InternalError("Unsupported type");
571+
}
572+
}
573+
421574
ScrollParser::ScrollParser(bool doc_value_mode) : _size(0), _line_index(0) {}
422575

423576
ScrollParser::~ScrollParser() = default;
@@ -687,125 +840,7 @@ Status ScrollParser::fill_columns(const TupleDescriptor* tuple_desc,
687840
case TYPE_ARRAY: {
688841
vectorized::Array array;
689842
const auto& sub_type = tuple_desc->slots()[i]->type().children[0].type;
690-
RETURN_ERROR_IF_COL_IS_ARRAY(col, type, false);
691-
for (const auto& sub_col : col.GetArray()) {
692-
switch (sub_type) {
693-
case TYPE_CHAR:
694-
case TYPE_VARCHAR:
695-
case TYPE_STRING: {
696-
std::string val;
697-
RETURN_ERROR_IF_COL_IS_ARRAY(sub_col, sub_type, true);
698-
if (!sub_col.IsString()) {
699-
val = json_value_to_string(sub_col);
700-
} else {
701-
val = sub_col.GetString();
702-
}
703-
array.push_back(val);
704-
break;
705-
}
706-
case TYPE_TINYINT: {
707-
int8_t val;
708-
RETURN_IF_ERROR(get_int_value<int8_t>(sub_col, sub_type, &val, pure_doc_value));
709-
array.push_back(val);
710-
break;
711-
}
712-
case TYPE_SMALLINT: {
713-
int16_t val;
714-
RETURN_IF_ERROR(
715-
get_int_value<int16_t>(sub_col, sub_type, &val, pure_doc_value));
716-
array.push_back(val);
717-
break;
718-
}
719-
case TYPE_INT: {
720-
int32 val;
721-
RETURN_IF_ERROR(get_int_value<int32>(sub_col, sub_type, &val, pure_doc_value));
722-
array.push_back(val);
723-
break;
724-
}
725-
case TYPE_BIGINT: {
726-
int64_t val;
727-
RETURN_IF_ERROR(
728-
get_int_value<int64_t>(sub_col, sub_type, &val, pure_doc_value));
729-
array.push_back(val);
730-
break;
731-
}
732-
case TYPE_LARGEINT: {
733-
__int128 val;
734-
RETURN_IF_ERROR(
735-
get_int_value<__int128>(sub_col, sub_type, &val, pure_doc_value));
736-
array.push_back(val);
737-
break;
738-
}
739-
case TYPE_FLOAT: {
740-
float val {};
741-
RETURN_IF_ERROR(
742-
get_float_value<float>(sub_col, sub_type, &val, pure_doc_value));
743-
array.push_back(val);
744-
break;
745-
}
746-
case TYPE_DOUBLE: {
747-
double val {};
748-
RETURN_IF_ERROR(
749-
get_float_value<double>(sub_col, sub_type, &val, pure_doc_value));
750-
array.push_back(val);
751-
break;
752-
}
753-
case TYPE_BOOLEAN: {
754-
if (sub_col.IsBool()) {
755-
array.push_back(sub_col.GetBool());
756-
break;
757-
}
758-
759-
if (sub_col.IsNumber()) {
760-
array.push_back(sub_col.GetInt());
761-
break;
762-
}
763-
764-
bool is_nested_str = false;
765-
if (pure_doc_value && sub_col.IsArray() && !sub_col.Empty() &&
766-
sub_col[0].IsBool()) {
767-
array.push_back(sub_col[0].GetBool());
768-
break;
769-
} else if (pure_doc_value && sub_col.IsArray() && !sub_col.Empty() &&
770-
sub_col[0].IsString()) {
771-
is_nested_str = true;
772-
} else if (pure_doc_value && sub_col.IsArray()) {
773-
return Status::InternalError(ERROR_INVALID_COL_DATA, "BOOLEAN");
774-
}
775-
776-
const rapidjson::Value& str_col = is_nested_str ? sub_col[0] : sub_col;
777-
778-
const std::string& val = str_col.GetString();
779-
size_t val_size = str_col.GetStringLength();
780-
StringParser::ParseResult result;
781-
bool b = StringParser::string_to_bool(val.c_str(), val_size, &result);
782-
RETURN_ERROR_IF_PARSING_FAILED(result, str_col, type);
783-
array.push_back(b);
784-
break;
785-
}
786-
// date/datetime v2 is the default type for catalog table,
787-
// see https://github.com/apache/doris/pull/16304
788-
// No need to support date and datetime types.
789-
case TYPE_DATEV2: {
790-
uint32_t data;
791-
RETURN_IF_ERROR((get_date_int<DateV2Value<DateV2ValueType>, uint32_t>(
792-
sub_col, sub_type, pure_doc_value, &data, time_zone)));
793-
array.push_back(data);
794-
break;
795-
}
796-
case TYPE_DATETIMEV2: {
797-
uint64_t data;
798-
RETURN_IF_ERROR((get_date_int<DateV2Value<DateTimeV2ValueType>, uint64_t>(
799-
sub_col, sub_type, pure_doc_value, &data, time_zone)));
800-
array.push_back(data);
801-
break;
802-
}
803-
default: {
804-
LOG(ERROR) << "Do not support Array type: " << sub_type;
805-
break;
806-
}
807-
}
808-
}
843+
RETURN_IF_ERROR(parse_column(col, sub_type, pure_doc_value, array, time_zone));
809844
col_ptr->insert(array);
810845
break;
811846
}

be/src/exec/es/es_scroll_parser.h

+2
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ class ScrollParser {
4747
int get_size() const;
4848

4949
private:
50+
Status parse_column(const rapidjson::Value& col, PrimitiveType sub_type, bool pure_doc_value,
51+
vectorized::Array& array, const cctz::time_zone& time_zone);
5052
std::string _scroll_id;
5153
int _size;
5254
rapidjson::SizeType _line_index;
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
{"name": "Andy", "sports": "soccer"}
2-
{"name": "Betty", "sports": "pingpong ball"}
3-
{"name": "Cindy", "sports": "武术"}
4-
{"name": "David", "sports": ["volleyball"]}
5-
{"name": "Emily", "sports": ["baseball", "golf", "hockey"]}
6-
{"name": "Frank", "sports": ["rugby", "cricket", "boxing"]}
7-
{"name": "Grace", "sports": ["table tennis", "badminton", "athletics"]}
8-
{"name": "Henry", "sports": ["archery", "fencing", "weightlifting"]}
9-
{"name": "Ivy", "sports": ["judo", "karate", "taekwondo"]}
10-
{"name": "Jack", "sports": ["wrestling", "gymnastics", "surfing"]}
1+
{"name": "Andy", "sports": "soccer", "scores": 100}
2+
{"name": "Betty", "sports": "pingpong ball", "scores": 90}
3+
{"name": "Cindy", "sports": "武术", "scores": 89}
4+
{"name": "David", "sports": ["volleyball"], "scores": [77]}
5+
{"name": "Emily", "sports": ["baseball", "golf", "hockey"], "scores": [56, 78, 99]}
6+
{"name": "Frank", "sports": ["rugby", "cricket", "boxing"], "scores": [45, 67, 88]}
7+
{"name": "Grace", "sports": ["table tennis", "badminton", "athletics"], "scores": [34, 56, 78]}
8+
{"name": "Henry", "sports": ["archery", "fencing", "weightlifting"], "scores": [23, 45, 67]}
9+
{"name": "Ivy", "sports": ["judo", "karate", "taekwondo"], "scores": [12, 34, 56]}
10+
{"name": "Jack", "sports": ["wrestling", "gymnastics", "surfing"], "scores": [1, 23, 45]}

docker/thirdparties/docker-compose/elasticsearch/scripts/index/array_meta_composite_type_array.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
"_meta": {
33
"doris":{
44
"array_fields":[
5-
"sports"
5+
"sports",
6+
"scores"
67
]
78
}
89
}

docker/thirdparties/docker-compose/elasticsearch/scripts/index/es6_composite_type_array.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
"doc": {
88
"properties": {
99
"name": { "type": "keyword" },
10-
"sports": { "type": "keyword", "doc_values": false}
10+
"sports": { "type": "keyword", "doc_values": false},
11+
"scores": { "type": "integer", "doc_values": false}
1112
}
1213
}
1314
}

docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_composite_type_array.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
"mappings": {
77
"properties": {
88
"name": { "type": "keyword" },
9-
"sports": { "type": "keyword", "doc_values": false}
9+
"sports": { "type": "keyword", "doc_values": false},
10+
"scores": { "type": "integer", "doc_values": false}
1011
}
1112
}
1213
}

0 commit comments

Comments
 (0)