From daccfb69a741055d4383576943c5a2333c5a5414 Mon Sep 17 00:00:00 2001 From: Semyon Danilov Date: Mon, 9 Sep 2024 13:45:13 +0000 Subject: [PATCH 1/3] Add integrity trails helper --- ydb/tools/integrity_trails_helper/main.cpp | 262 +++++++++++++++++++++ ydb/tools/integrity_trails_helper/ya.make | 14 ++ 2 files changed, 276 insertions(+) create mode 100644 ydb/tools/integrity_trails_helper/main.cpp create mode 100644 ydb/tools/integrity_trails_helper/ya.make diff --git a/ydb/tools/integrity_trails_helper/main.cpp b/ydb/tools/integrity_trails_helper/main.cpp new file mode 100644 index 000000000000..cedacf04fd1f --- /dev/null +++ b/ydb/tools/integrity_trails_helper/main.cpp @@ -0,0 +1,262 @@ +#include +#include +#include +#include +#include +#include +#include +#define USE_CURRENT_UDF_ABI_VERSION true +#include +#include +#include + +using namespace NKikimr; + +#define IF_TYPE(typeName) \ + else if (type == #typeName) { \ + resolved = NScheme::NTypeIds::typeName; \ + } + +std::optional ResolveType(std::string typeAlias) { + auto type = NYql::LookupSimpleTypeBySqlAlias(typeAlias, true); + + if (!type) { + return {}; + } + + std::optional resolved = {}; + + if (false) {} + IF_TYPE(Bool) + IF_TYPE(Int8) + IF_TYPE(Uint8) + IF_TYPE(Int16) + IF_TYPE(Uint16) + IF_TYPE(Int32) + IF_TYPE(Uint32) + IF_TYPE(Int64) + IF_TYPE(Uint64) + IF_TYPE(Double) + IF_TYPE(Float) + IF_TYPE(String) + IF_TYPE(Utf8) + IF_TYPE(Yson) + IF_TYPE(Json) + IF_TYPE(Uuid) + IF_TYPE(Date) + IF_TYPE(Datetime) + IF_TYPE(Timestamp) + IF_TYPE(Interval) + IF_TYPE(Decimal) + IF_TYPE(DyNumber) + IF_TYPE(JsonDocument) + IF_TYPE(Date32) + IF_TYPE(Datetime64) + IF_TYPE(Timestamp64) + IF_TYPE(Interval64) + + return resolved; +} + +#define EXTRACT_VAL(cellType, protoType, cppType) \ + case NScheme::NTypeIds::cellType : { \ + cppType v = FromString(val); \ + cell = TCell((const char*)&v, sizeof(v)); \ + break; \ + } + +std::optional ParseCell(std::string type, std::string val) { + auto typeId = ResolveType(type); + + std::optional cell = {}; + + if (!typeId) { + return {}; + } + + switch (*typeId) { + EXTRACT_VAL(Bool, bool, ui8); + EXTRACT_VAL(Int8, int32, i8); + EXTRACT_VAL(Uint8, uint32, ui8); + EXTRACT_VAL(Int16, int32, i16); + EXTRACT_VAL(Uint16, uint32, ui16); + EXTRACT_VAL(Int32, int32, i32); + EXTRACT_VAL(Uint32, uint32, ui32); + EXTRACT_VAL(Int64, int64, i64); + EXTRACT_VAL(Uint64, uint64, ui64); + EXTRACT_VAL(Float, float, float); + EXTRACT_VAL(Double, double, double); + EXTRACT_VAL(Date, uint32, ui16); + EXTRACT_VAL(Datetime, uint32, ui32); + EXTRACT_VAL(Timestamp, uint64, ui64); + EXTRACT_VAL(Interval, int64, i64); + EXTRACT_VAL(Date32, int32, i32); + EXTRACT_VAL(Datetime64, int64, i64); + EXTRACT_VAL(Timestamp64, int64, i64); + EXTRACT_VAL(Interval64, int64, i64); + case NScheme::NTypeIds::Json : + case NScheme::NTypeIds::Utf8 : { + cell = TCell(val.data(), val.size()); + break; + } + case NScheme::NTypeIds::DyNumber : { + const auto dyNumber = NDyNumber::ParseDyNumberString(val); + if (!dyNumber.Defined()) { + return {}; + } + cell = TCell(dyNumber->data(), dyNumber->size()); + break; + } + case NScheme::NTypeIds::Yson : + case NScheme::NTypeIds::String : { + cell = TCell(val.data(), val.size()); + break; + } + case NScheme::NTypeIds::Decimal : + case NScheme::NTypeIds::Uuid : { + char uuid[16]; + cell = TCell(uuid, sizeof(uuid)); + break; + } + default: + return {}; + }; + + return cell; +} + +std::vector ReadPK(NJson::TJsonValue& jsonValue) { + auto &pkField = jsonValue["primary_key"]; + + if (!pkField.IsArray()) { + Cerr << "Scheme parsing error, primary key is not an array" << Endl; + return {}; + } + + std::vector pk; + + if (pkField.IsArray()) { + auto &pkArray = pkField.GetArray(); + + for (size_t i = 0; i < pkArray.size(); ++i) { + if (!pkArray[i].IsString()) { + Cerr << "Scheme parsing error, primary key array element is not a string" << Endl; + return {}; + } + pk.push_back(pkArray[i].GetString()); + } + } + + return pk; +} + +std::map ReadColumnMapping(NJson::TJsonValue& jsonValue) { + auto &columnsField = jsonValue["columns"]; + + if (!columnsField.IsArray()) { + Cerr << "Scheme parsing error, columns is not an array"; + return {}; + } + + auto &columnsArray = columnsField.GetArray(); + + std::map colToType; + + for (size_t i = 0; i < columnsArray.size(); ++i) { + auto &column = columnsArray[i]; + + if (!column.IsMap()) { + Cerr << "Scheme parsing error, column is not an object"; + return {}; + } + + auto &nameField = column["name"].GetString(); + auto &typeField = column["type"]; + + std::string typeId = ""; + + if (typeField.Has("type_id")) { + typeId = typeField["type_id"].GetString(); + } else if (typeField.Has("optional_type")) { + typeId = typeField["optional_type"]["item"]["type_id"].GetString(); + } + + colToType[nameField] = typeId; + } + + return colToType; +} + +int main(int argc, char* argv[]) { + if (argc < 3) { + Cerr << "Usage: path-to-scheme.json key-column1-value ... key-columnN-value" << Endl; + return 1; + } + + std::string path = argv[1]; + + std::vector values; + + for (int i = 2; i < argc; ++i) { + values.push_back(argv[i]); + } + + std::stringstream buffer; + + std::ifstream fileStream(path); + buffer << fileStream.rdbuf(); + + std::string json = buffer.str(); + + NJson::TJsonValue jsonValue; + + NJson::ReadJsonTree(json, &jsonValue); + + std::vector pk = ReadPK(jsonValue); + + if (pk.empty()) { + Cerr << "Primary key is empty" << Endl; + return 1; + } + + std::map colToType = ReadColumnMapping(jsonValue); + + if (colToType.empty()) { + Cerr << "Column mapping is empty" << Endl; + return 1; + } + + if (values.size() != pk.size()) { + Cerr << "Key's columns count doesn't match scheme" << Endl; + + return 1; + } + + TVector arr(pk.size()); + + for (size_t i = 0; i < values.size(); ++i) { + auto col = pk[i]; + auto type = colToType[col]; + auto cell = ParseCell(colToType[pk[i]], values[i]); + + if (!cell) { + Cerr << "Unexpected type " << type << " of column " << col << Endl; + + return 1; + } + + arr[i] = *cell; + } + + TSerializedCellVec vec(arr); + + Cout << "Obfuscated key: " << Endl; + + TStringStream output; + + NDataIntegrity::WriteTablePoint(vec.GetCells(), output); + + Cout << output.Str() << Endl; + + return 0; +} diff --git a/ydb/tools/integrity_trails_helper/ya.make b/ydb/tools/integrity_trails_helper/ya.make new file mode 100644 index 000000000000..cbb11fa0a8ff --- /dev/null +++ b/ydb/tools/integrity_trails_helper/ya.make @@ -0,0 +1,14 @@ +PROGRAM() + +SRCS( + main.cpp +) + +PEERDIR( + ydb/core/engine + ydb/core/scheme + ydb/library/yql/public/udf/service/stub + ydb/library/yql/sql/pg_dummy +) + +END() From 75614da9e04d5cf48e5688e6e82e907cf988a56c Mon Sep 17 00:00:00 2001 From: Semyon Danilov Date: Tue, 10 Sep 2024 12:09:59 +0000 Subject: [PATCH 2/3] Check return value of the JSON parser --- ydb/tools/integrity_trails_helper/main.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ydb/tools/integrity_trails_helper/main.cpp b/ydb/tools/integrity_trails_helper/main.cpp index cedacf04fd1f..7f3cc009ad2b 100644 --- a/ydb/tools/integrity_trails_helper/main.cpp +++ b/ydb/tools/integrity_trails_helper/main.cpp @@ -210,7 +210,10 @@ int main(int argc, char* argv[]) { NJson::TJsonValue jsonValue; - NJson::ReadJsonTree(json, &jsonValue); + if (!NJson::ReadJsonTree(json, &jsonValue)) { + Cerr << "Failed to parse JSON" << Endl; + return 1; + } std::vector pk = ReadPK(jsonValue); From 8bc39d8b3f8ef936b38abc35d7171c5756283c8c Mon Sep 17 00:00:00 2001 From: Semyon Danilov Date: Tue, 10 Sep 2024 12:36:01 +0000 Subject: [PATCH 3/3] Review fixes --- ydb/tools/integrity_trails_helper/main.cpp | 25 ++++++++++++---------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/ydb/tools/integrity_trails_helper/main.cpp b/ydb/tools/integrity_trails_helper/main.cpp index 7f3cc009ad2b..2c0514c7c5ef 100644 --- a/ydb/tools/integrity_trails_helper/main.cpp +++ b/ydb/tools/integrity_trails_helper/main.cpp @@ -129,22 +129,20 @@ std::vector ReadPK(NJson::TJsonValue& jsonValue) { auto &pkField = jsonValue["primary_key"]; if (!pkField.IsArray()) { - Cerr << "Scheme parsing error, primary key is not an array" << Endl; + Cerr << "Scheme parsing error, primary_key is not an array" << Endl; return {}; } std::vector pk; - if (pkField.IsArray()) { - auto &pkArray = pkField.GetArray(); + auto &pkArray = pkField.GetArray(); - for (size_t i = 0; i < pkArray.size(); ++i) { - if (!pkArray[i].IsString()) { - Cerr << "Scheme parsing error, primary key array element is not a string" << Endl; - return {}; - } - pk.push_back(pkArray[i].GetString()); + for (size_t i = 0; i < pkArray.size(); ++i) { + if (!pkArray[i].IsString()) { + Cerr << "Scheme parsing error, primary key array element is not a string" << Endl; + return {}; } + pk.push_back(pkArray[i].GetString()); } return pk; @@ -154,7 +152,7 @@ std::map ReadColumnMapping(NJson::TJsonValue& jsonValu auto &columnsField = jsonValue["columns"]; if (!columnsField.IsArray()) { - Cerr << "Scheme parsing error, columns is not an array"; + Cerr << "Scheme parsing error, columns is not an array" << Endl; return {}; } @@ -166,7 +164,7 @@ std::map ReadColumnMapping(NJson::TJsonValue& jsonValu auto &column = columnsArray[i]; if (!column.IsMap()) { - Cerr << "Scheme parsing error, column is not an object"; + Cerr << "Scheme parsing error, column is not an object" << Endl; return {}; } @@ -181,6 +179,11 @@ std::map ReadColumnMapping(NJson::TJsonValue& jsonValu typeId = typeField["optional_type"]["item"]["type_id"].GetString(); } + if (typeId.empty()) { + Cerr << "Scheme parsing error, type_id is not found" << Endl; + return {}; + } + colToType[nameField] = typeId; }