diff --git a/CMakeLists.txt b/CMakeLists.txt index f6f05a13..2c4e7de1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.13) project(tentris LANGUAGES CXX - VERSION 1.0.7) + VERSION 1.1.0) set(CMAKE_CXX_STANDARD 20) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/version.hpp.in ${CMAKE_CURRENT_SOURCE_DIR}/src/lib/tentris/tentris_version.hpp) diff --git a/conanfile.txt b/conanfile.txt index 81d0ec9c..db0f0479 100644 --- a/conanfile.txt +++ b/conanfile.txt @@ -2,7 +2,7 @@ boost/1.75.0 fmt/7.1.3 restinio/0.6.12 -hypertrie/0.5.3@dice-group/stable +hypertrie/0.6.0@dice-group/stable rdf-parser/0.13.0@dice-group/stable sparql-parser-base/0.2.2@dice-group/stable diff --git a/src/exec/tools/IDs2Hypertrie.cpp b/src/exec/tools/IDs2Hypertrie.cpp index 36a75dac..4152003e 100644 --- a/src/exec/tools/IDs2Hypertrie.cpp +++ b/src/exec/tools/IDs2Hypertrie.cpp @@ -12,6 +12,13 @@ #include namespace tentris::IDs2Hypertrie { + void writeNodeStatsTSVs(const auto &storage_3_uncompressed, const auto &storage_2_uncompressed, + const auto &storage_2_compressed, const auto &storage_1_uncompressed, + const auto &storage_1_compressed); + + void writeNodeCountComparisonTSVs(const auto &storage_2_uncompressed, const auto &storage_2_compressed, + const auto &storage_1_uncompressed, const auto &storage_1_compressed); + void loadIDsAndWriteOutStats(const std::string &csv_file_path); } int main(int argc, char *argv[]) { @@ -81,6 +88,24 @@ namespace tentris::IDs2Hypertrie { auto end = steady_clock::now(); auto duration = end - start; + constexpr static auto uncompressed = hypertrie::internal::raw::NodeCompression::uncompressed; + constexpr static auto compressed = hypertrie::internal::raw::NodeCompression::compressed; + + auto &storage = trie.context()->rawContext().storage; + + const auto &storage_3_uncompressed = storage.getNodeStorage<3UL, uncompressed>(); + const auto &storage_2_uncompressed = storage.getNodeStorage<2UL, uncompressed>(); + const auto &storage_2_compressed = storage.getNodeStorage<2UL, compressed>(); + const auto &storage_1_uncompressed = storage.getNodeStorage<1UL, uncompressed>(); + const auto &storage_1_compressed = storage.getNodeStorage<1UL, compressed>(); + + writeNodeStatsTSVs(storage_3_uncompressed, storage_2_uncompressed, storage_2_compressed, storage_1_uncompressed, + storage_1_compressed); + + writeNodeCountComparisonTSVs(storage_2_uncompressed, storage_2_compressed, storage_1_uncompressed, + storage_1_compressed); + + std::cerr << "## total ## \n" << "triples processed: {}\n"_format(count) << "triples loaded: {}\n"_format(trie.size()) @@ -90,7 +115,222 @@ namespace tentris::IDs2Hypertrie { (std::chrono::duration_cast(duration) % 60).count(), (std::chrono::duration_cast(duration) % 60).count(), (std::chrono::duration_cast(duration) % 1000).count(), - std::chrono::duration_cast(duration).count()); + std::chrono::duration_cast(duration).count()) + << "# hypertrie stats #\n" + << "depth 3 uncompressed nodes: {}\n"_format(storage_3_uncompressed.size()) + << "depth 2 uncompressed nodes: {}\n"_format(storage.getNodeStorage<2UL, uncompressed>().size()) + << "depth 2 compressed nodes: {}\n"_format(storage.getNodeStorage<2UL, compressed>().size()) + << "depth 1 uncompressed nodes: {}\n"_format(storage.getNodeStorage<1UL, uncompressed>().size()) + << "depth 1 compressed nodes: {}\n"_format(storage.getNodeStorage<1UL, compressed>().size()); } + void writeNodeCountComparisonTSVs(const auto &storage_2_uncompressed, const auto &storage_2_compressed, + const auto &storage_1_uncompressed, const auto &storage_1_compressed) { + { + std::ofstream tsv_depth_2_comp("depth_2_node_count_comparision.tsv"); + auto csv_writer = csv::make_tsv_writer(tsv_depth_2_comp); + + csv_writer << std::make_tuple("hypertrie_type", "uncompressed_nodes", "compressed_nodes"); + + { // baseline + size_t uc_nodes = [&]() { + size_t old_uc = 0; + for (auto[hash, node] : storage_2_uncompressed) + old_uc += node->ref_count(); + for (auto[hash, node] : storage_2_compressed) + old_uc += node->ref_count(); + return old_uc; + }(); + + size_t c_nodes = 0; + + csv_writer << std::make_tuple("baseline", uc_nodes, c_nodes); + } + + { // compression + size_t uc_nodes = [&]() { + size_t old_uc = 0; + for (auto[hash, node] : storage_2_uncompressed) + old_uc += node->ref_count(); + return old_uc; + }(); + + size_t c_nodes = [&]() { + size_t c_nodes = 0; + for (auto[hash, node] : storage_2_compressed) + c_nodes += node->ref_count(); + return c_nodes; + }(); + + csv_writer << std::make_tuple("compression", uc_nodes, c_nodes); + } + + { // hash + size_t uc_nodes = storage_2_uncompressed.size() + storage_2_compressed.size(); + + size_t c_nodes = 0; + + csv_writer << std::make_tuple("hash", uc_nodes, c_nodes); + } + + { // hash+compression and hash+compression+inline + size_t uc_nodes = storage_2_uncompressed.size(); + + size_t c_nodes = storage_2_compressed.size(); + + csv_writer << std::make_tuple("hash+compression", uc_nodes, c_nodes); + csv_writer << std::make_tuple("hash+compression+inline", uc_nodes, c_nodes); + } + } + + { + std::ofstream tsv_depth_1_comp("depth_1_node_count_comparision.tsv"); + auto csv_writer = csv::make_tsv_writer(tsv_depth_1_comp); + + csv_writer << std::make_tuple("hypertrie_type", "uncompressed_nodes", "compressed_nodes"); + + { // baseline + size_t c_depth2_nodes = [&]() { + size_t depth2nodes = 0; + for (auto[hash, node] : storage_2_uncompressed) + depth2nodes += node->ref_count(); + return depth2nodes; + }(); + + size_t depth1_nodes = [&]() { + size_t uc_nodes = 0; + for (auto[hash, node] : storage_1_uncompressed) + uc_nodes += node->ref_count(); + for (auto[hash, node] : storage_1_compressed) + uc_nodes += node->ref_count(); + return uc_nodes; + }(); + + size_t uc_nodes = c_depth2_nodes + (depth1_nodes / 2); + + size_t c_nodes = 0; + + csv_writer << std::make_tuple("baseline", uc_nodes, c_nodes); + } + + { // compression + size_t uc_nodes = [&]() { + size_t uc_nodes = 0; + for (auto[hash, node] : storage_1_uncompressed) + uc_nodes += node->ref_count(); + return uc_nodes; + }() / 2; + + size_t c_nodes = [&]() { + size_t x = 0; + for (auto[hash, node] : storage_1_compressed) + x += node->ref_count(); + return x; + }(); + c_nodes = c_nodes / 2; + + csv_writer << std::make_tuple("compression", uc_nodes, c_nodes); + } + + { // hash + using TensorHash = hypertrie::internal::raw::TensorHash; + + size_t compressed_nodes_count = [&]() { + robin_hood::unordered_set c_d1_hashes; + + // add the hashes from depth 1 compressed nodes. + for (auto[hash, node] : storage_1_compressed) + c_d1_hashes.insert(hash); + + // break apart the depth 2 compressed nodes and a Hash for each of both key parts + for (auto[hash, node] : storage_2_compressed) { + c_d1_hashes.insert( + TensorHash::getCompressedNodeHash<1, key_part_type>( + {node->key()[0]}, true)); + c_d1_hashes.insert( + TensorHash::getCompressedNodeHash<1, key_part_type>( + {node->key()[1]}, true)); + } + + return c_d1_hashes.size(); + }(); + + size_t uc_nodes = compressed_nodes_count + storage_1_uncompressed.size(); + + size_t c_nodes = 0; + + csv_writer << std::make_tuple("hash", uc_nodes, c_nodes); + } + + { // hash+compression and hash+compression+inline + size_t uc_nodes = storage_1_uncompressed.size(); + + size_t c_nodes = storage_1_compressed.size(); + + csv_writer << std::make_tuple("hash+compression", uc_nodes, c_nodes); + csv_writer << std::make_tuple("hash+compression+inline", uc_nodes, 0); + } + + } + } + + void writeNodeStatsTSVs(const auto &storage_3_uncompressed, const auto &storage_2_uncompressed, + const auto &storage_2_compressed, const auto &storage_1_uncompressed, + const auto &storage_1_compressed) { + auto extactCompressionTag = [](const auto &hash) { return (hash.isCompressed()) ? "c" : "u"; }; + + { + std::ofstream tsv_depth_3("depth_3_nodes_stats.tsv"); // Can also use ofstream, etc. + auto csv_writer = csv::make_tsv_writer(tsv_depth_3); + + + csv_writer + << std::make_tuple("node_type", "node_size", "dimension_1_size", "dimension_2_size", + "dimension_3_size", + "reference_count"); + for (auto[hash, node] : storage_3_uncompressed) { + csv_writer << std::make_tuple(extactCompressionTag(hash), node->size(), node->edges(0).size(), + node->edges(1).size(), + node->edges(2).size(), + node->ref_count()); + } + } + + { + std::ofstream tsv_depth_2("depth_2_nodes_stats.tsv"); // Can also use ofstream, etc. + auto csv_writer = csv::make_tsv_writer(tsv_depth_2); + + csv_writer + << std::make_tuple("node_type", "node_size", "dimension_1_size", "dimension_2_size", + "reference_count"); + + for (auto[hash, node] : storage_2_compressed) { + csv_writer << std::make_tuple(extactCompressionTag(hash), node->size(), 1, 1, node->ref_count()); + } + + for (auto[hash, node] : storage_2_uncompressed) { + csv_writer + << std::make_tuple(extactCompressionTag(hash), node->size(), node->edges(0).size(), + node->edges(1).size(), node->ref_count()); + } + } + + { + std::ofstream tsv_depth_1("depth_1_nodes_stats.tsv"); // Can also use ofstream, etc. + auto csv_writer = csv::make_tsv_writer(tsv_depth_1); + + csv_writer << std::make_tuple("node_type", "node_size", "dimension_1_size", "reference_count"); + + for (auto[hash, node] : storage_1_compressed) { + csv_writer << std::make_tuple(extactCompressionTag(hash), node->size(), 1, node->ref_count()); + } + + for (auto[hash, node] : storage_1_uncompressed) { + csv_writer << std::make_tuple(extactCompressionTag(hash), node->size(), node->edges(0).size(), + node->ref_count()); + } + } + } + + } \ No newline at end of file diff --git a/src/lib/tentris/store/TripleStore.hpp b/src/lib/tentris/store/TripleStore.hpp index 10410617..8023b936 100644 --- a/src/lib/tentris/store/TripleStore.hpp +++ b/src/lib/tentris/store/TripleStore.hpp @@ -71,7 +71,7 @@ namespace tentris::store { "Subject or predicate of the triple have a term type that is not allowed there."}; ++count; - if (trie.size() % bulk_size == 0) { + if (bulk_inserter.size() == bulk_size) { bulk_inserter.flush(); logDebug("{:>10.3} mio triples processed."_format(double(count)/1'000'000)); logDebug("{:>10.3} mio triples loaded."_format(double(trie.size())/1'000'000));