diff --git a/include/silo/preprocessing/metadata_info.h b/include/silo/preprocessing/metadata_info.h index 6457def10..d85c51090 100644 --- a/include/silo/preprocessing/metadata_info.h +++ b/include/silo/preprocessing/metadata_info.h @@ -15,6 +15,7 @@ class MetadataInfo { const silo::config::DatabaseConfig& database_config ); + static bool isNdjsonFileEmpty(const std::filesystem::path& ndjson_file); static void validateNdjsonFile( const std::filesystem::path& ndjson_file, const silo::config::DatabaseConfig& database_config diff --git a/src/silo/preprocessing/metadata_info.cpp b/src/silo/preprocessing/metadata_info.cpp index dd3c59db1..e2993ac43 100644 --- a/src/silo/preprocessing/metadata_info.cpp +++ b/src/silo/preprocessing/metadata_info.cpp @@ -85,6 +85,21 @@ void MetadataInfo::validateMetadataFile( } } +bool MetadataInfo::isNdjsonFileEmpty(const std::filesystem::path& ndjson_file) { + duckdb::DuckDB duck_db(nullptr); + duckdb::Connection connection(duck_db); + + auto result = connection.Query(fmt::format( + "SELECT COUNT(*) " + "FROM read_json_auto(\"{}\");", + ndjson_file.string() + )); + + auto row_count_value = result->GetValue(0, 0); + const int64_t row_count = duckdb::BigIntValue::Get(row_count_value); + return row_count == 0; +} + void MetadataInfo::validateNdjsonFile( const std::filesystem::path& ndjson_file, const silo::config::DatabaseConfig& database_config diff --git a/src/silo/preprocessing/preprocessor.cpp b/src/silo/preprocessing/preprocessor.cpp index cc9aca88b..953286ce1 100644 --- a/src/silo/preprocessing/preprocessor.cpp +++ b/src/silo/preprocessing/preprocessor.cpp @@ -134,7 +134,7 @@ void Preprocessor::buildTablesFromNdjsonInput(const std::filesystem::path& file_ ); } - if (std::filesystem::is_empty(file_name)) { + if (MetadataInfo::isNdjsonFileEmpty(file_name)) { SPDLOG_WARN( "The specified input file {} is empty. Ignoring its content.", file_name.string() ); @@ -325,7 +325,7 @@ void Preprocessor::createPartitionedSequenceTablesFromNdjson(const std::filesyst void Preprocessor::createAlignedPartitionedSequenceViews(const std::filesystem::path& file_name) { std::string file_reader_sql; - if (std::filesystem::is_empty(file_name)) { + if (MetadataInfo::isNdjsonFileEmpty(file_name)) { file_reader_sql = fmt::format( "SELECT ''::VARCHAR AS key, 'NULL'::VARCHAR AS partition_key {} {} {} {} {} LIMIT 0", boost::join(silo::prepend(", ''::VARCHAR AS ", prefixed_nuc_sequences), ""), @@ -419,7 +419,7 @@ void Preprocessor::createAlignedPartitionedSequenceViews(const std::filesystem:: void Preprocessor::createUnalignedPartitionedSequenceFiles(const std::filesystem::path& file_name) { for (const auto& [seq_name, _] : reference_genomes_.raw_nucleotide_sequences) { const std::string file_reader_sql = - std::filesystem::is_empty(file_name) + MetadataInfo::isNdjsonFileEmpty(file_name) ? fmt::format( "SELECT ''::VARCHAR AS key, 'NULL'::VARCHAR as partition_key," " ''::VARCHAR AS unaligned_nuc_{} LIMIT 0", diff --git a/src/silo/preprocessing/sequence_info.cpp b/src/silo/preprocessing/sequence_info.cpp index 42397aba1..a4e6f93cf 100644 --- a/src/silo/preprocessing/sequence_info.cpp +++ b/src/silo/preprocessing/sequence_info.cpp @@ -2,6 +2,7 @@ #include +#include "silo/preprocessing/metadata_info.h" #include "silo/preprocessing/preprocessing_database.h" #include "silo/preprocessing/preprocessing_exception.h" #include "silo/preprocessing/sql_function.h" @@ -106,7 +107,7 @@ void SequenceInfo::validateNdjsonFile( duckdb::Connection& connection, const std::filesystem::path& input_filename ) { - if (std::filesystem::is_empty(input_filename)) { + if (MetadataInfo::isNdjsonFileEmpty(input_filename)) { return; } diff --git a/testBaseData/emptyInputNdjson/input_file.ndjson b/testBaseData/emptyInputNdjson/input_file.ndjson index e69de29bb..8b1378917 100644 --- a/testBaseData/emptyInputNdjson/input_file.ndjson +++ b/testBaseData/emptyInputNdjson/input_file.ndjson @@ -0,0 +1 @@ +