From e35ea5794509c0ab6fa73cac2c859db8e268d975 Mon Sep 17 00:00:00 2001 From: Luc Grosheintz Date: Wed, 3 May 2023 21:28:56 +0200 Subject: [PATCH] Implement flexible strings. This commit implements serializing `std::string` to both variable (the default) and fixed length HDF5 strings. --- include/highfive/bits/H5Attribute_misc.hpp | 4 +- include/highfive/bits/H5Converter_misc.hpp | 259 +++++++++++++++++- include/highfive/bits/H5DataType_misc.hpp | 2 +- include/highfive/bits/H5Inspector_misc.hpp | 27 +- include/highfive/bits/H5ReadWrite_misc.hpp | 30 +- include/highfive/bits/H5Slice_traits_misc.hpp | 3 +- src/examples/read_write_std_strings.cpp | 114 ++++++++ tests/unit/tests_high_five_base.cpp | 224 +++++++++++++++ 8 files changed, 634 insertions(+), 29 deletions(-) create mode 100644 src/examples/read_write_std_strings.cpp diff --git a/include/highfive/bits/H5Attribute_misc.hpp b/include/highfive/bits/H5Attribute_misc.hpp index 5c52f7bbb..651678829 100644 --- a/include/highfive/bits/H5Attribute_misc.hpp +++ b/include/highfive/bits/H5Attribute_misc.hpp @@ -87,8 +87,10 @@ inline void Attribute::read(T& array) const { read(r.getPointer(), buffer_info.data_type); // re-arrange results r.unserialize(array); - auto t = create_datatype::base_type>(); + + auto t = buffer_info.data_type; auto c = t.getClass(); + if (c == DataTypeClass::VarLen || t.isVariableStr()) { #if H5_VERSION_GE(1, 12, 0) // This one have been created in 1.12.0 diff --git a/include/highfive/bits/H5Converter_misc.hpp b/include/highfive/bits/H5Converter_misc.hpp index 8f4411118..cdcfd246f 100644 --- a/include/highfive/bits/H5Converter_misc.hpp +++ b/include/highfive/bits/H5Converter_misc.hpp @@ -16,11 +16,23 @@ namespace HighFive { namespace details { +template +struct is_std_string { + static constexpr bool value = + std::is_same::base_type, std::string>::value; +}; + template -struct enable_shallow_copy: public std::enable_if::is_trivially_copyable, V> {}; +struct enable_shallow_copy + : public std::enable_if::value && inspector::is_trivially_copyable, V> {}; template -struct enable_deep_copy: public std::enable_if::is_trivially_copyable, V> {}; +struct enable_deep_copy + : public std::enable_if::value && !inspector::is_trivially_copyable, V> {}; + +template +struct enable_string_copy: public std::enable_if::value, V> {}; + template struct ShallowCopyBuffer { @@ -85,6 +97,234 @@ struct DeepCopyBuffer { std::vector dims; }; +enum class BufferMode { Read, Write }; + + +/// +/// \brief String length in bytes excluding the `\0`. +/// +inline size_t char_buffer_size(char const* const str, size_t max_string_length) { + for (size_t i = 0; i <= max_string_length; ++i) { + if (str[i] == '\0') { + return i; + } + } + + return max_string_length; +} + + +/// +/// \brief A buffer for reading/writing strings. +/// +/// A string in HDF5 can be represented as a fixed or variable length string. +/// The important difference for this buffer is that `H5D{read,write}` expects +/// different input depending on if the strings are fixed or variable length. +/// For fixed length strings, it expects an array of chars, i.e. one string +/// packed after the other contiguously. While for variable length strings it +/// expects a list of pointer to the beginning of each string. Variable length +/// string must be null-terminated; otherwise their length is unknown. +/// +/// This buffer hides the difference between fixed and variable length strings +/// by having internal datastructures available for both cases at compile time. +/// The choice which internal buffer to use is made at runtime. +/// +/// In a dataset with N fixed-length strings which each are M characters long, +/// the in-memory strings are copied into an internal buffer of size N*M. If +/// null- or space-padded the buffer should be filled with the appropriate +/// character. This is important if the in-memory strings are less than M +/// characters long. +/// +/// A dataset with N variable-length strings (all null-terminated) uses uses +/// the internal list of pointers to the beginning of each string. Those +/// pointers can either point to the in-memory string themselves, if those +/// strings are known to be null-terminated. Otherwise the in-memory string are +/// copied to an internal buffer of null-terminated strings; and the pointer +/// point to the start of the string in the internal buffer. +/// +/// This class is responsible for arranging the string properly before passing +/// the buffers to HDF5. To keep this class generic, it provides a generic +/// read/write interface to the internal strings, i.e. a pointer with a size. +/// For reading from the buffer the proxy is called `StringConstView`. This +/// proxy object is to be used by the `inspector` to copy from the buffer into +/// the final destination, e.g. an `std::string`. Similarly, there's a proxy +/// object for serializing into the buffer, i.e. the `StringView`. Again the +/// `inspector` is responsible for obtaining the pointer, size and padding of +/// the string. +/// +/// Nomenclature: +/// - size of a string is the number of bytes required to store the string, +/// including the null character for null-terminated strings. +/// +/// - length of a string is the number of bytes without the null character. +/// +/// Note: both 'length' and 'size' are counted in number of bytes, not number +/// of symbols or characters. Even for UTF8 strings. +template +struct StringBuffer { + using type = unqualified_t; + using hdf5_type = typename inspector::hdf5_type; + + class StringView { + public: + StringView(StringBuffer& _buffer, size_t _i) + : buffer(_buffer) + , i(_i) {} + + /// + /// \brief Assign the in-memory string to the buffer. + /// + /// This method copies the in-memory string to the appropriate + /// internal buffer as needed. + /// + /// The `length` is the length of the string in bytes. + void assign(char const* data, size_t length, StringPadding padding) { + if (buffer.isVariableLengthString()) { + if (padding == StringPadding::NullTerminated) { + buffer.variable_length_pointers[i] = data; + } else { + buffer.variable_length_buffer[i] = std::string(data, length); + buffer.variable_length_pointers[i] = buffer.variable_length_buffer[i].data(); + } + } else if (buffer.isFixedLengthString()) { + // If the buffer is fixed-length and null-terminated, then + // `buffer.string_length` doesn't include the null-character. + if (length > buffer.string_length) { + throw std::invalid_argument("String length too big."); + } + + memcpy(&buffer.fixed_length_buffer[i * buffer.string_size], data, length); + } + } + + private: + StringBuffer& buffer; + size_t i; + }; + + + class StringConstView { + public: + StringConstView(const StringBuffer& _buffer, size_t _i) + : buffer(_buffer) + , i(_i) {} + + char const* data() const { + if (buffer.isVariableLengthString()) { + return buffer.variable_length_pointers[i]; + } else { + return &buffer.fixed_length_buffer[i * buffer.string_size]; + } + } + + // Length of the string in bytes without null for null-terminated strings. + size_t length() const { + if (buffer.isNullTerminated()) { + return char_buffer_size(data(), buffer.string_length); + } else { + return buffer.string_length; + } + } + + private: + const StringBuffer& buffer; + size_t i; + }; + + + class Iterator { + public: + Iterator(StringBuffer& _buffer, size_t _pos) + : buffer(_buffer) + , pos(_pos) {} + + Iterator operator+(size_t n_strings) const { + return Iterator(buffer, pos + n_strings); + } + + void operator+=(size_t n_strings) { + pos += n_strings; + } + + StringView operator*() { + return StringView(buffer, pos); + } + + StringConstView operator*() const { + return StringConstView(buffer, pos); + } + + private: + StringBuffer& buffer; + size_t pos; + }; + + StringBuffer(std::vector _dims, const DataType& _file_datatype) + : file_datatype(_file_datatype.asStringType()) + , padding(file_datatype.getPadding()) + , string_size(file_datatype.isVariableStr() ? size_t(-1) : file_datatype.getSize()) + , string_length(string_size - size_t(isNullTerminated())) + , dims(_dims) { + if (string_size == 0 && isNullTerminated()) { + throw DataTypeException( + "Fixed-length, null-terminated need at least one byte to store the " + "null-character."); + } + + auto n_strings = compute_total_size(dims); + if (isVariableLengthString()) { + variable_length_buffer.resize(n_strings); + variable_length_pointers.resize(n_strings); + } else { + char pad = padding == StringPadding::SpacePadded ? ' ' : '\0'; + fixed_length_buffer.assign(n_strings * string_size, pad); + } + } + + bool isVariableLengthString() const { + return file_datatype.isVariableStr(); + } + + bool isFixedLengthString() const { + return file_datatype.isFixedLenStr(); + } + + bool isNullTerminated() const { + return file_datatype.getPadding() == StringPadding::NullTerminated; + } + + + void* getPointer() { + if (file_datatype.isVariableStr()) { + return variable_length_pointers.data(); + } else { + return fixed_length_buffer.data(); + } + } + + Iterator begin() { + return Iterator(*this, 0ul); + } + + void unserialize(T& val) { + inspector::unserialize(begin(), dims, val); + } + + private: + StringType file_datatype; + StringPadding padding; + size_t string_size; // Size of buffer required to store the string. + // Meaningful for fixed length strings only. + size_t string_length; // Semantic length of string. + std::vector dims; + + std::vector fixed_length_buffer; + std::vector variable_length_buffer; + std::vector< + typename std::conditional::type*> + variable_length_pointers; +}; + template struct Writer; @@ -107,6 +347,14 @@ struct Writer::type>: public DeepCopyBuffer { } }; +template +struct Writer::type>: public StringBuffer { + explicit Writer(const T& val, const DataType& _file_datatype) + : StringBuffer(inspector::getDimensions(val), _file_datatype) { + inspector::serialize(val, this->begin()); + } +}; + template struct Reader; @@ -133,6 +381,13 @@ struct Reader::type>: public DeepCopyBuffer { }; +template +struct Reader::type>: public StringBuffer { + public: + explicit Reader(const std::vector& _dims, const T& /* val */, const DataType& _file_datatype) + : StringBuffer(_dims, _file_datatype) {} +}; + struct data_converter { template static Writer serialize(const typename inspector::type& val, diff --git a/include/highfive/bits/H5DataType_misc.hpp b/include/highfive/bits/H5DataType_misc.hpp index 934d5f5e5..66ab02ecb 100644 --- a/include/highfive/bits/H5DataType_misc.hpp +++ b/include/highfive/bits/H5DataType_misc.hpp @@ -22,7 +22,7 @@ #include #endif -#include "H5Converter_misc.hpp" +#include "H5Inspector_misc.hpp" namespace HighFive { diff --git a/include/highfive/bits/H5Inspector_misc.hpp b/include/highfive/bits/H5Inspector_misc.hpp index c25bcfbd5..91c0719c0 100644 --- a/include/highfive/bits/H5Inspector_misc.hpp +++ b/include/highfive/bits/H5Inspector_misc.hpp @@ -1,17 +1,11 @@ -/* - * Copyright (c) 2022 Blue Brain Project - * - * Distributed under the Boost Software License, Version 1.0. - * (See accompanying file LICENSE_1_0.txt or copy at - * http://www.boost.org/LICENSE_1_0.txt) - * - */ - #pragma once #include #include #include +#include +#include +#include #include #include "../H5Reference.hpp" @@ -28,7 +22,9 @@ #include #endif + namespace HighFive { + namespace details { inline bool checkDimensions(const std::vector& dims, size_t n_dim_requested) { @@ -260,14 +256,15 @@ struct inspector: type_helper { throw DataSpaceException("A std::string cannot be written directly."); } - static void serialize(const type& val, hdf5_type* m) { - *m = val.c_str(); + template + static void serialize(const type& val, It m) { + (*m).assign(val.data(), val.size(), StringPadding::NullTerminated); } - static void unserialize(const hdf5_type* vec, - const std::vector& /* dims */, - type& val) { - val = vec[0]; + template + static void unserialize(const It& vec, const std::vector& /* dims */, type& val) { + const auto& view = *vec; + val.assign(view.data(), view.length()); } }; diff --git a/include/highfive/bits/H5ReadWrite_misc.hpp b/include/highfive/bits/H5ReadWrite_misc.hpp index 9da473b05..c8e736174 100644 --- a/include/highfive/bits/H5ReadWrite_misc.hpp +++ b/include/highfive/bits/H5ReadWrite_misc.hpp @@ -19,9 +19,13 @@ template using unqualified_t = typename std::remove_const::type>::type; // Find the type of an eventual char array, otherwise void -template +template struct type_char_array { - using type = void; + using type = typename std::conditional< + std::is_same::base_type, std::string>::value, + std::string, + void>::type; + static constexpr bool is_char_array = false; }; template @@ -29,6 +33,7 @@ struct type_char_array { using type = typename std::conditional, char>::value, char*, typename type_char_array::type>::type; + static constexpr bool is_char_array = true; }; template @@ -36,6 +41,7 @@ struct type_char_array { using type = typename std::conditional, char>::value, char[N], typename type_char_array::type>::type; + static constexpr bool is_char_array = true; }; template @@ -43,7 +49,7 @@ struct BufferInfo { using type_no_const = typename std::remove_const::type; using elem_type = typename details::inspector::base_type; using char_array_t = typename details::type_char_array::type; - static constexpr bool is_char_array = !std::is_same::value; + static constexpr bool is_char_array = details::type_char_array::is_char_array; enum Operation { read, write }; const Operation op; @@ -85,6 +91,16 @@ struct string_type_checker { } }; +template <> +struct string_type_checker { + inline static DataType getDataType(const DataType&, const DataType& file_datatype) { + // The StringBuffer ensures that the data is transformed such that it + // matches the datatype of the dataset, i.e. `file_datatype` and + // `mem_datatype` are the same. + return file_datatype; + } +}; + template struct string_type_checker { inline static DataType getDataType(const DataType& element_type, const DataType& dtype) { @@ -98,8 +114,9 @@ struct string_type_checker { template <> struct string_type_checker { inline static DataType getDataType(const DataType&, const DataType& dtype) { - if (dtype.isFixedLenStr()) + if (dtype.isFixedLenStr()) { throw DataSetException("Can't output variable-length to fixed-length strings"); + } DataType return_type = AtomicType(); enforce_ascii_hack(return_type, dtype); return return_type; @@ -116,11 +133,6 @@ BufferInfo::BufferInfo(const DataType& dtype, F getName, Operation _op) ((is_fixed_len_string && is_char_array) ? 1 : 0)) , data_type( string_type_checker::getDataType(create_datatype(), dtype)) { - if (is_fixed_len_string && std::is_same::value) { - throw DataSetException( - "Can't output std::string as fixed-length. " - "Use raw arrays or FixedLenStringArray"); - } // We warn. In case they are really not convertible an exception will rise on read/write if (dtype.getClass() != data_type.getClass()) { HIGHFIVE_LOG_WARN(getName() + "\": data and hdf5 dataset have different types: " + diff --git a/include/highfive/bits/H5Slice_traits_misc.hpp b/include/highfive/bits/H5Slice_traits_misc.hpp index a5dc9ee37..7b07c9abf 100644 --- a/include/highfive/bits/H5Slice_traits_misc.hpp +++ b/include/highfive/bits/H5Slice_traits_misc.hpp @@ -199,7 +199,8 @@ inline void SliceTraits::read(T& array, const DataTransferProps& xfer_ read(r.getPointer(), buffer_info.data_type, xfer_props); // re-arrange results r.unserialize(array); - auto t = create_datatype::base_type>(); + + auto t = buffer_info.data_type; auto c = t.getClass(); if (c == DataTypeClass::VarLen || t.isVariableStr()) { #if H5_VERSION_GE(1, 12, 0) diff --git a/src/examples/read_write_std_strings.cpp b/src/examples/read_write_std_strings.cpp new file mode 100644 index 000000000..7699e0c0c --- /dev/null +++ b/src/examples/read_write_std_strings.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c), 2023, Blue Brain Project, EPFL + * + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + * + */ +#include +#include +#include + +#include +#include +#include + +using namespace HighFive; + +// This example shows how to write (containers of) `std::string` +// to dataset either as fixed or variable length HDF5 strings. +// The feature is available from 2.8.0 onwards. +int main(void) { + auto file = File("read_write_std_string.h5", File::Truncate); + + // A string of length 3 in a buffer of size 4 bytes. We'll use "length" for + // the semantic length of the string, i.e. excluding the '\0' character and + // "size" to refer to the length of the buffer in which the string is stored. + // For null-terminated strings, the `size == length + 1`. + std::string ascii_string = "foo"; + auto scalar_dataspace = DataSpace(DataSpace::dataspace_scalar); + + // Just write the string: + file.createDataSet("single_automatic", ascii_string); + + // The above results in writing the string as an HDF5 variable length UTF8 + // string. In HDF5 a variable length string doesn't specify the length of + // the string. Variable length strings are always null-terminated. + auto variable_stringtype = VariableLengthStringType(); + file.createDataSet("single_variable", scalar_dataspace, variable_stringtype) + .write(ascii_string); + + // HDF5 also has the concept of fixed length string. In fixed length strings + // the size of the string, in bytes, is part of the datatype. The HDF5 API + // for fixed and variable length strings is distinct. Hence, when writing + // string that need to be read by other programs, it can matter if the string + // is stored as fixed or variable length. + // + // Important: The HDF5 string size is the size of the buffer required to + // store the string. + // + // We know that ascii_string requires 4 bytes to store, but want to store + // it in fixed length strings of length 8. Additionally, we promise that + // the strings are null-terminated. The character set defaults to ASCII. + auto fixed_stringtype = FixedLengthStringType(8, StringPadding::NullTerminated); + file.createDataSet("single_fixed_nullterm", scalar_dataspace, fixed_stringtype) + .write(ascii_string); + + // When reading into an `std::string` it doesn't matter if the HDF5 datatype + // is fixed or variable length. HighFive will internally read into a buffer + // and then write to the final destination. + auto from_variable = file.getDataSet("single_variable").read(); + auto from_fixed = file.getDataSet("single_fixed_nullterm").read(); + + // Note that because the fixed length string is null-terminated, + // `from_fixed.size() == ascii_string.size()` despite it being stored as a string of + // length 8. + std::cout << "from_variable = '" << from_variable << "' size = " << from_variable.size() + << "\n"; + std::cout << "from_fixed = '" << from_fixed << "' size = " << from_fixed.size() << "\n"; + + // Fixed-length string don't have to be null-terminated. Their length could + // be defined simply by the known size of the buffer required to store the + // string. To deal with the situation where the string is shorter than the + // buffer, one defines a padding character. This must be either the null or + // space character. We'll show null-padded, space-padded works the same way. + auto fixed_nullpad = FixedLengthStringType(8, StringPadding::NullPadded); + file.createDataSet("single_fixed_nullpad", scalar_dataspace, fixed_nullpad).write(ascii_string); + + // Note that because we only know that the string is padded with nulls, but we + // don't know if those nulls were part of the string to begin with. The full + // size of the buffer is read into the `std::string`. The length of the + // `std::string` is the size of the string type. + auto from_nullpad = file.getDataSet("single_fixed_nullpad").read(); + std::cout << "from_nullpad = '" << from_nullpad << "' size = " << from_nullpad.size() << "\n"; + + // Let's look at UTF8 strings. In HDF5 the size of a string is the size in + // bytes of the buffer required to store the string. A UTF8 symbol/character + // requires 1 to 4 bytes. + // + // The 'a' is 1 byte, the 'α' 2 bytes, therefore a total of 3 bytes (same + // as `utf8_string.size()`). Which including the null character fits into + // 8 bytes. However, 8 bytes would, in general not be enough to store 2 + // UTF8 characters and the null character. Which would require 9 bytes. + std::string utf8_string = "aα"; + auto fixed_utf8_type = + FixedLengthStringType(8, StringPadding::NullTerminated, CharacterSet::Utf8); + file.createDataSet("single_fixed_utf8", scalar_dataspace, fixed_utf8_type).write(utf8_string); + + auto from_utf8 = file.getDataSet("single_fixed_utf8").read(); + std::cout << "from_utf8 = '" << from_utf8 << "' size = " << from_utf8.size() << "\n"; + + // Finally, containers of `std::string`s work analogously: + auto ascii_strings = std::vector{"123", "456"}; + file.createDataSet("multi_fixed_nullterm", DataSpace::From(ascii_strings), fixed_stringtype) + .write(ascii_strings); + + auto ascii_strings_from_fixed = + file.getDataSet("multi_fixed_nullterm").read>(); + + // In order to see details of how each is stored in the HDF5 file use: + // h5dump read_write_std_string.h5 + + return 0; +} diff --git a/tests/unit/tests_high_five_base.cpp b/tests/unit/tests_high_five_base.cpp index bba8ded88..a433b3f56 100644 --- a/tests/unit/tests_high_five_base.cpp +++ b/tests/unit/tests_high_five_base.cpp @@ -2928,6 +2928,229 @@ TEST_CASE("HighFiveReadType") { CHECK(t4 == t3); } +class ForwardToAttribute { + public: + ForwardToAttribute(const HighFive::File& file) + : _file(file) {} + + template + HighFive::Attribute create(const std::string& name, const T& value) { + return _file.createAttribute(name, value); + } + + HighFive::Attribute create(const std::string& name, + const HighFive::DataSpace filespace, + const HighFive::DataType& datatype) { + return _file.createAttribute(name, filespace, datatype); + } + + HighFive::Attribute get(const std::string& name) { + return _file.getAttribute(name); + } + + private: + HighFive::File _file; +}; + +class ForwardToDataSet { + public: + ForwardToDataSet(const HighFive::File& file) + : _file(file) {} + + template + HighFive::DataSet create(const std::string& name, const T& value) { + return _file.createDataSet(name, value); + } + + HighFive::DataSet create(const std::string& name, + const HighFive::DataSpace filespace, + const HighFive::DataType& datatype) { + return _file.createDataSet(name, filespace, datatype); + } + + HighFive::DataSet get(const std::string& name) { + return _file.getDataSet(name); + } + + private: + HighFive::File _file; +}; + +template +void check_single_string(Proxy proxy, size_t string_length) { + auto value = std::string(string_length, 'o'); + auto dataspace = DataSpace::From(value); + + auto n_chars = value.size() + 1; + auto n_chars_overlength = n_chars + 10; + auto fixed_length = FixedLengthStringType(n_chars, StringPadding::NullTerminated); + auto overlength_nullterm = FixedLengthStringType(n_chars_overlength, + StringPadding::NullTerminated); + auto overlength_nullpad = FixedLengthStringType(n_chars_overlength, StringPadding::NullPadded); + auto overlength_spacepad = FixedLengthStringType(n_chars_overlength, + StringPadding::SpacePadded); + auto variable_length = VariableLengthStringType(); + + SECTION("automatic") { + proxy.create("auto", value); + REQUIRE(proxy.get("auto").template read() == value); + } + + SECTION("fixed length") { + proxy.create("fixed", dataspace, fixed_length).write(value); + REQUIRE(proxy.get("fixed").template read() == value); + } + + SECTION("overlength null-terminated") { + proxy.create("overlength_nullterm", dataspace, overlength_nullterm).write(value); + REQUIRE(proxy.get("overlength_nullterm").template read() == value); + } + + SECTION("overlength null-padded") { + proxy.create("overlength_nullterm", dataspace, overlength_nullpad).write(value); + auto expected = std::string(n_chars_overlength, '\0'); + expected.replace(0, value.size(), value.data()); + REQUIRE(proxy.get("overlength_nullterm").template read() == expected); + } + + SECTION("overlength space-padded") { + proxy.create("overlength_nullterm", dataspace, overlength_spacepad).write(value); + auto expected = std::string(n_chars_overlength, ' '); + expected.replace(0, value.size(), value.data()); + REQUIRE(proxy.get("overlength_nullterm").template read() == expected); + } + + SECTION("variable length") { + proxy.create("variable", dataspace, variable_length).write(value); + REQUIRE(proxy.get("variable").template read() == value); + } +} + +template +void check_multiple_string(Proxy proxy, size_t string_length) { + using value_t = std::vector; + auto value = value_t{std::string(string_length, 'o'), std::string(string_length, 'x')}; + + auto dataspace = DataSpace::From(value); + + auto string_overlength = string_length + 10; + auto onpoint_nullpad = FixedLengthStringType(string_length, StringPadding::NullPadded); + auto onpoint_spacepad = FixedLengthStringType(string_length, StringPadding::SpacePadded); + + auto overlength_nullterm = FixedLengthStringType(string_overlength, + StringPadding::NullTerminated); + auto overlength_nullpad = FixedLengthStringType(string_overlength, StringPadding::NullPadded); + auto overlength_spacepad = FixedLengthStringType(string_overlength, StringPadding::SpacePadded); + auto variable_length = VariableLengthStringType(); + + auto check = [](const value_t actual, const value_t& expected) { + REQUIRE(actual.size() == expected.size()); + for (size_t i = 0; i < actual.size(); ++i) { + REQUIRE(actual[i] == expected[i]); + } + }; + + SECTION("automatic") { + proxy.create("auto", value); + check(proxy.get("auto").template read(), value); + } + + SECTION("variable length") { + proxy.create("variable", dataspace, variable_length).write(value); + check(proxy.get("variable").template read(), value); + } + + auto make_padded_reference = [&](char pad, size_t n) { + auto expected = std::vector(value.size(), std::string(n, pad)); + for (size_t i = 0; i < value.size(); ++i) { + expected[i].replace(0, value[i].size(), value[i].data()); + } + + return expected; + }; + + auto check_fixed_length = [&](const std::string& label, size_t length) { + SECTION(label + " null-terminated") { + auto datatype = FixedLengthStringType(length + 1, StringPadding::NullTerminated); + proxy.create(label + "_nullterm", dataspace, datatype).write(value); + check(proxy.get(label + "_nullterm").template read(), value); + } + + SECTION(label + " null-padded") { + auto datatype = FixedLengthStringType(length, StringPadding::NullPadded); + proxy.create(label + "_nullpad", dataspace, datatype).write(value); + auto expected = make_padded_reference('\0', length); + check(proxy.get(label + "_nullpad").template read(), expected); + } + + SECTION(label + " space-padded") { + auto datatype = FixedLengthStringType(length, StringPadding::SpacePadded); + proxy.create(label + "_spacepad", dataspace, datatype).write(value); + auto expected = make_padded_reference(' ', length); + check(proxy.get(label + "_spacepad").template read(), expected); + } + }; + + check_fixed_length("onpoint", string_length); + check_fixed_length("overlength", string_length + 5); + + + SECTION("underlength null-terminated") { + auto datatype = FixedLengthStringType(string_length, StringPadding::NullTerminated); + REQUIRE_THROWS(proxy.create("underlength_nullterm", dataspace, datatype).write(value)); + } + + SECTION("underlength nullpad") { + auto datatype = FixedLengthStringType(string_length - 1, StringPadding::NullPadded); + REQUIRE_THROWS(proxy.create("underlength_nullpad", dataspace, datatype).write(value)); + } + + SECTION("underlength spacepad") { + auto datatype = FixedLengthStringType(string_length - 1, StringPadding::NullTerminated); + REQUIRE_THROWS(proxy.create("underlength_spacepad", dataspace, datatype).write(value)); + } +} + +TEST_CASE("HighFiveSTDString (dataset, single, short)") { + File file("std_string_dataset_single_short.h5", File::Truncate); + check_single_string(ForwardToDataSet(file), 3); +} + +TEST_CASE("HighFiveSTDString (attribute, single, short)") { + File file("std_string_attribute_single_short.h5", File::Truncate); + check_single_string(ForwardToAttribute(file), 3); +} + +TEST_CASE("HighFiveSTDString (dataset, single, long)") { + File file("std_string_dataset_single_long.h5", File::Truncate); + check_single_string(ForwardToDataSet(file), 256); +} + +TEST_CASE("HighFiveSTDString (attribute, single, long)") { + File file("std_string_attribute_single_long.h5", File::Truncate); + check_single_string(ForwardToAttribute(file), 256); +} + +TEST_CASE("HighFiveSTDString (dataset, multiple, short)") { + File file("std_string_dataset_multiple_short.h5", File::Truncate); + check_multiple_string(ForwardToDataSet(file), 3); +} + +TEST_CASE("HighFiveSTDString (attribute, multiple, short)") { + File file("std_string_attribute_multiple_short.h5", File::Truncate); + check_multiple_string(ForwardToAttribute(file), 3); +} + +TEST_CASE("HighFiveSTDString (dataset, multiple, long)") { + File file("std_string_dataset_multiple_short.h5", File::Truncate); + check_multiple_string(ForwardToDataSet(file), 256); +} + +TEST_CASE("HighFiveSTDString (attribute, multiple, long)") { + File file("std_string_attribute_multiple_short.h5", File::Truncate); + check_multiple_string(ForwardToAttribute(file), 256); +} + TEST_CASE("HighFiveFixedString") { const std::string file_name("array_atomic_types.h5"); const std::string group_1("group1"); @@ -2962,6 +3185,7 @@ TEST_CASE("HighFiveFixedString") { file.createDataSet("ds4", DataSpace(2)).write(strings_fixed); } + { // Cant convert flex-length to fixed-length const char* buffer[] = {"abcd", "1234"}; SilenceHDF5 silencer;