-
Notifications
You must be signed in to change notification settings - Fork 901
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix errors in chunked ORC writer when no tables were (successfully) written #15393
Changes from all commits
e929f60
98d5668
1a86089
d92ae7e
47a6e38
c572430
633ed55
04fb8a4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -227,6 +227,14 @@ struct encoded_footer_statistics { | |
std::vector<ColStatsBlob> file_level; | ||
}; | ||
|
||
enum class writer_state { | ||
NO_DATA_WRITTEN, // No table data has been written to the sink; if the writer is closed or | ||
// destroyed in this state, it should not write the footer. | ||
Comment on lines
+231
to
+232
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But does it write the magic? Please clarify. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can't guarantee that the magic header is not written if the writer throws after the encode, e.g. while writing to the sink. In this case both the writer and the file are in invalid states (and thus we should not try to write the footer). Your strong exception guarantee only covers the encode. |
||
DATA_WRITTEN, // At least one table has been written to the sink; when the writer is closed, | ||
// it should write the footer. | ||
CLOSED // Writer has been closed; no further writes are allowed. | ||
}; | ||
|
||
/** | ||
* @brief Implementation for ORC writer | ||
*/ | ||
|
@@ -266,11 +274,6 @@ class writer::impl { | |
*/ | ||
~impl(); | ||
|
||
/** | ||
* @brief Begins the chunked/streamed write process. | ||
*/ | ||
void init_state(); | ||
|
||
/** | ||
* @brief Writes a single subtable as part of a larger ORC file/table write. | ||
* | ||
|
@@ -283,11 +286,6 @@ class writer::impl { | |
*/ | ||
void close(); | ||
|
||
/** | ||
* @brief Skip writing the footer when closing/deleting the writer. | ||
*/ | ||
void skip_close() { _closed = true; } | ||
|
||
private: | ||
/** | ||
* @brief Write the intermediate ORC data into the data sink. | ||
|
@@ -363,7 +361,7 @@ class writer::impl { | |
Footer _footer; | ||
Metadata _orc_meta; | ||
persisted_statistics _persisted_stripe_statistics; // Statistics data saved between calls. | ||
bool _closed = false; // To track if the output has been written to sink. | ||
writer_state _state = writer_state::NO_DATA_WRITTEN; | ||
}; | ||
|
||
} // namespace cudf::io::orc::detail |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -28,6 +28,7 @@ | |||||
#include <cudf/concatenate.hpp> | ||||||
#include <cudf/copying.hpp> | ||||||
#include <cudf/detail/iterator.cuh> | ||||||
#include <cudf/io/data_sink.hpp> | ||||||
#include <cudf/io/orc.hpp> | ||||||
#include <cudf/io/orc_metadata.hpp> | ||||||
#include <cudf/strings/strings_column_view.hpp> | ||||||
|
@@ -2100,8 +2101,7 @@ TEST_F(OrcWriterTest, BounceBufferBug) | |||||
auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100; }); | ||||||
|
||||||
constexpr auto num_rows = 150000; | ||||||
column_wrapper<int8_t, typename decltype(sequence)::value_type> col(sequence, | ||||||
sequence + num_rows); | ||||||
column_wrapper<int8_t> col(sequence, sequence + num_rows); | ||||||
table_view expected({col}); | ||||||
|
||||||
auto filepath = temp_env->get_temp_filepath("BounceBufferBug.orc"); | ||||||
|
@@ -2120,8 +2120,7 @@ TEST_F(OrcReaderTest, SizeTypeRowsOverflow) | |||||
static_assert(total_rows > std::numeric_limits<cudf::size_type>::max()); | ||||||
|
||||||
auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 127; }); | ||||||
column_wrapper<int8_t, typename decltype(sequence)::value_type> col(sequence, | ||||||
sequence + num_rows); | ||||||
column_wrapper<int8_t> col(sequence, sequence + num_rows); | ||||||
table_view chunk_table({col}); | ||||||
|
||||||
std::vector<char> out_buffer; | ||||||
|
@@ -2169,4 +2168,55 @@ TEST_F(OrcReaderTest, SizeTypeRowsOverflow) | |||||
CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_with_stripe_selection->view()); | ||||||
} | ||||||
|
||||||
TEST_F(OrcChunkedWriterTest, NoWriteCloseNotThrow) | ||||||
{ | ||||||
std::vector<char> out_buffer; | ||||||
|
||||||
cudf::io::chunked_orc_writer_options write_opts = | ||||||
cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&out_buffer}); | ||||||
auto writer = cudf::io::orc_chunked_writer(write_opts); | ||||||
|
||||||
EXPECT_NO_THROW(writer.close()); | ||||||
} | ||||||
|
||||||
TEST_F(OrcChunkedWriterTest, FailedWriteCloseNotThrow) | ||||||
{ | ||||||
// A sink that throws on write() | ||||||
class throw_sink : public cudf::io::data_sink { | ||||||
public: | ||||||
void host_write(void const* data, size_t size) override { throw std::runtime_error("write"); } | ||||||
void flush() override {} | ||||||
size_t bytes_written() override { return 0; } | ||||||
}; | ||||||
|
||||||
auto sequence = thrust::make_counting_iterator(0); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
column_wrapper<int8_t> col(sequence, sequence + 10); | ||||||
table_view table({col}); | ||||||
|
||||||
throw_sink sink; | ||||||
cudf::io::chunked_orc_writer_options write_opts = | ||||||
cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&sink}); | ||||||
auto writer = cudf::io::orc_chunked_writer(write_opts); | ||||||
|
||||||
try { | ||||||
writer.write(table); | ||||||
} catch (...) { | ||||||
// ignore the exception; we're testing that close() doesn't throw when the only write() fails | ||||||
} | ||||||
|
||||||
EXPECT_NO_THROW(writer.close()); | ||||||
} | ||||||
|
||||||
TEST_F(OrcChunkedWriterTest, NoDataInSinkWhenNoWrite) | ||||||
{ | ||||||
std::vector<char> out_buffer; | ||||||
|
||||||
cudf::io::chunked_orc_writer_options write_opts = | ||||||
cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&out_buffer}); | ||||||
auto writer = cudf::io::orc_chunked_writer(write_opts); | ||||||
|
||||||
EXPECT_NO_THROW(writer.close()); | ||||||
EXPECT_EQ(out_buffer.size(), 0); | ||||||
} | ||||||
|
||||||
CUDF_TEST_PROGRAM_MAIN() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So no more exception throwing right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this fixes the exception (i.e. terminate) in close when footer is empty. this happens when no write calls are made with a chunked writer, and when all write calls threw.
Not sure if this answers the question.