From b2458bd686e5bc75397fde4a25f3a8b6c42ab064 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Wed, 17 Jul 2024 06:29:34 +0800 Subject: [PATCH] StringView support in arrow-csv (#6062) * StringView support in arrow-csv * review and micro-benches --- arrow-csv/src/reader/mod.rs | 94 +++++++++++++++++++++++++++++++++---- arrow/benches/csv_reader.rs | 42 +++++++++++++++++ 2 files changed, 128 insertions(+), 8 deletions(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 9721349b0184..c5057599b822 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -795,6 +795,14 @@ fn parse( }) .collect::(), ) as ArrayRef), + DataType::Utf8View => Ok(Arc::new( + rows.iter() + .map(|row| { + let s = row.get(i); + (!null_regex.is_null(s)).then_some(s) + }) + .collect::(), + ) as ArrayRef), DataType::Dictionary(key_type, value_type) if value_type.as_ref() == &DataType::Utf8 => { @@ -2380,17 +2388,27 @@ mod tests { } fn err_test(csv: &[u8], expected: &str) { - let schema = Arc::new(Schema::new(vec![ + fn err_test_with_schema(csv: &[u8], expected: &str, schema: Arc) { + let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv)); + let b = ReaderBuilder::new(schema) + .with_batch_size(2) + .build_buffered(buffer) + .unwrap(); + let err = b.collect::, _>>().unwrap_err().to_string(); + assert_eq!(err, expected) + } + + let schema_utf8 = Arc::new(Schema::new(vec![ Field::new("text1", DataType::Utf8, true), Field::new("text2", DataType::Utf8, true), ])); - let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv)); - let b = ReaderBuilder::new(schema) - .with_batch_size(2) - .build_buffered(buffer) - .unwrap(); - let err = b.collect::, _>>().unwrap_err().to_string(); - assert_eq!(err, expected) + err_test_with_schema(csv, expected, schema_utf8); + + let schema_utf8view = Arc::new(Schema::new(vec![ + Field::new("text1", DataType::Utf8View, true), + Field::new("text2", DataType::Utf8View, true), + ])); + err_test_with_schema(csv, expected, schema_utf8view); } #[test] @@ -2587,4 +2605,64 @@ mod tests { &vec![2, 22] ); } + + #[test] + fn test_parse_string_view_single_column() { + let csv = ["foo", "something_cannot_be_inlined", "foobar"].join("\n"); + let schema = Arc::new(Schema::new(vec![Field::new( + "c1", + DataType::Utf8View, + true, + )])); + + let mut decoder = ReaderBuilder::new(schema).build_decoder(); + + let decoded = decoder.decode(csv.as_bytes()).unwrap(); + assert_eq!(decoded, csv.len()); + decoder.decode(&[]).unwrap(); + + let batch = decoder.flush().unwrap().unwrap(); + assert_eq!(batch.num_columns(), 1); + assert_eq!(batch.num_rows(), 3); + let col = batch.column(0).as_string_view(); + assert_eq!(col.data_type(), &DataType::Utf8View); + assert_eq!(col.value(0), "foo"); + assert_eq!(col.value(1), "something_cannot_be_inlined"); + assert_eq!(col.value(2), "foobar"); + } + + #[test] + fn test_parse_string_view_multi_column() { + let csv = ["foo,", ",something_cannot_be_inlined", "foobarfoobar,bar"].join("\n"); + let schema = Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Utf8View, true), + Field::new("c2", DataType::Utf8View, true), + ])); + + let mut decoder = ReaderBuilder::new(schema).build_decoder(); + + let decoded = decoder.decode(csv.as_bytes()).unwrap(); + assert_eq!(decoded, csv.len()); + decoder.decode(&[]).unwrap(); + + let batch = decoder.flush().unwrap().unwrap(); + assert_eq!(batch.num_columns(), 2); + assert_eq!(batch.num_rows(), 3); + let c1 = batch.column(0).as_string_view(); + let c2 = batch.column(1).as_string_view(); + assert_eq!(c1.data_type(), &DataType::Utf8View); + assert_eq!(c2.data_type(), &DataType::Utf8View); + + assert!(!c1.is_null(0)); + assert!(c1.is_null(1)); + assert!(!c1.is_null(2)); + assert_eq!(c1.value(0), "foo"); + assert_eq!(c1.value(2), "foobarfoobar"); + + assert!(c2.is_null(0)); + assert!(!c2.is_null(1)); + assert!(!c2.is_null(2)); + assert_eq!(c2.value(1), "something_cannot_be_inlined"); + assert_eq!(c2.value(2), "bar"); + } } diff --git a/arrow/benches/csv_reader.rs b/arrow/benches/csv_reader.rs index 38e091548be0..74a47ef892e0 100644 --- a/arrow/benches/csv_reader.rs +++ b/arrow/benches/csv_reader.rs @@ -21,6 +21,7 @@ extern crate criterion; use std::io::Cursor; use std::sync::Arc; +use arrow::util::bench_util::create_string_view_array_with_len; use criterion::*; use rand::Rng; @@ -59,6 +60,7 @@ fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { fn criterion_benchmark(c: &mut Criterion) { let mut rng = seedable_rng(); + // Single Primitive Column tests let values = Int32Array::from_iter_values((0..4096).map(|_| rng.gen_range(0..1024))); let cols = vec![Arc::new(values) as ArrayRef]; do_bench(c, "4096 i32_small(0)", cols); @@ -101,6 +103,7 @@ fn criterion_benchmark(c: &mut Criterion) { let cols = vec![Arc::new(values) as ArrayRef]; do_bench(c, "4096 f64(0)", cols); + // Single String Column tests let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 10)) as ArrayRef]; do_bench(c, "4096 string(10, 0)", cols); @@ -113,6 +116,20 @@ fn criterion_benchmark(c: &mut Criterion) { let cols = vec![Arc::new(create_string_array_with_len::(4096, 0.5, 100)) as ArrayRef]; do_bench(c, "4096 string(100, 0.5)", cols); + // Single StringView Column tests + let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 10, false)) as ArrayRef]; + do_bench(c, "4096 StringView(10, 0)", cols); + + let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as ArrayRef]; + do_bench(c, "4096 StringView(30, 0)", cols); + + let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 100, false)) as ArrayRef]; + do_bench(c, "4096 StringView(100, 0)", cols); + + let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0.5, 100, false)) as ArrayRef]; + do_bench(c, "4096 StringView(100, 0.5)", cols); + + // Multi-Column(with String) tests let cols = vec![ Arc::new(create_string_array_with_len::(4096, 0.5, 20)) as ArrayRef, Arc::new(create_string_array_with_len::(4096, 0., 30)) as ArrayRef, @@ -136,6 +153,31 @@ fn criterion_benchmark(c: &mut Criterion) { "4096 string(20, 0.5), string(30, 0), f64(0), i64(0)", cols, ); + + // Multi-Column(with StringView) tests + let cols = vec![ + Arc::new(create_string_view_array_with_len(4096, 0.5, 20, false)) as ArrayRef, + Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as ArrayRef, + Arc::new(create_string_view_array_with_len(4096, 0., 100, false)) as ArrayRef, + Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef, + ]; + do_bench( + c, + "4096 StringView(20, 0.5), StringView(30, 0), StringView(100, 0), i64(0)", + cols, + ); + + let cols = vec![ + Arc::new(create_string_view_array_with_len(4096, 0.5, 20, false)) as ArrayRef, + Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as ArrayRef, + Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef, + Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef, + ]; + do_bench( + c, + "4096 StringView(20, 0.5), StringView(30, 0), f64(0), i64(0)", + cols, + ); } criterion_group!(benches, criterion_benchmark);