From 0742edddce26df26c10bddf53f9450a420e4e77c Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Mon, 16 Aug 2021 10:47:04 +0100 Subject: [PATCH] Moved CSV and JSON testing away from using files. (#290) --- .github/workflows/test.yml | 4 +- src/io/json/read/infer_schema.rs | 29 +- src/io/json/read/reader.rs | 20 +- test/data/arrays.json | 3 - test/data/basic.json | 12 - test/data/basic_nulls.json | 12 - test/data/integration.json | 808 ------------------- test/data/list_string_dict_nested_nulls.json | 3 - test/data/mixed_arrays.json | 4 - test/data/mixed_arrays.json.gz | Bin 141 -> 0 bytes test/data/nested_structs.json | 4 - test/data/null_test.csv | 6 - test/data/uk_cities.csv | 37 - test/data/uk_cities_with_headers.csv | 38 - test/data/various_types.csv | 6 - test/data/various_types_invalid.csv | 6 - tests/it/io/csv/read.rs | 19 +- tests/it/io/json/mod.rs | 197 ++++- tests/it/io/json/read.rs | 587 +++----------- 19 files changed, 335 insertions(+), 1460 deletions(-) delete mode 100644 test/data/arrays.json delete mode 100644 test/data/basic.json delete mode 100644 test/data/basic_nulls.json delete mode 100644 test/data/integration.json delete mode 100644 test/data/list_string_dict_nested_nulls.json delete mode 100644 test/data/mixed_arrays.json delete mode 100644 test/data/mixed_arrays.json.gz delete mode 100644 test/data/nested_structs.json delete mode 100644 test/data/null_test.csv delete mode 100644 test/data/uk_cities.csv delete mode 100644 test/data/uk_cities_with_headers.csv delete mode 100644 test/data/various_types.csv delete mode 100644 test/data/various_types_invalid.csv diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 987589d9f30..d31a4cc3697 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -159,11 +159,11 @@ jobs: env: RUST_BACKTRACE: full RUST_LOG: 'trace' - # --skip io: miri does not handle IO very well, unfortunately. + # --skip io: miri can't handle opening of files, so we skip those run: | cargo miri setup cargo clean - cargo miri test -- --skip io + cargo miri test -- --skip io::parquet --skip io::ipc coverage: name: Coverage diff --git a/src/io/json/read/infer_schema.rs b/src/io/json/read/infer_schema.rs index 38082ded789..0ae6a8c6681 100644 --- a/src/io/json/read/infer_schema.rs +++ b/src/io/json/read/infer_schema.rs @@ -170,20 +170,18 @@ fn generate_schema(spec: HashMap>) -> Result { /// /// # Examples /// ``` -/// use std::fs::File; -/// use std::io::{BufReader, SeekFrom, Seek}; -/// use flate2::read::GzDecoder; +/// use std::io::{BufReader, Cursor, SeekFrom, Seek}; /// use arrow2::io::json::infer_json_schema; /// -/// let mut file = File::open("test/data/mixed_arrays.json.gz").unwrap(); +/// let data = r#"{"a":1, "b":[2.0, 1.3, -6.1], "c":[false, true], "d":4.1} +/// {"a":-10, "b":[2.0, 1.3, -6.1], "c":null, "d":null} +/// {"a":2, "b":[2.0, null, -6.1], "c":[false, null], "d":"text"} +/// {"a":3, "b":4, "c": true, "d":[1, false, "array", 2.4]} +/// "#; /// /// // file's cursor's offset at 0 -/// let mut reader = BufReader::new(GzDecoder::new(&file)); +/// let mut reader = BufReader::new(Cursor::new(data)); /// let inferred_schema = infer_json_schema(&mut reader, None).unwrap(); -/// // cursor's offset at end of file -/// -/// // seek back to start so that the original file is usable again -/// file.seek(SeekFrom::Start(0)).unwrap(); /// ``` pub fn infer_json_schema( reader: &mut BufReader, @@ -345,14 +343,17 @@ where /// # Examples /// ``` /// use std::fs::File; -/// use std::io::BufReader; +/// use std::io::{BufReader, Cursor}; /// use arrow2::io::json::infer_json_schema_from_seekable; /// -/// let file = File::open("test/data/mixed_arrays.json").unwrap(); -/// // file's cursor's offset at 0 -/// let mut reader = BufReader::new(file); +/// let data = r#"{"a":1, "b":[2.0, 1.3, -6.1], "c":[false, true], "d":4.1} +/// {"a":-10, "b":[2.0, 1.3, -6.1], "c":null, "d":null} +/// {"a":2, "b":[2.0, null, -6.1], "c":[false, null], "d":"text"} +/// {"a":3, "b":4, "c": true, "d":[1, false, "array", 2.4]} +/// "#; +/// let mut reader = BufReader::new(Cursor::new(data)); /// let inferred_schema = infer_json_schema_from_seekable(&mut reader, None).unwrap(); -/// // file's cursor's offset automatically set at 0 +/// // cursor's position automatically set at 0 /// ``` pub fn infer_json_schema_from_seekable( reader: &mut BufReader, diff --git a/src/io/json/read/reader.rs b/src/io/json/read/reader.rs index 0fe8fed271e..ff51c818220 100644 --- a/src/io/json/read/reader.rs +++ b/src/io/json/read/reader.rs @@ -133,19 +133,21 @@ impl Decoder { /// use std::sync::Arc; /// use arrow2::datatypes::{DataType, Field, Schema}; /// use arrow2::io::json; -/// use std::fs::File; -/// use std::io::BufReader; +/// use std::io::{Cursor, BufReader}; /// /// let schema = Arc::new(Schema::new(vec![ -/// Field::new("a", DataType::Float64, false), -/// Field::new("b", DataType::Float64, false), -/// Field::new("c", DataType::Float64, false), +/// Field::new("a", DataType::Int64, true), +/// Field::new("b", DataType::Float32, true), +/// Field::new("c", DataType::Boolean, true), +/// Field::new("d", DataType::Utf8, true), /// ])); /// -/// let file = File::open("test/data/basic.json").unwrap(); -/// -/// let mut json = json::Reader::new(BufReader::new(file), schema, 1024, None); -/// let batch = json.next().unwrap().unwrap(); +/// let data = r#"{"a":1, "b":2.0, "c":false, "d":"4"} +/// {"a":-10, "b":-3.5, "c":true, "d":null} +/// {"a":100000000, "b":0.6, "d":"text"}"#; +/// let mut reader = BufReader::new(Cursor::new(data)); +/// let mut reader = json::Reader::new(&mut reader, schema, 1024, None); +/// let batch = reader.next().unwrap().unwrap(); /// ``` #[derive(Debug)] pub struct Reader { diff --git a/test/data/arrays.json b/test/data/arrays.json deleted file mode 100644 index 5dbdd19ffc0..00000000000 --- a/test/data/arrays.json +++ /dev/null @@ -1,3 +0,0 @@ -{"a":1, "b":[2.0, 1.3, -6.1], "c":[false, true], "d":"4"} -{"a":-10, "b":[2.0, 1.3, -6.1], "c":[true, true], "d":"4"} -{"a":2, "b":[2.0, null, -6.1], "c":[false, null], "d":"text"} diff --git a/test/data/basic.json b/test/data/basic.json deleted file mode 100644 index dafd2dd2e42..00000000000 --- a/test/data/basic.json +++ /dev/null @@ -1,12 +0,0 @@ -{"a":1, "b":2.0, "c":false, "d":"4"} -{"a":-10, "b":-3.5, "c":true, "d":"4"} -{"a":2, "b":0.6, "c":false, "d":"text"} -{"a":1, "b":2.0, "c":false, "d":"4"} -{"a":7, "b":-3.5, "c":true, "d":"4"} -{"a":1, "b":0.6, "c":false, "d":"text"} -{"a":1, "b":2.0, "c":false, "d":"4"} -{"a":5, "b":-3.5, "c":true, "d":"4"} -{"a":1, "b":0.6, "c":false, "d":"text"} -{"a":1, "b":2.0, "c":false, "d":"4"} -{"a":1, "b":-3.5, "c":true, "d":"4"} -{"a":100000000000000, "b":0.6, "c":false, "d":"text"} \ No newline at end of file diff --git a/test/data/basic_nulls.json b/test/data/basic_nulls.json deleted file mode 100644 index 1451df7f57f..00000000000 --- a/test/data/basic_nulls.json +++ /dev/null @@ -1,12 +0,0 @@ -{"a":1, "b":2.0, "c":false} -{"a":null, "b":-3.5, "c":true, "d":"4"} -{"c":false, "d":"text"} -{"a":1, "b":2.0, "c":false, "d":"4"} -{"a":7, "b":-3.5, "c":null, "d":null} -{"a":1, "b":0.6, "c":false} -{"a":1, "b":2.0, "d":"4"} -{"a":5, "c":true} -{"a":1, "b":0.6, "c":false, "d":"text"} -{"a":1, "b":2.0, "c":false, "d":"4"} -{"a":1, "b":-3.5, "c":true, "d":"4"} -{} \ No newline at end of file diff --git a/test/data/integration.json b/test/data/integration.json deleted file mode 100644 index 7e4a22cddba..00000000000 --- a/test/data/integration.json +++ /dev/null @@ -1,808 +0,0 @@ -{ - "schema": { - "fields": [ - { - "name": "bools-with-metadata-map", - "type": { - "name": "bool" - }, - "nullable": true, - "metadata": { - "k": "v" - }, - "children": [] - }, - { - "name": "bools-with-metadata-vec", - "type": { - "name": "bool" - }, - "nullable": true, - "metadata": [ - { - "key": "k2", - "value": "v2" - } - ], - "children": [] - }, - { - "name": "bools", - "type": { - "name": "bool" - }, - "nullable": true, - "children": [] - }, - { - "name": "int8s", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 8 - }, - "nullable": true, - "children": [] - }, - { - "name": "int16s", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 16 - }, - "nullable": true, - "children": [] - }, - { - "name": "int32s", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "int64s", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 64 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint8s", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 8 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint16s", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 16 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint32s", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "uint64s", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 64 - }, - "nullable": true, - "children": [] - }, - { - "name": "float32s", - "type": { - "name": "floatingpoint", - "precision": "SINGLE" - }, - "nullable": true, - "children": [] - }, - { - "name": "float64s", - "type": { - "name": "floatingpoint", - "precision": "DOUBLE" - }, - "nullable": true, - "children": [] - }, - { - "name": "date_days", - "type": { - "name": "date", - "unit": "DAY" - }, - "nullable": true, - "children": [] - }, - { - "name": "date_millis", - "type": { - "name": "date", - "unit": "MILLISECOND" - }, - "nullable": true, - "children": [] - }, - { - "name": "time_secs", - "type": { - "name": "time", - "unit": "SECOND", - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "time_millis", - "type": { - "name": "time", - "unit": "MILLISECOND", - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "time_micros", - "type": { - "name": "time", - "unit": "MICROSECOND", - "bitWidth": 64 - }, - "nullable": true, - "children": [] - }, - { - "name": "time_nanos", - "type": { - "name": "time", - "unit": "NANOSECOND", - "bitWidth": 64 - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_secs", - "type": { - "name": "timestamp", - "unit": "SECOND" - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_millis", - "type": { - "name": "timestamp", - "unit": "MILLISECOND" - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_micros", - "type": { - "name": "timestamp", - "unit": "MICROSECOND" - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_nanos", - "type": { - "name": "timestamp", - "unit": "NANOSECOND" - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_secs_tz", - "type": { - "name": "timestamp", - "unit": "SECOND", - "timezone": "Europe/Budapest" - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_millis_tz", - "type": { - "name": "timestamp", - "unit": "MILLISECOND", - "timezone": "America/New_York" - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_micros_tz", - "type": { - "name": "timestamp", - "unit": "MICROSECOND", - "timezone": "UTC" - }, - "nullable": true, - "children": [] - }, - { - "name": "ts_nanos_tz", - "type": { - "name": "timestamp", - "unit": "NANOSECOND", - "timezone": "Africa/Johannesburg" - }, - "nullable": true, - "children": [] - }, - { - "name": "utf8s", - "type": { - "name": "utf8" - }, - "nullable": true, - "children": [] - }, - { - "name": "lists", - "nullable": true, - "type": { - "name": "list" - }, - "children": [ - { - "name": "item", - "nullable": true, - "type": { - "name": "int", - "bitWidth": 32, - "isSigned": true - }, - "children": [] - } - ] - }, - { - "name": "structs", - "type": { - "name": "struct" - }, - "nullable": true, - "children": [ - { - "name": "int32s", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "utf8s", - "type": { - "name": "utf8" - }, - "nullable": true, - "children": [] - } - ] - } - ] - }, - "batches": [ - { - "count": 3, - "columns": [ - { - "name": "bools-with-metadata-map", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - true, - true, - false - ] - }, - { - "name": "bools-with-metadata-vec", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - true, - true, - false - ] - }, - { - "name": "bools", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - true, - true, - false - ] - }, - { - "name": "int8s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "int16s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "int32s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "int64s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "uint8s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "uint16s", - "count": 5, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "uint32s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "uint64s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1, - 2, - 3 - ] - }, - { - "name": "float32s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1.0, - 2.0, - 3.0 - ] - }, - { - "name": "float64s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 1.0, - 2.0, - 3.0 - ] - }, - { - "name": "date_days", - "count": 3, - "VALIDITY": [ - 1, - 0, - 0 - ], - "DATA": [ - 1196848, - 2319603, - 2755982 - ] - }, - { - "name": "date_millis", - "count": 3, - "VALIDITY": [ - 1, - 1, - 1 - ], - "DATA": [ - 167903550396207, - 29923997007884, - 30612271819236 - ] - }, - { - "name": "time_secs", - "count": 3, - "VALIDITY": [ - 1, - 1, - 1 - ], - "DATA": [ - 27974, - 78592, - 43207 - ] - }, - { - "name": "time_millis", - "count": 3, - "VALIDITY": [ - 1, - 1, - 1 - ], - "DATA": [ - 6613125, - 74667230, - 52260079 - ] - }, - { - "name": "time_micros", - "count": 3, - "VALIDITY": [ - 1, - 0, - 0 - ], - "DATA": [ - 62522958593, - 13470380050, - 50797036705 - ] - }, - { - "name": "time_nanos", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "DATA": [ - 73380123595985, - 52520995325145, - 16584393546415 - ] - }, - { - "name": "ts_secs", - "count": 3, - "VALIDITY": [ - 0, - 1, - 0 - ], - "DATA": [ - 209869064422, - 193438817552, - 51757838205 - ] - }, - { - "name": "ts_millis", - "count": 3, - "VALIDITY": [ - 0, - 1, - 1 - ], - "DATA": [ - 228315043570185, - 38606916383008, - 58113709376587 - ] - }, - { - "name": "ts_micros", - "count": 3, - "VALIDITY": [ - 0, - 0, - 0 - ], - "DATA": [ - 133457416537791415, - 129522736067409280, - 177110451066832967 - ] - }, - { - "name": "ts_nanos", - "count": 3, - "VALIDITY": [ - 0, - 0, - 1 - ], - "DATA": [ - -804525722984600007, - 8166038652634779458, - -6473623571954960143 - ] - }, - { - "name": "ts_secs_tz", - "count": 3, - "VALIDITY": [ - 0, - 1, - 0 - ], - "DATA": [ - 209869064422, - 193438817552, - 51757838205 - ] - }, - { - "name": "ts_millis_tz", - "count": 3, - "VALIDITY": [ - 0, - 1, - 1 - ], - "DATA": [ - 228315043570185, - 38606916383008, - 58113709376587 - ] - }, - { - "name": "ts_micros_tz", - "count": 3, - "VALIDITY": [ - 0, - 0, - 0 - ], - "DATA": [ - 133457416537791415, - 129522736067409280, - 177110451066832967 - ] - }, - { - "name": "ts_nanos_tz", - "count": 3, - "VALIDITY": [ - 0, - 0, - 1 - ], - "DATA": [ - -804525722984600007, - 8166038652634779458, - -6473623571954960143 - ] - }, - { - "name": "utf8s", - "count": 3, - "VALIDITY": [ - 1, - 0, - 1 - ], - "OFFSET": [ - 0, - 2, - 2, - 5 - ], - "DATA": [ - "aa", - "", - "bbb" - ] - }, - { - "name": "lists", - "count": 3, - "VALIDITY": [ - 1, - 1, - 0 - ], - "OFFSET": [ - 0, - 3, - 4, - 4 - ], - "children": [ - { - "name": "item", - "count": 4, - "VALIDITY": [ - 0, - 1, - 0, - 0 - ], - "DATA": [ - 1, - 2, - 3, - 4 - ] - } - ] - }, - { - "name": "structs", - "count": 3, - "VALIDITY": [ - 1, - 1, - 0 - ], - "children": [ - { - "name": "int32s", - "count": 3, - "VALIDITY": [ - 0, - 1, - 0 - ], - "DATA": [ - -1, - -2, - -3 - ] - }, - { - "name": "utf8s", - "count": 3, - "VALIDITY": [ - 0, - 0, - 1 - ], - "OFFSET": [ - 0, - 0, - 0, - 7 - ], - "DATA": [ - "", - "", - "aaaaaa" - ] - } - ] - } - ] - } - ] -} \ No newline at end of file diff --git a/test/data/list_string_dict_nested_nulls.json b/test/data/list_string_dict_nested_nulls.json deleted file mode 100644 index 9300b14ce27..00000000000 --- a/test/data/list_string_dict_nested_nulls.json +++ /dev/null @@ -1,3 +0,0 @@ -{"machine": "a", "events": [null, "Elect Leader", "Do Ballot"]} -{"machine": "b", "events": ["Do Ballot", null, "Send Data", "Elect Leader"]} -{"machine": "c", "events": ["Send Data"]} diff --git a/test/data/mixed_arrays.json b/test/data/mixed_arrays.json deleted file mode 100644 index 18987284a5b..00000000000 --- a/test/data/mixed_arrays.json +++ /dev/null @@ -1,4 +0,0 @@ -{"a":1, "b":[2.0, 1.3, -6.1], "c":[false, true], "d":4.1} -{"a":-10, "b":[2.0, 1.3, -6.1], "c":null, "d":null} -{"a":2, "b":[2.0, null, -6.1], "c":[false, null], "d":"text"} -{"a":3, "b":4, "c": true, "d":[1, false, "array", 2.4]} diff --git a/test/data/mixed_arrays.json.gz b/test/data/mixed_arrays.json.gz deleted file mode 100644 index 0f6040092ff1277ab28be57795e0d1ad17aa74c7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 141 zcmb2|=3oE==G9?dA)Ab@1O|8oZR#?+WPBme;EM6Jzze3AjeA(T)R Result<()> { - let mut reader = ReaderBuilder::new().from_path("test/data/uk_cities_with_headers.csv")?; + let data = r#"city,lat,lng +"Elgin, Scotland, the UK",57.653484,-3.335724 +"Stoke-on-Trent, Staffordshire, the UK",53.002666,-2.179404 +"Solihull, Birmingham, UK",52.412811,-1.778197 +"Cardiff, Cardiff county, UK",51.481583,-3.179090 +"Eastbourne, East Sussex, UK",50.768036,0.290472 +"Oxford, Oxfordshire, UK",51.752022,-1.257677 +"London, UK",51.509865,-0.118092 +"Swindon, Swindon, UK",51.568535,-1.772232 +"Gravesend, Kent, UK",51.441883,0.370759 +"Northampton, Northamptonshire, UK",52.240479,-0.902656 +"Rugby, Warwickshire, UK",52.370876,-1.265032 +"Sutton Coldfield, West Midlands, UK",52.570385,-1.824042 +"Harlow, Essex, UK",51.772938,0.102310 +"Aberdeen, Aberdeen City, UK",57.149651,-2.099075"#; + let mut reader = ReaderBuilder::new().from_reader(Cursor::new(data)); let schema = Arc::new(infer_schema(&mut reader, None, true, &infer)?); @@ -26,7 +41,7 @@ fn read() -> Result<()> { let batch_schema = batch.schema(); assert_eq!(&schema, batch_schema); - assert_eq!(37, batch.num_rows()); + assert_eq!(14, batch.num_rows()); assert_eq!(3, batch.num_columns()); let lat = batch diff --git a/tests/it/io/json/mod.rs b/tests/it/io/json/mod.rs index 63c11ec1647..37810e9ec39 100644 --- a/tests/it/io/json/mod.rs +++ b/tests/it/io/json/mod.rs @@ -1,20 +1,20 @@ mod read; mod write; +use std::io::Cursor; +use std::sync::Arc; + use serde_json::Value; -use std::fs::{read_to_string, File}; -use arrow2::io::json::LineDelimitedWriter; -use arrow2::io::json::Reader; -use arrow2::io::json::ReaderBuilder; +use arrow2::array::*; +use arrow2::datatypes::*; +use arrow2::io::json::{LineDelimitedWriter, ReaderBuilder}; -fn test_write_for_file(test_file: &str) { +fn round_trip(data: String) { let builder = ReaderBuilder::new() .infer_schema(None) .with_batch_size(1024); - let mut reader: Reader = builder - .build::(File::open(test_file).unwrap()) - .unwrap(); + let mut reader = builder.build(Cursor::new(data.clone())).unwrap(); let batch = reader.next().unwrap().unwrap(); let mut buf = Vec::new(); @@ -24,12 +24,11 @@ fn test_write_for_file(test_file: &str) { } let result = String::from_utf8(buf).unwrap(); - let expected = read_to_string(test_file).unwrap(); - for (r, e) in result.lines().zip(expected.lines()) { + for (r, e) in result.lines().zip(data.lines()) { let mut result_json = serde_json::from_str::(r).unwrap(); let expected_json = serde_json::from_str::(e).unwrap(); if let Value::Object(e) = &expected_json { - // remove null value from object to make comparision consistent: + // remove null value from object to make comparison consistent: if let Value::Object(r) = result_json { result_json = Value::Object( r.into_iter() @@ -43,16 +42,178 @@ fn test_write_for_file(test_file: &str) { } #[test] -fn write_basic_rows() { - test_write_for_file("test/data/basic.json"); +fn round_trip_basics() { + let (data, _, _) = case_basics(); + round_trip(data); } #[test] -fn write_arrays() { - test_write_for_file("test/data/arrays.json"); +fn round_trip_list() { + let (data, _, _) = case_list(); + round_trip(data); } -#[test] -fn write_basic_nulls() { - test_write_for_file("test/data/basic_nulls.json"); +fn case_list() -> (String, Schema, Vec>) { + let data = r#"{"a":1, "b":[2.0, 1.3, -6.1], "c":[false, true], "d":"4"} + {"a":-10, "b":null, "c":[true, true]} + {"a":null, "b":[2.1, null, -6.2], "c":[false, null], "d":"text"} + "# + .to_string(); + + let schema = Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new( + "b", + DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + true, + ), + Field::new( + "c", + DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + true, + ), + Field::new("d", DataType::Utf8, true), + ]); + let a = Int64Array::from(&[Some(1), Some(-10), None]); + + let mut b = MutableListArray::>::new(); + b.try_extend(vec![ + Some(vec![Some(2.0), Some(1.3), Some(-6.1)]), + None, + Some(vec![Some(2.1), None, Some(-6.2)]), + ]) + .unwrap(); + let b: ListArray = b.into(); + + let mut c = MutableListArray::::new(); + c.try_extend(vec![ + Some(vec![Some(false), Some(true)]), + Some(vec![Some(true), Some(true)]), + Some(vec![Some(false), None]), + ]) + .unwrap(); + let c: ListArray = c.into(); + + let d = Utf8Array::::from(&[Some("4"), None, Some("text")]); + + let columns = vec![ + Box::new(a) as Box, + Box::new(b), + Box::new(c), + Box::new(d), + ]; + + (data, schema, columns) +} + +fn case_dict() -> (String, Schema, Vec>) { + let data = r#"{"machine": "a", "events": [null, "Elect Leader", "Do Ballot"]} + {"machine": "b", "events": ["Do Ballot", null, "Send Data", "Elect Leader"]} + {"machine": "c", "events": ["Send Data"]} + {"machine": "c"} + {"machine": "c", "events": null} + "# + .to_string(); + + let data_type = DataType::List(Box::new(Field::new( + "item", + DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), + true, + ))); + + let schema = Schema::new(vec![Field::new("events", data_type, true)]); + + type A = MutableDictionaryArray>; + + let mut array = MutableListArray::::new(); + array + .try_extend(vec![ + Some(vec![None, Some("Elect Leader"), Some("Do Ballot")]), + Some(vec![ + Some("Do Ballot"), + None, + Some("Send Data"), + Some("Elect Leader"), + ]), + Some(vec![Some("Send Data")]), + None, + None, + ]) + .unwrap(); + + let array: ListArray = array.into(); + + (data, schema, vec![Box::new(array) as Box]) +} + +fn case_basics() -> (String, Schema, Vec>) { + let data = r#"{"a":1, "b":2.0, "c":false, "d":"4"} + {"a":-10, "b":-3.5, "c":true, "d":null} + {"a":100000000, "b":0.6, "d":"text"}"# + .to_string(); + let schema = Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Float64, true), + Field::new("c", DataType::Boolean, true), + Field::new("d", DataType::Utf8, true), + ]); + let columns = vec![ + Box::new(Int64Array::from_slice(&[1, -10, 100000000])) as Box, + Box::new(Float64Array::from_slice(&[2.0, -3.5, 0.6])), + Box::new(BooleanArray::from(&[Some(false), Some(true), None])), + Box::new(Utf8Array::::from(&[Some("4"), None, Some("text")])), + ]; + (data, schema, columns) +} + +fn case_basics_schema() -> (String, Schema, Vec>) { + let data = r#"{"a":1, "b":2.0, "c":false, "d":"4"} + {"a":10, "b":-3.5, "c":true, "d":null} + {"a":100000000, "b":0.6, "d":"text"}"# + .to_string(); + let schema = Schema::new(vec![ + Field::new("a", DataType::UInt32, true), + Field::new("b", DataType::Float32, true), + Field::new("c", DataType::Boolean, true), + // note how "d" is not here + ]); + let columns = vec![ + Box::new(UInt32Array::from_slice(&[1, 10, 100000000])) as Box, + Box::new(Float32Array::from_slice(&[2.0, -3.5, 0.6])), + Box::new(BooleanArray::from(&[Some(false), Some(true), None])), + ]; + (data, schema, columns) +} + +fn case_struct() -> (String, Schema, Vec>) { + let data = r#"{"a": {"b": true, "c": {"d": "text"}}} + {"a": {"b": false, "c": null}} + {"a": {"b": true, "c": {"d": "text"}}} + {"a": 1}"# + .to_string(); + + let d_field = Field::new("d", DataType::Utf8, true); + let c_field = Field::new("c", DataType::Struct(vec![d_field.clone()]), true); + let a_field = Field::new( + "a", + DataType::Struct(vec![ + Field::new("b", DataType::Boolean, true), + c_field.clone(), + ]), + true, + ); + let schema = Schema::new(vec![a_field]); + + // build expected output + let d = Utf8Array::::from(&vec![Some("text"), None, Some("text"), None]); + let c = StructArray::from_data(vec![d_field], vec![Arc::new(d)], None); + + let b = BooleanArray::from(vec![Some(true), Some(false), Some(true), None]); + let expected = StructArray::from_data( + vec![Field::new("b", DataType::Boolean, true), c_field], + vec![Arc::new(b), Arc::new(c)], + None, + ); + + (data, schema, vec![Box::new(expected) as Box]) } diff --git a/tests/it/io/json/read.rs b/tests/it/io/json/read.rs index de3bd737623..89dc10e39b4 100644 --- a/tests/it/io/json/read.rs +++ b/tests/it/io/json/read.rs @@ -1,423 +1,128 @@ -use flate2::read::GzDecoder; use std::io::BufReader; -use std::{ - fs::File, - io::{Seek, SeekFrom}, -}; use std::{io::Cursor, sync::Arc}; use arrow2::array::*; use arrow2::datatypes::*; use arrow2::{bitmap::Bitmap, buffer::Buffer, error::Result, io::json::*}; +use crate::io::json::*; + #[test] -fn json_basic() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); +fn basic() -> Result<()> { + let (data, schema, columns) = case_basics(); - assert_eq!(4, batch.num_columns()); - assert_eq!(12, batch.num_rows()); + let mut reader = ReaderBuilder::new().build(Cursor::new(data))?; + let batch = reader.next()?.unwrap(); - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(0, a.0); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!(1, b.0); - assert_eq!(&DataType::Float64, b.1.data_type()); - let c = schema.column_with_name("c").unwrap(); - assert_eq!(2, c.0); - assert_eq!(&DataType::Boolean, c.1.data_type()); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(3, d.0); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!(1, aa.value(0)); - assert_eq!(-10, aa.value(1)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert!((2.0 - bb.value(0)).abs() < f64::EPSILON); - assert!((-3.5 - bb.value(1)).abs() < f64::EPSILON); - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!cc.value(0)); - assert!(cc.value(10)); - let dd = batch - .column(d.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!("4", dd.value(0)); - assert_eq!("text", dd.value(8)); + assert_eq!(&schema, batch.schema().as_ref()); + + columns + .iter() + .zip(batch.columns()) + .for_each(|(expected, result)| assert_eq!(expected.as_ref(), result.as_ref())); + Ok(()) } #[test] -fn json_basic_with_nulls() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); +fn basics_with_schema_projection() -> Result<()> { + let (data, schema, columns) = case_basics_schema(); - assert_eq!(4, batch.num_columns()); - assert_eq!(12, batch.num_rows()); + let mut reader = ReaderBuilder::new() + .with_schema(Arc::new(schema.clone())) + .build(Cursor::new(data))?; + let batch = reader.next()?.unwrap(); - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!(&DataType::Float64, b.1.data_type()); - let c = schema.column_with_name("c").unwrap(); - assert_eq!(&DataType::Boolean, c.1.data_type()); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert!(aa.is_valid(0)); - assert!(!aa.is_valid(1)); - assert!(!aa.is_valid(11)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert!(bb.is_valid(0)); - assert!(!bb.is_valid(2)); - assert!(!bb.is_valid(11)); - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(cc.is_valid(0)); - assert!(!cc.is_valid(4)); - assert!(!cc.is_valid(11)); - let dd = batch - .column(d.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert!(!dd.is_valid(0)); - assert!(dd.is_valid(1)); - assert!(!dd.is_valid(4)); - assert!(!dd.is_valid(11)); -} + assert_eq!(&schema, batch.schema().as_ref()); -#[test] -fn json_basic_schema() { - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Float32, false), - Field::new("c", DataType::Boolean, false), - Field::new("d", DataType::Utf8, false), - ])); - - let mut reader: Reader = Reader::new( - File::open("test/data/basic.json").unwrap(), - schema.clone(), - 1024, - None, - ); - let reader_schema = reader.schema(); - assert_eq!(reader_schema, &schema); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(4, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = batch.schema(); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int32, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!(&DataType::Float32, b.1.data_type()); - let c = schema.column_with_name("c").unwrap(); - assert_eq!(&DataType::Boolean, c.1.data_type()); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(1, aa.value(0)); - // test that a 64bit value is returned as null due to overflowing - assert!(!aa.is_valid(11)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!((2.0 - bb.value(0)).abs() < f32::EPSILON); - assert!((-3.5 - bb.value(1)).abs() < f32::EPSILON); + columns + .iter() + .zip(batch.columns()) + .for_each(|(expected, result)| assert_eq!(expected.as_ref(), result.as_ref())); + Ok(()) } #[test] -fn json_basic_schema_projection() { - // We test implicit and explicit projection: - // Implicit: omitting fields from a schema - // Explicit: supplying a vec of fields to take - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Float32, false), - Field::new("c", DataType::Boolean, false), - ])); - - let mut reader: Reader = Reader::new( - File::open("test/data/basic.json").unwrap(), - schema, - 1024, - Some(vec!["a".to_string(), "c".to_string()]), - ); - let reader_schema = reader.schema().clone(); - let expected_schema = Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("c", DataType::Boolean, false), - ]); - assert_eq!(reader_schema.as_ref(), &expected_schema); +fn lists() -> Result<()> { + let (data, schema, columns) = case_list(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(2, batch.num_columns()); - assert_eq!(12, batch.num_rows()); + let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); + let mut reader = builder.build(Cursor::new(data))?; + let batch = reader.next()?.unwrap(); - let batch_schema = batch.schema(); - assert_eq!(&reader_schema, batch_schema); + assert_eq!(&schema, batch.schema().as_ref()); - let a = batch_schema.column_with_name("a").unwrap(); - assert_eq!(0, a.0); - assert_eq!(&DataType::Int32, a.1.data_type()); - let c = batch_schema.column_with_name("c").unwrap(); - assert_eq!(1, c.0); - assert_eq!(&DataType::Boolean, c.1.data_type()); + columns + .iter() + .zip(batch.columns()) + .for_each(|(expected, result)| assert_eq!(expected.as_ref(), result.as_ref())); + Ok(()) } #[test] -fn json_arrays() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/arrays.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); +fn line_break_in_values() -> Result<()> { + let data = r#" + {"a":"aa\n\n"} + {"a":"aa\n"} + {"a":null} + "#; - assert_eq!(4, batch.num_columns()); - assert_eq!(3, batch.num_rows()); + let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); + let mut reader = builder.build(Cursor::new(data))?; + let batch = reader.next()?.unwrap(); - let schema = batch.schema(); + let expected = Utf8Array::::from(&[Some("aa\n\n"), Some("aa\n"), None]); - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), - b.1.data_type() - ); - let c = schema.column_with_name("c").unwrap(); - assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), - c.1.data_type() - ); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(1, aa.value(0)); - assert_eq!(-10, aa.value(1)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::>() - .unwrap(); - let bb = bb.values(); - let bb = bb.as_any().downcast_ref::().unwrap(); - assert_eq!(9, bb.len()); - assert!((2.0 - bb.value(0)).abs() < f64::EPSILON); - assert!((-6.1 - bb.value(5)).abs() < f64::EPSILON); - assert!(!bb.is_valid(7)); - - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::>() - .unwrap(); - let cc = cc.values(); - let cc = cc.as_any().downcast_ref::().unwrap(); - assert_eq!(6, cc.len()); - assert!(!cc.value(0)); - assert!(!cc.value(4)); - assert!(!cc.is_valid(5)); + assert_eq!(expected, batch.columns()[0].as_ref()); + Ok(()) } #[test] -fn invalid_json_infer_schema() { - let re = infer_json_schema_from_seekable( - &mut BufReader::new(File::open("test/data/uk_cities_with_headers.csv").unwrap()), - None, - ); +fn invalid_infer_schema() -> Result<()> { + let re = + infer_json_schema_from_seekable(&mut BufReader::new(Cursor::new("city,lat,lng")), None); assert_eq!( re.err().unwrap().to_string(), "External error: expected value at line 1 column 1", ); + Ok(()) } #[test] -fn invalid_json_read_record() { +fn invalid_read_record() -> Result<()> { let schema = Arc::new(Schema::new(vec![Field::new( "a", DataType::Struct(vec![Field::new("a", DataType::Utf8, true)]), true, )])); let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/uk_cities_with_headers.csv").unwrap()) - .unwrap(); + let mut data = Cursor::new("city,lat,lng"); + let mut reader = builder.build(&mut data)?; assert_eq!( reader.next().err().unwrap().to_string(), "External error: expected value at line 1 column 1", ); + Ok(()) } #[test] -fn mixed_json_arrays() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/mixed_arrays.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - let mut file = File::open("test/data/mixed_arrays.json.gz").unwrap(); - let mut reader = BufReader::new(GzDecoder::new(&file)); - let schema = Arc::new(infer_json_schema(&mut reader, None).unwrap()); - file.seek(SeekFrom::Start(0)).unwrap(); - - let reader = BufReader::new(GzDecoder::new(&file)); - let mut reader = Reader::from_buf_reader(reader, schema, 64, None); - let batch_gz = reader.next().unwrap().unwrap(); - - for batch in vec![batch, batch_gz] { - assert_eq!(4, batch.num_columns()); - assert_eq!(4, batch.num_rows()); - - let schema = batch.schema(); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), - b.1.data_type() - ); - let c = schema.column_with_name("c").unwrap(); - assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), - c.1.data_type() - ); - let d = schema.column_with_name("d").unwrap(); - assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), - d.1.data_type() - ); - - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::>() - .unwrap(); - let bb = bb.values(); - let bb = bb.as_any().downcast_ref::().unwrap(); - assert_eq!(9, bb.len()); - assert!((-6.1 - bb.value(8)).abs() < f64::EPSILON); - - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::>() - .unwrap(); - let cc = cc.values(); - let cc = cc.as_any().downcast_ref::().unwrap(); - let cc_expected = BooleanArray::from(vec![Some(false), Some(true), Some(false), None]); - assert_eq!(cc, &cc_expected); - - let dd = batch - .column(d.0) - .as_any() - .downcast_ref::>() - .unwrap(); - let dd = dd.values(); - let dd = dd.as_any().downcast_ref::>().unwrap(); - assert_eq!( - dd, - &Utf8Array::::from_slice(&["1", "false", "array", "2.4"]) - ); - } -} - -#[test] -fn nested_struct_json_arrays() { - let d_field = Field::new("d", DataType::Utf8, true); - let c_field = Field::new("c", DataType::Struct(vec![d_field.clone()]), true); - let a_field = Field::new( - "a", - DataType::Struct(vec![ - Field::new("b", DataType::Boolean, true), - c_field.clone(), - ]), - true, - ); - let schema = Arc::new(Schema::new(vec![a_field])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/nested_structs.json").unwrap()) - .unwrap(); +fn nested_struct_arrays() -> Result<()> { + let (data, schema, columns) = case_struct(); - // build expected output - let d = Utf8Array::::from(&vec![Some("text"), None, Some("text"), None]); - let c = StructArray::from_data(vec![d_field], vec![Arc::new(d)], None); + let builder = ReaderBuilder::new().with_schema(Arc::new(schema.clone())); + let mut reader = builder.build(Cursor::new(data))?; + let batch = reader.next()?.unwrap(); - let b = BooleanArray::from(vec![Some(true), Some(false), Some(true), None]); - let expected = StructArray::from_data( - vec![Field::new("b", DataType::Boolean, true), c_field], - vec![Arc::new(b), Arc::new(c)], - None, - ); + assert_eq!(&schema, batch.schema().as_ref()); - // compare `a` with result from json reader - let batch = reader.next().unwrap().unwrap(); - let read = batch.column(0); - assert_eq!(expected, read.as_ref()); + columns + .iter() + .zip(batch.columns()) + .for_each(|(expected, result)| assert_eq!(expected.as_ref(), result.as_ref())); + Ok(()) } #[test] -fn nested_list_json_arrays() { +fn nested_list_arrays() { let d_field = Field::new("d", DataType::Utf8, true); let c_field = Field::new("c", DataType::Struct(vec![d_field.clone()]), true); let b_field = Field::new("b", DataType::Boolean, true); @@ -430,14 +135,14 @@ fn nested_list_json_arrays() { let a_field = Field::new("a", a_list_data_type.clone(), true); let schema = Arc::new(Schema::new(vec![a_field])); let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let json_content = r#" + let content = r#" {"a": [{"b": true, "c": {"d": "a_text"}}, {"b": false, "c": {"d": "b_text"}}]} {"a": [{"b": false, "c": null}]} {"a": [{"b": true, "c": {"d": "c_text"}}, {"b": null, "c": {"d": "d_text"}}, {"b": true, "c": {"d": null}}]} {"a": null} {"a": []} "#; - let mut reader = builder.build(Cursor::new(json_content)).unwrap(); + let mut reader = builder.build(Cursor::new(content)).unwrap(); // build expected output let d = Utf8Array::::from(&vec![ @@ -477,65 +182,16 @@ fn nested_list_json_arrays() { assert_eq!(expected, read.as_ref()); } -#[test] -fn dictionary_from_json_basic_with_nulls() -> Result<()> { - let schema = Arc::new(Schema::new(vec![Field::new( - "d", - DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), - true, - )])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let d = schema.column_with_name("d").unwrap(); - let data_type = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)); - assert_eq!(&data_type, d.1.data_type()); - - let result = batch.column(d.0); - - let values = vec![ - None, - Some("4"), - Some("text"), - Some("4"), - None, - None, - Some("4"), - None, - Some("text"), - Some("4"), - Some("4"), - None, - ]; - - let mut expected = MutableDictionaryArray::>::new(); - expected.try_extend(values)?; - let expected: DictionaryArray = expected.into(); - - assert_eq!(expected, result.as_ref()); - Ok(()) -} - #[test] fn skip_empty_lines() { let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let json_content = " + let content = " {\"a\": 1} {\"a\": 2} {\"a\": 3}"; - let mut reader = builder.build(Cursor::new(json_content)).unwrap(); + let mut reader = builder.build(Cursor::new(content)).unwrap(); let batch = reader.next().unwrap().unwrap(); assert_eq!(1, batch.num_columns()); @@ -549,10 +205,10 @@ fn skip_empty_lines() { #[test] fn row_type_validation() { let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let json_content = " + let content = " [1, \"hello\"] \"world\""; - let re = builder.build(Cursor::new(json_content)); + let re = builder.build(Cursor::new(content)); assert_eq!( re.err().unwrap().to_string(), r#"Expected JSON record to be an object, found Array([Number(1), String("hello")])"#, @@ -560,75 +216,60 @@ fn row_type_validation() { } #[test] -fn list_of_string_dictionary_from_json_with_nulls() -> Result<()> { - let data_type = DataType::List(Box::new(Field::new( - "item", - DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), - true, - ))); - - let schema = Arc::new(Schema::new(vec![Field::new( - "events", - data_type.clone(), - true, - )])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/list_string_dict_nested_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(3, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); +fn list_of_string_dictionary_from_with_nulls() -> Result<()> { + let (data, schema, columns) = case_dict(); - let events = schema.column_with_name("events").unwrap(); - assert_eq!(&data_type, events.1.data_type()); - - let expected = vec![ - Some(vec![None, Some("Elect Leader"), Some("Do Ballot")]), - Some(vec![ - Some("Do Ballot"), - None, - Some("Send Data"), - Some("Elect Leader"), - ]), - Some(vec![Some("Send Data")]), - ]; - - type A = MutableDictionaryArray>; - - let mut array = MutableListArray::::new(); - array.try_extend(expected)?; + let builder = ReaderBuilder::new() + .with_schema(Arc::new(schema)) + .with_batch_size(64); + let mut reader = builder.build(Cursor::new(data))?; + let batch = reader.next()?.unwrap(); - let expected: ListArray = array.into(); + assert_eq!(reader.schema(), batch.schema()); - assert_eq!(expected, batch.column(0).as_ref()); + assert_eq!(columns[0].as_ref(), batch.columns()[0].as_ref()); Ok(()) } #[test] -fn with_multiple_batches() { +fn with_multiple_batches() -> Result<()> { + let data = r#" + {"a":1} + {"a":null} + {} + {"a":1} + {"a":7} + {"a":1} + {"a":1} + {"a":5} + {"a":1} + {"a":1} + {"a":1} + {} + "#; + let builder = ReaderBuilder::new() .infer_schema(Some(4)) .with_batch_size(5); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); + let mut reader = builder.build(Cursor::new(data))?; let mut num_records = Vec::new(); - while let Some(rb) = reader.next().unwrap() { + while let Some(rb) = reader.next()? { num_records.push(rb.num_rows()); } assert_eq!(vec![5, 5, 2], num_records); + Ok(()) } #[test] -fn json_infer_schema() { +fn infer_schema_mixed_list() -> Result<()> { + let data = r#"{"a":1, "b":[2.0, 1.3, -6.1], "c":[false, true], "d":4.1} + {"a":-10, "b":[2.0, 1.3, -6.1], "c":null, "d":null} + {"a":2, "b":[2.0, null, -6.1], "c":[false, null], "d":"text"} + {"a":3, "b":4, "c": true, "d":[1, false, "array", 2.4]} + "#; + let schema = Schema::new(vec![ Field::new("a", DataType::Int64, true), Field::new( @@ -648,14 +289,8 @@ fn json_infer_schema() { ), ]); - let mut reader = BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); - let inferred_schema = infer_json_schema_from_seekable(&mut reader, None).unwrap(); - - assert_eq!(inferred_schema, schema); - - let file = File::open("test/data/mixed_arrays.json.gz").unwrap(); - let mut reader = BufReader::new(GzDecoder::new(&file)); - let inferred_schema = infer_json_schema(&mut reader, None).unwrap(); + let inferred_schema = infer_json_schema(&mut BufReader::new(Cursor::new(data)), None)?; assert_eq!(inferred_schema, schema); + Ok(()) }