Skip to content

Commit

Permalink
make_read_json_accept_empty_list
Browse files Browse the repository at this point in the history
  • Loading branch information
deanm0000 committed Sep 6, 2024
1 parent e09cd0b commit b07dae2
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 20 deletions.
21 changes: 17 additions & 4 deletions crates/polars-io/src/json/infer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,37 @@ use std::num::NonZeroUsize;

use polars_core::prelude::DataType;
use polars_core::utils::try_get_supertype;
use polars_error::{polars_bail, PolarsResult};
use polars_error::{polars_bail, PolarsError, PolarsResult};
use simd_json::BorrowedValue;

pub(crate) fn json_values_to_supertype(
values: &[BorrowedValue],
infer_schema_len: NonZeroUsize,
) -> PolarsResult<DataType> {
// struct types may have missing fields so find supertype
values
let out_opt: Option<Result<DataType, PolarsError>> = values
.iter()
.take(infer_schema_len.into())
.map(|value| polars_json::json::infer(value).map(|dt| DataType::from(&dt)))
.reduce(|l, r| {
let l = l?;
let r = r?;
try_get_supertype(&l, &r)
})
.unwrap_or_else(|| polars_bail!(ComputeError: "could not infer data-type"))
});
match (out_opt, values.len()==0) {
(Some(out), true) => {
match out {
Ok(out)=>Ok(out),
_=>Err(PolarsError::NoData("no data".into()))
}
},
(Some(out), false) => {
out
},
(None, true) => Err(PolarsError::NoData("no data".into())),
(None, false) => polars_bail!(ComputeError: "could not infer data-type")

}
}

pub(crate) fn dtypes_to_supertype<I: Iterator<Item = DataType>>(
Expand Down
49 changes: 34 additions & 15 deletions crates/polars-io/src/json/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ use std::ops::Deref;
use arrow::legacy::conversion::chunk_to_struct;
use polars_core::error::to_compute_err;
use polars_core::prelude::*;
use polars_error::{polars_bail, PolarsResult};
use polars_error::{polars_bail, PolarsError, PolarsResult};
use polars_json::json::write::FallibleStreamingIterator;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -274,27 +274,41 @@ where
let mut bytes = rb.deref().to_vec();
let json_value =
simd_json::to_borrowed_value(&mut bytes).map_err(to_compute_err)?;

// struct type
let dtype = if let Some(mut schema) = self.schema {
let dtype_result = if let Some(mut schema) = self.schema {
if let Some(overwrite) = self.schema_overwrite {
let mut_schema = Arc::make_mut(&mut schema);
overwrite_schema(mut_schema, overwrite)?;
}

DataType::Struct(schema.iter_fields().collect()).to_arrow(CompatLevel::newest())
Ok(DataType::Struct(schema.iter_fields().collect())
.to_arrow(CompatLevel::newest()))
} else {
// infer
let inner_dtype = if let BorrowedValue::Array(values) = &json_value {
infer::json_values_to_supertype(
let inner_dtype_result = if let BorrowedValue::Array(values) = &json_value {
let supertype = infer::json_values_to_supertype(
values,
self.infer_schema_len
.unwrap_or(NonZeroUsize::new(usize::MAX).unwrap()),
)?
.to_arrow(CompatLevel::newest())
);
match supertype {
Ok(supertype) => Ok(supertype.to_arrow(CompatLevel::newest())),
Err(e) => Err(e),
}
} else {
polars_json::json::infer(&json_value)?
polars_json::json::infer(&json_value)
};
if inner_dtype_result.is_err() {
match &json_value {
BorrowedValue::Array(array) => {
if array.is_empty() {
return Ok(DataFrame::empty());
}
},
_ => {
polars_bail!(ComputeError: "could not infer data-type")
},
}
}
let inner_dtype = inner_dtype_result?;

if let Some(overwrite) = self.schema_overwrite {
let ArrowDataType::Struct(fields) = inner_dtype else {
Expand All @@ -304,18 +318,23 @@ where
let mut schema = Schema::from_iter(fields.iter().map(Into::<Field>::into));
overwrite_schema(&mut schema, overwrite)?;

DataType::Struct(
Ok(DataType::Struct(
schema
.into_iter()
.map(|(name, dt)| Field::new(name, dt))
.collect(),
)
.to_arrow(CompatLevel::newest())
.to_arrow(CompatLevel::newest()))
} else {
inner_dtype
Ok(inner_dtype)
}
};

if let Err(e) = &dtype_result {
if let PolarsError::NoData(_) = e {
return Ok(DataFrame::empty());
}
};
let dtype = dtype_result?;
let dtype = if let BorrowedValue::Array(_) = &json_value {
ArrowDataType::LargeList(Box::new(arrow::datatypes::Field::new(
PlSmallStr::from_static("item"),
Expand Down
12 changes: 11 additions & 1 deletion py-polars/tests/unit/io/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def test_ndjson_null_inference_13183() -> None:
}


@pytest.mark.write_disk
@pytest.mark.write_disk()
@typing.no_type_check
def test_json_wrong_input_handle_textio(tmp_path: Path) -> None:
# this shouldn't be passed, but still we test if we can handle it gracefully
Expand Down Expand Up @@ -385,3 +385,13 @@ def test_empty_json() -> None:
df = pl.read_json(b'{"j":{}}')
assert df.dtypes == [pl.Struct([])]
assert df.shape == (0, 1)


def test_empty_list_json() -> None:
df = pl.read_json(io.StringIO("[]"))
assert df.shape == (0, 0)
assert isinstance(df, pl.DataFrame)

df = pl.read_json(b"[]")
assert df.shape == (0, 0)
assert isinstance(df, pl.DataFrame)

0 comments on commit b07dae2

Please sign in to comment.