Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: upgrade to Arrow 37 and Datafusion 23 #1314

Merged
merged 10 commits into from
Apr 29, 2023
12 changes: 0 additions & 12 deletions python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -321,18 +321,6 @@ impl RawDeltaTable {
.collect())
}

pub fn arrow_schema_json(&self) -> PyResult<String> {
let schema = self
._table
.get_schema()
.map_err(PyDeltaTableError::from_raw)?;
serde_json::to_string(
&<ArrowSchema as TryFrom<&deltalake::Schema>>::try_from(schema)
.map_err(PyDeltaTableError::from_arrow)?,
)
.map_err(|_| PyDeltaTableError::new_err("Got invalid table schema"))
}
Comment on lines -324 to -334
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this was used in our public API, and I know the arrow-rs folks wanted to remove the unofficial JSON schema serialization from their public API as well. I think it should be fine to drop this.


pub fn update_incremental(&mut self) -> PyResult<()> {
rt()?
.block_on(self._table.update_incremental(None))
Expand Down
44 changes: 25 additions & 19 deletions python/src/schema.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
extern crate pyo3;

use deltalake::arrow::datatypes::{
DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema,
DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef,
Schema as ArrowSchema,
};
use deltalake::arrow::error::ArrowError;
use deltalake::arrow::pyarrow::PyArrowType;
Expand Down Expand Up @@ -1023,42 +1024,49 @@ impl PySchema {
.try_into()
.map_err(|err: ArrowError| PyException::new_err(err.to_string()))?;

fn convert_to_large_type(field: ArrowField, dt: ArrowDataType) -> ArrowField {
fn convert_to_large_type(field: ArrowFieldRef, dt: ArrowDataType) -> ArrowFieldRef {
let field = field.as_ref().clone();
match dt {
ArrowDataType::Utf8 => field.with_data_type(ArrowDataType::LargeUtf8),
ArrowDataType::Utf8 => field.with_data_type(ArrowDataType::LargeUtf8).into(),

ArrowDataType::Binary => field.with_data_type(ArrowDataType::LargeBinary),
ArrowDataType::Binary => field.with_data_type(ArrowDataType::LargeBinary).into(),

ArrowDataType::List(f) => {
let sub_field = convert_to_large_type(*f.clone(), f.data_type().clone());
field.with_data_type(ArrowDataType::LargeList(Box::from(sub_field)))
let sub_field = convert_to_large_type(f.clone(), f.data_type().clone());
field
.with_data_type(ArrowDataType::LargeList(sub_field))
.into()
}

ArrowDataType::FixedSizeList(f, size) => {
let sub_field = convert_to_large_type(*f.clone(), f.data_type().clone());
field.with_data_type(ArrowDataType::FixedSizeList(Box::from(sub_field), size))
let sub_field = convert_to_large_type(f.clone(), f.data_type().clone());
field
.with_data_type(ArrowDataType::FixedSizeList(sub_field, size))
.into()
}

ArrowDataType::Map(f, sorted) => {
let sub_field = convert_to_large_type(*f.clone(), f.data_type().clone());
field.with_data_type(ArrowDataType::Map(Box::from(sub_field), sorted))
let sub_field = convert_to_large_type(f.clone(), f.data_type().clone());
field
.with_data_type(ArrowDataType::Map(sub_field, sorted))
.into()
}

ArrowDataType::Struct(fields) => {
let sub_fields = fields
.iter()
.map(|f| {
let dt: ArrowDataType = f.data_type().clone();
let f: ArrowField = f.clone();

convert_to_large_type(f, dt)
convert_to_large_type(f.clone(), dt)
})
.collect();

field.with_data_type(ArrowDataType::Struct(sub_fields))
field
.with_data_type(ArrowDataType::Struct(sub_fields))
.into()
}

_ => field,
_ => field.into(),
}
}

Expand All @@ -1068,11 +1076,9 @@ impl PySchema {
.iter()
.map(|f| {
let dt: ArrowDataType = f.data_type().clone();
let f: ArrowField = f.clone();

convert_to_large_type(f, dt)
convert_to_large_type(f.clone(), dt)
})
.collect(),
.collect::<Vec<ArrowFieldRef>>(),
);

Ok(PyArrowType(schema))
Expand Down
20 changes: 10 additions & 10 deletions rust/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "deltalake"
version = "0.9.1"
version = "0.10.0"
rust-version = "1.64"
authors = ["Qingping Hou <dave2008713@gmail.com>"]
homepage = "https://github.com/delta-io/delta.rs"
Expand All @@ -13,7 +13,7 @@ readme = "README.md"
edition = "2021"

[dependencies]
arrow = { version = "36.0.0", optional = true }
arrow = { version = "37.0.0", optional = true }
async-trait = "0.1"
bytes = "1"
chrono = { version = "0.4.22", default-features = false, features = ["clock"] }
Expand All @@ -30,7 +30,7 @@ num-traits = "0.2.15"
object_store = "0.5.6"
once_cell = "1.16.0"
parking_lot = "0.12"
parquet = { version = "36", features = [
parquet = { version = "37", features = [
"async",
"object_store",
], optional = true }
Expand All @@ -54,14 +54,14 @@ rusoto_dynamodb = { version = "0.47", default-features = false, optional = true
rusoto_glue = { version = "0.47", default-features = false, optional = true }

# Datafusion
datafusion = { version = "22", optional = true }
datafusion-expr = { version = "22", optional = true }
datafusion-common = { version = "22", optional = true }
datafusion-proto = { version = "22", optional = true }
datafusion-sql = { version = "22", optional = true }
datafusion-physical-expr = { version = "22", optional = true }
datafusion = { version = "23", optional = true }
datafusion-expr = { version = "23", optional = true }
datafusion-common = { version = "23", optional = true }
datafusion-proto = { version = "23", optional = true }
datafusion-sql = { version = "23", optional = true }
datafusion-physical-expr = { version = "23", optional = true }

sqlparser = { version = "0.32", optional = true }
sqlparser = { version = "0.33", optional = true }

# NOTE dependencies only for integration tests
fs_extra = { version = "1.2.0", optional = true }
Expand Down
Loading