Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Treat empty timezone string as no-timezone #1009

Merged
merged 6 commits into from
May 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/array/primitive/fmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ use crate::array::Array;
use crate::datatypes::{IntervalUnit, TimeUnit};
use crate::types::{days_ms, months_days_ns};

use super::super::super::temporal_conversions;
use super::super::super::types::NativeType;
use super::super::fmt::write_vec;
use super::PrimitiveArray;
use crate::array::fmt::write_vec;
use crate::temporal_conversions;
use crate::types::NativeType;

macro_rules! dyn_primitive {
($array:expr, $ty:ty, $expr:expr) => {{
Expand Down
233 changes: 170 additions & 63 deletions src/ffi/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -288,74 +288,102 @@ unsafe fn to_data_type(schema: &ArrowSchema) -> Result<DataType> {
DataType::Struct(children)
}
other => {
let parts = other.split(':').collect::<Vec<_>>();
if parts.len() == 2 && parts[0] == "tss" {
DataType::Timestamp(TimeUnit::Second, Some(parts[1].to_string()))
} else if parts.len() == 2 && parts[0] == "tsm" {
DataType::Timestamp(TimeUnit::Millisecond, Some(parts[1].to_string()))
} else if parts.len() == 2 && parts[0] == "tsu" {
DataType::Timestamp(TimeUnit::Microsecond, Some(parts[1].to_string()))
} else if parts.len() == 2 && parts[0] == "tsn" {
DataType::Timestamp(TimeUnit::Nanosecond, Some(parts[1].to_string()))
} else if parts.len() == 2 && parts[0] == "w" {
let size = parts[1].parse::<usize>().map_err(|_| {
ArrowError::OutOfSpec("size is not a valid integer".to_string())
})?;
DataType::FixedSizeBinary(size)
} else if parts.len() == 2 && parts[0] == "+w" {
let size = parts[1].parse::<usize>().map_err(|_| {
ArrowError::OutOfSpec("size is not a valid integer".to_string())
})?;
let child = to_field(schema.child(0))?;
DataType::FixedSizeList(Box::new(child), size)
} else if parts.len() == 2 && parts[0] == "d" {
let parts = parts[1].split(',').collect::<Vec<_>>();
if parts.len() < 2 || parts.len() > 3 {
return Err(ArrowError::OutOfSpec(
"Decimal must contain 2 or 3 comma-separated values".to_string(),
));
};
if parts.len() == 3 {
let bit_width = parts[0].parse::<usize>().map_err(|_| {
ArrowError::OutOfSpec(
"Decimal bit width is not a valid integer".to_string(),
)
match other.split(':').collect::<Vec<_>>()[..] {
// Timestamps with no timezone
["tss", ""] => DataType::Timestamp(TimeUnit::Second, None),
["tsm", ""] => DataType::Timestamp(TimeUnit::Millisecond, None),
["tsu", ""] => DataType::Timestamp(TimeUnit::Microsecond, None),
["tsn", ""] => DataType::Timestamp(TimeUnit::Nanosecond, None),

// Timestamps with timezone
["tss", tz] => DataType::Timestamp(TimeUnit::Second, Some(tz.to_string())),
["tsm", tz] => DataType::Timestamp(TimeUnit::Millisecond, Some(tz.to_string())),
["tsu", tz] => DataType::Timestamp(TimeUnit::Microsecond, Some(tz.to_string())),
["tsn", tz] => DataType::Timestamp(TimeUnit::Nanosecond, Some(tz.to_string())),

["w", size_raw] => {
// Example: "w:42" fixed-width binary [42 bytes]
let size = size_raw.parse::<usize>().map_err(|_| {
ArrowError::OutOfSpec("size is not a valid integer".to_string())
})?;
if bit_width != 128 {
return Err(ArrowError::OutOfSpec(
"Decimal256 is not supported".to_string(),
));
}
DataType::FixedSizeBinary(size)
}
let precision = parts[0].parse::<usize>().map_err(|_| {
ArrowError::OutOfSpec("Decimal precision is not a valid integer".to_string())
})?;
let scale = parts[1].parse::<usize>().map_err(|_| {
ArrowError::OutOfSpec("Decimal scale is not a valid integer".to_string())
})?;
DataType::Decimal(precision, scale)
} else if !parts.is_empty() && ((parts[0] == "+us") || (parts[0] == "+ud")) {
// union
let mode = UnionMode::sparse(parts[0] == "+us");
let type_ids = parts[1]
.split(',')
.map(|x| {
x.parse::<i32>().map_err(|_| {
["+w", size_raw] => {
// Example: "+w:123" fixed-sized list [123 items]
let size = size_raw.parse::<usize>().map_err(|_| {
ArrowError::OutOfSpec("size is not a valid integer".to_string())
})?;
let child = to_field(schema.child(0))?;
DataType::FixedSizeList(Box::new(child), size)
}
["d", raw] => {
// Decimal
let (precision, scale) = match raw.split(',').collect::<Vec<_>>()[..] {
[precision_raw, scale_raw] => {
// Example: "d:19,10" decimal128 [precision 19, scale 10]
(precision_raw, scale_raw)
}
[precision_raw, scale_raw, width_raw] => {
// Example: "d:19,10,NNN" decimal bitwidth = NNN [precision 19, scale 10]
// Only bitwdth of 128 currently supported
let bit_width = width_raw.parse::<usize>().map_err(|_| {
ArrowError::OutOfSpec(
"Decimal bit width is not a valid integer".to_string(),
)
})?;
if bit_width != 128 {
return Err(ArrowError::OutOfSpec(
"Decimal256 is not supported".to_string(),
));
}
(precision_raw, scale_raw)
}
_ => {
return Err(ArrowError::OutOfSpec(
"Decimal must contain 2 or 3 comma-separated values".to_string(),
));
}
};

DataType::Decimal(
precision.parse::<usize>().map_err(|_| {
ArrowError::OutOfSpec(
"Decimal precision is not a valid integer".to_string(),
)
})?,
scale.parse::<usize>().map_err(|_| {
ArrowError::OutOfSpec(
"Union type id is not a valid integer".to_string(),
"Decimal scale is not a valid integer".to_string(),
)
})?,
)
}
[union_type @ "+us", union_parts] | [union_type @ "+ud", union_parts] => {
// union, sparse
// Example "+us:I,J,..." sparse union with type ids I,J...
// Example: "+ud:I,J,..." dense union with type ids I,J...
let mode = UnionMode::sparse(union_type == "+us");
let type_ids = union_parts
.split(',')
.map(|x| {
x.parse::<i32>().map_err(|_| {
ArrowError::OutOfSpec(
"Union type id is not a valid integer".to_string(),
)
})
})
})
.collect::<Result<Vec<_>>>()?;
let fields = (0..schema.n_children as usize)
.map(|x| to_field(schema.child(x)))
.collect::<Result<Vec<_>>>()?;
DataType::Union(fields, Some(type_ids), mode)
} else {
return Err(ArrowError::OutOfSpec(format!(
"The datatype \"{}\" is still not supported in Rust implementation",
other
)));
.collect::<Result<Vec<_>>>()?;
let fields = (0..schema.n_children as usize)
.map(|x| to_field(schema.child(x)))
.collect::<Result<Vec<_>>>()?;
DataType::Union(fields, Some(type_ids), mode)
}
_ => {
return Err(ArrowError::OutOfSpec(format!(
"The datatype \"{}\" is still not supported in Rust implementation",
other
)));
}
}
}
})
Expand Down Expand Up @@ -511,3 +539,82 @@ unsafe fn metadata_from_bytes(data: *const ::std::os::raw::c_char) -> (Metadata,
let extension = extension_name.map(|name| (name, extension_metadata));
(result, extension)
}

#[cfg(test)]
mod tests {
use crate::datatypes::{DataType, Field, TimeUnit};

macro_rules! test_dt_conv {
($name:ident, $field:expr, $expected:expr,) => {
#[test]
fn $name() {
use crate::ffi::ArrowSchema;

let field = $field;
let schema = ArrowSchema::new(&field);
let dt = unsafe { super::to_data_type(&schema).unwrap() };
let result = super::to_format(&dt);
assert_eq!(result, $expected);
}
};
}

test_dt_conv!(
test_bool,
Field::new("example", DataType::Boolean, false),
"b",
);

// Timezone handling
test_dt_conv!(
test_timestamp_none,
Field::new(
"example",
DataType::Timestamp(TimeUnit::Second, None),
false
),
"tss:",
);

test_dt_conv!(
test_timestamp_something,
Field::new(
"example",
DataType::Timestamp(TimeUnit::Second, Some("Nonsense timezone!!".to_string())),
false
),
"tss:Nonsense timezone!!",
);

test_dt_conv!(
test_timestamp_empty,
Field::new(
"example",
DataType::Timestamp(TimeUnit::Second, Some(" ".to_string())),
false
),
"tss: ",
);

// Other time stamp types
test_dt_conv!(
test_timestamp_nanoseconds,
Field::new(
"example",
DataType::Timestamp(TimeUnit::Nanosecond, None),
false
),
"tsn:",
);

// Other time stamp types
test_dt_conv!(
test_list,
Field::new(
"example",
DataType::List(Box::new(Field::new("example", DataType::Boolean, false))),
false
),
"+l",
);
}