Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port generic_placeholder_for_datatype to arrow-rs #8628

Merged
merged 5 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -6533,13 +6533,15 @@ dependencies = [
"bytemuck",
"criterion",
"document-features",
"half",
"itertools 0.13.0",
"nohash-hasher",
"once_cell",
"re_arrow2",
"re_byte_size",
"re_case",
"re_error",
"re_log",
"re_string_interner",
"re_tracing",
"re_tuid",
Expand Down
4 changes: 3 additions & 1 deletion crates/store/re_types_core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@ serde = ["dep:serde", "re_string_interner/serde"]

[dependencies]
# Rerun
re_byte_size.workspace = true
re_case.workspace = true
re_error.workspace = true
re_byte_size.workspace = true
re_log.workspace = true
re_string_interner.workspace = true
re_tracing.workspace = true
re_tuid.workspace = true
Expand All @@ -53,6 +54,7 @@ arrow2 = { workspace = true, features = [
backtrace.workspace = true
bytemuck.workspace = true
document-features.workspace = true
half.workspace = true
itertools.workspace = true
nohash-hasher.workspace = true
once_cell.workspace = true
Expand Down
23 changes: 15 additions & 8 deletions crates/store/re_types_core/src/arrow_string.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
use arrow2::buffer::Buffer;
use arrow::buffer::Buffer;

/// Convenience-wrapper around an arrow [`Buffer`] that is known to contain a
/// UTF-8 encoded string.
///
/// The arrow2 [`Buffer`] object is internally reference-counted and can be
/// The arrow [`Buffer`] object is internally reference-counted and can be
/// easily converted back to a `&str` referencing the underlying storage.
/// This avoids some of the lifetime complexities that would otherwise
/// arise from returning a `&str` directly, but is significantly more
/// performant than doing the full allocation necessary to return a `String`.
#[derive(Clone, Debug, Default)]
pub struct ArrowString(Buffer<u8>);
#[derive(Clone, Debug)]
pub struct ArrowString(Buffer);

impl Default for ArrowString {
#[inline]
fn default() -> Self {
Self(Buffer::from_vec::<u8>(vec![]))
}
}

impl re_byte_size::SizeBytes for ArrowString {
#[inline]
Expand Down Expand Up @@ -57,26 +64,26 @@ impl ArrowString {

#[inline]
pub fn into_arrow_buffer(self) -> arrow::buffer::Buffer {
self.0.into()
self.0
}

#[inline]
pub fn into_arrow2_buffer(self) -> arrow2::buffer::Buffer<u8> {
self.0
self.0.into()
}
}

impl From<arrow::buffer::Buffer> for ArrowString {
#[inline]
fn from(buf: arrow::buffer::Buffer) -> Self {
Self(buf.into())
Self(buf)
}
}

impl From<arrow2::buffer::Buffer<u8>> for ArrowString {
#[inline]
fn from(buf: arrow2::buffer::Buffer<u8>) -> Self {
Self(buf)
Self(buf.into())
}
}

Expand Down
194 changes: 102 additions & 92 deletions crates/store/re_types_core/src/reflection.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
//! Run-time reflection for reading meta-data about components and archetypes.

use arrow::array::ArrayRef;
use std::sync::Arc;

use arrow::array::{Array, ArrayRef};

use crate::{ArchetypeName, ComponentName};

Expand Down Expand Up @@ -67,145 +69,153 @@ impl Reflection {
pub fn generic_placeholder_for_datatype(
datatype: &arrow::datatypes::DataType,
) -> arrow::array::ArrayRef {
generic_placeholder_for_datatype_arrow2(&datatype.clone().into()).into()
}

fn generic_placeholder_for_datatype_arrow2(
datatype: &arrow2::datatypes::DataType,
) -> Box<dyn arrow2::array::Array> {
use arrow2::{
array,
use arrow::{
array::{self, types},
datatypes::{DataType, IntervalUnit},
types,
};

match datatype {
DataType::Null => Box::new(array::NullArray::new(datatype.clone(), 1)),
DataType::Boolean => Box::new(array::BooleanArray::from_slice([false])),
DataType::Int8 => Box::new(array::Int8Array::from_slice([0])),
DataType::Int16 => Box::new(array::Int16Array::from_slice([0])),
DataType::Null => Arc::new(array::NullArray::new(1)),
DataType::Boolean => Arc::new(array::BooleanArray::from_iter([Some(false)])),
DataType::Int8 => Arc::new(array::Int8Array::from_iter([0])),
DataType::Int16 => Arc::new(array::Int16Array::from_iter([0])),

DataType::Int32
| DataType::Date32
| DataType::Time32(_)
| DataType::Interval(IntervalUnit::YearMonth) => {
// TODO(andreas): Do we have to further distinguish these types? They do share the physical type.
Box::new(array::Int32Array::from_slice([0]))
Arc::new(array::Int32Array::from_iter([0]))
}
DataType::Int64
| DataType::Date64
| DataType::Timestamp(_, _)
| DataType::Time64(_)
| DataType::Duration(_) => {
// TODO(andreas): Do we have to further distinguish these types? They do share the physical type.
Box::new(array::Int64Array::from_slice([0]))
Arc::new(array::Int64Array::from_iter([0]))
}

DataType::UInt8 => Box::new(array::UInt8Array::from_slice([0])),
DataType::UInt16 => Box::new(array::UInt16Array::from_slice([0])),
DataType::UInt32 => Box::new(array::UInt32Array::from_slice([0])),
DataType::UInt64 => Box::new(array::UInt64Array::from_slice([0])),
DataType::Float16 => Box::new(array::Float16Array::from_slice([types::f16::from_f32(0.0)])),
DataType::Float32 => Box::new(array::Float32Array::from_slice([0.0])),
DataType::Float64 => Box::new(array::Float64Array::from_slice([0.0])),
DataType::UInt8 => Arc::new(array::UInt8Array::from_iter([0])),
DataType::UInt16 => Arc::new(array::UInt16Array::from_iter([0])),
DataType::UInt32 => Arc::new(array::UInt32Array::from_iter([0])),
DataType::UInt64 => Arc::new(array::UInt64Array::from_iter([0])),
DataType::Float16 => Arc::new(array::Float16Array::from_iter([half::f16::ZERO])),
DataType::Float32 => Arc::new(array::Float32Array::from_iter([0.0])),
DataType::Float64 => Arc::new(array::Float64Array::from_iter([0.0])),

DataType::Interval(IntervalUnit::DayTime) => {
Box::new(array::DaysMsArray::from_slice([types::days_ms::new(0, 0)]))
Arc::new(array::IntervalDayTimeArray::from(vec![
types::IntervalDayTime::new(0, 0),
]))
}
DataType::Interval(IntervalUnit::MonthDayNano) => {
Box::new(array::MonthsDaysNsArray::from_slice([
types::months_days_ns::new(0, 0, 0),
Arc::new(array::IntervalMonthDayNanoArray::from(vec![
types::IntervalMonthDayNano::new(0, 0, 0),
]))
}

DataType::Binary => Box::new(array::BinaryArray::<i32>::from_slice([[]])),
DataType::FixedSizeBinary(size) => Box::new(array::FixedSizeBinaryArray::from_iter(
std::iter::once(Some(vec![0; *size])),
*size,
)),
DataType::LargeBinary => Box::new(array::BinaryArray::<i64>::from_slice([[]])),
DataType::Utf8 => Box::new(array::Utf8Array::<i32>::from_slice([""])),
DataType::LargeUtf8 => Box::new(array::Utf8Array::<i64>::from_slice([""])),
DataType::Binary => Arc::new(array::GenericBinaryArray::<i32>::from_vec(vec![&[]])),
DataType::LargeBinary => Arc::new(array::GenericBinaryArray::<i64>::from_vec(vec![&[]])),

DataType::Utf8 => Arc::new(array::StringArray::from(vec![""])),
DataType::LargeUtf8 => Arc::new(array::LargeStringArray::from(vec![""])),

DataType::List(field) => {
let inner = generic_placeholder_for_datatype_arrow2(field.data_type());
let offsets = arrow2::offset::Offsets::try_from_lengths(std::iter::once(inner.len()))
.expect("failed to create offsets buffer");
Box::new(array::ListArray::<i32>::new(
datatype.clone(),
offsets.into(),
let inner = generic_placeholder_for_datatype(field.data_type());
let offsets = arrow::buffer::OffsetBuffer::from_lengths(std::iter::once(inner.len()));
Arc::new(array::GenericListArray::<i32>::new(
field.clone(),
offsets,
inner,
None,
))
}

// TODO(andreas): Unsupported type.
// What we actually want here is an array containing a single array of size `size`.
// But it's a bit tricky to build, because it doesn't look like we can concatenate `size` many arrays.
DataType::FixedSizeList(_field, size) => {
Box::new(array::FixedSizeListArray::new_null(datatype.clone(), *size))
DataType::FixedSizeList(field, size) => {
let size = *size as usize;
let value_data: ArrayRef = {
match field.data_type() {
DataType::Boolean => Arc::new(array::BooleanArray::from(vec![false; size])),

DataType::Int8 => Arc::new(array::Int8Array::from(vec![0; size])),
DataType::Int16 => Arc::new(array::Int16Array::from(vec![0; size])),
DataType::Int32 => Arc::new(array::Int32Array::from(vec![0; size])),
DataType::Int64 => Arc::new(array::Int64Array::from(vec![0; size])),

DataType::UInt8 => Arc::new(array::UInt8Array::from(vec![0; size])),
DataType::UInt16 => Arc::new(array::UInt16Array::from(vec![0; size])),
DataType::UInt32 => Arc::new(array::UInt32Array::from(vec![0; size])),
DataType::UInt64 => Arc::new(array::UInt64Array::from(vec![0; size])),

DataType::Float16 => {
Arc::new(array::Float16Array::from(vec![half::f16::ZERO; size]))
}
DataType::Float32 => Arc::new(array::Float32Array::from(vec![0.0; size])),
DataType::Float64 => Arc::new(array::Float64Array::from(vec![0.0; size])),

_ => {
// TODO(emilk)
re_log::debug_once!(
"Unimplemented: placeholder value for FixedSizeListArray of {:?}",
field.data_type()
);
return array::new_empty_array(datatype);
}
}
};
if let Ok(list_data) = array::ArrayData::builder(datatype.clone())
.len(1)
.add_child_data(value_data.into_data())
.build()
{
Arc::new(array::FixedSizeListArray::from(list_data))
} else {
re_log::warn_once!("Bug in FixedSizeListArray of {:?}", field.data_type());
array::new_empty_array(datatype)
}
}

DataType::LargeList(field) => {
let inner = generic_placeholder_for_datatype_arrow2(field.data_type());
let offsets = arrow2::offset::Offsets::try_from_lengths(std::iter::once(inner.len()))
.expect("failed to create offsets buffer");
Box::new(array::ListArray::<i64>::new(
datatype.clone(),
offsets.into(),
let inner = generic_placeholder_for_datatype(field.data_type());
let offsets = arrow::buffer::OffsetBuffer::from_lengths(std::iter::once(inner.len()));
Arc::new(array::GenericListArray::<i64>::new(
field.clone(),
offsets,
inner,
None,
))
}
DataType::Struct(fields) => {
let inners = fields
.iter()
.map(|field| generic_placeholder_for_datatype_arrow2(field.data_type()));
Box::new(array::StructArray::new(
datatype.clone(),
.map(|field| generic_placeholder_for_datatype(field.data_type()));
Arc::new(array::StructArray::new(
fields.clone(),
inners.collect(),
None,
))
}
DataType::Union(fields, _types, _union_mode) => {
if let Some(first_field) = fields.first() {
let first_field = generic_placeholder_for_datatype_arrow2(first_field.data_type());
let first_field_len = first_field.len(); // Should be 1, but let's play this safe!
let other_fields = fields
.iter()
.skip(1)
.map(|field| array::new_empty_array(field.data_type().clone()));
let fields = std::iter::once(first_field).chain(other_fields);

Box::new(array::UnionArray::new(
datatype.clone(),
std::iter::once(0).collect(), // Single element of type 0.
fields.collect(),
Some(std::iter::once(first_field_len as i32).collect()),
))
} else {
// Pathological case: a union with no fields can't have a placeholder with a single element?
array::new_empty_array(datatype.clone())
}
}

// TODO(andreas): Unsupported types. Fairly complex to build and we don't use it so far.
DataType::Map(_field, _) => Box::new(array::MapArray::new_empty(datatype.clone())),

// TODO(andreas): Unsupported type. Has only `try_new` meaning we'd have to handle all error cases.
// But also we don't use this today anyways.
DataType::Dictionary(_integer_type, _arc, _sorted) => {
array::new_empty_array(datatype.clone()) // Rust type varies per integer type, use utility instead.
DataType::Decimal128(_, _) => Arc::new(array::Decimal128Array::from_iter([0])),

DataType::Decimal256(_, _) => Arc::new(array::Decimal256Array::from_iter([
arrow::datatypes::i256::ZERO,
])),

DataType::FixedSizeBinary { .. }
| DataType::Dictionary { .. }
| DataType::Union { .. }
| DataType::Map { .. }
| DataType::BinaryView
| DataType::Utf8View
| DataType::ListView { .. }
| DataType::LargeListView { .. }
| DataType::RunEndEncoded { .. } => {
// TODO(emilk)
re_log::debug_once!("Unimplemented: placeholder value for: {datatype:?}");
array::new_empty_array(datatype) // TODO(emilk)
}

DataType::Decimal(_, _) => Box::new(array::Int128Array::from_slice([0])),

DataType::Decimal256(_, _) => {
Box::new(array::Int256Array::from_slice([types::i256::from_words(
0, 0,
)]))
}

DataType::Extension(_, datatype, _) => generic_placeholder_for_datatype_arrow2(datatype),
}
}

Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,9 @@ fn check_for_unused_snapshots(test_cases: &[TestCase], snapshot_options: &Snapsh
let file_name = path.file_name().unwrap().to_string_lossy().to_string();

if file_name.ends_with(".png")
&& !file_name.ends_with(".new.png")
&& !file_name.ends_with(".diff.png")
&& !file_name.ends_with(".new.png")
&& !file_name.ends_with(".old.png")
&& !ok_file_names.contains(file_name.strip_suffix(".png").unwrap())
{
panic!(
Expand Down
Loading