Make the choice to dictionary-encode data results explicit (#7405)

### What - When requesting a Column explicitly, you can now use the new `ColumnDescriptor::DictionaryEncoded(descriptor)` to indicate that you want a dictionary-encoded view of a column. - Components that are neither part of the chunk, nor dictionary encoded, result in row-slices instead. - If there are no row-slices, we preserve the old full-chunk based join mechanism, giving large record-batches. - If there are row-slices, you get single rows. Future work: - #7414 ### Checklist * [x] I have read and agree to [Contributor Guide](https://github.com/rerun-io/rerun/blob/main/CONTRIBUTING.md) and the [Code of Conduct](https://github.com/rerun-io/rerun/blob/main/CODE_OF_CONDUCT.md) * [x] I've included a screenshot or gif (if applicable) * [x] I have tested the web demo (if applicable): * Using examples from latest `main` build: [rerun.io/viewer](https://rerun.io/viewer/pr/7405?manifest_url=https://app.rerun.io/version/main/examples_manifest.json) * Using full set of examples from `nightly` build: [rerun.io/viewer](https://rerun.io/viewer/pr/7405?manifest_url=https://app.rerun.io/version/nightly/examples_manifest.json) * [x] The PR title and labels are set such as to maximize their usefulness for the next release's CHANGELOG * [x] If applicable, add a new check to the [release checklist](https://github.com/rerun-io/rerun/blob/main/tests/python/release_checklist)! * [x] If have noted any breaking changes to the log API in `CHANGELOG.md` and the migration guide - [PR Build Summary](https://build.rerun.io/pr/7405) - [Recent benchmark results](https://build.rerun.io/graphs/crates.html) - [Wasm size tracking](https://build.rerun.io/graphs/sizes.html) To run all checks from `main`, comment on the PR with `@rerun-bot full-check`.
rerun-io · Sep 18, 2024 · 7449e38 · 7449e38
1 parent ee50fab
commit 7449e38
Show file tree

Hide file tree

Showing 7 changed files with 447 additions and 179 deletions.
diff --git a/crates/store/re_chunk/src/util.rs b/crates/store/re_chunk/src/util.rs
@@ -82,7 +82,7 @@ pub fn arrays_to_list_array(
 // TODO(cmc): A possible improvement would be to pick the smallest key datatype possible based
 // on the cardinality of the input arrays.
 pub fn arrays_to_dictionary<Idx: Copy + Eq>(
-    array_datatype: ArrowDatatype,
+    array_datatype: &ArrowDatatype,
     arrays: &[Option<(Idx, &dyn ArrowArray)>],
 ) -> Option<ArrowDictionaryArray<i32>> {
     // Dedupe the input arrays based on the given primary key.
@@ -115,33 +115,29 @@ pub fn arrays_to_dictionary<Idx: Copy + Eq>(
     };
 
     // Concatenate the underlying data as usual, except only the _unique_ values!
+    // We still need the underlying data to be a list-array, so the dictionary's keys can index
+    // into this list-array.
     let data = if arrays_dense_deduped.is_empty() {
         arrow2::array::new_empty_array(array_datatype.clone())
     } else {
-        arrow2::compute::concatenate::concatenate(&arrays_dense_deduped)
+        let values = arrow2::compute::concatenate::concatenate(&arrays_dense_deduped)
             .map_err(|err| {
                 re_log::warn_once!("failed to concatenate arrays: {err}");
                 err
             })
-            .ok()?
-    };
-
-    // We still need the underlying data to be a list-array, so the dictionary's keys can index
-    // into this list-array.
-    let data = {
-        let datatype = ArrowListArray::<i32>::default_datatype(array_datatype);
+            .ok()?;
 
         #[allow(clippy::unwrap_used)] // yes, these are indeed lengths
         let offsets =
             ArrowOffsets::try_from_lengths(arrays_dense_deduped.iter().map(|array| array.len()))
                 .unwrap();
 
-        ArrowListArray::<i32>::new(datatype, offsets.into(), data, None)
+        ArrowListArray::<i32>::new(array_datatype.clone(), offsets.into(), values, None).to_boxed()
     };
 
     let datatype = ArrowDatatype::Dictionary(
         arrow2::datatypes::IntegerType::Int32,
-        std::sync::Arc::new(data.data_type().clone()),
+        std::sync::Arc::new(array_datatype.clone()),
         true, // is_sorted
     );
 

diff --git a/crates/store/re_chunk_store/src/dataframe.rs b/crates/store/re_chunk_store/src/dataframe.rs
@@ -22,6 +22,47 @@ use crate::RowId;
 
 // --- Descriptors ---
 
+/// When selecting secondary component columns, specify how the joined data should be encoded.
+///
+/// Because range-queries often involve repeating the same joined-in data multiple times,
+/// the strategy we choose for joining can have a significant impact on the size and memory
+/// overhead of the `RecordBatch`.
+#[derive(Debug, Default, Clone, PartialEq, Eq, Hash)]
+pub enum JoinEncoding {
+    /// Slice the `RecordBatch` to minimal overlapping sub-ranges.
+    ///
+    /// This is the default, and should always be used for the POV component which defines
+    /// the optimal size for `RecordBatch`.
+    ///
+    /// This minimizes the need for allocation, but at the cost of `RecordBatch`es that are
+    /// almost always smaller than the optimal size. In the common worst-case, this will result
+    /// in single-row `RecordBatch`es.
+    #[default]
+    OverlappingSlice,
+
+    /// Dictionary-encode the joined column.
+    ///
+    /// Using dictionary-encoding allows any repeated data to be shared between rows,
+    /// but comes with the cost of an extra dictionary-lookup indirection.
+    ///
+    /// Note that this changes the physical type of the returned column.
+    ///
+    /// Using this encoding for complex types is incompatible with some arrow libraries.
+    DictionaryEncode,
+    //
+    // TODO(jleibs):
+    // RepeatCopy,
+    //
+    // Repeat the joined column by physically copying the data.
+    //
+    // This will always allocate a new column in the `RecordBatch`, matching the size of the
+    // POV component.
+    //
+    // This is the most expensive option, but can make working with the data more efficient,
+    // especially when the copied column is small.
+    //
+}
+
 // TODO(#6889): At some point all these descriptors needs to be interned and have handles or
 // something. And of course they need to be codegen. But we'll get there once we're back to
 // natively tagged components.
@@ -43,17 +84,17 @@ impl ColumnDescriptor {
     #[inline]
     pub fn entity_path(&self) -> Option<&EntityPath> {
         match self {
-            Self::Component(descr) => Some(&descr.entity_path),
             Self::Control(_) | Self::Time(_) => None,
+            Self::Component(descr) => Some(&descr.entity_path),
         }
     }
 
     #[inline]
-    pub fn datatype(&self) -> &ArrowDatatype {
+    pub fn datatype(&self) -> ArrowDatatype {
         match self {
-            Self::Control(descr) => &descr.datatype,
-            Self::Component(descr) => &descr.datatype,
-            Self::Time(descr) => &descr.datatype,
+            Self::Control(descr) => descr.datatype.clone(),
+            Self::Time(descr) => descr.datatype.clone(),
+            Self::Component(descr) => descr.returned_datatype(),
         }
     }
 
@@ -193,8 +234,15 @@ pub struct ComponentColumnDescriptor {
     /// Example: `rerun.components.Position3D`.
     pub component_name: ComponentName,
 
-    /// The Arrow datatype of the column.
-    pub datatype: ArrowDatatype,
+    /// The Arrow datatype of the stored column.
+    ///
+    /// This is the log-time datatype corresponding to how this data is encoded
+    /// in a chunk. Currently this will always be an [`ArrowListArray`], but as
+    /// we introduce mono-type optimization, this might be a native type instead.
+    pub store_datatype: ArrowDatatype,
+
+    /// How the data will be joined into the resulting `RecordBatch`.
+    pub join_encoding: JoinEncoding,
 
     /// Whether this column represents static data.
     pub is_static: bool,
@@ -215,7 +263,8 @@ impl Ord for ComponentColumnDescriptor {
             archetype_name,
             archetype_field_name,
             component_name,
-            datatype: _,
+            join_encoding: _,
+            store_datatype: _,
             is_static: _,
         } = self;
 
@@ -234,7 +283,8 @@ impl std::fmt::Display for ComponentColumnDescriptor {
             archetype_name,
             archetype_field_name,
             component_name,
-            datatype: _,
+            join_encoding: _,
+            store_datatype: _,
             is_static,
         } = self;
 
@@ -267,59 +317,84 @@ impl std::fmt::Display for ComponentColumnDescriptor {
 impl ComponentColumnDescriptor {
     #[inline]
     pub fn new<C: re_types_core::Component>(entity_path: EntityPath) -> Self {
+        let join_encoding = JoinEncoding::default();
+
+        // NOTE: The data is always a at least a list, whether it's latest-at or range.
+        // It might be wrapped further in e.g. a dict, but at the very least
+        // it's a list.
+        let store_datatype = ArrowListArray::<i32>::default_datatype(C::arrow_datatype());
+
         Self {
             entity_path,
             archetype_name: None,
             archetype_field_name: None,
             component_name: C::name(),
-            // NOTE: The data is always a at least a list, whether it's latest-at or range.
-            // It might be wrapped further in e.g. a dict, but at the very least
-            // it's a list.
-            // TODO(#7365): user-specified datatypes have got to go.
-            datatype: ArrowListArray::<i32>::default_datatype(C::arrow_datatype()),
+            join_encoding,
+            store_datatype,
             is_static: false,
         }
     }
 
-    #[inline]
-    pub fn to_arrow_field(&self) -> ArrowField {
+    fn metadata(&self) -> arrow2::datatypes::Metadata {
         let Self {
             entity_path,
             archetype_name,
             archetype_field_name,
             component_name,
-            datatype,
+            join_encoding: _,
+            store_datatype: _,
             is_static,
         } = self;
 
+        [
+            (*is_static).then_some(("sorbet.is_static".to_owned(), "yes".to_owned())),
+            Some(("sorbet.path".to_owned(), entity_path.to_string())),
+            Some((
+                "sorbet.semantic_type".to_owned(),
+                component_name.short_name().to_owned(),
+            )),
+            archetype_name.map(|name| {
+                (
+                    "sorbet.semantic_family".to_owned(),
+                    name.short_name().to_owned(),
+                )
+            }),
+            archetype_field_name
+                .as_ref()
+                .map(|name| ("sorbet.logical_type".to_owned(), name.to_owned())),
+        ]
+        .into_iter()
+        .flatten()
+        .collect()
+    }
+
+    #[inline]
+    pub fn returned_datatype(&self) -> ArrowDatatype {
+        match self.join_encoding {
+            JoinEncoding::OverlappingSlice => self.store_datatype.clone(),
+            JoinEncoding::DictionaryEncode => ArrowDatatype::Dictionary(
+                arrow2::datatypes::IntegerType::Int32,
+                std::sync::Arc::new(self.store_datatype.clone()),
+                true,
+            ),
+        }
+    }
+
+    #[inline]
+    pub fn to_arrow_field(&self) -> ArrowField {
         ArrowField::new(
-            component_name.short_name().to_owned(),
-            datatype.clone(),
-            false, /* nullable */
+            self.component_name.short_name().to_owned(),
+            self.returned_datatype(),
+            true, /* nullable */
         )
         // TODO(#6889): This needs some proper sorbetization -- I just threw these names randomly.
-        .with_metadata(
-            [
-                (*is_static).then_some(("sorbet.is_static".to_owned(), "yes".to_owned())),
-                Some(("sorbet.path".to_owned(), entity_path.to_string())),
-                Some((
-                    "sorbet.semantic_type".to_owned(),
-                    component_name.short_name().to_owned(),
-                )),
-                archetype_name.map(|name| {
-                    (
-                        "sorbet.semantic_family".to_owned(),
-                        name.short_name().to_owned(),
-                    )
-                }),
-                archetype_field_name
-                    .as_ref()
-                    .map(|name| ("sorbet.logical_type".to_owned(), name.to_owned())),
-            ]
-            .into_iter()
-            .flatten()
-            .collect(),
-        )
+        .with_metadata(self.metadata())
+    }
+
+    #[inline]
+    pub fn with_join_encoding(mut self, join_encoding: JoinEncoding) -> Self {
+        self.join_encoding = join_encoding;
+        self
     }
 }
 
@@ -491,7 +566,10 @@ impl ChunkStore {
                                 archetype_name: None,
                                 archetype_field_name: None,
                                 component_name: *component_name,
-                                datatype: ArrowListArray::<i32>::default_datatype(datatype.clone()),
+                                store_datatype: ArrowListArray::<i32>::default_datatype(
+                                    datatype.clone(),
+                                ),
+                                join_encoding: JoinEncoding::default(),
                                 is_static: true,
                             })
                         })
@@ -520,7 +598,10 @@ impl ChunkStore {
                             // NOTE: The data is always a at least a list, whether it's latest-at or range.
                             // It might be wrapped further in e.g. a dict, but at the very least
                             // it's a list.
-                            datatype: ArrowListArray::<i32>::default_datatype(datatype.clone()),
+                            store_datatype: ArrowListArray::<i32>::default_datatype(
+                                datatype.clone(),
+                            ),
+                            join_encoding: JoinEncoding::default(),
                             // NOTE: This will make it so shadowed temporal data automatically gets
                             // discarded from the schema.
                             is_static: self
@@ -570,6 +651,8 @@ impl ChunkStore {
         // Then, discard any column descriptor which cannot possibly have data for the given query.
         //
         // TODO(cmc): Opportunities for parallelization, if it proves to be a net positive in practice.
+        // TODO(jleibs): This filtering actually seems incorrect. This operation should be based solely
+        // on the timeline,
         let mut filtered_out = HashSet::default();
         for column_descr in &schema {
             let ColumnDescriptor::Component(descr) = column_descr else {

diff --git a/crates/store/re_chunk_store/src/lib.rs b/crates/store/re_chunk_store/src/lib.rs
@@ -24,8 +24,8 @@ mod subscribers;
 mod writes;
 
 pub use self::dataframe::{
-    ColumnDescriptor, ComponentColumnDescriptor, ControlColumnDescriptor, LatestAtQueryExpression,
-    QueryExpression, RangeQueryExpression, TimeColumnDescriptor,
+    ColumnDescriptor, ComponentColumnDescriptor, ControlColumnDescriptor, JoinEncoding,
+    LatestAtQueryExpression, QueryExpression, RangeQueryExpression, TimeColumnDescriptor,
 };
 pub use self::events::{ChunkStoreDiff, ChunkStoreDiffKind, ChunkStoreEvent};
 pub use self::gc::{GarbageCollectionOptions, GarbageCollectionTarget};

diff --git a/crates/store/re_dataframe/src/engine.rs b/crates/store/re_dataframe/src/engine.rs
@@ -111,7 +111,6 @@ impl QueryEngine<'_> {
     /// Creating a handle is very cheap as it doesn't perform any kind of querying.
     ///
     /// If `columns` is specified, the schema of the result will strictly follow this specification.
-    /// [`ComponentColumnDescriptor::datatype`] and [`ComponentColumnDescriptor::is_static`] are ignored.
     ///
     /// Any provided [`ColumnDescriptor`]s that don't match a column in the result will still be included, but the
     /// data will be null for the entire column.
@@ -144,7 +143,6 @@ impl QueryEngine<'_> {
     /// Creating a handle is very cheap as it doesn't perform any kind of querying.
     ///
     /// If `columns` is specified, the schema of the result will strictly follow this specification.
-    /// [`ComponentColumnDescriptor::datatype`] and [`ComponentColumnDescriptor::is_static`] are ignored.
     ///
     /// Any provided [`ColumnDescriptor`]s that don't match a column in the result will still be included, but the
     /// data will be null for the entire column.