From 5dca0be0b2194ae2443361953103ae8e78bb5c5b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 1 Aug 2023 07:50:41 -0400 Subject: [PATCH 1/2] Minor: improve docs and add example for lexicographical_partition_ranges --- arrow-ord/src/partition.rs | 78 ++++++++++++++++++++++++++++++++++---- 1 file changed, 71 insertions(+), 7 deletions(-) diff --git a/arrow-ord/src/partition.rs b/arrow-ord/src/partition.rs index 26a030beb35e..9c35eedc0904 100644 --- a/arrow-ord/src/partition.rs +++ b/arrow-ord/src/partition.rs @@ -22,15 +22,79 @@ use arrow_schema::ArrowError; use std::cmp::Ordering; use std::ops::Range; -/// Given a list of already sorted columns, find partition ranges that would partition -/// lexicographically equal values across columns. +/// Given a list of already sorted columns, returns [`Range`]es that +/// partition the input such that each partition has equal values +/// across sort columns. /// -/// Here LexicographicalComparator is used in conjunction with binary -/// search so the columns *MUST* be pre-sorted already. +/// Note that the columns *MUST* be pre-sorted otherwise the result is +/// undefined. /// -/// The returned vec would be of size k where k is cardinality of the sorted values; Consecutive -/// values will be connected: (a, b) and (b, c), where start = 0 and end = n for the first and last -/// range. +/// Returns an iterator with `k` items where `k` is cardinality of the +/// sort values: Consecutive values will be connected: `(a, b)` and `(b, +/// c)`, where `start = 0` and `end = n` for the first and last range. +/// +/// # Example: +/// +/// For example, given columns `x`, `y` and `z`, calling +/// `lexicographical_partition_ranges(values, (x, y))` will divide the +/// rows into ranges where the values of `(x, y)` are equal: +/// +/// ```text +/// ┌ ─ ┬───┬ ─ ─┌───┐─ ─ ┬───┬ ─ ─ ┐ +/// │ 1 │ │ 1 │ │ A │ Range: 0..1 (x=1, y=1) +/// ├ ─ ┼───┼ ─ ─├───┤─ ─ ┼───┼ ─ ─ ┤ +/// │ 1 │ │ 2 │ │ B │ +/// │ ├───┤ ├───┤ ├───┤ │ +/// │ 1 │ │ 2 │ │ C │ Range: 1..4 (x=1, y=2) +/// │ ├───┤ ├───┤ ├───┤ │ +/// │ 1 │ │ 2 │ │ D │ +/// ├ ─ ┼───┼ ─ ─├───┤─ ─ ┼───┼ ─ ─ ┤ +/// │ 2 │ │ 1 │ │ E │ Range: 4..5 (x=2, y=1) +/// ├ ─ ┼───┼ ─ ─├───┤─ ─ ┼───┼ ─ ─ ┤ +/// │ 3 │ │ 1 │ │ F │ Range: 5..6 (x=3, y=1) +/// └ ─ ┴───┴ ─ ─└───┘─ ─ ┴───┴ ─ ─ ┘ +/// +/// x y z lexicographical_partition_ranges +/// by (x,y) +/// ``` +/// +/// # Example Code +/// +/// ``` +/// # use std::{sync::Arc, ops::Range}; +/// # use arrow_array::{RecordBatch, Int64Array, StringArray, ArrayRef}; +/// # use arrow_ord::sort::{SortColumn, SortOptions}; +/// # use arrow_ord::partition::lexicographical_partition_ranges; +/// let batch = RecordBatch::try_from_iter(vec![ +/// ("x", Arc::new(Int64Array::from(vec![1, 1, 1, 1, 2, 3])) as ArrayRef), +/// ("y", Arc::new(Int64Array::from(vec![1, 2, 2, 2, 1, 1])) as ArrayRef), +/// ("z", Arc::new(StringArray::from(vec!["A", "B", "C", "D", "E", "F"])) as ArrayRef), +/// ]).unwrap(); +/// +/// // Lexographically partition on (x, y) +/// let sort_columns = vec![ +/// SortColumn { +/// values: batch.column(0).clone(), +/// options: Some(SortOptions::default()), +/// }, +/// SortColumn { +/// values: batch.column(1).clone(), +/// options: Some(SortOptions::default()), +/// }, +/// ]; +/// let ranges:Vec> = lexicographical_partition_ranges(&sort_columns) +/// .unwrap() +/// .collect(); +/// +/// let expected = vec![ +/// (0..1), +/// (1..4), +/// (4..5), +/// (5..6), +/// ]; +/// +/// assert_eq!(ranges, expected); +/// ``` pub fn lexicographical_partition_ranges( columns: &[SortColumn], ) -> Result> + '_, ArrowError> { From d28efcb410aef72fe7d1d70ed25f728f5aa9a7da Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 1 Aug 2023 07:58:11 -0400 Subject: [PATCH 2/2] improve wording --- arrow-ord/src/partition.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-ord/src/partition.rs b/arrow-ord/src/partition.rs index 9c35eedc0904..4411a0f0ab31 100644 --- a/arrow-ord/src/partition.rs +++ b/arrow-ord/src/partition.rs @@ -26,8 +26,8 @@ use std::ops::Range; /// partition the input such that each partition has equal values /// across sort columns. /// -/// Note that the columns *MUST* be pre-sorted otherwise the result is -/// undefined. +/// Returns an error if no columns are specified or all columns do not +/// have the same number of rows. /// /// Returns an iterator with `k` items where `k` is cardinality of the /// sort values: Consecutive values will be connected: `(a, b)` and `(b,