Skip to content

Commit

Permalink
Docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
Jay Chia committed Dec 20, 2024
1 parent 511d996 commit 28f2083
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@ use common_error::DaftResult;
use super::{split_parquet_decision, split_parquet_file};
use crate::ScanTaskRef;

/// Retrieves Parquet metadata for the incoming "Decisions".
///
/// # Returns
///
/// Returns [`ParquetSplitScanTaskGenerator`] instances which are themselves iterators that can yield:
/// - A single [`ScanTaskRef`] if no split was needed
/// - Multiple [`ScanTaskRef`]s if the task was split
///
/// # Implementation Details
///
/// Retrieval of Parquet metadata is performed in batches using a windowed approach for efficiency.
pub(super) struct RetrieveParquetMetadataIterator<'cfg> {
decider: split_parquet_decision::DecideSplitIterator<'cfg>,
_cfg: &'cfg DaftExecutionConfig,
Expand Down
20 changes: 20 additions & 0 deletions src/daft-scan/src/scan_task_iters/split_parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,26 @@ mod fetch_parquet_metadata;
mod split_parquet_decision;
mod split_parquet_file;

/// Splits input scan tasks into smaller scan tasks based on Parquet file metadata.
///
/// This struct provides functionality to split scan tasks by:
/// 1. Deciding whether or not to split each input ScanTask
/// 2. Fetching the Parquet metadata of the ScanTask (if deemed necessary to split)
/// 3. Performing the splitting of the ScanTask into smaller ScanTasks
///
/// Note that this may be expensive if the incoming stream has many large ScanTasks, incurring a
/// higher cost at planning-time.
///
/// # Examples
///
/// ```
/// # use daft_scan::scan_task_iters::BoxScanTaskIter;
/// # use common_daft_config::DaftExecutionConfig;
/// # let input_tasks: BoxScanTaskIter = unimplemented!();
/// # let config = DaftExecutionConfig::default();
/// let splitter = SplitParquetScanTasks::new(input_tasks, &config);
/// let split_tasks = splitter.into_iter();
/// ```
pub struct SplitParquetScanTasks<'cfg> {
retriever: fetch_parquet_metadata::RetrieveParquetMetadataIterator<'cfg>,
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ use common_daft_config::DaftExecutionConfig;

use crate::scan_task_iters::BoxScanTaskIter;

/// An iterator that determines whether incoming ScanTasks should be split by Parquet rowgroups.
///
/// # Returns
///
/// Returns an iterator of [`Decision`] objects indicating whether and how to split each task.
pub(super) struct DecideSplitIterator<'cfg> {
inputs: BoxScanTaskIter<'cfg>,
_cfg: &'cfg DaftExecutionConfig,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@ use common_error::DaftResult;

use crate::ScanTaskRef;

/// Splits its internal ScanTask into smaller ScanTasks based on certain criteria, including
/// the size of the Parquet file and available rowgroups.
///
/// # Implementation Details
///
/// This type implements [`Iterator`] to produce [`ScanTaskRef`]s representing the split tasks.
pub(super) struct ParquetFileSplitter {}

impl Iterator for ParquetFileSplitter {
Expand Down

0 comments on commit 28f2083

Please sign in to comment.