From ea7d4cde3e799376bc6ae7c5c25a2727fc4e72b4 Mon Sep 17 00:00:00 2001 From: Daniel Falbel Date: Wed, 31 Jul 2024 19:53:04 -0300 Subject: [PATCH] Data Explorer: Support for getting multiple profiles per column (#456) * Adapt to vectorization of profiles per column. * Mark Histograms and Frequency tables as unsupported --- .../amalthea/src/comm/data_explorer_comm.rs | 74 ++++++++--- .../ark/src/data_explorer/r_data_explorer.rs | 115 ++++++++++-------- crates/ark/tests/data_explorer.rs | 16 ++- 3 files changed, 130 insertions(+), 75 deletions(-) diff --git a/crates/amalthea/src/comm/data_explorer_comm.rs b/crates/amalthea/src/comm/data_explorer_comm.rs index be945481b..5f1f0e43b 100644 --- a/crates/amalthea/src/comm/data_explorer_comm.rs +++ b/crates/amalthea/src/comm/data_explorer_comm.rs @@ -260,8 +260,18 @@ pub struct ColumnProfileRequest { /// The ordinal column index to profile pub column_index: i64, - /// The type of analytical column profile - pub profile_type: ColumnProfileType + /// Column profiles needed + pub profiles: Vec +} + +/// Parameters for a single column profile for a request for profiles +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct ColumnProfileSpec { + /// Type of column profile + pub profile_type: ColumnProfileType, + + /// Extra parameters for different profile types + pub params: Option } /// Support status for a given column profile type @@ -393,40 +403,56 @@ pub struct SummaryStatsDatetime { pub timezone: Option } +/// Parameters for a column histogram profile request +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct ColumnHistogramParams { + /// Number of bins in the computed histogram + pub num_bins: i64, + + /// Sample quantiles (numbers between 0 and 1) to compute along with the + /// histogram + pub quantiles: Option> +} + /// Result from a histogram profile request #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] pub struct ColumnHistogram { + /// String-formatted versions of the bin edges, there are N + 1 where N is + /// the number of bins + pub bin_edges: Vec, + /// Absolute count of values in each histogram bin - pub bin_sizes: Vec, + pub bin_counts: Vec, - /// Absolute floating-point width of a histogram bin - pub bin_width: f64 + /// Sample quantiles that were also requested + pub quantiles: Vec } -/// Result from a frequency_table profile request +/// Parameters for a frequency_table profile request #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] -pub struct ColumnFrequencyTable { - /// Counts of distinct values in column - pub counts: Vec, - - /// Number of other values not accounted for in counts. May be 0 - pub other_count: i64 +pub struct ColumnFrequencyTableParams { + /// Number of most frequently-occurring values to return. The K in TopK + pub limit: i64 } -/// Entry in a column's frequency table +/// Result from a frequency_table profile request #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] -pub struct ColumnFrequencyTableItem { - /// Stringified value - pub value: String, +pub struct ColumnFrequencyTable { + /// The formatted top values + pub values: Vec, - /// Number of occurrences of value - pub count: i64 + /// Counts of top values + pub counts: Vec, + + /// Number of other values not accounted for in counts, excluding nulls/NA + /// values. May be omitted + pub other_count: Option } /// An exact or approximate quantile value from a column #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] pub struct ColumnQuantileValue { - /// Quantile number (percentile). E.g. 1 for 1%, 50 for median + /// Quantile number; a number between 0 and 1 pub q: f64, /// Stringified quantile value @@ -847,6 +873,16 @@ pub enum ColumnFilterParams { MatchDataTypes(FilterMatchDataTypes) } +/// Union type ColumnProfileParams +/// Extra parameters for different profile types +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(untagged)] +pub enum ColumnProfileParams { + Histogram(ColumnHistogramParams), + + FrequencyTable(ColumnFrequencyTableParams) +} + /// Union type Selection in Properties #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] #[serde(untagged)] diff --git a/crates/ark/src/data_explorer/r_data_explorer.rs b/crates/ark/src/data_explorer/r_data_explorer.rs index df931f9e6..9a3ea2da4 100644 --- a/crates/ark/src/data_explorer/r_data_explorer.rs +++ b/crates/ark/src/data_explorer/r_data_explorer.rs @@ -11,6 +11,7 @@ use std::collections::HashMap; use amalthea::comm::comm_channel::CommMsg; use amalthea::comm::data_explorer_comm::BackendState; use amalthea::comm::data_explorer_comm::ColumnDisplayType; +use amalthea::comm::data_explorer_comm::ColumnProfileRequest; use amalthea::comm::data_explorer_comm::ColumnProfileResult; use amalthea::comm::data_explorer_comm::ColumnProfileType; use amalthea::comm::data_explorer_comm::ColumnProfileTypeSupportStatus; @@ -499,59 +500,7 @@ impl RDataExplorer { }) => { let profiles = requests .into_iter() - .map(|request| match request.profile_type { - ColumnProfileType::NullCount => { - let null_count = - r_task(|| self.r_null_count(request.column_index as i32)); - ColumnProfileResult { - null_count: match null_count { - Err(err) => { - log::error!( - "Error getting null count for column {}: {}", - request.column_index, - err - ); - None - }, - Ok(count) => Some(count as i64), - }, - summary_stats: None, - histogram: None, - frequency_table: None, - } - }, - ColumnProfileType::SummaryStats => { - let summary_stats = r_task(|| { - self.r_summary_stats(request.column_index as i32, &format_options) - }); - ColumnProfileResult { - null_count: None, - summary_stats: match summary_stats { - Err(err) => { - log::error!( - "Error getting summary stats for column {}: {}", - request.column_index, - err - ); - None - }, - Ok(stats) => Some(stats), - }, - histogram: None, - frequency_table: None, - } - }, - _ => { - // Other kinds of column profiles are not yet - // implemented in R - ColumnProfileResult { - null_count: None, - summary_stats: None, - histogram: None, - frequency_table: None, - } - }, - }) + .map(|request| self.r_get_column_profile(request, &format_options)) .collect::>(); Ok(DataExplorerBackendReply::GetColumnProfilesReply(profiles)) }, @@ -630,6 +579,58 @@ impl RDataExplorer { } } + fn r_get_column_profile( + &self, + request: ColumnProfileRequest, + format_options: &FormatOptions, + ) -> ColumnProfileResult { + let mut output = ColumnProfileResult { + null_count: None, + summary_stats: None, + histogram: None, + frequency_table: None, + }; + + for profile_req in request.profiles { + match profile_req.profile_type { + ColumnProfileType::NullCount => { + let null_count = r_task(|| self.r_null_count(request.column_index as i32)); + output.null_count = match null_count { + Err(err) => { + log::error!( + "Error getting null count for column {}: {}", + request.column_index, + err + ); + None + }, + Ok(count) => Some(count as i64), + }; + }, + ColumnProfileType::SummaryStats => { + let summary_stats = r_task(|| { + self.r_summary_stats(request.column_index as i32, &format_options) + }); + output.summary_stats = match summary_stats { + Err(err) => { + log::error!( + "Error getting summary stats for column {}: {}", + request.column_index, + err + ); + None + }, + Ok(stats) => Some(stats), + }; + }, + _ => { + // Other types are not supported yet + }, + }; + } + output + } + /// Counts the number of nulls in a column. As the intent is to provide an /// idea of how complete the data is, NA values are considered to be null /// for the purposes of these stats. @@ -940,6 +941,14 @@ impl RDataExplorer { profile_type: ColumnProfileType::SummaryStats, support_status: SupportStatus::Experimental, }, + ColumnProfileTypeSupportStatus { + profile_type: ColumnProfileType::Histogram, + support_status: SupportStatus::Unsupported, + }, + ColumnProfileTypeSupportStatus { + profile_type: ColumnProfileType::FrequencyTable, + support_status: SupportStatus::Unsupported, + }, ], }, search_schema: SearchSchemaFeatures { diff --git a/crates/ark/tests/data_explorer.rs b/crates/ark/tests/data_explorer.rs index 964658bd9..73d14b1af 100644 --- a/crates/ark/tests/data_explorer.rs +++ b/crates/ark/tests/data_explorer.rs @@ -7,6 +7,7 @@ use amalthea::comm::comm_channel::CommMsg; use amalthea::comm::data_explorer_comm::ColumnProfileRequest; +use amalthea::comm::data_explorer_comm::ColumnProfileSpec; use amalthea::comm::data_explorer_comm::ColumnProfileType; use amalthea::comm::data_explorer_comm::ColumnSortKey; use amalthea::comm::data_explorer_comm::ColumnValue; @@ -546,7 +547,10 @@ fn test_null_counts() { let req = DataExplorerBackendRequest::GetColumnProfiles(GetColumnProfilesParams { profiles: vec![ColumnProfileRequest { column_index: 0, - profile_type: ColumnProfileType::NullCount, + profiles: vec![ColumnProfileSpec { + profile_type: ColumnProfileType::NullCount, + params: None, + }], }], format_options: default_format_options(), }); @@ -586,7 +590,10 @@ fn test_null_counts() { let req = DataExplorerBackendRequest::GetColumnProfiles(GetColumnProfilesParams { profiles: vec![ColumnProfileRequest { column_index: 0, - profile_type: ColumnProfileType::NullCount, + profiles: vec![ColumnProfileSpec { + profile_type: ColumnProfileType::NullCount, + params: None, + }], }], format_options: default_format_options(), }); @@ -645,7 +652,10 @@ fn test_summary_stats() { profiles: (0..3) .map(|i| ColumnProfileRequest { column_index: i, - profile_type: ColumnProfileType::SummaryStats, + profiles: vec![ColumnProfileSpec { + profile_type: ColumnProfileType::SummaryStats, + params: None, + }], }) .collect(), format_options: default_format_options(),