Skip to content

Commit

Permalink
Data Explorer: Support for getting multiple profiles per column (#456)
Browse files Browse the repository at this point in the history
* Adapt to vectorization of profiles per column.
* Mark Histograms and Frequency tables as unsupported
  • Loading branch information
dfalbel authored Jul 31, 2024
1 parent 70552b8 commit ea7d4cd
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 75 deletions.
74 changes: 55 additions & 19 deletions crates/amalthea/src/comm/data_explorer_comm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -260,8 +260,18 @@ pub struct ColumnProfileRequest {
/// The ordinal column index to profile
pub column_index: i64,

/// The type of analytical column profile
pub profile_type: ColumnProfileType
/// Column profiles needed
pub profiles: Vec<ColumnProfileSpec>
}

/// Parameters for a single column profile for a request for profiles
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct ColumnProfileSpec {
/// Type of column profile
pub profile_type: ColumnProfileType,

/// Extra parameters for different profile types
pub params: Option<ColumnProfileParams>
}

/// Support status for a given column profile type
Expand Down Expand Up @@ -393,40 +403,56 @@ pub struct SummaryStatsDatetime {
pub timezone: Option<String>
}

/// Parameters for a column histogram profile request
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct ColumnHistogramParams {
/// Number of bins in the computed histogram
pub num_bins: i64,

/// Sample quantiles (numbers between 0 and 1) to compute along with the
/// histogram
pub quantiles: Option<Vec<f64>>
}

/// Result from a histogram profile request
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct ColumnHistogram {
/// String-formatted versions of the bin edges, there are N + 1 where N is
/// the number of bins
pub bin_edges: Vec<String>,

/// Absolute count of values in each histogram bin
pub bin_sizes: Vec<i64>,
pub bin_counts: Vec<i64>,

/// Absolute floating-point width of a histogram bin
pub bin_width: f64
/// Sample quantiles that were also requested
pub quantiles: Vec<ColumnQuantileValue>
}

/// Result from a frequency_table profile request
/// Parameters for a frequency_table profile request
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct ColumnFrequencyTable {
/// Counts of distinct values in column
pub counts: Vec<ColumnFrequencyTableItem>,

/// Number of other values not accounted for in counts. May be 0
pub other_count: i64
pub struct ColumnFrequencyTableParams {
/// Number of most frequently-occurring values to return. The K in TopK
pub limit: i64
}

/// Entry in a column's frequency table
/// Result from a frequency_table profile request
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct ColumnFrequencyTableItem {
/// Stringified value
pub value: String,
pub struct ColumnFrequencyTable {
/// The formatted top values
pub values: Vec<String>,

/// Number of occurrences of value
pub count: i64
/// Counts of top values
pub counts: Vec<i64>,

/// Number of other values not accounted for in counts, excluding nulls/NA
/// values. May be omitted
pub other_count: Option<i64>
}

/// An exact or approximate quantile value from a column
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct ColumnQuantileValue {
/// Quantile number (percentile). E.g. 1 for 1%, 50 for median
/// Quantile number; a number between 0 and 1
pub q: f64,

/// Stringified quantile value
Expand Down Expand Up @@ -847,6 +873,16 @@ pub enum ColumnFilterParams {
MatchDataTypes(FilterMatchDataTypes)
}

/// Union type ColumnProfileParams
/// Extra parameters for different profile types
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[serde(untagged)]
pub enum ColumnProfileParams {
Histogram(ColumnHistogramParams),

FrequencyTable(ColumnFrequencyTableParams)
}

/// Union type Selection in Properties
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[serde(untagged)]
Expand Down
115 changes: 62 additions & 53 deletions crates/ark/src/data_explorer/r_data_explorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use std::collections::HashMap;
use amalthea::comm::comm_channel::CommMsg;
use amalthea::comm::data_explorer_comm::BackendState;
use amalthea::comm::data_explorer_comm::ColumnDisplayType;
use amalthea::comm::data_explorer_comm::ColumnProfileRequest;
use amalthea::comm::data_explorer_comm::ColumnProfileResult;
use amalthea::comm::data_explorer_comm::ColumnProfileType;
use amalthea::comm::data_explorer_comm::ColumnProfileTypeSupportStatus;
Expand Down Expand Up @@ -499,59 +500,7 @@ impl RDataExplorer {
}) => {
let profiles = requests
.into_iter()
.map(|request| match request.profile_type {
ColumnProfileType::NullCount => {
let null_count =
r_task(|| self.r_null_count(request.column_index as i32));
ColumnProfileResult {
null_count: match null_count {
Err(err) => {
log::error!(
"Error getting null count for column {}: {}",
request.column_index,
err
);
None
},
Ok(count) => Some(count as i64),
},
summary_stats: None,
histogram: None,
frequency_table: None,
}
},
ColumnProfileType::SummaryStats => {
let summary_stats = r_task(|| {
self.r_summary_stats(request.column_index as i32, &format_options)
});
ColumnProfileResult {
null_count: None,
summary_stats: match summary_stats {
Err(err) => {
log::error!(
"Error getting summary stats for column {}: {}",
request.column_index,
err
);
None
},
Ok(stats) => Some(stats),
},
histogram: None,
frequency_table: None,
}
},
_ => {
// Other kinds of column profiles are not yet
// implemented in R
ColumnProfileResult {
null_count: None,
summary_stats: None,
histogram: None,
frequency_table: None,
}
},
})
.map(|request| self.r_get_column_profile(request, &format_options))
.collect::<Vec<ColumnProfileResult>>();
Ok(DataExplorerBackendReply::GetColumnProfilesReply(profiles))
},
Expand Down Expand Up @@ -630,6 +579,58 @@ impl RDataExplorer {
}
}

fn r_get_column_profile(
&self,
request: ColumnProfileRequest,
format_options: &FormatOptions,
) -> ColumnProfileResult {
let mut output = ColumnProfileResult {
null_count: None,
summary_stats: None,
histogram: None,
frequency_table: None,
};

for profile_req in request.profiles {
match profile_req.profile_type {
ColumnProfileType::NullCount => {
let null_count = r_task(|| self.r_null_count(request.column_index as i32));
output.null_count = match null_count {
Err(err) => {
log::error!(
"Error getting null count for column {}: {}",
request.column_index,
err
);
None
},
Ok(count) => Some(count as i64),
};
},
ColumnProfileType::SummaryStats => {
let summary_stats = r_task(|| {
self.r_summary_stats(request.column_index as i32, &format_options)
});
output.summary_stats = match summary_stats {
Err(err) => {
log::error!(
"Error getting summary stats for column {}: {}",
request.column_index,
err
);
None
},
Ok(stats) => Some(stats),
};
},
_ => {
// Other types are not supported yet
},
};
}
output
}

/// Counts the number of nulls in a column. As the intent is to provide an
/// idea of how complete the data is, NA values are considered to be null
/// for the purposes of these stats.
Expand Down Expand Up @@ -940,6 +941,14 @@ impl RDataExplorer {
profile_type: ColumnProfileType::SummaryStats,
support_status: SupportStatus::Experimental,
},
ColumnProfileTypeSupportStatus {
profile_type: ColumnProfileType::Histogram,
support_status: SupportStatus::Unsupported,
},
ColumnProfileTypeSupportStatus {
profile_type: ColumnProfileType::FrequencyTable,
support_status: SupportStatus::Unsupported,
},
],
},
search_schema: SearchSchemaFeatures {
Expand Down
16 changes: 13 additions & 3 deletions crates/ark/tests/data_explorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

use amalthea::comm::comm_channel::CommMsg;
use amalthea::comm::data_explorer_comm::ColumnProfileRequest;
use amalthea::comm::data_explorer_comm::ColumnProfileSpec;
use amalthea::comm::data_explorer_comm::ColumnProfileType;
use amalthea::comm::data_explorer_comm::ColumnSortKey;
use amalthea::comm::data_explorer_comm::ColumnValue;
Expand Down Expand Up @@ -546,7 +547,10 @@ fn test_null_counts() {
let req = DataExplorerBackendRequest::GetColumnProfiles(GetColumnProfilesParams {
profiles: vec![ColumnProfileRequest {
column_index: 0,
profile_type: ColumnProfileType::NullCount,
profiles: vec![ColumnProfileSpec {
profile_type: ColumnProfileType::NullCount,
params: None,
}],
}],
format_options: default_format_options(),
});
Expand Down Expand Up @@ -586,7 +590,10 @@ fn test_null_counts() {
let req = DataExplorerBackendRequest::GetColumnProfiles(GetColumnProfilesParams {
profiles: vec![ColumnProfileRequest {
column_index: 0,
profile_type: ColumnProfileType::NullCount,
profiles: vec![ColumnProfileSpec {
profile_type: ColumnProfileType::NullCount,
params: None,
}],
}],
format_options: default_format_options(),
});
Expand Down Expand Up @@ -645,7 +652,10 @@ fn test_summary_stats() {
profiles: (0..3)
.map(|i| ColumnProfileRequest {
column_index: i,
profile_type: ColumnProfileType::SummaryStats,
profiles: vec![ColumnProfileSpec {
profile_type: ColumnProfileType::SummaryStats,
params: None,
}],
})
.collect(),
format_options: default_format_options(),
Expand Down

0 comments on commit ea7d4cd

Please sign in to comment.