Skip to content

Commit

Permalink
Merge pull request #16 from FAIRChemistry/dataset-validation
Browse files Browse the repository at this point in the history
JSON Dataset validation
  • Loading branch information
JR-1991 authored Dec 22, 2024
2 parents d4da04e + 7f172f4 commit 58c7028
Show file tree
Hide file tree
Showing 15 changed files with 874 additions and 29 deletions.
527 changes: 520 additions & 7 deletions Cargo.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ wasm-bindgen = { version = "0.2.95", optional = true }
serde-wasm-bindgen = { version = "0.6.5", optional = true }
tokio = { version = "1.42.0", features = ["rt"] }
openai-api-rs = { version = "5.2.3", optional = true }
jsonschema = { version = "0.26.2", default-features = false }

[features]
default = ["openai"]
Expand Down
48 changes: 48 additions & 0 deletions src/bin/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@

use clap::{Parser, Subcommand};
use colored::Colorize;
use log::error;
use mdmodels::{
datamodel::DataModel,
exporters::{render_jinja_template, Templates},
json::validation::validate_json,
llm::extraction::query_openai,
pipeline::process_pipeline,
};
Expand Down Expand Up @@ -53,6 +55,8 @@ enum Commands {
Pipeline(PipelineArgs),
/// Large Language Model Extraction
Extract(ExtractArgs),
/// Validate a dataset against a markdown model.
Dataset(DatasetArgs),
}

/// Arguments for the validate subcommand.
Expand Down Expand Up @@ -141,6 +145,33 @@ struct ExtractArgs {
multiple: bool,
}

/// Arguments for the dataset subcommand.
#[derive(Parser, Debug)]
struct DatasetArgs {
#[command(subcommand)]
command: DatasetCommands,
}

/// Subcommands for dataset operations
#[derive(Subcommand, Debug)]
enum DatasetCommands {
/// Validate a dataset against a markdown model.
Validate(ValidateDatasetArgs),
// Add more dataset subcommands here as needed
}

/// Arguments for the validate dataset subcommand.
#[derive(Parser, Debug)]
struct ValidateDatasetArgs {
/// Path to the dataset file.
#[arg(short, long, help = "Path to the dataset file")]
input: InputType,

/// Path to the markdown model.
#[arg(short, long, help = "Path to the markdown model")]
model: InputType,
}

/// Represents the input type, either remote URL or local file path.
#[derive(Deserialize, Serialize, Clone, Debug)]
enum InputType {
Expand Down Expand Up @@ -186,6 +217,9 @@ fn main() -> Result<(), Box<dyn Error>> {
Commands::Convert(args) => convert(args),
Commands::Pipeline(args) => process_pipeline(&args.input),
Commands::Extract(args) => query_llm(args),
Commands::Dataset(args) => match args.command {
DatasetCommands::Validate(args) => validate_ds(args),
},
}
}

Expand Down Expand Up @@ -355,6 +389,20 @@ fn render_all_json_schemes(
Ok(())
}

/// Validates a dataset against a markdown model.
fn validate_ds(args: ValidateDatasetArgs) -> Result<(), Box<dyn Error>> {
let model_path = resolve_input_path(&args.model);
let model = DataModel::from_markdown(&model_path)?;
let dataset_path = resolve_input_path(&args.input);
let result = validate_json(dataset_path, &model, None)?;

for error in result {
error!("{}", error);
}

Ok(())
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
2 changes: 1 addition & 1 deletion src/bindings/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ impl DataModel {
///
/// A string that represents the `DataModel` instance.
fn __repr__(&self) -> String {
self.model.sdrdm_schema()
self.model.internal_schema()
}

/// Converts the `DataModel` instance to a specified template format.
Expand Down
45 changes: 34 additions & 11 deletions src/datamodel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ use serde::{Deserialize, Serialize};

use crate::exporters::{render_jinja_template, Templates};
use crate::json::export::to_json_schema;
use crate::json::validation::{validate_json, ValidationError};
use crate::markdown::frontmatter::FrontMatter;
use crate::markdown::parser::parse_markdown;
use crate::object::{Enumeration, Object};
Expand Down Expand Up @@ -60,7 +61,7 @@ use pyo3::pyclass;
// * `parse` - Parse a markdown file and create a data model
// * `json_schema` - Generate a JSON schema from the data model
// * `json_schema_all` - Generate JSON schemas for all objects in the data model
// * `sdrdm_schema` - Generate a SDRDM schema from the data model
// * `internal_schema` - Generate an internal schema from the data model
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
#[cfg_attr(feature = "python", pyclass(get_all))]
pub struct DataModel {
Expand All @@ -82,6 +83,28 @@ impl DataModel {
}
}

/// Validates a dataset against the data model.
///
/// This function takes the path to a dataset and validates it against the
/// current data model. It returns a vector of validation errors if any
/// validation issues are found, or an empty vector if the validation is successful.
///
/// # Arguments
///
/// * `path` - A reference to the path of the dataset to validate.
/// * `root` - An optional root path for the schema. Will use the first object if not provided.
///
/// # Returns
/// A Result containing a vector of `ValidationError` if validation fails,
/// or an empty vector if successful.
pub fn validate_json(
&self,
path: &Path,
root: Option<String>,
) -> Result<Vec<ValidationError>, Box<dyn Error>> {
validate_json(path.to_path_buf(), self, root)
}

// Get the JSON schema for an object
//
// * `obj_name` - Name of the object
Expand Down Expand Up @@ -166,7 +189,7 @@ impl DataModel {
Ok(())
}

// Get the SDRDM schema for the markdown file
// Get the internal schema for the markdown file
//
// # Panics
//
Expand All @@ -177,18 +200,18 @@ impl DataModel {
// ```
// let model = DataModel::new();
// model.parse("path/to/file.md".to_string());
// let schema = model.sdrdm_schema();
// let schema = model.internal_schema();
// ```
//
// # Returns
//
// A SDRDM schema string
pub fn sdrdm_schema(&self) -> String {
// An internal schema string
pub fn internal_schema(&self) -> String {
if self.objects.is_empty() {
panic!("No objects found in the markdown file");
}

serde_json::to_string_pretty(&self).expect("Could not serialize to sdRDM schema")
serde_json::to_string_pretty(&self).expect("Could not serialize to internal schema")
}

// Parse a markdown file and create a data model
Expand All @@ -199,14 +222,14 @@ impl DataModel {
//
// ```
// let path = Path::new("path/to/file.md");
// let model = DataModel::from_sdrdm_schema(path);
// let model = DataModel::from_internal_schema(path);
// ```
//
// # Returns
//
// A data model
//
pub fn from_sdrdm_schema(path: &Path) -> Result<Self, Box<dyn Error>> {
pub fn from_internal_schema(path: &Path) -> Result<Self, Box<dyn Error>> {
if !path.exists() {
return Err("File does not exist".into());
}
Expand Down Expand Up @@ -457,12 +480,12 @@ mod tests {
}

#[test]
fn test_from_sdrdm_schema() {
fn test_from_internal_schema() {
// Arrange
let path = Path::new("tests/data/expected_sdrdm_schema.json");
let path = Path::new("tests/data/expected_internal_schema.json");

// Act
let model = DataModel::from_sdrdm_schema(path).expect("Failed to parse SDRDM schema");
let model = DataModel::from_internal_schema(path).expect("Failed to parse internal schema");

// Assert
assert_eq!(model.objects.len(), 2);
Expand Down
151 changes: 151 additions & 0 deletions src/json/validation.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
/*
* Copyright (c) 2024 Jan Range
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/

use std::error::Error;
use std::path::PathBuf;

use colored::Colorize;
use jsonschema::error::ValidationErrorKind;
use serde_json::Value;
use std::convert::TryFrom;

use crate::datamodel::DataModel;
use jsonschema::validator_for;

/// Represents a validation error that occurs during dataset validation.
#[derive(Debug)]
pub struct ValidationError {
pub instance_path: String,
pub schema_path: String,
pub message: String,
pub kind: ValidationErrorKind,
}

impl std::fmt::Display for ValidationError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Validation Error: Instance {} violates schema at {}: {}",
self.instance_path.red().bold(),
self.schema_path.green().bold(),
self.message.yellow().bold()
)
}
}

impl From<jsonschema::ValidationError<'_>> for ValidationError {
fn from(err: jsonschema::ValidationError) -> Self {
ValidationError {
instance_path: err.instance_path.to_string(),
schema_path: err.schema_path.to_string(),
message: err.to_string(),
kind: err.kind,
}
}
}

/// Validates a dataset against a given DataModel.
///
/// # Arguments
///
/// * `dataset` - The dataset to validate, which can be provided in various forms.
/// * `model` - A reference to the DataModel against which the dataset will be validated.
/// * `root` - An optional root path for the schema.
///
/// # Returns
///
/// A Result containing a vector of ValidationErrors if validation fails, or an empty vector if successful.
pub fn validate_json<T: Into<DatasetInput>>(
dataset: T,
model: &DataModel,
root: Option<String>,
) -> Result<Vec<ValidationError>, Box<dyn Error>> {
// Convert the dataset input to a Value
let dataset_input: DatasetInput = dataset.into();
let value: Value = dataset_input.try_into()?;

// Get the JSON Schema from the model
let schema = model.json_schema(root, false)?;
let schema_value: Value = serde_json::from_str(&schema)?;

// Create a validator for the schema
let validator = validator_for(&schema_value)?;

// Validate the dataset against the schema
let result = validator.iter_errors(&value);
let mut errors: Vec<ValidationError> = Vec::new();

for err in result {
errors.push(ValidationError::from(err));
}

Ok(errors)
}

/// Enum representing the different types of dataset inputs.
pub enum DatasetInput {
Path(PathBuf),
Value(Value),
String(String),
}

impl From<PathBuf> for DatasetInput {
/// Converts a PathBuf into a DatasetInput.
fn from(path: PathBuf) -> Self {
DatasetInput::Path(path)
}
}

impl From<Value> for DatasetInput {
/// Converts a Value into a DatasetInput.
fn from(value: Value) -> Self {
DatasetInput::Value(value)
}
}

impl From<String> for DatasetInput {
/// Converts a String into a DatasetInput.
fn from(string: String) -> Self {
DatasetInput::String(string)
}
}

impl TryFrom<DatasetInput> for Value {
type Error = Box<dyn Error>;

fn try_from(input: DatasetInput) -> Result<Self, Self::Error> {
match input {
DatasetInput::Path(path) => {
// Logic to read from the path and convert to Value
let content = std::fs::read_to_string(path)?;
let value: Value = serde_json::from_str(&content)?;
Ok(value)
}
DatasetInput::Value(value) => Ok(value),
DatasetInput::String(string) => {
let value: Value = serde_json::from_str(&string)?;
Ok(value)
}
}
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ pub mod json {
mod datatype;
pub mod export;
pub mod schema;
pub mod validation;
}

pub(crate) mod markdown {
Expand Down
2 changes: 1 addition & 1 deletion src/pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ fn serialize_to_internal_schema(
) -> Result<(), Box<dyn Error>> {
match merge_state {
MergeState::Merge => {
let schema = model.sdrdm_schema();
let schema = model.internal_schema();
save_to_file(out, &schema)?;
print_render_msg(out, &Templates::Internal);
Ok(())
Expand Down
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 58c7028

Please sign in to comment.