Skip to content

Commit

Permalink
Add in contribution guide, allow for selectExpr to happen before vali…
Browse files Browse the repository at this point in the history
…dation
  • Loading branch information
pflooky committed Dec 4, 2023
1 parent d830b95 commit 9eb092f
Show file tree
Hide file tree
Showing 13 changed files with 133 additions and 62 deletions.
35 changes: 35 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Contributing

## Overview

Thanks for visiting and considering contributing to Data Caterer 🎉. Any contributions, whether they are suggestions,
bug fixes, features and general improvements or discussion points, are welcomed. To make things simple for everyone,
please follow the below guidelines when contributing.

## Feature Request

1. [Check if it already exists on the roadmap here](https://data.catering/use-case/roadmap/)
2. [For further discussion, join the Slack channel to discuss](https://join.slack.com/t/data-catering/shared_invite/zt-2664ylbpi-w3n7lWAO~PHeOG9Ujpm~~w)
3. Once accepted, it will be added and tracked via the Github Project
4. Sub-tasks will be created and implemented when contributors have time
5. When all sub-tasks completed, it can be promoted to the next release once all documentation is completed
in [data-caterer-docs](https://github.com/data-catering/data-caterer-docs)

## Bug Fixes

1. Search [Issues](https://github.com/data-catering/data-caterer/issues) in the project to see if the bug has already
been raised
1. If it already exists, you can upvote or add to the discussion if you feel it is warranted (e.g. same bug, but you
get different output to others)
2. If none exist, please create a new issue with the following format:
```shell
version: 0.5.3 (Version of Data Caterer)
description: When generating data to Postgres, error inserting due to record with same primary key. (High level description of bug)
reproduction: (Code that could be used to reproduce the bug)
expected_behaviour: (What you are expecting to occur)
```
2. Depending on the severity of the issue, the team will be able to pick up the issue when contributors have time

## General Improvements/Discussion

[Come join the Slack channel to discuss anything you want!](https://join.slack.com/t/data-catering/shared_invite/zt-2664ylbpi-w3n7lWAO~PHeOG9Ujpm~~w)
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ Sponsors have access to the following features:
This is inspired by the [mkdocs-material project](https://github.com/squidfunk/mkdocs-material) which
[follows the same model](https://squidfunk.github.io/mkdocs-material/insiders/).

## Contributing

[View details here about how you can contribute to the project.](CONTRIBUTING.md)

## Additional Details

### Design
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ case class ValidationBuilder(validation: Validation = ExpressionValidation()) {
* required to be boolean. Can use any columns in the validation logic.
*
* For example,
* {{{validation.expr("CASE WHEN status == 'open' THEN balance > 0 ELSE balance == 0 END}}}
* {{{validation.expr("CASE WHEN status == 'open' THEN balance > 0 ELSE balance == 0 END")}}}
*
* @param expr SQL expression which returns a boolean
* @return ValidationBuilder
Expand All @@ -126,14 +126,28 @@ case class ValidationBuilder(validation: Validation = ExpressionValidation()) {
validation match {
case GroupByValidation(grpCols, aggCol, aggType, _) =>
val grpWithExpr = GroupByValidation(grpCols, aggCol, aggType, expr)
grpWithExpr.description = this.validation.description
grpWithExpr.errorThreshold = this.validation.errorThreshold
this.modify(_.validation).setTo(grpWithExpr)
copyWithDescAndThreshold(grpWithExpr)
case expressionValidation: ExpressionValidation =>
val withExpr = expressionValidation.modify(_.expr).setTo(expr)
withExpr.description = this.validation.description
withExpr.errorThreshold = this.validation.errorThreshold
this.modify(_.validation).setTo(withExpr)
val withExpr = expressionValidation.modify(_.whereExpr).setTo(expr)
copyWithDescAndThreshold(withExpr)
}
}

/**
* SQL expression used to apply to columns before running validations.
*
* For example,
* {{{validation.selectExpr("PERCENTILE(amount, 0.5) AS median_amount, *")}}}
*
* @param expr SQL expression
* @return ValidationBuilder
* @see <a href="https://spark.apache.org/docs/latest/api/sql/">SQL expressions</a>
*/
def selectExpr(expr: String): ValidationBuilder = {
validation match {
case expressionValidation: ExpressionValidation =>
val withExpr = expressionValidation.modify(_.selectExpr).setTo(expr)
copyWithDescAndThreshold(withExpr)
}
}

Expand Down Expand Up @@ -195,6 +209,12 @@ case class ValidationBuilder(validation: Validation = ExpressionValidation()) {
def columnNames: ColumnNamesValidationBuilder = {
ColumnNamesValidationBuilder()
}

private def copyWithDescAndThreshold(newValidation: Validation): ValidationBuilder = {
newValidation.description = this.validation.description
newValidation.errorThreshold = this.validation.errorThreshold
this.modify(_.validation).setTo(newValidation)
}
}

case class ColumnValidationBuilder(validationBuilder: ValidationBuilder = ValidationBuilder(), column: String = "") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ trait Validation {
}

case class ExpressionValidation(
expr: String = "true"
whereExpr: String = "true",
selectExpr: String = "*"
) extends Validation

case class GroupByValidation(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ class ValidationBuilderSerializer extends JsonSerializer[ValidationBuilder] {
val validation = value.validation
gen.writeStartObject()
validation match {
case ExpressionValidation(expr) =>
gen.writeStringField("expr", expr)
case ExpressionValidation(expr, selectExpr) =>
gen.writeStringField("whereExpr", expr)
gen.writeStringField("selectExpr", selectExpr)
case GroupByValidation(groupByCols, aggCol, aggType, expr) =>
gen.writeArrayFieldStart("groupByCols")
groupByCols.foreach(gen.writeObject)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ class PlanBuilderTest extends AnyFunSuite {
assert(validationHead.description.contains("name is equal to Peter"))
assert(validationHead.errorThreshold.contains(0.1))
assert(validationHead.isInstanceOf[ExpressionValidation])
assert(validationHead.asInstanceOf[ExpressionValidation].expr == "name == 'Peter'")
assert(validationHead.asInstanceOf[ExpressionValidation].whereExpr == "name == 'Peter'")
assert(dataSourceHead._2.head.options == Map("path" -> "test/path/json"))
assert(dataSourceHead._2.head.waitCondition == PauseWaitCondition())
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class PlanRunTest extends AnyFunSuite {
assert(dsValidation._2.head.validations.size == 1)
assert(dsValidation._2.head.validations.head.validation.isInstanceOf[ExpressionValidation])
val expressionValidation = dsValidation._2.head.validations.head.validation.asInstanceOf[ExpressionValidation]
assert(expressionValidation.expr == "account_id != ''")
assert(expressionValidation.whereExpr == "account_id != ''")
}

test("Can create plan with multiple validations for one data source") {
Expand All @@ -105,10 +105,10 @@ class PlanRunTest extends AnyFunSuite {
assert(dsValidation._1 == "my_postgres")
val accountValid = dsValidation._2.filter(_.options.get(JDBC_TABLE).contains("account.accounts")).head
assert(accountValid.validations.size == 1)
assert(accountValid.validations.exists(v => v.validation.asInstanceOf[ExpressionValidation].expr == "account_id != ''"))
assert(accountValid.validations.exists(v => v.validation.asInstanceOf[ExpressionValidation].whereExpr == "account_id != ''"))
val txnValid = dsValidation._2.filter(_.options.get(JDBC_TABLE).contains("account.transactions")).head
assert(txnValid.validations.size == 1)
assert(txnValid.validations.exists(v => v.validation.asInstanceOf[ExpressionValidation].expr == "txn_id IS NOT NULL"))
assert(txnValid.validations.exists(v => v.validation.asInstanceOf[ExpressionValidation].whereExpr == "txn_id IS NOT NULL"))
}

test("Can create plan with validations only defined") {
Expand All @@ -124,7 +124,7 @@ class PlanRunTest extends AnyFunSuite {
assert(result._validations.head.dataSources.contains("my_csv"))
val validRes = result._validations.head.dataSources("my_csv").head
assert(validRes.validations.size == 1)
assert(validRes.validations.head.validation.asInstanceOf[ExpressionValidation].expr == "account_id != 'acc123'")
assert(validRes.validations.head.validation.asInstanceOf[ExpressionValidation].whereExpr == "account_id != 'acc123'")
assert(validRes.options.nonEmpty)
assert(validRes.options == Map(FORMAT -> "csv", PATH -> "/my/csv"))
}
Expand Down
Loading

0 comments on commit 9eb092f

Please sign in to comment.