Skip to content

Commit

Permalink
Add ability to round generated numbers, set metadata statistics and r…
Browse files Browse the repository at this point in the history
…ecord tracking log from info to debug
  • Loading branch information
pflooky committed Dec 4, 2024
1 parent 4a5ff34 commit 2375ac1
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,15 @@ case class FieldBuilder(field: Field = Field()) {
def numericScale(scale: Int): FieldBuilder =
this.modify(_.field.generator).setTo(Some(getGenBuilder.numericScale(scale).generator))

/**
* Sets the rounding for the field.
*
* @param round Number of decimal places to round to
* @return the updated `FieldBuilder` instance
*/
def round(round: Int): FieldBuilder =
this.modify(_.field.generator).setTo(Some(getGenBuilder.round(round).generator))

/**
* Sets whether the field should be omitted from the generated output.
*
Expand Down Expand Up @@ -1225,6 +1234,15 @@ case class GeneratorBuilder(generator: Generator = Generator()) {
def numericScale(scale: Int): GeneratorBuilder =
this.modify(_.generator.options)(_ ++ Map(NUMERIC_SCALE -> scale.toString))

/**
* Rounding to decimal places for numeric data types
*
* @param round Number of decimal places to round to
* @return GeneratorBuilder
*/
def round(round: Int): GeneratorBuilder =
this.modify(_.generator.options)(_ ++ Map(ROUND -> round.toString))

/**
* Enable/disable including the value in the final output to the data source. Allows you to define intermediate values
* that can be used to generate other columns
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ object Constants {
lazy val MAXIMUM = "max"
lazy val STANDARD_DEVIATION = "stddev"
lazy val MEAN = "mean"
lazy val ROUND = "round"
lazy val DISTRIBUTION = "distribution"
lazy val DISTRIBUTION_RATE_PARAMETER = "distributionRateParam"
lazy val DISTRIBUTION_UNIFORM = "uniform"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package io.github.datacatering.datacaterer.core.generator.provider

import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MAXIMUM_LENGTH, ARRAY_MINIMUM_LENGTH, DEFAULT_VALUE, DISTINCT_COUNT, DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, EXPRESSION, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, MEAN, MINIMUM, MINIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, ROW_COUNT, STANDARD_DEVIATION}
import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MAXIMUM_LENGTH, ARRAY_MINIMUM_LENGTH, DEFAULT_VALUE, DISTINCT_COUNT, DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, EXPRESSION, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, MEAN, MINIMUM, MINIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, ROUND, ROW_COUNT, STANDARD_DEVIATION}
import io.github.datacatering.datacaterer.core.exception.UnsupportedDataGeneratorType
import io.github.datacatering.datacaterer.core.model.Constants._
import io.github.datacatering.datacaterer.core.util.GeneratorUtil
Expand Down Expand Up @@ -386,10 +386,15 @@ object RandomDataGenerator {
s"$sqlRand * $diff + $min"
}

if (!baseFormula.contains(INDEX_INC_COL) && (typeName == "INT" || typeName == "SHORT" || typeName == "LONG")) {
s"CAST(ROUND($baseFormula, 0) AS $typeName)"
val rounded = if (metadata.contains(ROUND)) {
val roundValue = metadata.getString(ROUND)
s"ROUND($baseFormula, $roundValue)"
} else baseFormula

if (!rounded.contains(INDEX_INC_COL) && (typeName == "INT" || typeName == "SHORT" || typeName == "LONG")) {
s"CAST(ROUND($rounded, 0) AS $typeName)"
} else {
s"CAST($baseFormula AS $typeName)"
s"CAST($rounded AS $typeName)"
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class RecordTrackingProcessor(recordTrackingFolderPath: String) {

def trackRecords(df: DataFrame, dataSourceName: String, planName: String, step: Step): Unit = {
val subDataSourcePath = getSubDataSourcePath(dataSourceName, planName, step, recordTrackingFolderPath)
LOGGER.info(s"Generated record tracking is enabled, data-source-name=$dataSourceName, plan-name=$planName, save-path=$subDataSourcePath")
LOGGER.debug(s"Generated record tracking is enabled, data-source-name=$dataSourceName, plan-name=$planName, save-path=$subDataSourcePath")
if (df.isEmpty || df.schema.isEmpty) {
LOGGER.debug("Unable to save records for record tracking due to 0 records found or empty schema")
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ object MetadataUtil {
computeColumnStatistics(sourceData, dataSourceReadOptions, dataSourceMetadata.name, dataSourceMetadata.format)
val columnLevelStatistics = sparkSession.sharedState.cacheManager.lookupCachedData(sourceData).get.cachedRepresentation.stats
val rowCount = columnLevelStatistics.rowCount.getOrElse(BigInt(0))
LOGGER.info(s"Computed metadata statistics for data source, name=${dataSourceMetadata.name}, format=$dataSourceFormat, " +
LOGGER.debug(s"Computed metadata statistics for data source, name=${dataSourceMetadata.name}, format=$dataSourceFormat, " +
s"details=${ConfigUtil.cleanseOptions(dataSourceReadOptions)}, rows-analysed=$rowCount, size-in-bytes=${columnLevelStatistics.sizeInBytes}, " +
s"num-columns-analysed=${columnLevelStatistics.attributeStats.size}")

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package io.github.datacatering.datacaterer.core.generator.provider

import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MINIMUM_LENGTH, DISTINCT_COUNT, DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, ENABLED_EDGE_CASE, ENABLED_NULL, EXPRESSION, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, MEAN, MINIMUM, MINIMUM_LENGTH, PROBABILITY_OF_EDGE_CASE, PROBABILITY_OF_NULL, ROW_COUNT, STANDARD_DEVIATION}
import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MINIMUM_LENGTH, DISTINCT_COUNT, DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, ENABLED_EDGE_CASE, ENABLED_NULL, EXPRESSION, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, MEAN, MINIMUM, MINIMUM_LENGTH, PROBABILITY_OF_EDGE_CASE, PROBABILITY_OF_NULL, ROUND, ROW_COUNT, STANDARD_DEVIATION}
import io.github.datacatering.datacaterer.core.generator.provider.RandomDataGenerator._
import io.github.datacatering.datacaterer.core.model.Constants.INDEX_INC_COL
import org.apache.spark.sql.types._
Expand Down Expand Up @@ -189,6 +189,17 @@ class RandomDataGeneratorTest extends AnyFunSuite {
assert(doubleGenerator.generateSqlExpression == "CAST(RAND() * 5.0 + 5.0 AS DOUBLE)")
}

test("Can create random double generator with custom min, max and rounding") {
val metadata = new MetadataBuilder().putString(MAXIMUM, "10.0").putString(MINIMUM, "5.0").putString(ROUND, "2").build()
val doubleGenerator = new RandomDoubleDataGenerator(StructField("random_double", DoubleType, false, metadata))
val sampleData = doubleGenerator.generate

assert(doubleGenerator.edgeCases.nonEmpty)
assert(sampleData >= 5.0)
assert(sampleData <= 10.0)
assert(doubleGenerator.generateSqlExpression == "CAST(ROUND(RAND() * 5.0 + 5.0, 2) AS DOUBLE)")
}

test("Can create random float generator") {
val floatGenerator = new RandomFloatDataGenerator(StructField("random_float", FloatType, false))
val sampleData = floatGenerator.generate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class PlanProcessorTest extends SparkSuite {
.schema(
field.name("account_id").regex("ACC[0-9]{8}"),
field.name("year").`type`(IntegerType).sql("YEAR(date)"),
field.name("balance").`type`(DoubleType).min(10).max(1000),
field.name("balance").`type`(DoubleType).min(10).max(1000).round(2),
field.name("date").`type`(DateType).min(Date.valueOf("2022-01-01")),
field.name("status").oneOf(accountStatus: _*),
field.name("update_history")
Expand Down
2 changes: 1 addition & 1 deletion gradle.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
groupId=io.github.data-catering
version=0.12.2
version=0.12.3

scalaVersion=2.12
scalaSpecificVersion=2.12.19
Expand Down

0 comments on commit 2375ac1

Please sign in to comment.