diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/TaskBuilder.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/TaskBuilder.scala index 7bcb7ff..f194145 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/TaskBuilder.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/TaskBuilder.scala @@ -904,6 +904,15 @@ case class FieldBuilder(field: Field = Field()) { def numericScale(scale: Int): FieldBuilder = this.modify(_.field.generator).setTo(Some(getGenBuilder.numericScale(scale).generator)) + /** + * Sets the rounding for the field. + * + * @param round Number of decimal places to round to + * @return the updated `FieldBuilder` instance + */ + def round(round: Int): FieldBuilder = + this.modify(_.field.generator).setTo(Some(getGenBuilder.round(round).generator)) + /** * Sets whether the field should be omitted from the generated output. * @@ -1225,6 +1234,15 @@ case class GeneratorBuilder(generator: Generator = Generator()) { def numericScale(scale: Int): GeneratorBuilder = this.modify(_.generator.options)(_ ++ Map(NUMERIC_SCALE -> scale.toString)) + /** + * Rounding to decimal places for numeric data types + * + * @param round Number of decimal places to round to + * @return GeneratorBuilder + */ + def round(round: Int): GeneratorBuilder = + this.modify(_.generator.options)(_ ++ Map(ROUND -> round.toString)) + /** * Enable/disable including the value in the final output to the data source. Allows you to define intermediate values * that can be used to generate other columns diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala index 344655d..3ef95ce 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala @@ -83,6 +83,7 @@ object Constants { lazy val MAXIMUM = "max" lazy val STANDARD_DEVIATION = "stddev" lazy val MEAN = "mean" + lazy val ROUND = "round" lazy val DISTRIBUTION = "distribution" lazy val DISTRIBUTION_RATE_PARAMETER = "distributionRateParam" lazy val DISTRIBUTION_UNIFORM = "uniform" diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGenerator.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGenerator.scala index 446d087..315dcac 100644 --- a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGenerator.scala +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGenerator.scala @@ -1,6 +1,6 @@ package io.github.datacatering.datacaterer.core.generator.provider -import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MAXIMUM_LENGTH, ARRAY_MINIMUM_LENGTH, DEFAULT_VALUE, DISTINCT_COUNT, DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, EXPRESSION, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, MEAN, MINIMUM, MINIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, ROW_COUNT, STANDARD_DEVIATION} +import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MAXIMUM_LENGTH, ARRAY_MINIMUM_LENGTH, DEFAULT_VALUE, DISTINCT_COUNT, DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, EXPRESSION, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, MEAN, MINIMUM, MINIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, ROUND, ROW_COUNT, STANDARD_DEVIATION} import io.github.datacatering.datacaterer.core.exception.UnsupportedDataGeneratorType import io.github.datacatering.datacaterer.core.model.Constants._ import io.github.datacatering.datacaterer.core.util.GeneratorUtil @@ -386,10 +386,15 @@ object RandomDataGenerator { s"$sqlRand * $diff + $min" } - if (!baseFormula.contains(INDEX_INC_COL) && (typeName == "INT" || typeName == "SHORT" || typeName == "LONG")) { - s"CAST(ROUND($baseFormula, 0) AS $typeName)" + val rounded = if (metadata.contains(ROUND)) { + val roundValue = metadata.getString(ROUND) + s"ROUND($baseFormula, $roundValue)" + } else baseFormula + + if (!rounded.contains(INDEX_INC_COL) && (typeName == "INT" || typeName == "SHORT" || typeName == "LONG")) { + s"CAST(ROUND($rounded, 0) AS $typeName)" } else { - s"CAST($baseFormula AS $typeName)" + s"CAST($rounded AS $typeName)" } } diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/track/RecordTrackingProcessor.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/track/RecordTrackingProcessor.scala index 619a526..7139f4d 100644 --- a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/track/RecordTrackingProcessor.scala +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/track/RecordTrackingProcessor.scala @@ -13,7 +13,7 @@ class RecordTrackingProcessor(recordTrackingFolderPath: String) { def trackRecords(df: DataFrame, dataSourceName: String, planName: String, step: Step): Unit = { val subDataSourcePath = getSubDataSourcePath(dataSourceName, planName, step, recordTrackingFolderPath) - LOGGER.info(s"Generated record tracking is enabled, data-source-name=$dataSourceName, plan-name=$planName, save-path=$subDataSourcePath") + LOGGER.debug(s"Generated record tracking is enabled, data-source-name=$dataSourceName, plan-name=$planName, save-path=$subDataSourcePath") if (df.isEmpty || df.schema.isEmpty) { LOGGER.debug("Unable to save records for record tracking due to 0 records found or empty schema") } else { diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/util/MetadataUtil.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/util/MetadataUtil.scala index 6e71fd1..ed17f4c 100644 --- a/app/src/main/scala/io/github/datacatering/datacaterer/core/util/MetadataUtil.scala +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/util/MetadataUtil.scala @@ -111,7 +111,7 @@ object MetadataUtil { computeColumnStatistics(sourceData, dataSourceReadOptions, dataSourceMetadata.name, dataSourceMetadata.format) val columnLevelStatistics = sparkSession.sharedState.cacheManager.lookupCachedData(sourceData).get.cachedRepresentation.stats val rowCount = columnLevelStatistics.rowCount.getOrElse(BigInt(0)) - LOGGER.info(s"Computed metadata statistics for data source, name=${dataSourceMetadata.name}, format=$dataSourceFormat, " + + LOGGER.debug(s"Computed metadata statistics for data source, name=${dataSourceMetadata.name}, format=$dataSourceFormat, " + s"details=${ConfigUtil.cleanseOptions(dataSourceReadOptions)}, rows-analysed=$rowCount, size-in-bytes=${columnLevelStatistics.sizeInBytes}, " + s"num-columns-analysed=${columnLevelStatistics.attributeStats.size}") diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGeneratorTest.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGeneratorTest.scala index d988ffd..3a166cc 100644 --- a/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGeneratorTest.scala +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/provider/RandomDataGeneratorTest.scala @@ -1,6 +1,6 @@ package io.github.datacatering.datacaterer.core.generator.provider -import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MINIMUM_LENGTH, DISTINCT_COUNT, DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, ENABLED_EDGE_CASE, ENABLED_NULL, EXPRESSION, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, MEAN, MINIMUM, MINIMUM_LENGTH, PROBABILITY_OF_EDGE_CASE, PROBABILITY_OF_NULL, ROW_COUNT, STANDARD_DEVIATION} +import io.github.datacatering.datacaterer.api.model.Constants.{ARRAY_MINIMUM_LENGTH, DISTINCT_COUNT, DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, ENABLED_EDGE_CASE, ENABLED_NULL, EXPRESSION, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, MEAN, MINIMUM, MINIMUM_LENGTH, PROBABILITY_OF_EDGE_CASE, PROBABILITY_OF_NULL, ROUND, ROW_COUNT, STANDARD_DEVIATION} import io.github.datacatering.datacaterer.core.generator.provider.RandomDataGenerator._ import io.github.datacatering.datacaterer.core.model.Constants.INDEX_INC_COL import org.apache.spark.sql.types._ @@ -189,6 +189,17 @@ class RandomDataGeneratorTest extends AnyFunSuite { assert(doubleGenerator.generateSqlExpression == "CAST(RAND() * 5.0 + 5.0 AS DOUBLE)") } + test("Can create random double generator with custom min, max and rounding") { + val metadata = new MetadataBuilder().putString(MAXIMUM, "10.0").putString(MINIMUM, "5.0").putString(ROUND, "2").build() + val doubleGenerator = new RandomDoubleDataGenerator(StructField("random_double", DoubleType, false, metadata)) + val sampleData = doubleGenerator.generate + + assert(doubleGenerator.edgeCases.nonEmpty) + assert(sampleData >= 5.0) + assert(sampleData <= 10.0) + assert(doubleGenerator.generateSqlExpression == "CAST(ROUND(RAND() * 5.0 + 5.0, 2) AS DOUBLE)") + } + test("Can create random float generator") { val floatGenerator = new RandomFloatDataGenerator(StructField("random_float", FloatType, false)) val sampleData = floatGenerator.generate diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessorTest.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessorTest.scala index add1998..e46f2a5 100644 --- a/app/src/test/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessorTest.scala +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessorTest.scala @@ -24,7 +24,7 @@ class PlanProcessorTest extends SparkSuite { .schema( field.name("account_id").regex("ACC[0-9]{8}"), field.name("year").`type`(IntegerType).sql("YEAR(date)"), - field.name("balance").`type`(DoubleType).min(10).max(1000), + field.name("balance").`type`(DoubleType).min(10).max(1000).round(2), field.name("date").`type`(DateType).min(Date.valueOf("2022-01-01")), field.name("status").oneOf(accountStatus: _*), field.name("update_history") diff --git a/gradle.properties b/gradle.properties index 18550c9..44b1247 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,5 +1,5 @@ groupId=io.github.data-catering -version=0.12.2 +version=0.12.3 scalaVersion=2.12 scalaSpecificVersion=2.12.19