Skip to content

Commit

Permalink
Feature struct changes (#219)
Browse files Browse the repository at this point in the history
* wip

* wip

* added support for inferred types

* added support for inferred types

* wip

* wip

* wip

* updates and fixes to unit tests

* wip

* updated pipfile due to upstream changes in pipenv

* additional tests

* wip

* wip

* wip

* wip

* removed old code
  • Loading branch information
ronanstokes-db committed Feb 17, 2024
1 parent b2e09ca commit 9b0847b
Show file tree
Hide file tree
Showing 7 changed files with 609 additions and 29 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ All notable changes to the Databricks Labs Data Generator will be documented in

#### Changed
* Added formatting of generated code as Html for script methods
* Allow use of inferred types on `withColumn` method when `expr` attribute is used
* Added ``withStructColumn`` method to allow simplified generation of struct and JSON columns
* Modified pipfile to use newer version of package specifications

### Version 0.3.4 Post 3
Expand Down
3 changes: 2 additions & 1 deletion dbldatagen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@

from .data_generator import DataGenerator
from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_RANDOM, RANDOM_SEED_FIXED, \
RANDOM_SEED_HASH_FIELD_NAME, MIN_PYTHON_VERSION, MIN_SPARK_VERSION
RANDOM_SEED_HASH_FIELD_NAME, MIN_PYTHON_VERSION, MIN_SPARK_VERSION, \
INFER_DATATYPE
from .utils import ensure, topologicalSort, mkBoundsList, coalesce_values, \
deprecated, parse_time_interval, DataGenError, split_list_matching_condition, strip_margins, \
json_value_from_path, system_time_millis
Expand Down
48 changes: 43 additions & 5 deletions dbldatagen/column_generation_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,20 @@

from .column_spec_options import ColumnSpecOptions
from .datagen_constants import RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME, RANDOM_SEED_RANDOM, \
DEFAULT_SEED_COLUMN, OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD
DEFAULT_SEED_COLUMN, OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD, INFER_DATATYPE

from .daterange import DateRange
from .distributions import Normal, DataDistribution
from .nrange import NRange
from .text_generators import TemplateGenerator
from .utils import ensure, coalesce_values
from .schema_parser import SchemaParser

HASH_COMPUTE_METHOD = "hash"
VALUES_COMPUTE_METHOD = "values"
RAW_VALUES_COMPUTE_METHOD = "raw_values"
AUTO_COMPUTE_METHOD = "auto"
EXPR_OPTION = "expr"
COMPUTE_METHOD_VALID_VALUES = [HASH_COMPUTE_METHOD,
AUTO_COMPUTE_METHOD,
VALUES_COMPUTE_METHOD,
Expand Down Expand Up @@ -107,8 +109,18 @@ def __init__(self, name, colType=None, minValue=0, maxValue=None, step=1, prefix
# set up default range and type for column
self._dataRange = NRange(None, None, None) # by default range of values for column is unconstrained

self._inferDataType = False
if colType is None: # default to integer field if none specified
colType = IntegerType()
elif colType == INFER_DATATYPE:
colType = StringType() # default inferred data type to string until exact type is known
self._inferDataType = True

if EXPR_OPTION not in kwargs:
raise ValueError("Column generation spec must have `expr` attribute specified if datatype is inferred")

elif type(colType) == str:
colType = SchemaParser.columnTypeFromString(colType)

assert isinstance(colType, DataType), f"colType `{colType}` is not instance of DataType"

Expand Down Expand Up @@ -399,6 +411,12 @@ def textGenerator(self):
""" Get the text generator for the column spec"""
return self._textGenerator

@property
def inferDatatype(self):
""" If True indicates that datatype should be inferred to be result of computing SQL expression
"""
return self._inferDataType

@property
def baseColumns(self):
""" Return base columns as list of strings"""
Expand Down Expand Up @@ -1030,11 +1048,12 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=T
# TODO: add full support for date value generation
if self.expr is not None:
# note use of SQL expression ignores range specifications
new_def = expr(self.expr).astype(self.datatype)

# record execution history
new_def = expr(self.expr)
self.executionHistory.append(f".. using SQL expression `{self.expr}` as base")
self.executionHistory.append(f".. casting to `{self.datatype}`")

if not self._inferDataType:
new_def = new_def.astype(self.datatype)
self.executionHistory.append(f".. casting to `{self.datatype}`")
elif type(self.datatype) in [ArrayType, MapType, StructType] and self.values is None:
new_def = expr("NULL")
elif self._dataRange is not None and self._dataRange.isFullyPopulated():
Expand Down Expand Up @@ -1083,6 +1102,22 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=T
new_def = self._applyComputePercentNullsExpression(new_def, percent_nulls)
return new_def

def _onSelect(self, df):
"""
The _onSelect method is called when the column specifications expression as produced by the
method ``_makeSingleGenerationExpression`` is used in a select statement.
:param df: Dataframe in which expression is used
:return: nothing
.. note:: The purpose of this method is to allow for introspection of information such as datatype
which can only be determined when column specifications expression is used.
"""
if self._inferDataType:
inferred_type = df.schema[self.name].dataType
self.logger.info("Inferred datatype for column %s as %s", self.name, str(inferred_type))
self._csOptions.options['type'] = inferred_type

def _applyTextFormatExpression(self, new_def, sformat):
# note :
# while it seems like this could use a shared instance, this does not work if initialized
Expand Down Expand Up @@ -1141,6 +1176,9 @@ def _applyFinalCastExpression(self, col_type, new_def):
# cast the result to the appropriate type. For dates, cast first to timestamp, then to date
if type(col_type) is DateType:
new_def = new_def.astype(TimestampType()).astype(col_type)
elif self._inferDataType:
# dont apply cast when column has an inferred data type
pass
else:
new_def = new_def.astype(col_type)

Expand Down
Loading

0 comments on commit 9b0847b

Please sign in to comment.