Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature struct changes #219

Merged
merged 30 commits into from
Jul 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
aebbb06
wip
ronanstokes-db Mar 26, 2023
5f0ffc0
merge from origin
ronanstokes-db Mar 27, 2023
1eda552
wip
ronanstokes-db Apr 7, 2023
7de014c
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 7, 2023
c859475
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 9, 2023
3094e96
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 13, 2023
3bf6e9b
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 17, 2023
caaff18
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 18, 2023
87d5c50
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 18, 2023
4536794
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 19, 2023
eba6193
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 21, 2023
c4fdc3b
wip
ronanstokes-db May 9, 2023
8734b19
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db May 30, 2023
a1b03aa
added support for inferred types
ronanstokes-db Jun 14, 2023
b3cd644
added support for inferred types
ronanstokes-db Jun 14, 2023
be7acca
Merge branch 'master' into feature_struct_changes
ronanstokes-db Jun 15, 2023
61e1044
wip
ronanstokes-db Jun 26, 2023
713e1bb
wip
ronanstokes-db Jun 26, 2023
9762f39
wip
ronanstokes-db Jun 28, 2023
c5b3da6
updates and fixes to unit tests
ronanstokes-db Jun 28, 2023
7a1cfe0
wip
ronanstokes-db Jun 28, 2023
72db0b2
updated pipfile due to upstream changes in pipenv
ronanstokes-db Jun 28, 2023
0ccd114
additional tests
ronanstokes-db Jun 28, 2023
1cb14c8
wip
ronanstokes-db Jun 28, 2023
03c823f
wip
ronanstokes-db Jun 28, 2023
0ee24f9
wip
ronanstokes-db Jun 28, 2023
2a05255
Merge branch 'master' into feature_struct_changes
ronanstokes-db Jun 30, 2023
bb256f6
wip
ronanstokes-db Jul 1, 2023
6376f6b
removed old code
ronanstokes-db Jul 8, 2023
9fcbd29
Merge branch 'feature_struct_changes' of https://github.com/databrick…
ronanstokes-db Jul 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ All notable changes to the Databricks Labs Data Generator will be documented in

#### Changed
* Added formatting of generated code as Html for script methods
* Allow use of inferred types on `withColumn` method when `expr` attribute is used
* Added ``withStructColumn`` method to allow simplified generation of struct and JSON columns
* Modified pipfile to use newer version of package specifications

### Version 0.3.4 Post 3
Expand Down
3 changes: 2 additions & 1 deletion dbldatagen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@

from .data_generator import DataGenerator
from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_RANDOM, RANDOM_SEED_FIXED, \
RANDOM_SEED_HASH_FIELD_NAME, MIN_PYTHON_VERSION, MIN_SPARK_VERSION
RANDOM_SEED_HASH_FIELD_NAME, MIN_PYTHON_VERSION, MIN_SPARK_VERSION, \
INFER_DATATYPE
from .utils import ensure, topologicalSort, mkBoundsList, coalesce_values, \
deprecated, parse_time_interval, DataGenError, split_list_matching_condition, strip_margins, \
json_value_from_path, system_time_millis
Expand Down
48 changes: 43 additions & 5 deletions dbldatagen/column_generation_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,20 @@

from .column_spec_options import ColumnSpecOptions
from .datagen_constants import RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME, RANDOM_SEED_RANDOM, \
DEFAULT_SEED_COLUMN, OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD
DEFAULT_SEED_COLUMN, OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD, INFER_DATATYPE

from .daterange import DateRange
from .distributions import Normal, DataDistribution
from .nrange import NRange
from .text_generators import TemplateGenerator
from .utils import ensure, coalesce_values
from .schema_parser import SchemaParser

HASH_COMPUTE_METHOD = "hash"
VALUES_COMPUTE_METHOD = "values"
RAW_VALUES_COMPUTE_METHOD = "raw_values"
AUTO_COMPUTE_METHOD = "auto"
EXPR_OPTION = "expr"
COMPUTE_METHOD_VALID_VALUES = [HASH_COMPUTE_METHOD,
AUTO_COMPUTE_METHOD,
VALUES_COMPUTE_METHOD,
Expand Down Expand Up @@ -107,8 +109,18 @@ def __init__(self, name, colType=None, minValue=0, maxValue=None, step=1, prefix
# set up default range and type for column
self._dataRange = NRange(None, None, None) # by default range of values for column is unconstrained

self._inferDataType = False
if colType is None: # default to integer field if none specified
colType = IntegerType()
elif colType == INFER_DATATYPE:
colType = StringType() # default inferred data type to string until exact type is known
self._inferDataType = True

if EXPR_OPTION not in kwargs:
raise ValueError("Column generation spec must have `expr` attribute specified if datatype is inferred")

elif type(colType) == str:
colType = SchemaParser.columnTypeFromString(colType)

assert isinstance(colType, DataType), f"colType `{colType}` is not instance of DataType"

Expand Down Expand Up @@ -399,6 +411,12 @@ def textGenerator(self):
""" Get the text generator for the column spec"""
return self._textGenerator

@property
def inferDatatype(self):
""" If True indicates that datatype should be inferred to be result of computing SQL expression
"""
return self._inferDataType

@property
def baseColumns(self):
""" Return base columns as list of strings"""
Expand Down Expand Up @@ -1030,11 +1048,12 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=T
# TODO: add full support for date value generation
if self.expr is not None:
# note use of SQL expression ignores range specifications
new_def = expr(self.expr).astype(self.datatype)

# record execution history
new_def = expr(self.expr)
self.executionHistory.append(f".. using SQL expression `{self.expr}` as base")
self.executionHistory.append(f".. casting to `{self.datatype}`")

if not self._inferDataType:
new_def = new_def.astype(self.datatype)
self.executionHistory.append(f".. casting to `{self.datatype}`")
elif type(self.datatype) in [ArrayType, MapType, StructType] and self.values is None:
new_def = expr("NULL")
elif self._dataRange is not None and self._dataRange.isFullyPopulated():
Expand Down Expand Up @@ -1083,6 +1102,22 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=T
new_def = self._applyComputePercentNullsExpression(new_def, percent_nulls)
return new_def

def _onSelect(self, df):
"""
The _onSelect method is called when the column specifications expression as produced by the
method ``_makeSingleGenerationExpression`` is used in a select statement.

:param df: Dataframe in which expression is used
:return: nothing

.. note:: The purpose of this method is to allow for introspection of information such as datatype
which can only be determined when column specifications expression is used.
"""
if self._inferDataType:
inferred_type = df.schema[self.name].dataType
self.logger.info("Inferred datatype for column %s as %s", self.name, str(inferred_type))
self._csOptions.options['type'] = inferred_type

def _applyTextFormatExpression(self, new_def, sformat):
# note :
# while it seems like this could use a shared instance, this does not work if initialized
Expand Down Expand Up @@ -1141,6 +1176,9 @@ def _applyFinalCastExpression(self, col_type, new_def):
# cast the result to the appropriate type. For dates, cast first to timestamp, then to date
if type(col_type) is DateType:
new_def = new_def.astype(TimestampType()).astype(col_type)
elif self._inferDataType:
# dont apply cast when column has an inferred data type
pass
ronanstokes-db marked this conversation as resolved.
Show resolved Hide resolved
else:
new_def = new_def.astype(col_type)

Expand Down
Loading