Skip to content

Commit

Permalink
datetime format and database expression on column level (#652)
Browse files Browse the repository at this point in the history
* time format minor features added

* add description for datetime format input

* db version bug walkaround

* removed unecessary comments and fixed minor bug

* fixed code style

* minor fix

* fixed missing time format column in DruidDatasource

* Update models.py

Minor style fix

* Revert "Update models.py"

This reverts commit 6897c38.

* removed timestamp_format from druid and removed try catch in migration

* Using spaces, not tabs

* get the most updated migration and add the migration on the head of it

* remove vscode setting file

* use colunm based dttm_format

* modify dttm_converter

* modify datetime viz

* added comments and documents

* fixed some description and removed unnecessary import

* fix migration head

* minor style

* minor style

* deleted empty lines

* delete print statement

* add epoch converter

* error fixed

* fixed epoch parsing issue

* delete unnecessary lines

* fixed typo

* fix minor error

* fix styling issues

* fix styling error

* fixed typo

* support epoch_ms and did some refactoring

* fixed styling error

* fixed styling error

* add one more dataset to test dttm_format and db_expr

* add more slices

* styling

* specified String() lenght
  • Loading branch information
yxjames authored and mistercrunch committed Jun 28, 2016
1 parent 3e742c7 commit 7a7f61a
Show file tree
Hide file tree
Showing 7 changed files with 203 additions and 28 deletions.
3 changes: 3 additions & 0 deletions caravel/bin/caravel
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ def load_examples(load_test_data):
print("Loading [Random long/lat data]")
data.load_long_lat_data()

print("Loading [Multiformat time series]")
data.load_multiformat_time_series_data()

if load_test_data:
print("Loading [Unicode test data]")
data.load_unicode_test_data()
Expand Down
83 changes: 82 additions & 1 deletion caravel/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import random

import pandas as pd
from sqlalchemy import String, DateTime, Date, Float
from sqlalchemy import String, DateTime, Date, Float, BigInteger

from caravel import app, db, models, utils

Expand Down Expand Up @@ -1020,3 +1020,84 @@ def load_long_lat_data():
params=get_slice_json(slice_data),
)
merge_slice(slc)


def load_multiformat_time_series_data():

"""Loading time series data from a zip file in the repo"""
with gzip.open(os.path.join(DATA_FOLDER, 'multiformat_time_series.json.gz')) as f:
pdf = pd.read_json(f)
pdf.ds = pd.to_datetime(pdf.ds, unit='s')
pdf.ds2 = pd.to_datetime(pdf.ds2, unit='s')
pdf.to_sql(
'multiformat_time_series',
db.engine,
if_exists='replace',
chunksize=500,
dtype={
"ds": Date,
'ds2': DateTime,
"epoch_s": BigInteger,
"epoch_ms": BigInteger,
"string0": String(100),
"string1": String(100),
"string2": String(100),
"string3": String(100),
},
index=False)
print("Done loading table!")
print("-" * 80)
print("Creating table [multiformat_time_series] reference")
obj = db.session.query(TBL).filter_by(table_name='multiformat_time_series').first()
if not obj:
obj = TBL(table_name='multiformat_time_series')
obj.main_dttm_col = 'ds'
obj.database = get_or_create_db(db.session)
obj.is_featured = False
dttm_and_expr_dict = {
'ds': [None, None],
'ds2': [None, None],
'epoch_s': ['epoch_s', None],
'epoch_ms': ['epoch_ms', None],
'string2': ['%Y%m%d-%H%M%S', None],
'string1': ['%Y-%m-%d^%H:%M:%S', None],
'string0': ['%Y-%m-%d %H:%M:%S.%f', None],
'string3': ['%Y/%m/%d%H:%M:%S.%f', None],
}
for col in obj.table_columns:
print(col.column_name)
dttm_and_expr = dttm_and_expr_dict[col.column_name]
col.python_date_format = dttm_and_expr[0]
col.dbatabase_expr = dttm_and_expr[1]
db.session.merge(obj)
db.session.commit()
obj.fetch_metadata()
tbl = obj

print("Creating some slices")
i = 0
for col in tbl.table_columns:
slice_data = {
"granularity_sqla": col.column_name,
"datasource_id": "8",
"datasource_name": "multiformat_time_series",
"datasource_type": "table",
"granularity": "day",
"row_limit": config.get("ROW_LIMIT"),
"since": "1 year ago",
"until": "now",
"where": "",
"viz_type": "cal_heatmap",
"domain_granularity": "month",
"subdomain_granularity": "day",
}

slc = Slice(
slice_name="Calendar Heatmap multiformat" + str(i),
viz_type='cal_heatmap',
datasource_type='table',
table=tbl,
params=get_slice_json(slice_data),
)
i += 1
merge_slice(slc)
Binary file added caravel/data/multiformat_time_series.json.gz
Binary file not shown.
24 changes: 24 additions & 0 deletions caravel/migrations/versions/960c69cb1f5b_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""add dttm_format related fields in table_columns
Revision ID: 960c69cb1f5b
Revises: d8bc074f7aad
Create Date: 2016-06-16 14:15:19.573183
"""

# revision identifiers, used by Alembic.
revision = '960c69cb1f5b'
down_revision = 'd8bc074f7aad'

from alembic import op
import sqlalchemy as sa


def upgrade():
op.add_column('table_columns', sa.Column('python_date_format', sa.String(length=255), nullable=True))
op.add_column('table_columns', sa.Column('database_expression', sa.String(length=255), nullable=True))


def downgrade():
op.drop_column('table_columns', 'python_date_format')
op.drop_column('table_columns', 'database_expression')
75 changes: 51 additions & 24 deletions caravel/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,24 +445,6 @@ def grains(self):
if self.sqlalchemy_uri.startswith(db_type):
return grains

def dttm_converter(self, dttm):
"""Returns a string that the database flavor understands as a date"""
default = "'{}'".format(dttm.strftime('%Y-%m-%d %H:%M:%S.%f'))
iso = dttm.isoformat()
d = {
'mssql': "CONVERT(DATETIME, '{}', 126)".format(iso), #untested
'mysql': default,
'oracle':
"""TO_TIMESTAMP('{}', 'YYYY-MM-DD"T"HH24:MI:SS.ff6')""".format(
dttm.isoformat()),
'presto': default,
'sqlite': default,
}
for k, v in d.items():
if self.sqlalchemy_uri.startswith(k):
return v
return default

def grains_dict(self):
return {grain.name: grain for grain in self.grains()}

Expand Down Expand Up @@ -525,6 +507,7 @@ class SqlaTable(Model, Queryable, AuditMixinNullable):
offset = Column(Integer, default=0)
cache_timeout = Column(Integer)
schema = Column(String(255))
table_columns = relationship("TableColumn", back_populates="table")

baselink = "tablemodelview"

Expand Down Expand Up @@ -607,6 +590,12 @@ def sql_url(self):
def sql_link(self):
return '<a href="{}">SQL</a>'.format(self.sql_url)

def get_col(self, col_name):
columns = self.table_columns
for col in columns:
if col_name == col.column_name:
return col

def query( # sqla
self, groupby, metrics,
granularity,
Expand Down Expand Up @@ -661,7 +650,8 @@ def query( # sqla
metrics_exprs = []

if granularity:
dttm_expr = cols[granularity].sqla_col.label('timestamp')
dttm_col = cols[granularity]
dttm_expr = dttm_col.sqla_col.label('timestamp')
timestamp = dttm_expr

# Transforming time grain into an expression based on configuration
Expand All @@ -677,18 +667,20 @@ def query( # sqla
select_exprs += [timestamp_grain]
groupby_exprs += [timestamp_grain]

tf = '%Y-%m-%d %H:%M:%S.%f'
outer_from = text(dttm_col.dttm_sql_literal(from_dttm))
outer_to = text(dttm_col.dttm_sql_literal(to_dttm))

time_filter = [
timestamp >= text(self.database.dttm_converter(from_dttm)),
timestamp <= text(self.database.dttm_converter(to_dttm)),
timestamp >= outer_from,
timestamp <= outer_to,
]
inner_time_filter = copy(time_filter)
if inner_from_dttm:
inner_time_filter[0] = timestamp >= text(
self.database.dttm_converter(inner_from_dttm))
dttm_col.dttm_sql_literal(inner_from_dttm))
if inner_to_dttm:
inner_time_filter[1] = timestamp <= text(
self.database.dttm_converter(inner_to_dttm))
dttm_col.dttm_sql_literal(inner_to_dttm))
else:
inner_time_filter = []

Expand Down Expand Up @@ -909,6 +901,8 @@ class TableColumn(Model, AuditMixinNullable):
filterable = Column(Boolean, default=False)
expression = Column(Text, default='')
description = Column(Text, default='')
python_date_format = Column(String(255))
database_expression = Column(String(255))

num_types = ('DOUBLE', 'FLOAT', 'INT', 'BIGINT', 'LONG')
date_types = ('DATE', 'TIME')
Expand Down Expand Up @@ -938,6 +932,39 @@ def sqla_col(self):
col = literal_column(self.expression).label(name)
return col

def dttm_sql_literal(self, dttm):
"""Convert datetime object to string
If datebase_expression is empty, the internal dttm
will be parsed as the string with the pattern that
user input (python_date_format)
If database_expression is not empty, the internal dttm
will be parsed as the sql sentence for datebase to convert
"""
tf = self.python_date_format or '%Y-%m-%d %H:%M:%S.%f'
if self.database_expression:
return self.database_expression.format(dttm.strftime('%Y-%m-%d %H:%M:%S'))
elif tf == 'epoch_s':
return str((dttm - datetime(1970, 1, 1)).total_seconds())
elif tf == 'epoch_ms':
return str((dttm - datetime(1970, 1, 1)).total_seconds()*1000.0)
else:
default = "'{}'".format(dttm.strftime(tf))
iso = dttm.isoformat()
d = {
'mssql': "CONVERT(DATETIME, '{}', 126)".format(iso), # untested
'mysql': default,
'oracle':
"""TO_TIMESTAMP('{}', 'YYYY-MM-DD"T"HH24:MI:SS.ff6')""".format(
dttm.isoformat()),
'presto': default,
'sqlite': default,
}
for k, v in d.items():
if self.table.database.sqlalchemy_uri.startswith(k):
return v
return default


class DruidCluster(Model, AuditMixinNullable):

Expand Down
25 changes: 23 additions & 2 deletions caravel/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ class TableColumnInlineView(CompactCRUDMixin, CaravelModelView): # noqa
edit_columns = [
'column_name', 'verbose_name', 'description', 'groupby', 'filterable',
'table', 'count_distinct', 'sum', 'min', 'max', 'expression',
'is_dttm', ]
'is_dttm', 'python_date_format', 'database_expression']
add_columns = edit_columns
list_columns = [
'column_name', 'type', 'groupby', 'filterable', 'count_distinct',
Expand All @@ -201,6 +201,24 @@ class TableColumnInlineView(CompactCRUDMixin, CaravelModelView): # noqa
'expression': utils.markdown(
"a valid SQL expression as supported by the underlying backend. "
"Example: `substr(name, 1, 1)`", True),
'python_date_format': utils.markdown(Markup(
"The pattern of timestamp format, use "
"<a href='https://docs.python.org/2/library/"
"datetime.html#strftime-strptime-behavior'>"
"python datetime string pattern</a> "
"expression. If time is stored in epoch "
"format, put `epoch_s` or `epoch_ms`. Leave `Database Expression` "
"below empty if timestamp is stored in "
"String or Integer(epoch) type"), True),
'database_expression': utils.markdown(
"The database expression to cast internal datetime "
"constants to database date/timestamp type according to the DBAPI. "
"The expression should follow the pattern of "
"%Y-%m-%d %H:%M:%S, based on different DBAPI. "
"The string should be a python string formatter \n"
"`Ex: TO_DATE('{}', 'YYYY-MM-DD HH24:MI:SS')` for Oracle"
"Caravel uses default expression based on DB URI if this "
"field is blank.", True),
}
label_columns = {
'column_name': _("Column"),
Expand All @@ -215,6 +233,8 @@ class TableColumnInlineView(CompactCRUDMixin, CaravelModelView): # noqa
'max': _("Max"),
'expression': _("Expression"),
'is_dttm': _("Is temporal"),
'python_date_format': _("Datetime Format"),
'database_expression': _("Database Expression")
}
appbuilder.add_view_no_menu(TableColumnInlineView)

Expand Down Expand Up @@ -388,7 +408,8 @@ class TableModelView(CaravelModelView, DeleteMixin): # noqa
'table_name', 'database', 'schema',
'default_endpoint', 'offset', 'cache_timeout']
edit_columns = [
'table_name', 'is_featured', 'database', 'schema', 'description', 'owner',
'table_name', 'is_featured', 'database', 'schema',
'description', 'owner',
'main_dttm_col', 'default_endpoint', 'offset', 'cache_timeout']
related_views = [TableColumnInlineView, SqlMetricInlineView]
base_order = ('changed_on', 'desc')
Expand Down
21 changes: 20 additions & 1 deletion caravel/viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,15 +146,34 @@ def get_df(self, query_obj=None):
self.error_msg = ""
self.results = None

timestamp_format = None
if self.datasource.type == 'table':
dttm_col = self.datasource.get_col(query_obj['granularity'])
if dttm_col:
timestamp_format = dttm_col.python_date_format

# The datasource here can be different backend but the interface is common
self.results = self.datasource.query(**query_obj)
self.query = self.results.query
df = self.results.df
# Transform the timestamp we received from database to pandas supported
# datetime format. If no python_date_format is specified, the pattern will
# be considered as the default ISO date format
# If the datetime format is unix, the parse will use the corresponding
# parsing logic.
if df is None or df.empty:
raise Exception("No data, review your incantations!")
else:
if 'timestamp' in df.columns:
df.timestamp = pd.to_datetime(df.timestamp, utc=False)
if timestamp_format == "epoch_s":
df.timestamp = pd.to_datetime(
df.timestamp, utc=False, unit="s")
elif timestamp_format == "epoch_ms":
df.timestamp = pd.to_datetime(
df.timestamp, utc=False, unit="ms")
else:
df.timestamp = pd.to_datetime(
df.timestamp, utc=False, format=timestamp_format)
if self.datasource.offset:
df.timestamp += timedelta(hours=self.datasource.offset)
df.replace([np.inf, -np.inf], np.nan)
Expand Down

0 comments on commit 7a7f61a

Please sign in to comment.