Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[sqllab] assign types for visualize flow #2458

Merged
merged 5 commits into from
Mar 24, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def get_git_sha():
'flask-script==2.0.5',
'flask-sqlalchemy==2.0',
'flask-testing==0.6.1',
'future>=0.16.0, <0.17',
'humanize==0.5.1',
'gunicorn==19.6.0',
'markdown==2.6.8',
Expand Down
123 changes: 75 additions & 48 deletions superset/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from __future__ import print_function
from __future__ import unicode_literals

from datetime import datetime, date
from past.builtins import basestring

import pandas as pd
import numpy as np

Expand All @@ -19,6 +22,21 @@


class SupersetDataFrame(object):
# Mapping numpy dtype.char to generic database types
type_map = {
'b': 'BOOL', # boolean
'i': 'INT', # (signed) integer
'u': 'INT', # unsigned integer
'f': 'FLOAT', # floating-point
'c': 'FLOAT', # complex-floating point
'm': None, # timedelta
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is the value here None but the comment says timedelta?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to cover the entire scope of numpy types to be explicit, but it really doesn't map to a classic database type. We could handle / convert those in the future but most likely database drivers will never return that type (though they are supported in numpy...)

'M': 'DATETIME', # datetime
'O': 'OBJECT', # (Python) objects
'S': 'BYTE', # (byte-)string
'U': 'STRING', # Unicode
'V': None, # raw data (void)
}

def __init__(self, df):
self.__df = df.where((pd.notnull(df)), None)

Expand All @@ -30,6 +48,47 @@ def size(self):
def data(self):
return self.__df.to_dict(orient='records')

@classmethod
def db_type(cls, dtype):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't this be col_type rather than db_type here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes to be explicit I should differentiate between col_numpy_type and col_db_type, which I'm converting here...

"""Given a numpy dtype, Returns a generic database type"""
return cls.type_map.get(dtype.char)

@classmethod
def datetime_conversion_rate(cls, data_series):
success = 0
total = 0
for value in data_series:
total += 1
try:
pd.to_datetime(value)
success += 1
except Exception:
continue
return 100 * success / total

@classmethod
def is_date(cls, dtype):
if dtype.name:
return dtype.name.startswith('datetime')

@classmethod
def is_dimension(cls, dtype, column_name):
if cls.is_id(column_name):
return False
return dtype.name in ('object', 'bool')

@classmethod
def is_id(cls, column_name):
return column_name.startswith('id') or column_name.endswith('id')

@classmethod
def agg_func(cls, dtype, column_name):
# consider checking for key substring too.
if cls.is_id(column_name):
return 'count_distinct'
if np.issubdtype(dtype, np.number):
return 'sum'

@property
def columns(self):
"""Provides metadata about columns for data visualization.
Expand All @@ -45,22 +104,29 @@ def columns(self):
if sample_size:
sample = self.__df.sample(sample_size)
for col in self.__df.dtypes.keys():
db_type = self.db_type(self.__df.dtypes[col])
column = {
'name': col,
'type': self.__df.dtypes[col].name,
'is_date': is_date(self.__df.dtypes[col]),
'is_dim': is_dimension(self.__df.dtypes[col], col),
'agg': self.agg_func(self.__df.dtypes[col], col),
'type': db_type,
'is_date': self.is_date(self.__df.dtypes[col]),
'is_dim': self.is_dimension(self.__df.dtypes[col], col),
}
agg = agg_func(self.__df.dtypes[col], col)
if agg_func:
column['agg'] = agg

if column['type'] == 'object':
if column['type'] in ('OBJECT', None):
v = sample[col][0]
if isinstance(v, basestring):
column['type'] = 'STRING'
elif isinstance(v, int):
column['type'] = 'INT'
elif isinstance(v, float):
column['type'] = 'FLOAT'
elif isinstance(v, (datetime, date)):
column['type'] = 'DATETIME'
# check if encoded datetime
if (datetime_conversion_rate(sample[col]) >
if (self.datetime_conversion_rate(sample[col]) >
INFER_COL_TYPES_THRESHOLD):
column.update({
'type': 'datetime_string',
'is_date': True,
'is_dim': False,
'agg': None
Expand All @@ -70,42 +136,3 @@ def columns(self):
column.pop('agg', None)
columns.append(column)
return columns


# It will give false positives on the numbers that are stored as strings.
# It is hard to distinguish integer numbers and timestamps
def datetime_conversion_rate(data_series):
success = 0
total = 0
for value in data_series:
total += 1
try:
pd.to_datetime(value)
success += 1
except Exception:
continue
return 100 * success / total


def is_date(dtype):
if dtype.name:
return dtype.name.startswith('datetime')


def is_dimension(dtype, column_name):
if is_id(column_name):
return False
return dtype.name in ('object', 'bool')


def is_id(column_name):
return column_name.startswith('id') or column_name.endswith('id')


def agg_func(dtype, column_name):
# consider checking for key substring too.
if is_id(column_name):
return 'count_distinct'
if np.issubdtype(dtype, np.number):
return 'sum'
return None
1 change: 1 addition & 0 deletions superset/views/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1784,6 +1784,7 @@ def sqllab_viz(self):
filterable=is_dim,
groupby=is_dim,
is_dttm=config.get('is_date', False),
type=config.get('type', False),
)
cols.append(col)
if is_dim:
Expand Down