-
Notifications
You must be signed in to change notification settings - Fork 14.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[sqllab] assign types for visualize flow #2458
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,9 @@ | |
from __future__ import print_function | ||
from __future__ import unicode_literals | ||
|
||
from datetime import datetime, date | ||
from past.builtins import basestring | ||
|
||
import pandas as pd | ||
import numpy as np | ||
|
||
|
@@ -19,6 +22,21 @@ | |
|
||
|
||
class SupersetDataFrame(object): | ||
# Mapping numpy dtype.char to generic database types | ||
type_map = { | ||
'b': 'BOOL', # boolean | ||
'i': 'INT', # (signed) integer | ||
'u': 'INT', # unsigned integer | ||
'f': 'FLOAT', # floating-point | ||
'c': 'FLOAT', # complex-floating point | ||
'm': None, # timedelta | ||
'M': 'DATETIME', # datetime | ||
'O': 'OBJECT', # (Python) objects | ||
'S': 'BYTE', # (byte-)string | ||
'U': 'STRING', # Unicode | ||
'V': None, # raw data (void) | ||
} | ||
|
||
def __init__(self, df): | ||
self.__df = df.where((pd.notnull(df)), None) | ||
|
||
|
@@ -30,6 +48,47 @@ def size(self): | |
def data(self): | ||
return self.__df.to_dict(orient='records') | ||
|
||
@classmethod | ||
def db_type(cls, dtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shouldn't this be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes to be explicit I should differentiate between col_numpy_type and col_db_type, which I'm converting here... |
||
"""Given a numpy dtype, Returns a generic database type""" | ||
return cls.type_map.get(dtype.char) | ||
|
||
@classmethod | ||
def datetime_conversion_rate(cls, data_series): | ||
success = 0 | ||
total = 0 | ||
for value in data_series: | ||
total += 1 | ||
try: | ||
pd.to_datetime(value) | ||
success += 1 | ||
except Exception: | ||
continue | ||
return 100 * success / total | ||
|
||
@classmethod | ||
def is_date(cls, dtype): | ||
if dtype.name: | ||
return dtype.name.startswith('datetime') | ||
|
||
@classmethod | ||
def is_dimension(cls, dtype, column_name): | ||
if cls.is_id(column_name): | ||
return False | ||
return dtype.name in ('object', 'bool') | ||
|
||
@classmethod | ||
def is_id(cls, column_name): | ||
return column_name.startswith('id') or column_name.endswith('id') | ||
|
||
@classmethod | ||
def agg_func(cls, dtype, column_name): | ||
# consider checking for key substring too. | ||
if cls.is_id(column_name): | ||
return 'count_distinct' | ||
if np.issubdtype(dtype, np.number): | ||
return 'sum' | ||
|
||
@property | ||
def columns(self): | ||
"""Provides metadata about columns for data visualization. | ||
|
@@ -45,22 +104,29 @@ def columns(self): | |
if sample_size: | ||
sample = self.__df.sample(sample_size) | ||
for col in self.__df.dtypes.keys(): | ||
db_type = self.db_type(self.__df.dtypes[col]) | ||
column = { | ||
'name': col, | ||
'type': self.__df.dtypes[col].name, | ||
'is_date': is_date(self.__df.dtypes[col]), | ||
'is_dim': is_dimension(self.__df.dtypes[col], col), | ||
'agg': self.agg_func(self.__df.dtypes[col], col), | ||
'type': db_type, | ||
'is_date': self.is_date(self.__df.dtypes[col]), | ||
'is_dim': self.is_dimension(self.__df.dtypes[col], col), | ||
} | ||
agg = agg_func(self.__df.dtypes[col], col) | ||
if agg_func: | ||
column['agg'] = agg | ||
|
||
if column['type'] == 'object': | ||
if column['type'] in ('OBJECT', None): | ||
v = sample[col][0] | ||
if isinstance(v, basestring): | ||
column['type'] = 'STRING' | ||
elif isinstance(v, int): | ||
column['type'] = 'INT' | ||
elif isinstance(v, float): | ||
column['type'] = 'FLOAT' | ||
elif isinstance(v, (datetime, date)): | ||
column['type'] = 'DATETIME' | ||
# check if encoded datetime | ||
if (datetime_conversion_rate(sample[col]) > | ||
if (self.datetime_conversion_rate(sample[col]) > | ||
INFER_COL_TYPES_THRESHOLD): | ||
column.update({ | ||
'type': 'datetime_string', | ||
'is_date': True, | ||
'is_dim': False, | ||
'agg': None | ||
|
@@ -70,42 +136,3 @@ def columns(self): | |
column.pop('agg', None) | ||
columns.append(column) | ||
return columns | ||
|
||
|
||
# It will give false positives on the numbers that are stored as strings. | ||
# It is hard to distinguish integer numbers and timestamps | ||
def datetime_conversion_rate(data_series): | ||
success = 0 | ||
total = 0 | ||
for value in data_series: | ||
total += 1 | ||
try: | ||
pd.to_datetime(value) | ||
success += 1 | ||
except Exception: | ||
continue | ||
return 100 * success / total | ||
|
||
|
||
def is_date(dtype): | ||
if dtype.name: | ||
return dtype.name.startswith('datetime') | ||
|
||
|
||
def is_dimension(dtype, column_name): | ||
if is_id(column_name): | ||
return False | ||
return dtype.name in ('object', 'bool') | ||
|
||
|
||
def is_id(column_name): | ||
return column_name.startswith('id') or column_name.endswith('id') | ||
|
||
|
||
def agg_func(dtype, column_name): | ||
# consider checking for key substring too. | ||
if is_id(column_name): | ||
return 'count_distinct' | ||
if np.issubdtype(dtype, np.number): | ||
return 'sum' | ||
return None |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is the value here
None
but the comment saystimedelta
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wanted to cover the entire scope of numpy types to be explicit, but it really doesn't map to a classic database type. We could handle / convert those in the future but most likely database drivers will never return that type (though they are supported in numpy...)