apache · mistercrunch · Mar 24, 2017 · Mar 22, 2017 · Mar 23, 2017 · Mar 24, 2017
diff --git a/setup.py b/setup.py
@@ -51,6 +51,7 @@ def get_git_sha():
         'flask-script==2.0.5',
         'flask-sqlalchemy==2.0',
         'flask-testing==0.6.1',
+        'future>=0.16.0, <0.17',
         'humanize==0.5.1',
         'gunicorn==19.6.0',
         'markdown==2.6.8',

diff --git a/superset/dataframe.py b/superset/dataframe.py
@@ -10,6 +10,9 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+from datetime import datetime, date
+from past.builtins import basestring
+
 import pandas as pd
 import numpy as np
 
@@ -19,6 +22,21 @@
 
 
 class SupersetDataFrame(object):
+    # Mapping numpy dtype.char to generic database types
+    type_map = {
+        'b': 'BOOL',  # boolean
+        'i': 'INT',  # (signed) integer
+        'u': 'INT',  # unsigned integer
+        'f': 'FLOAT',  # floating-point
+        'c': 'FLOAT',  # complex-floating point
+        'm': None,  # timedelta
+        'M': 'DATETIME',  # datetime
+        'O': 'OBJECT',  # (Python) objects
+        'S': 'BYTE',  # (byte-)string
+        'U': 'STRING',  # Unicode
+        'V': None,   # raw data (void)
+    }
+
     def __init__(self, df):
         self.__df = df.where((pd.notnull(df)), None)
 
@@ -30,6 +48,47 @@ def size(self):
     def data(self):
         return self.__df.to_dict(orient='records')
 
+    @classmethod
+    def db_type(cls, dtype):
+        """Given a numpy dtype, Returns a generic database type"""
+        return cls.type_map.get(dtype.char)
+
+    @classmethod
+    def datetime_conversion_rate(cls, data_series):
+        success = 0
+        total = 0
+        for value in data_series:
+            total += 1
+            try:
+                pd.to_datetime(value)
+                success += 1
+            except Exception:
+                continue
+        return 100 * success / total
+
+    @classmethod
+    def is_date(cls, dtype):
+        if dtype.name:
+            return dtype.name.startswith('datetime')
+
+    @classmethod
+    def is_dimension(cls, dtype, column_name):
+        if cls.is_id(column_name):
+            return False
+        return dtype.name in ('object', 'bool')
+
+    @classmethod
+    def is_id(cls, column_name):
+        return column_name.startswith('id') or column_name.endswith('id')
+
+    @classmethod
+    def agg_func(cls, dtype, column_name):
+        # consider checking for key substring too.
+        if cls.is_id(column_name):
+            return 'count_distinct'
+        if np.issubdtype(dtype, np.number):
+            return 'sum'
+
     @property
     def columns(self):
         """Provides metadata about columns for data visualization.
@@ -45,22 +104,29 @@ def columns(self):
         if sample_size:
             sample = self.__df.sample(sample_size)
         for col in self.__df.dtypes.keys():
+            db_type = self.db_type(self.__df.dtypes[col])
             column = {
                 'name': col,
-                'type': self.__df.dtypes[col].name,
-                'is_date': is_date(self.__df.dtypes[col]),
-                'is_dim': is_dimension(self.__df.dtypes[col], col),
+                'agg': self.agg_func(self.__df.dtypes[col], col),
+                'type': db_type,
+                'is_date': self.is_date(self.__df.dtypes[col]),
+                'is_dim': self.is_dimension(self.__df.dtypes[col], col),
             }
-            agg = agg_func(self.__df.dtypes[col], col)
-            if agg_func:
-                column['agg'] = agg
 
-            if column['type'] == 'object':
+            if column['type'] in ('OBJECT', None):
+                v = sample[col][0]
+                if isinstance(v, basestring):
+                    column['type'] = 'STRING'
+                elif isinstance(v, int):
+                    column['type'] = 'INT'
+                elif isinstance(v, float):
+                    column['type'] = 'FLOAT'
+                elif isinstance(v, (datetime, date)):
+                    column['type'] = 'DATETIME'
                 # check if encoded datetime
-                if (datetime_conversion_rate(sample[col]) >
+                if (self.datetime_conversion_rate(sample[col]) >
                         INFER_COL_TYPES_THRESHOLD):
                     column.update({
-                        'type': 'datetime_string',
                         'is_date': True,
                         'is_dim': False,
                         'agg': None
@@ -70,42 +136,3 @@ def columns(self):
                 column.pop('agg', None)
             columns.append(column)
         return columns
-
-
-# It will give false positives on the numbers that are stored as strings.
-# It is hard to distinguish integer numbers and timestamps
-def datetime_conversion_rate(data_series):
-    success = 0
-    total = 0
-    for value in data_series:
-        total += 1
-        try:
-            pd.to_datetime(value)
-            success += 1
-        except Exception:
-            continue
-    return 100 * success / total
-
-
-def is_date(dtype):
-    if dtype.name:
-        return dtype.name.startswith('datetime')
-
-
-def is_dimension(dtype, column_name):
-    if is_id(column_name):
-        return False
-    return dtype.name in ('object', 'bool')
-
-
-def is_id(column_name):
-    return column_name.startswith('id') or column_name.endswith('id')
-
-
-def agg_func(dtype, column_name):
-    # consider checking for key substring too.
-    if is_id(column_name):
-        return 'count_distinct'
-    if np.issubdtype(dtype, np.number):
-        return 'sum'
-    return None
diff --git a/superset/views/core.py b/superset/views/core.py
@@ -1784,6 +1784,7 @@ def sqllab_viz(self):
                 filterable=is_dim,
                 groupby=is_dim,
                 is_dttm=config.get('is_date', False),
+                type=config.get('type', False),
             )
             cols.append(col)
             if is_dim: