Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: Support Python 3.10 and bump pandas 1.4 and pyarrow 6 #21002

Merged
merged 13 commits into from
Aug 17, 2022
Merged
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ Commits to `master` trigger a rebuild and redeploy of the documentation site. Su
Make sure your machine meets the [OS dependencies](https://superset.apache.org/docs/installation/installing-superset-from-scratch#os-dependencies) before following these steps.
You also need to install MySQL or [MariaDB](https://mariadb.com/downloads).

Ensure that you are using Python version 3.8 or 3.9, then proceed with:
Ensure that you are using Python version 3.8, 3.9 or 3.10, then proceed with:

```bash
# Create a virtual environment and activate it (recommended)
Expand Down
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
# limitations under the License.
#

# Python version installed; we need 3.8-3.9
PYTHON=`command -v python3.9 || command -v python3.8`
# Python version installed; we need 3.8-3.10
PYTHON=`command -v python3.10 || command -v python3.9 || command -v python3.8`

.PHONY: install superset venv pre-commit

Expand Down Expand Up @@ -70,7 +70,7 @@ update-js:

venv:
# Create a virtual environment and activate it (recommended)
if ! [ -x "${PYTHON}" ]; then echo "You need Python 3.8 or 3.9 installed"; exit 1; fi
if ! [ -x "${PYTHON}" ]; then echo "You need Python 3.8, 3.9 or 3.10 installed"; exit 1; fi
test -d venv || ${PYTHON} -m venv venv # setup a python3 virtualenv
. venv/bin/activate

Expand Down
1 change: 1 addition & 0 deletions UPDATING.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ assists people when migrating to a new version.

- [20606](https://github.com/apache/superset/pull/20606): When user clicks on chart title or "Edit chart" button in Dashboard page, Explore opens in the same tab. Clicking while holding cmd/ctrl opens Explore in a new tab. To bring back the old behaviour (always opening Explore in a new tab), flip feature flag `DASHBOARD_EDIT_CHART_IN_NEW_TAB` to `True`.
- [20799](https://github.com/apache/superset/pull/20799): Presto and Trino engine will now display tracking URL for running queries in SQL Lab. If for some reason you don't want to show the tracking URL (for example, when your data warehouse hasn't enable access for to Presto or Trino UI), update `TRACKING_URL_TRANSFORMER` in `config.py` to return `None`.
- [21002](https://github.com/apache/superset/pull/21002): Support Python 3.10 and bump pandas 1.4 and pyarrow 6.

### Breaking Changes

Expand Down
4 changes: 2 additions & 2 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ packaging==21.3
# via
# bleach
# deprecation
pandas==1.3.4
pandas==1.4.3
# via apache-superset
parsedatetime==2.6
# via apache-superset
Expand All @@ -197,7 +197,7 @@ prison==0.2.1
# via flask-appbuilder
prompt-toolkit==3.0.28
# via click-repl
pyarrow==5.0.0
pyarrow==6.0.1
# via apache-superset
pycparser==2.20
# via cffi
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,15 @@ def get_git_sha() -> str:
"markdown>=3.0",
"msgpack>=1.0.0, <1.1",
"numpy==1.22.1",
"pandas>=1.3.0, <1.4",
"pandas>=1.4.3, <1.5",
"parsedatetime",
"pgsanity",
"polyline",
"pyparsing>=3.0.6, <4",
"python-dateutil",
"python-dotenv",
"python-geohash",
"pyarrow>=5.0.0, <6.0",
"pyarrow>=6.0.1, <7",
"pyyaml>=5.4",
"PyJWT>=2.4.0, <3.0",
"redis",
Expand Down Expand Up @@ -182,5 +182,6 @@ def get_git_sha() -> str:
classifiers=[
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
],
)
6 changes: 3 additions & 3 deletions superset/examples/bart_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from superset import db

from ..utils.database import get_example_database
from .helpers import get_example_data, get_table_connector_registry
from .helpers import get_example_url, get_table_connector_registry


def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None:
Expand All @@ -34,8 +34,8 @@ def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None:
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
content = get_example_data("bart-lines.json.gz")
df = pd.read_json(content, encoding="latin-1")
url = get_example_url("bart-lines.json.gz")
df = pd.read_json(url, encoding="latin-1", compression="gzip")
df["path_json"] = df.path.map(json.dumps)
df["polyline"] = df.path.map(polyline.encode)
del df["path"]
Expand Down
5 changes: 3 additions & 2 deletions superset/examples/birth_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

from ..utils.database import get_example_database
from .helpers import (
get_example_data,
get_example_url,
get_slice_json,
get_table_connector_registry,
merge_slice,
Expand Down Expand Up @@ -66,7 +66,8 @@ def gen_filter(


def load_data(tbl_name: str, database: Database, sample: bool = False) -> None:
pdf = pd.read_json(get_example_data("birth_names2.json.gz"))
url = get_example_url("birth_names2.json.gz")
pdf = pd.read_json(url, compression="gzip")
# TODO(bkyryliuk): move load examples data into the pytest fixture
if database.backend == "presto":
pdf.ds = pd.to_datetime(pdf.ds, unit="ms")
Expand Down
8 changes: 3 additions & 5 deletions superset/examples/country_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from superset.utils.core import DatasourceType

from .helpers import (
get_example_data,
get_example_url,
get_slice_json,
get_table_connector_registry,
merge_slice,
Expand All @@ -44,10 +44,8 @@ def load_country_map_data(only_metadata: bool = False, force: bool = False) -> N
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
csv_bytes = get_example_data(
"birth_france_data_for_country_map.csv", is_gzip=False, make_bytes=True
)
data = pd.read_csv(csv_bytes, encoding="utf-8")
url = get_example_url("birth_france_data_for_country_map.csv")
data = pd.read_csv(url, encoding="utf-8")
data["dttm"] = datetime.datetime.now().date()
data.to_sql(
tbl_name,
Expand Down
6 changes: 3 additions & 3 deletions superset/examples/energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from superset.utils.core import DatasourceType

from .helpers import (
get_example_data,
get_example_url,
get_table_connector_registry,
merge_slice,
misc_dash_slices,
Expand All @@ -46,8 +46,8 @@ def load_energy(
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("energy.json.gz")
pdf = pd.read_json(data)
url = get_example_url("energy.json.gz")
pdf = pd.read_json(url, compression="gzip")
pdf = pdf.head(100) if sample else pdf
pdf.to_sql(
tbl_name,
Expand Down
10 changes: 5 additions & 5 deletions superset/examples/flights.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import superset.utils.database as database_utils
from superset import db

from .helpers import get_example_data, get_table_connector_registry
from .helpers import get_example_url, get_table_connector_registry


def load_flights(only_metadata: bool = False, force: bool = False) -> None:
Expand All @@ -32,12 +32,12 @@ def load_flights(only_metadata: bool = False, force: bool = False) -> None:
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("flight_data.csv.gz", make_bytes=True)
pdf = pd.read_csv(data, encoding="latin-1")
flight_data_url = get_example_url("flight_data.csv.gz")
pdf = pd.read_csv(flight_data_url, encoding="latin-1", compression="gzip")

# Loading airports info to join and get lat/long
airports_bytes = get_example_data("airports.csv.gz", make_bytes=True)
airports = pd.read_csv(airports_bytes, encoding="latin-1")
airports_url = get_example_url("airports.csv.gz")
airports = pd.read_csv(airports_url, encoding="latin-1", compression="gzip")
airports = airports.set_index("IATA_CODE")

pdf[ # pylint: disable=unsupported-assignment-operation,useless-suppression
Expand Down
16 changes: 2 additions & 14 deletions superset/examples/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@
"""Loads datasets, dashboards and slices in a new superset instance"""
import json
import os
import zlib
from io import BytesIO
from typing import Any, Dict, List, Set
from urllib import request

from superset import app, db
from superset.connectors.sqla.models import SqlaTable
Expand Down Expand Up @@ -73,14 +70,5 @@ def get_slice_json(defaults: Dict[Any, Any], **kwargs: Any) -> str:
return json.dumps(defaults_copy, indent=4, sort_keys=True)


def get_example_data(
filepath: str, is_gzip: bool = True, make_bytes: bool = False
) -> BytesIO:
content = request.urlopen( # pylint: disable=consider-using-with
f"{BASE_URL}{filepath}?raw=true"
).read()
if is_gzip:
content = zlib.decompress(content, zlib.MAX_WBITS | 16)
if make_bytes:
content = BytesIO(content)
return content
def get_example_url(filepath: str) -> str:
return f"{BASE_URL}{filepath}?raw=true"
Comment on lines +73 to +74
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

6 changes: 3 additions & 3 deletions superset/examples/long_lat.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from superset.utils.core import DatasourceType

from .helpers import (
get_example_data,
get_example_url,
get_slice_json,
get_table_connector_registry,
merge_slice,
Expand All @@ -44,8 +44,8 @@ def load_long_lat_data(only_metadata: bool = False, force: bool = False) -> None
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("san_francisco.csv.gz", make_bytes=True)
pdf = pd.read_csv(data, encoding="utf-8")
url = get_example_url("san_francisco.csv.gz")
pdf = pd.read_csv(url, encoding="utf-8", compression="gzip")
start = datetime.datetime.now().replace(
hour=0, minute=0, second=0, microsecond=0
)
Expand Down
6 changes: 3 additions & 3 deletions superset/examples/multiformat_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from ..utils.database import get_example_database
from .helpers import (
get_example_data,
get_example_url,
get_slice_json,
get_table_connector_registry,
merge_slice,
Expand All @@ -44,8 +44,8 @@ def load_multiformat_time_series( # pylint: disable=too-many-locals
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("multiformat_time_series.json.gz")
pdf = pd.read_json(data)
url = get_example_url("multiformat_time_series.json.gz")
pdf = pd.read_json(url, compression="gzip")
# TODO(bkyryliuk): move load examples data into the pytest fixture
if database.backend == "presto":
pdf.ds = pd.to_datetime(pdf.ds, unit="s")
Expand Down
6 changes: 3 additions & 3 deletions superset/examples/paris.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import superset.utils.database as database_utils
from superset import db

from .helpers import get_example_data, get_table_connector_registry
from .helpers import get_example_url, get_table_connector_registry


def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) -> None:
Expand All @@ -33,8 +33,8 @@ def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) ->
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("paris_iris.json.gz")
df = pd.read_json(data)
url = get_example_url("paris_iris.json.gz")
df = pd.read_json(url, compression="gzip")
df["features"] = df.features.map(json.dumps)

df.to_sql(
Expand Down
6 changes: 3 additions & 3 deletions superset/examples/random_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from superset.utils.core import DatasourceType

from .helpers import (
get_example_data,
get_example_url,
get_slice_json,
get_table_connector_registry,
merge_slice,
Expand All @@ -42,8 +42,8 @@ def load_random_time_series_data(
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("random_time_series.json.gz")
pdf = pd.read_json(data)
url = get_example_url("random_time_series.json.gz")
pdf = pd.read_json(url, compression="gzip")
if database.backend == "presto":
pdf.ds = pd.to_datetime(pdf.ds, unit="s")
pdf.ds = pdf.ds.dt.strftime("%Y-%m-%d %H:%M%:%S")
Expand Down
6 changes: 3 additions & 3 deletions superset/examples/sf_population_polygons.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import superset.utils.database as database_utils
from superset import db

from .helpers import get_example_data, get_table_connector_registry
from .helpers import get_example_url, get_table_connector_registry


def load_sf_population_polygons(
Expand All @@ -35,8 +35,8 @@ def load_sf_population_polygons(
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("sf_population.json.gz")
df = pd.read_json(data)
url = get_example_url("sf_population.json.gz")
df = pd.read_json(url, compression="gzip")
df["contour"] = df.contour.map(json.dumps)

df.to_sql(
Expand Down
6 changes: 3 additions & 3 deletions superset/examples/world_bank.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

from ..connectors.base.models import BaseDatasource
from .helpers import (
get_example_data,
get_example_url,
get_examples_folder,
get_slice_json,
get_table_connector_registry,
Expand All @@ -56,8 +56,8 @@ def load_world_bank_health_n_pop( # pylint: disable=too-many-locals, too-many-s
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("countries.json.gz")
pdf = pd.read_json(data)
url = get_example_url("countries.json.gz")
pdf = pd.read_json(url, compression="gzip")
pdf.columns = [col.replace(".", "_") for col in pdf.columns]
if database.backend == "presto":
pdf.year = pd.to_datetime(pdf.year)
Expand Down
3 changes: 3 additions & 0 deletions superset/result_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,9 @@ def __init__( # pylint: disable=too-many-locals
except Exception as ex: # pylint: disable=broad-except
logger.exception(ex)

if not pa_data:
column_names = []

self.table = pa.Table.from_arrays(pa_data, names=column_names)
self._type_dict: Dict[str, Any] = {}
try:
Expand Down
3 changes: 3 additions & 0 deletions superset/utils/pandas_postprocessing/contribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ def contribution(
"""
contribution_df = df.copy()
numeric_df = contribution_df.select_dtypes(include=["number", Decimal])
# TODO: copy needed due to following regression in 1.4, remove if not needed:
# https://github.com/pandas-dev/pandas/issues/48090
numeric_df = numeric_df.copy()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome!

numeric_df.fillna(0, inplace=True)
# verify column selections
if columns:
Expand Down
4 changes: 2 additions & 2 deletions superset/viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -2172,14 +2172,14 @@ def get_data(self, df: pd.DataFrame) -> VizData:
if df is not None and not df.empty:
if metric:
df = df.sort_values(
utils.get_metric_name(metric), ascending=flt.get("asc")
utils.get_metric_name(metric), ascending=flt.get("asc", False)
)
data[col] = [
{"id": row[0], "text": row[0], "metric": row[1]}
for row in df.itertuples(index=False)
]
else:
df = df.sort_values(col, ascending=flt.get("asc"))
df = df.sort_values(col, ascending=flt.get("asc", False))
data[col] = [
{"id": row[0], "text": row[0]}
for row in df.itertuples(index=False)
Expand Down