Skip to content

Commit

Permalink
fix(hive): Update CSV to Hive upload prefix (apache#14255)
Browse files Browse the repository at this point in the history
* fix(hive): Update CSV to Hive upload prefix

* Trigger notification

Co-authored-by: John Bodley <john.bodley@airbnb.com>
john-bodley and John Bodley authored Apr 24, 2021

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent 16f62c9 commit 31711b3
Showing 3 changed files with 32 additions and 7 deletions.
2 changes: 2 additions & 0 deletions UPDATING.md
Original file line number Diff line number Diff line change
@@ -27,6 +27,8 @@ assists people when migrating to a new version.

- [13980](https://github.com/apache/superset/pull/13980): Data health checks no longer use the metadata database as an interim cache. Though non-breaking, deployments which implement complex logic should likely memoize the callback function. Refer to documentation in the confg.py file for more detail.

- [14255](https://github.com/apache/superset/pull/14255): The default `CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC` callable logic has been updated to leverage the specified database and schema to ensure the upload S3 key prefix is unique. Previously tables generated via upload from CSV with the same name but differ schema and/or cluster would use the same S3 key prefix. Note this change does not impact previously imported tables.

### Breaking Changes
### Potential Downtime
### Deprecations
13 changes: 10 additions & 3 deletions superset/config.py
Original file line number Diff line number Diff line change
@@ -791,9 +791,16 @@ class CeleryConfig: # pylint: disable=too-few-public-methods
CSV_TO_HIVE_UPLOAD_DIRECTORY = "EXTERNAL_HIVE_TABLES/"
# Function that creates upload directory dynamically based on the
# database used, user and schema provided.
CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC: Callable[
["Database", "models.User", str], Optional[str]
] = lambda database, user, schema: CSV_TO_HIVE_UPLOAD_DIRECTORY
def CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC(
database: "Database",
user: "models.User", # pylint: disable=unused-argument
schema: Optional[str],
) -> str:
# Note the final empty path enforces a trailing slash.
return os.path.join(
CSV_TO_HIVE_UPLOAD_DIRECTORY, str(database.id), schema or "", ""
)


# The namespace within hive where the tables created from
# uploading CSVs will be stored.
24 changes: 20 additions & 4 deletions tests/db_engine_specs/hive_tests.py
Original file line number Diff line number Diff line change
@@ -173,7 +173,7 @@ def test_create_table_from_csv_append() -> None:

@mock.patch(
"superset.db_engine_specs.hive.config",
{**app.config, "CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC": lambda *args: True},
{**app.config, "CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC": lambda *args: ""},
)
@mock.patch("superset.db_engine_specs.hive.g", spec={})
@mock.patch("tableschema.Table")
@@ -190,7 +190,7 @@ def test_create_table_from_csv_if_exists_fail(mock_table, mock_g):

@mock.patch(
"superset.db_engine_specs.hive.config",
{**app.config, "CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC": lambda *args: True},
{**app.config, "CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC": lambda *args: ""},
)
@mock.patch("superset.db_engine_specs.hive.g", spec={})
@mock.patch("tableschema.Table")
@@ -211,7 +211,7 @@ def test_create_table_from_csv_if_exists_fail_with_schema(mock_table, mock_g):

@mock.patch(
"superset.db_engine_specs.hive.config",
{**app.config, "CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC": lambda *args: True},
{**app.config, "CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC": lambda *args: ""},
)
@mock.patch("superset.db_engine_specs.hive.g", spec={})
@mock.patch("tableschema.Table")
@@ -239,7 +239,7 @@ def test_create_table_from_csv_if_exists_replace(mock_upload_to_s3, mock_table,

@mock.patch(
"superset.db_engine_specs.hive.config",
{**app.config, "CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC": lambda *args: True},
{**app.config, "CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC": lambda *args: ""},
)
@mock.patch("superset.db_engine_specs.hive.g", spec={})
@mock.patch("tableschema.Table")
@@ -347,6 +347,22 @@ def is_readonly(sql: str) -> bool:
assert is_readonly("WITH (SELECT 1) bla SELECT * from bla")


@pytest.mark.parametrize(
"schema,upload_prefix",
[("foo", "EXTERNAL_HIVE_TABLES/1/foo/"), (None, "EXTERNAL_HIVE_TABLES/1/")],
)
def test_s3_upload_prefix(schema: str, upload_prefix: str) -> None:
mock_database = mock.MagicMock()
mock_database.id = 1

assert (
app.config["CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC"](
database=mock_database, user=mock.MagicMock(), schema=schema
)
== upload_prefix
)


def test_upload_to_s3_no_bucket_path():
with pytest.raises(
Exception,

0 comments on commit 31711b3

Please sign in to comment.