From 9d7f32c1393b49cce3be1cf3251372a01b952157 Mon Sep 17 00:00:00 2001 From: Daniel Gaspar Date: Wed, 1 Feb 2023 16:26:09 +0000 Subject: [PATCH 1/3] fix: add new config to allow for specific import data urls --- superset/config.py | 7 ++ superset/datasets/commands/exceptions.py | 4 + .../datasets/commands/importers/v1/utils.py | 32 ++++- .../commands/importers/v1/import_test.py | 112 +++++++++++++++++- 4 files changed, 149 insertions(+), 6 deletions(-) diff --git a/superset/config.py b/superset/config.py index 3aac5947189da..fe5b5825febfb 100644 --- a/superset/config.py +++ b/superset/config.py @@ -1406,6 +1406,13 @@ def EMAIL_HEADER_MUTATOR( # pylint: disable=invalid-name,unused-argument # Prevents unsafe default endpoints to be registered on datasets. PREVENT_UNSAFE_DEFAULT_URLS_ON_DATASET = True +# Define a list of allowed URLs for dataset data imports (v1). +# Simple example to only allow URLs that belong to certain domains: +# ALLOWED_IMPORT_URL_DOMAINS = [ +# r"^https://.+\.domain1\.com\/?.*", r"^https://.+\.domain2\.com\/?.*" +# ] +DATASET_IMPORT_ALLOWED_DATA_URLS = [r".*"] + # Path used to store SSL certificates that are generated when using custom certs. # Defaults to temporary directory. # Example: SSL_CERT_PATH = "/certs" diff --git a/superset/datasets/commands/exceptions.py b/superset/datasets/commands/exceptions.py index c76b7b3ad53dc..6b80aa3d6563a 100644 --- a/superset/datasets/commands/exceptions.py +++ b/superset/datasets/commands/exceptions.py @@ -191,3 +191,7 @@ class DatasetAccessDeniedError(ForbiddenError): class DatasetDuplicateFailedError(CreateFailedError): message = _("Dataset could not be duplicated.") + + +class DatasetUnAllowedDataURI(ForbiddenError): + message = _("Data URI is not allowed.") diff --git a/superset/datasets/commands/importers/v1/utils.py b/superset/datasets/commands/importers/v1/utils.py index d04763c7a8996..221abe0bb6d59 100644 --- a/superset/datasets/commands/importers/v1/utils.py +++ b/superset/datasets/commands/importers/v1/utils.py @@ -22,6 +22,7 @@ from urllib import request import pandas as pd +from datasets.commands.exceptions import DatasetUnAllowedDataURI from flask import current_app, g from sqlalchemy import BigInteger, Boolean, Date, DateTime, Float, String, Text from sqlalchemy.orm import Session @@ -75,6 +76,28 @@ def get_dtype(df: pd.DataFrame, dataset: SqlaTable) -> Dict[str, VisitableType]: } +def validate_data_uri(data_uri: str) -> None: + """ + Validate that the data URI is configured on DATASET_IMPORT_ALLOWED_URLS + has a valid URL. + + :param data_uri: + :return: + """ + allowed_urls = current_app.config["DATASET_IMPORT_ALLOWED_DATA_URLS"] + for allowed_url in allowed_urls: + try: + match = re.match(allowed_url, data_uri) + except re.error: + logger.exception( + "Invalid regular expression on DATASET_IMPORT_ALLOWED_URLS" + ) + raise + if match: + return + raise DatasetUnAllowedDataURI() + + def import_dataset( session: Session, config: Dict[str, Any], @@ -139,7 +162,6 @@ def import_dataset( table_exists = True if data_uri and (not table_exists or force_data): - logger.info("Downloading data from %s", data_uri) load_data(data_uri, dataset, dataset.database, session) if hasattr(g, "user") and g.user: @@ -151,6 +173,14 @@ def import_dataset( def load_data( data_uri: str, dataset: SqlaTable, database: Database, session: Session ) -> None: + """ + Load data from a data URI into a dataset. + + :raises DatasetUnAllowedDataURI: If a dataset is trying + to load data from a URI that is not allowed. + """ + validate_data_uri(data_uri) + logger.info("Downloading data from %s", data_uri) data = request.urlopen(data_uri) # pylint: disable=consider-using-with if data_uri.endswith(".gz"): data = gzip.open(data) diff --git a/tests/unit_tests/datasets/commands/importers/v1/import_test.py b/tests/unit_tests/datasets/commands/importers/v1/import_test.py index 934712b8c9b7e..1b3972d30719d 100644 --- a/tests/unit_tests/datasets/commands/importers/v1/import_test.py +++ b/tests/unit_tests/datasets/commands/importers/v1/import_test.py @@ -18,19 +18,25 @@ import copy import json +import re import uuid +import pytest from typing import Any, Dict +from unittest.mock import patch, Mock +from flask import current_app from sqlalchemy.orm.session import Session +from datasets.commands.exceptions import DatasetUnAllowedDataURI +from superset.datasets.commands.importers.v1.utils import validate_data_uri + def test_import_dataset(session: Session) -> None: """ Test importing a dataset. """ - from superset.connectors.sqla.models import SqlaTable, SqlMetric, TableColumn + from superset.connectors.sqla.models import SqlaTable from superset.datasets.commands.importers.v1.utils import import_dataset - from superset.datasets.schemas import ImportV1DatasetSchema from superset.models.core import Database engine = session.get_bind() @@ -340,13 +346,84 @@ def test_import_column_extra_is_string(session: Session) -> None: assert sqla_table.extra == '{"warning_markdown": "*WARNING*"}' +@patch("superset.datasets.commands.importers.v1.utils.request") +def test_import_column_allowed_data_url(request: Mock, session: Session) -> None: + """ + Test importing a dataset when using data key to fetch data from a URL. + """ + import io + from superset.connectors.sqla.models import SqlaTable + from superset.datasets.commands.importers.v1.utils import import_dataset + from superset.datasets.schemas import ImportV1DatasetSchema + from superset.models.core import Database + + + request.urlopen.return_value = io.StringIO("col1\nvalue1\nvalue2\n") + + engine = session.get_bind() + SqlaTable.metadata.create_all(engine) # pylint: disable=no-member + + database = Database(database_name="my_database", sqlalchemy_uri="sqlite://") + session.add(database) + session.flush() + + dataset_uuid = uuid.uuid4() + yaml_config: Dict[str, Any] = { + "version": "1.0.0", + "table_name": "my_table", + "main_dttm_col": "ds", + "description": "This is the description", + "default_endpoint": None, + "offset": -8, + "cache_timeout": 3600, + "schema": None, + "sql": None, + "params": { + "remote_id": 64, + "database_name": "examples", + "import_time": 1606677834, + }, + "template_params": None, + "filter_select_enabled": True, + "fetch_values_predicate": None, + "extra": None, + "uuid": dataset_uuid, + "metrics": [], + "columns": [ + { + "column_name": "col1", + "verbose_name": None, + "is_dttm": False, + "is_active": True, + "type": "TEXT", + "groupby": False, + "filterable": False, + "expression": None, + "description": None, + "python_date_format": None, + "extra": None, + } + ], + "database_uuid": database.uuid, + "data": "https://some-external-url.com/data.csv", + } + + # the Marshmallow schema should convert strings to objects + schema = ImportV1DatasetSchema() + dataset_config = schema.load(yaml_config) + dataset_config["database_id"] = database.id + _ = import_dataset(session, dataset_config, force_data=True) + session.connection() + assert [("value1",), ("value2",)] == session.execute( + "SELECT * FROM my_table" + ).fetchall() + def test_import_dataset_managed_externally(session: Session) -> None: """ Test importing a dataset that is managed externally. """ - from superset.connectors.sqla.models import SqlaTable, SqlMetric, TableColumn + from superset.connectors.sqla.models import SqlaTable from superset.datasets.commands.importers.v1.utils import import_dataset - from superset.datasets.schemas import ImportV1DatasetSchema from superset.models.core import Database from tests.integration_tests.fixtures.importexport import dataset_config @@ -357,7 +434,6 @@ def test_import_dataset_managed_externally(session: Session) -> None: session.add(database) session.flush() - dataset_uuid = uuid.uuid4() config = copy.deepcopy(dataset_config) config["is_managed_externally"] = True config["external_url"] = "https://example.org/my_table" @@ -366,3 +442,29 @@ def test_import_dataset_managed_externally(session: Session) -> None: sqla_table = import_dataset(session, config) assert sqla_table.is_managed_externally is True assert sqla_table.external_url == "https://example.org/my_table" + + + +@pytest.mark.parametrize( + "allowed_urls, data_uri, expected, exception_class", + [ + ([r".*"], "https://some-url/data.csv", True, None), + ([r"^https://.+\.domain1\.com\/?.*", r"^https://.+\.domain2\.com\/?.*"], + "https://host1.domain1.com/data.csv", True, None), + ([r"^https://.+\.domain1\.com\/?.*", r"^https://.+\.domain2\.com\/?.*"], + "https://host2.domain1.com/data.csv", True, None), + ([r"^https://.+\.domain1\.com\/?.*", r"^https://.+\.domain2\.com\/?.*"], + "https://host1.domain2.com/data.csv", True, None), + ([r"^https://.+\.domain1\.com\/?.*", r"^https://.+\.domain2\.com\/?.*"], + "https://host1.domain3.com/data.csv", False, DatasetUnAllowedDataURI), + ([], "https://host1.domain3.com/data.csv", False, DatasetUnAllowedDataURI), + (["*"], "https://host1.domain3.com/data.csv", False, re.error), + ], +) +def test_validate_data_uri(allowed_urls, data_uri, expected, exception_class): + current_app.config["DATASET_IMPORT_ALLOWED_DATA_URLS"] = allowed_urls + if expected: + validate_data_uri(data_uri) + else: + with pytest.raises(exception_class): + validate_data_uri(data_uri) From 1e502200af4696058e9d72669cc2ff58e1f8ec56 Mon Sep 17 00:00:00 2001 From: Daniel Gaspar Date: Wed, 1 Feb 2023 16:35:50 +0000 Subject: [PATCH 2/3] fix --- superset/datasets/commands/importers/v1/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superset/datasets/commands/importers/v1/utils.py b/superset/datasets/commands/importers/v1/utils.py index 221abe0bb6d59..983bb7fee5e2c 100644 --- a/superset/datasets/commands/importers/v1/utils.py +++ b/superset/datasets/commands/importers/v1/utils.py @@ -22,7 +22,6 @@ from urllib import request import pandas as pd -from datasets.commands.exceptions import DatasetUnAllowedDataURI from flask import current_app, g from sqlalchemy import BigInteger, Boolean, Date, DateTime, Float, String, Text from sqlalchemy.orm import Session @@ -30,6 +29,7 @@ from sqlalchemy.sql.visitors import VisitableType from superset.connectors.sqla.models import SqlaTable +from superset.datasets.commands.exceptions import DatasetUnAllowedDataURI from superset.models.core import Database logger = logging.getLogger(__name__) From 858d11f933e42567c66c8fa125049ce90c7904a3 Mon Sep 17 00:00:00 2001 From: Daniel Gaspar Date: Wed, 1 Feb 2023 16:50:54 +0000 Subject: [PATCH 3/3] rename exception, fix imports --- superset/datasets/commands/exceptions.py | 2 +- .../datasets/commands/importers/v1/utils.py | 4 +- .../commands/importers/v1/import_test.py | 44 +++++++++++++------ 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/superset/datasets/commands/exceptions.py b/superset/datasets/commands/exceptions.py index 6b80aa3d6563a..96ed7823b0a04 100644 --- a/superset/datasets/commands/exceptions.py +++ b/superset/datasets/commands/exceptions.py @@ -193,5 +193,5 @@ class DatasetDuplicateFailedError(CreateFailedError): message = _("Dataset could not be duplicated.") -class DatasetUnAllowedDataURI(ForbiddenError): +class DatasetForbiddenDataURI(ForbiddenError): message = _("Data URI is not allowed.") diff --git a/superset/datasets/commands/importers/v1/utils.py b/superset/datasets/commands/importers/v1/utils.py index 983bb7fee5e2c..b3fb2a804127b 100644 --- a/superset/datasets/commands/importers/v1/utils.py +++ b/superset/datasets/commands/importers/v1/utils.py @@ -29,7 +29,7 @@ from sqlalchemy.sql.visitors import VisitableType from superset.connectors.sqla.models import SqlaTable -from superset.datasets.commands.exceptions import DatasetUnAllowedDataURI +from superset.datasets.commands.exceptions import DatasetForbiddenDataURI from superset.models.core import Database logger = logging.getLogger(__name__) @@ -95,7 +95,7 @@ def validate_data_uri(data_uri: str) -> None: raise if match: return - raise DatasetUnAllowedDataURI() + raise DatasetForbiddenDataURI() def import_dataset( diff --git a/tests/unit_tests/datasets/commands/importers/v1/import_test.py b/tests/unit_tests/datasets/commands/importers/v1/import_test.py index 1b3972d30719d..5b52ac7f1da99 100644 --- a/tests/unit_tests/datasets/commands/importers/v1/import_test.py +++ b/tests/unit_tests/datasets/commands/importers/v1/import_test.py @@ -20,14 +20,14 @@ import json import re import uuid -import pytest from typing import Any, Dict -from unittest.mock import patch, Mock +from unittest.mock import Mock, patch +import pytest from flask import current_app from sqlalchemy.orm.session import Session -from datasets.commands.exceptions import DatasetUnAllowedDataURI +from superset.datasets.commands.exceptions import DatasetForbiddenDataURI from superset.datasets.commands.importers.v1.utils import validate_data_uri @@ -352,12 +352,12 @@ def test_import_column_allowed_data_url(request: Mock, session: Session) -> None Test importing a dataset when using data key to fetch data from a URL. """ import io + from superset.connectors.sqla.models import SqlaTable from superset.datasets.commands.importers.v1.utils import import_dataset from superset.datasets.schemas import ImportV1DatasetSchema from superset.models.core import Database - request.urlopen.return_value = io.StringIO("col1\nvalue1\nvalue2\n") engine = session.get_bind() @@ -418,6 +418,7 @@ def test_import_column_allowed_data_url(request: Mock, session: Session) -> None "SELECT * FROM my_table" ).fetchall() + def test_import_dataset_managed_externally(session: Session) -> None: """ Test importing a dataset that is managed externally. @@ -444,20 +445,35 @@ def test_import_dataset_managed_externally(session: Session) -> None: assert sqla_table.external_url == "https://example.org/my_table" - @pytest.mark.parametrize( "allowed_urls, data_uri, expected, exception_class", [ ([r".*"], "https://some-url/data.csv", True, None), - ([r"^https://.+\.domain1\.com\/?.*", r"^https://.+\.domain2\.com\/?.*"], - "https://host1.domain1.com/data.csv", True, None), - ([r"^https://.+\.domain1\.com\/?.*", r"^https://.+\.domain2\.com\/?.*"], - "https://host2.domain1.com/data.csv", True, None), - ([r"^https://.+\.domain1\.com\/?.*", r"^https://.+\.domain2\.com\/?.*"], - "https://host1.domain2.com/data.csv", True, None), - ([r"^https://.+\.domain1\.com\/?.*", r"^https://.+\.domain2\.com\/?.*"], - "https://host1.domain3.com/data.csv", False, DatasetUnAllowedDataURI), - ([], "https://host1.domain3.com/data.csv", False, DatasetUnAllowedDataURI), + ( + [r"^https://.+\.domain1\.com\/?.*", r"^https://.+\.domain2\.com\/?.*"], + "https://host1.domain1.com/data.csv", + True, + None, + ), + ( + [r"^https://.+\.domain1\.com\/?.*", r"^https://.+\.domain2\.com\/?.*"], + "https://host2.domain1.com/data.csv", + True, + None, + ), + ( + [r"^https://.+\.domain1\.com\/?.*", r"^https://.+\.domain2\.com\/?.*"], + "https://host1.domain2.com/data.csv", + True, + None, + ), + ( + [r"^https://.+\.domain1\.com\/?.*", r"^https://.+\.domain2\.com\/?.*"], + "https://host1.domain3.com/data.csv", + False, + DatasetForbiddenDataURI, + ), + ([], "https://host1.domain3.com/data.csv", False, DatasetForbiddenDataURI), (["*"], "https://host1.domain3.com/data.csv", False, re.error), ], )