diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py b/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py index f6e944f4fc3cb..c7e8a15d8dfa4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py @@ -9,6 +9,7 @@ from os.path import basename, dirname from pathlib import Path from typing import Any, Iterable, List, Optional, Union +from urllib.parse import urlparse import jsonref from pydantic import AnyHttpUrl, DirectoryPath, FilePath, validator @@ -53,6 +54,16 @@ logger = logging.getLogger(__name__) +def is_url_valid(url: Optional[str]) -> bool: + if url is None: + return False + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except Exception: + return False + + class URIReplacePattern(ConfigModel): match: str = Field( description="Pattern to match on uri-s as part of reference resolution. See replace field", @@ -281,12 +292,14 @@ def _load_one_file( entityUrn=dataset_urn, aspect=models.StatusClass(removed=False) ).as_workunit() + external_url = JsonSchemaTranslator._get_id_from_any_schema(schema_dict) + if not is_url_valid(external_url): + external_url = None + yield MetadataChangeProposalWrapper( entityUrn=dataset_urn, aspect=models.DatasetPropertiesClass( - externalUrl=JsonSchemaTranslator._get_id_from_any_schema( - schema_dict - ), + externalUrl=external_url, name=dataset_simple_name, description=JsonSchemaTranslator._get_description_from_any_schema( schema_dict