From 9dae0986f9256ea8727dfc0273c02cbb904833c6 Mon Sep 17 00:00:00 2001 From: Catherine Noll Date: Mon, 12 Dec 2022 23:44:11 -0500 Subject: [PATCH] Enable low-code CDK users to specify schema in the manifest (#20375) Enable low-code CDK users to specify schema in the manifest Also update documentation: * Add inline schema loader info to yaml-overview.md * Include inline schema info in tutorial --- airbyte-cdk/python/CHANGELOG.md | 3 +++ .../declarative/config_component_schema.json | 11 +++++++++++ .../manifest_declarative_source.py | 2 +- .../parsers/class_types_registry.py | 2 ++ .../sources/declarative/schema/__init__.py | 3 ++- .../schema/inline_schema_loader.py | 19 +++++++++++++++++++ airbyte-cdk/python/setup.py | 2 +- .../schema/test_inline_schema_loader.py | 19 +++++++++++++++++++ .../config-based/source_schema.yaml | 9 +++++++++ .../config-based/tutorial/4-reading-data.md | 2 ++ .../yaml-overview.md | 11 ++++++++--- 11 files changed, 77 insertions(+), 6 deletions(-) create mode 100644 airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/inline_schema_loader.py create mode 100644 airbyte-cdk/python/unit_tests/sources/declarative/schema/test_inline_schema_loader.py diff --git a/airbyte-cdk/python/CHANGELOG.md b/airbyte-cdk/python/CHANGELOG.md index 116c818a2e09..1a7357cc95d3 100644 --- a/airbyte-cdk/python/CHANGELOG.md +++ b/airbyte-cdk/python/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## 0.13.2 +Low-code: Enable low-code CDK users to specify schema inline in the manifest + ## 0.13.1 Low-code: Add `SessionTokenAuthenticator` diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/config_component_schema.json b/airbyte-cdk/python/airbyte_cdk/sources/declarative/config_component_schema.json index 607cfacd930b..bcac98666570 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/declarative/config_component_schema.json +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/config_component_schema.json @@ -75,6 +75,9 @@ }, { "$ref": "#/definitions/DefaultSchemaLoader" + }, + { + "$ref": "#/definitions/InlineSchemaLoader" } ] }, @@ -93,6 +96,9 @@ }, { "$ref": "#/definitions/DefaultSchemaLoader" + }, + { + "$ref": "#/definitions/InlineSchemaLoader" } ] }, @@ -1609,6 +1615,11 @@ ], "description": "\n Loads a schema from the default location or returns an empty schema for streams that have not defined their schema file yet.\n\n Attributes:\n config (Config): The user-provided configuration as specified by the source's spec\n options (Mapping[str, Any]): Additional arguments to pass to the string interpolation if needed\n " }, + "InlineSchemaLoader": { + "type": "object", + "properties": {}, + "description": "Loads a schema from the manifest, if provided." + }, "AddFields": { "allOf": [ { diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/manifest_declarative_source.py index a8ca572b42b6..88fe7294c5fa 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -40,7 +40,7 @@ class ConcreteDeclarativeSource(JsonSchemaMixin): class ManifestDeclarativeSource(DeclarativeSource): """Declarative source defined by a manifest of low-code components that define source connector behavior""" - VALID_TOP_LEVEL_FIELDS = {"check", "definitions", "spec", "streams", "version"} + VALID_TOP_LEVEL_FIELDS = {"check", "definitions", "schemas", "spec", "streams", "version"} def __init__(self, source_config: ConnectionDefinition, debug: bool = False): """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/class_types_registry.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/class_types_registry.py index f26287fa671f..7e151efa1eab 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/class_types_registry.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/class_types_registry.py @@ -37,6 +37,7 @@ from airbyte_cdk.sources.declarative.requesters.paginators.strategies.offset_increment import OffsetIncrement from airbyte_cdk.sources.declarative.requesters.paginators.strategies.page_increment import PageIncrement from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever +from airbyte_cdk.sources.declarative.schema.inline_schema_loader import InlineSchemaLoader from airbyte_cdk.sources.declarative.schema.json_file_schema_loader import JsonFileSchemaLoader from airbyte_cdk.sources.declarative.spec import Spec from airbyte_cdk.sources.declarative.stream_slicers.cartesian_product_stream_slicer import CartesianProductStreamSlicer @@ -66,6 +67,7 @@ "DpathExtractor": DpathExtractor, "ExponentialBackoffStrategy": ExponentialBackoffStrategy, "HttpRequester": HttpRequester, + "InlineSchemaLoader": InlineSchemaLoader, "InterpolatedBoolean": InterpolatedBoolean, "InterpolatedString": InterpolatedString, "JsonSchema": JsonFileSchemaLoader, # todo remove after hacktoberfest and update connectors to use JsonFileSchemaLoader diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/__init__.py index ce2281d59f0c..db5349305dfa 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/__init__.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/__init__.py @@ -3,7 +3,8 @@ # from airbyte_cdk.sources.declarative.schema.default_schema_loader import DefaultSchemaLoader +from airbyte_cdk.sources.declarative.schema.inline_schema_loader import InlineSchemaLoader from airbyte_cdk.sources.declarative.schema.json_file_schema_loader import JsonFileSchemaLoader from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader -__all__ = ["JsonFileSchemaLoader", "DefaultSchemaLoader", "SchemaLoader"] +__all__ = ["JsonFileSchemaLoader", "DefaultSchemaLoader", "SchemaLoader", "InlineSchemaLoader"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/inline_schema_loader.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/inline_schema_loader.py new file mode 100644 index 000000000000..2161db3bd809 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/inline_schema_loader.py @@ -0,0 +1,19 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Dict, Mapping + +from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader + + +@dataclass +class InlineSchemaLoader(SchemaLoader): + """Describes a stream's schema""" + + schema: Dict[str, Any] + options: InitVar[Mapping[str, Any]] + + def get_json_schema(self) -> Mapping[str, Any]: + return self.schema diff --git a/airbyte-cdk/python/setup.py b/airbyte-cdk/python/setup.py index c4065d6fa8e1..91bf0d7b69ee 100644 --- a/airbyte-cdk/python/setup.py +++ b/airbyte-cdk/python/setup.py @@ -15,7 +15,7 @@ setup( name="airbyte-cdk", - version="0.13.1", + version="0.13.2", description="A framework for writing Airbyte Connectors.", long_description=README, long_description_content_type="text/markdown", diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_inline_schema_loader.py b/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_inline_schema_loader.py new file mode 100644 index 000000000000..8379a1b3c8c8 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/schema/test_inline_schema_loader.py @@ -0,0 +1,19 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.schema import InlineSchemaLoader + + +@pytest.mark.parametrize( + "test_name, input_schema, expected_schema", + [ + ("schema", {"k": "string"}, {"k": "string"}), + ("empty_schema", {}, {}), + ], +) +def test_static_schema_loads(test_name, input_schema, expected_schema): + schema_loader = InlineSchemaLoader(input_schema, {}) + + assert schema_loader.get_json_schema() == expected_schema diff --git a/docs/connector-development/config-based/source_schema.yaml b/docs/connector-development/config-based/source_schema.yaml index 1be85be20a56..3c5f353cb65c 100644 --- a/docs/connector-development/config-based/source_schema.yaml +++ b/docs/connector-development/config-based/source_schema.yaml @@ -58,6 +58,15 @@ definitions: "$ref": "#/definitions/RecordTransformation" checkpoint_interval: type: integer + InlineSchemaLoader: + type: object + required: + - schema + properties: + "$options": + "$ref": "#/definitions/$options" + schema: + type: object PrimaryKey: type: string Retriever: diff --git a/docs/connector-development/config-based/tutorial/4-reading-data.md b/docs/connector-development/config-based/tutorial/4-reading-data.md index a39a8071f0ca..bcbce8a78669 100644 --- a/docs/connector-development/config-based/tutorial/4-reading-data.md +++ b/docs/connector-development/config-based/tutorial/4-reading-data.md @@ -39,6 +39,8 @@ rm source_exchange_rates_tutorial/schemas/customers.json rm source_exchange_rates_tutorial/schemas/employees.json ``` +As an alternative to storing the stream's data schema to the `schemas/` directory, we can store it inline in the YAML file, by including the optional `schema_loader` key and associated schema in the entry for each stream. More information on how to define a stream's schema in the YAML file can be found [here](../understanding-the-yaml-file/yaml-overview.md). + Reading from the source can be done by running the `read` operation ```bash diff --git a/docs/connector-development/config-based/understanding-the-yaml-file/yaml-overview.md b/docs/connector-development/config-based/understanding-the-yaml-file/yaml-overview.md index 95b5059c8574..be64d9e3d1db 100644 --- a/docs/connector-development/config-based/understanding-the-yaml-file/yaml-overview.md +++ b/docs/connector-development/config-based/understanding-the-yaml-file/yaml-overview.md @@ -7,10 +7,13 @@ The low-code framework involves editing a boilerplate [YAML file](../low-code-cd Streams define the schema of the data to sync, as well as how to read it from the underlying API source. A stream generally corresponds to a resource within the API. They are analogous to tables for a relational database source. -A stream's schema will can defined as a [JSONSchema](https://json-schema.org/) file in `/schemas/.json`. +By default, the schema of a stream's data is defined as a [JSONSchema](https://json-schema.org/) file in `/schemas/.json`. + +Alternately, the stream's data schema can be stored in YAML format inline in the YAML file, by including the optional `schema_loader` key. If the data schema is provided inline, any schema on disk for that stream will be ignored. + More information on how to define a stream's schema can be found [here](../source_schema.yaml) -The schema of a stream object is: +The stream object is represented in the YAML file as: ```yaml Stream: @@ -34,6 +37,8 @@ The schema of a stream object is: "$ref": "#/definitions/RecordTransformation" checkpoint_interval: type: integer + schema_loader: + "$ref": "#/definitions/InlineSchemaLoader" ``` More details on streams and sources can be found in the [basic concepts section](../../cdk-python/basic-concepts.md). @@ -99,4 +104,4 @@ More information on `DatetimeStreamSlicer` can be found in the [stream slicers]( ## More readings - [Requester](./requester.md) -- [Stream slicers](./stream-slicers.md) \ No newline at end of file +- [Stream slicers](./stream-slicers.md)