From 0a0aafb2bbb5b77cb9026145b50e84a45fd2df71 Mon Sep 17 00:00:00 2001 From: maxim-lixakov Date: Fri, 12 Apr 2024 11:51:49 +0300 Subject: [PATCH 01/10] [DOP-13855] - update clickhouse driver --- docs/changelog/next_release/249.breaking.rst | 1 + onetl/VERSION | 2 +- .../db_connection/clickhouse/connection.py | 51 ++++++++++++++----- .../test_clickhouse_integration.py | 4 -- .../test_strategy_increment_clickhouse.py | 2 +- .../test_clickhouse_unit.py | 27 +++++++--- 6 files changed, 61 insertions(+), 26 deletions(-) create mode 100644 docs/changelog/next_release/249.breaking.rst diff --git a/docs/changelog/next_release/249.breaking.rst b/docs/changelog/next_release/249.breaking.rst new file mode 100644 index 000000000..aa7bcd2e2 --- /dev/null +++ b/docs/changelog/next_release/249.breaking.rst @@ -0,0 +1 @@ +Updated the Clickhouse JDBC driver from ``ru.yandex.clickhouse:clickhouse-jdbc:0.3.2`` to `com.clickhouse:clickhouse-jdbc:0.6.0 `_. Added support for dynamic package versioning and dependencies on the `Apache HTTP client `_ as needed. diff --git a/onetl/VERSION b/onetl/VERSION index a3f5a8ed4..d9df1bbc0 100644 --- a/onetl/VERSION +++ b/onetl/VERSION @@ -1 +1 @@ -0.10.3 +0.11.0 diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 88de5645a..efa6c7edc 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging -import warnings from typing import ClassVar, Optional from onetl._util.classproperty import classproperty @@ -28,8 +27,8 @@ class Config: class Clickhouse(JDBCConnection): """Clickhouse JDBC connection. |support_hooks| - Based on Maven package ``ru.yandex.clickhouse:clickhouse-jdbc:0.3.2`` - (`official Clickhouse JDBC driver `_). + Based on Maven package `com.clickhouse:clickhouse-jdbc:0.6.0 `_ and additional dependencies + for `Apache HTTP client `_ when necessary (`official Clickhouse JDBC driver `_). .. warning:: @@ -104,14 +103,26 @@ class Clickhouse(JDBCConnection): Extra = ClickhouseExtra Dialect = ClickhouseDialect - DRIVER: ClassVar[str] = "ru.yandex.clickhouse.ClickHouseDriver" + DRIVER: ClassVar[str] = "com.clickhouse.jdbc.ClickHouseDriver" @slot @classmethod - def get_packages(cls) -> list[str]: + def get_packages( + cls, + package_version: str | None = None, + apache_http_client_version: str | None = None, + ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| + Parameters + ---------- + package_version : str , optional + ClickHouse JDBC version client packages. Defaults to ``0.6.0``. + + apache_http_client_version : str, optional + Apache HTTP Client version package. Defaults to ``5.3.1``. + Examples -------- @@ -119,17 +130,33 @@ def get_packages(cls) -> list[str]: from onetl.connection import Clickhouse - Clickhouse.get_packages() + Clickhouse.get_packages("0.7.1", "5.4") + + .. note:: + + Spark does not support ``.jar`` classifiers, so it is not possible to pass + ``com.clickhouse:clickhouse-jdbc:0.6.0-all`` to install all required packages. + Dependencies are listed manually. """ - return ["ru.yandex.clickhouse:clickhouse-jdbc:0.3.2"] + package_version = package_version or "0.6.0" + apache_http_client_version = apache_http_client_version or "5.3.1" + + result = [ + f"com.clickhouse:clickhouse-jdbc:{package_version}", + f"com.clickhouse:clickhouse-http-client:{package_version}", + ] + + if package_version >= "0.5.0": + # before 0.5.0 builtin Java HTTP Client was used + result.append(f"org.apache.httpcomponents.client5:httpclient5:{apache_http_client_version}") + + return result @classproperty - def package(cls) -> str: - """Get package name to be downloaded by Spark.""" - msg = "`Clickhouse.package` will be removed in 1.0.0, use `Clickhouse.get_packages()` instead" - warnings.warn(msg, UserWarning, stacklevel=3) - return "ru.yandex.clickhouse:clickhouse-jdbc:0.3.2" + def package(self) -> str: + """Get a single string of package names to be downloaded by Spark for establishing a Clickhouse connection.""" + return "com.clickhouse:clickhouse-jdbc:0.6.0,com.clickhouse:clickhouse-http-client:0.6.0,org.apache.httpcomponents.client5:httpclient5:5.3.1" @property def jdbc_url(self) -> str: diff --git a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py index 73d8abfb2..27052a9f1 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py @@ -227,10 +227,6 @@ def table_finalizer(): updated_df = pandas.concat([updated_rows, unchanged_rows]) processing.assert_equal_df(df=df, other_frame=updated_df, order_by="id_int") - # not supported by Clickhouse - with pytest.raises(Exception): - clickhouse.execute(f"UPDATE {temp_table} SET hwm_int = 1 WHERE id_int < 50{suffix}") - clickhouse.execute(f"ALTER TABLE {temp_table} DELETE WHERE id_int < 70{suffix}") df = clickhouse.fetch(f"SELECT * FROM {temp_table}{suffix}") assert df.count() diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py index 77ec071b3..478a8e380 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py @@ -304,7 +304,7 @@ def test_clickhouse_strategy_incremental_explicit_hwm_type( ), ( "hwm_datetime", - "CAST(text_string AS DateTime)", + "CAST(text_string AS DateTime64(6))", ColumnDateTimeHWM, lambda x: x.isoformat(), ), diff --git a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py index 42b5582ae..708388417 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py @@ -1,5 +1,3 @@ -import re - import pytest from onetl.connection import Clickhouse @@ -8,21 +6,34 @@ def test_clickhouse_driver(): - assert Clickhouse.DRIVER == "ru.yandex.clickhouse.ClickHouseDriver" + assert Clickhouse.DRIVER == "com.clickhouse.jdbc.ClickHouseDriver" def test_clickhouse_package(): - warning_msg = re.escape("will be removed in 1.0.0, use `Clickhouse.get_packages()` instead") - with pytest.warns(UserWarning, match=warning_msg): - assert Clickhouse.package == "ru.yandex.clickhouse:clickhouse-jdbc:0.3.2" + expected_packages = "com.clickhouse:clickhouse-jdbc:0.6.0,com.clickhouse:clickhouse-http-client:0.6.0,org.apache.httpcomponents.client5:httpclient5:5.3.1" + assert Clickhouse.package == expected_packages def test_clickhouse_get_packages(): - assert Clickhouse.get_packages() == ["ru.yandex.clickhouse:clickhouse-jdbc:0.3.2"] + expected_packages = [ + "com.clickhouse:clickhouse-jdbc:0.6.0", + "com.clickhouse:clickhouse-http-client:0.6.0", + "org.apache.httpcomponents.client5:httpclient5:5.3.1", + ] + assert Clickhouse.get_packages() == expected_packages + + +def test_clickhouse_get_packages_custom(): + expected_packages = [ + "com.clickhouse:clickhouse-jdbc:0.7.1", + "com.clickhouse:clickhouse-http-client:0.7.1", + "org.apache.httpcomponents.client5:httpclient5:5.4", + ] + assert Clickhouse.get_packages("0.7.1", "5.4") == expected_packages def test_clickhouse_missing_package(spark_no_packages): - msg = "Cannot import Java class 'ru.yandex.clickhouse.ClickHouseDriver'" + msg = "Cannot import Java class 'com.clickhouse.jdbc.ClickHouseDriver'" with pytest.raises(ValueError, match=msg): Clickhouse( host="some_host", From 0aae52a9867e58af7dad2838f49ab94398e10fd4 Mon Sep 17 00:00:00 2001 From: maxim-lixakov Date: Fri, 12 Apr 2024 13:59:18 +0300 Subject: [PATCH 02/10] [DOP-13855] - update clickhouse get_packages() tests --- .../db_connection/clickhouse/connection.py | 24 ++++--- .../test_clickhouse_integration.py | 2 + .../test_strategy_increment_clickhouse.py | 2 +- .../test_clickhouse_unit.py | 69 ++++++++++++++----- tests/util/assert_df.py | 2 +- 5 files changed, 71 insertions(+), 28 deletions(-) diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index efa6c7edc..f46a2e176 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -6,6 +6,7 @@ from typing import ClassVar, Optional from onetl._util.classproperty import classproperty +from onetl._util.version import Version from onetl.connection.db_connection.clickhouse.dialect import ClickhouseDialect from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.jdbc_mixin import JDBCStatementType @@ -130,26 +131,29 @@ def get_packages( from onetl.connection import Clickhouse - Clickhouse.get_packages("0.7.1", "5.4") + Clickhouse.get_packages(package_version="0.7.1", apache_http_client_version="5.4") .. note:: - Spark does not support ``.jar`` classifiers, so it is not possible to pass - ``com.clickhouse:clickhouse-jdbc:0.6.0-all`` to install all required packages. - Dependencies are listed manually. + Spark does not support ``.jar`` classifiers, so it is not possible to pass + ``com.clickhouse:clickhouse-jdbc:0.6.0:all`` to install all required packages. """ - package_version = package_version or "0.6.0" - apache_http_client_version = apache_http_client_version or "5.3.1" + package_version_obj = Version(package_version) if package_version else Version("0.6.0") + apache_http_client_version_obj = ( + Version(apache_http_client_version) if apache_http_client_version else Version("5.3.1") + ) + if len(package_version_obj) != 3 or len(apache_http_client_version_obj) != 3: + raise ValueError("Version should consist of exactly three numeric_parts (major.minor.patch)") result = [ - f"com.clickhouse:clickhouse-jdbc:{package_version}", - f"com.clickhouse:clickhouse-http-client:{package_version}", + f"com.clickhouse:clickhouse-jdbc:{package_version_obj}", + f"com.clickhouse:clickhouse-http-client:{package_version_obj}", ] - if package_version >= "0.5.0": + if package_version_obj >= Version("0.5.0"): # before 0.5.0 builtin Java HTTP Client was used - result.append(f"org.apache.httpcomponents.client5:httpclient5:{apache_http_client_version}") + result.append(f"org.apache.httpcomponents.client5:httpclient5:{apache_http_client_version_obj}") return result diff --git a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py index 27052a9f1..c786b1fe0 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py @@ -227,6 +227,8 @@ def table_finalizer(): updated_df = pandas.concat([updated_rows, unchanged_rows]) processing.assert_equal_df(df=df, other_frame=updated_df, order_by="id_int") + clickhouse.execute(f"UPDATE {temp_table} SET hwm_int = 1 WHERE id_int < 50{suffix}") + clickhouse.execute(f"ALTER TABLE {temp_table} DELETE WHERE id_int < 70{suffix}") df = clickhouse.fetch(f"SELECT * FROM {temp_table}{suffix}") assert df.count() diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py index 478a8e380..77ec071b3 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py @@ -304,7 +304,7 @@ def test_clickhouse_strategy_incremental_explicit_hwm_type( ), ( "hwm_datetime", - "CAST(text_string AS DateTime64(6))", + "CAST(text_string AS DateTime)", ColumnDateTimeHWM, lambda x: x.isoformat(), ), diff --git a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py index 708388417..00442f179 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py @@ -14,22 +14,59 @@ def test_clickhouse_package(): assert Clickhouse.package == expected_packages -def test_clickhouse_get_packages(): - expected_packages = [ - "com.clickhouse:clickhouse-jdbc:0.6.0", - "com.clickhouse:clickhouse-http-client:0.6.0", - "org.apache.httpcomponents.client5:httpclient5:5.3.1", - ] - assert Clickhouse.get_packages() == expected_packages - - -def test_clickhouse_get_packages_custom(): - expected_packages = [ - "com.clickhouse:clickhouse-jdbc:0.7.1", - "com.clickhouse:clickhouse-http-client:0.7.1", - "org.apache.httpcomponents.client5:httpclient5:5.4", - ] - assert Clickhouse.get_packages("0.7.1", "5.4") == expected_packages +@pytest.mark.parametrize( + "package_version, apache_http_client_version, expected_packages", + [ + ( + None, + None, + [ + "com.clickhouse:clickhouse-jdbc:0.6.0", + "com.clickhouse:clickhouse-http-client:0.6.0", + "org.apache.httpcomponents.client5:httpclient5:5.3.1", + ], + ), + ( + "0.7.1", + "0.6.0", + [ + "com.clickhouse:clickhouse-jdbc:0.7.1", + "com.clickhouse:clickhouse-http-client:0.7.1", + "org.apache.httpcomponents.client5:httpclient5:0.6.0", + ], + ), + ( + "0.6.0-patch3", + "5.3.1", + [ + "com.clickhouse:clickhouse-jdbc:0.6.0-patch3", + "com.clickhouse:clickhouse-http-client:0.6.0-patch3", + "org.apache.httpcomponents.client5:httpclient5:5.3.1", + ], + ), + ], +) +def test_clickhouse_get_packages(package_version, apache_http_client_version, expected_packages): + assert ( + Clickhouse.get_packages(package_version=package_version, apache_http_client_version=apache_http_client_version) + == expected_packages + ) + + +@pytest.mark.parametrize( + "package_version, apache_http_client_version", + [ + ("0.7", "5.3.1"), + ("1", "5.4.0"), + ("a.b.c", "5.3.1"), + ], +) +def test_invalid_versions_raise_error(package_version, apache_http_client_version): + with pytest.raises( + ValueError, + match=r"Version should consist of exactly three numeric_parts \(major\.minor\.patch\)", + ): + Clickhouse.get_packages(package_version=package_version, apache_http_client_version=apache_http_client_version) def test_clickhouse_missing_package(spark_no_packages): diff --git a/tests/util/assert_df.py b/tests/util/assert_df.py index f7adad032..e36c9af73 100644 --- a/tests/util/assert_df.py +++ b/tests/util/assert_df.py @@ -67,4 +67,4 @@ def assert_subset_df( for column in columns: # noqa: WPS528 difference = ~small_pdf[column].isin(large_pdf[column]) - assert not difference.all(), large_pdf[difference] + assert not difference.all(), small_pdf[difference] From a21bface2d80d924e2fb7a7e85ab9e583153eb4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 12 Apr 2024 12:02:06 +0000 Subject: [PATCH 03/10] [DOP-13855] Fix Clickhouse HWM tests --- tests/fixtures/processing/clickhouse.py | 16 ++++++++++++++++ .../test_strategy_increment_clickhouse.py | 18 ++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/tests/fixtures/processing/clickhouse.py b/tests/fixtures/processing/clickhouse.py index 1205fad6a..2b3e4cec1 100644 --- a/tests/fixtures/processing/clickhouse.py +++ b/tests/fixtures/processing/clickhouse.py @@ -152,3 +152,19 @@ def get_expected_dataframe( order_by: str | None = None, ) -> pandas.DataFrame: return self.connection.query_dataframe(self.get_expected_dataframe_ddl(schema, table, order_by)) + + def fix_pandas_df( + self, + df: pandas.DataFrame, + ) -> pandas.DataFrame: + df = super().fix_pandas_df(df) + + for column in df.columns: + column_name = column.lower() + + if "float" in column_name: + # somethere in chain Clickhouse -> Spark -> Pandas Float32 column is being converted to Float64, + # causing tests to fail. Convert it back to original type + df[column] = df[column].astype("float32") + + return df diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py index 77ec071b3..30c040a39 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py @@ -308,6 +308,24 @@ def test_clickhouse_strategy_incremental_explicit_hwm_type( ColumnDateTimeHWM, lambda x: x.isoformat(), ), + ( + "hwm_datetime", + "CAST(text_string AS DateTime32)", + ColumnDateTimeHWM, + lambda x: x.isoformat(), + ), + ( + "hwm_datetime", + "CAST(text_string AS DateTime64)", + ColumnDateTimeHWM, + lambda x: x.isoformat(), + ), + ( + "hwm_datetime", + "CAST(text_string AS DateTime64(6))", + ColumnDateTimeHWM, + lambda x: x.isoformat(), + ), ], ) def test_clickhouse_strategy_incremental_with_hwm_expr( From 1029e7a25a16316b63562df5c9bf3bad0df8ca88 Mon Sep 17 00:00:00 2001 From: maxim-lixakov Date: Fri, 12 Apr 2024 13:59:18 +0300 Subject: [PATCH 04/10] [DOP-13855] - update clickhouse get_packages() tests --- onetl/connection/db_connection/clickhouse/connection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index f46a2e176..9fb2a254b 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -28,8 +28,8 @@ class Config: class Clickhouse(JDBCConnection): """Clickhouse JDBC connection. |support_hooks| - Based on Maven package `com.clickhouse:clickhouse-jdbc:0.6.0 `_ and additional dependencies - for `Apache HTTP client `_ when necessary (`official Clickhouse JDBC driver `_). + Based on Maven package `com.clickhouse:clickhouse-jdbc:0.6.0 `_ + (`official Clickhouse JDBC driver `_). .. warning:: @@ -131,7 +131,7 @@ def get_packages( from onetl.connection import Clickhouse - Clickhouse.get_packages(package_version="0.7.1", apache_http_client_version="5.4") + Clickhouse.get_packages(package_version="0.6.0", apache_http_client_version="5.3.1") .. note:: From 4ff10c80a656b142ca1915b3bedf028ddcd7e814 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 12 Apr 2024 12:10:47 +0000 Subject: [PATCH 05/10] [DOP-13855] Fix Clickhouse HWM tests --- .../test_strategy_increment_clickhouse.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py index 30c040a39..7b5deaa06 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py @@ -308,12 +308,6 @@ def test_clickhouse_strategy_incremental_explicit_hwm_type( ColumnDateTimeHWM, lambda x: x.isoformat(), ), - ( - "hwm_datetime", - "CAST(text_string AS DateTime32)", - ColumnDateTimeHWM, - lambda x: x.isoformat(), - ), ( "hwm_datetime", "CAST(text_string AS DateTime64)", From b8360a28caa28e22438b2aaab1a2ba3ad99ef30f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 12 Apr 2024 12:22:58 +0000 Subject: [PATCH 06/10] [DOP-13855] Fix Clickhouse HWM tests --- .../test_strategy_increment_clickhouse.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py index 7b5deaa06..77ec071b3 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py @@ -308,18 +308,6 @@ def test_clickhouse_strategy_incremental_explicit_hwm_type( ColumnDateTimeHWM, lambda x: x.isoformat(), ), - ( - "hwm_datetime", - "CAST(text_string AS DateTime64)", - ColumnDateTimeHWM, - lambda x: x.isoformat(), - ), - ( - "hwm_datetime", - "CAST(text_string AS DateTime64(6))", - ColumnDateTimeHWM, - lambda x: x.isoformat(), - ), ], ) def test_clickhouse_strategy_incremental_with_hwm_expr( From 1cce3d8c2dc6c1cd20f1153f908dbdb425f54b5d Mon Sep 17 00:00:00 2001 From: maxim-lixakov Date: Fri, 12 Apr 2024 15:51:41 +0300 Subject: [PATCH 07/10] [DOP-13855] - update clickhouse get_packages() tests --- .../test_clickhouse_unit.py | 32 +++++++++++++------ 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py index 00442f179..aaae7a0cf 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py @@ -26,15 +26,6 @@ def test_clickhouse_package(): "org.apache.httpcomponents.client5:httpclient5:5.3.1", ], ), - ( - "0.7.1", - "0.6.0", - [ - "com.clickhouse:clickhouse-jdbc:0.7.1", - "com.clickhouse:clickhouse-http-client:0.7.1", - "org.apache.httpcomponents.client5:httpclient5:0.6.0", - ], - ), ( "0.6.0-patch3", "5.3.1", @@ -44,6 +35,29 @@ def test_clickhouse_package(): "org.apache.httpcomponents.client5:httpclient5:5.3.1", ], ), + ( + "0.4.0", + "4.5.14", + ["com.clickhouse:clickhouse-jdbc:0.4.0", "com.clickhouse:clickhouse-http-client:0.4.0"], + ), # No HTTP client should be included + ( + "0.5.0", + "4.5.14", + [ + "com.clickhouse:clickhouse-jdbc:0.5.0", + "com.clickhouse:clickhouse-http-client:0.5.0", + "org.apache.httpcomponents.client5:httpclient5:4.5.14", + ], + ), + ( + "0.6.0", + "4.5.14", + [ + "com.clickhouse:clickhouse-jdbc:0.6.0", + "com.clickhouse:clickhouse-http-client:0.6.0", + "org.apache.httpcomponents.client5:httpclient5:4.5.14", + ], + ), ], ) def test_clickhouse_get_packages(package_version, apache_http_client_version, expected_packages): From 58771d13957db97545496912985a9ecfa7df403e Mon Sep 17 00:00:00 2001 From: maxim-lixakov Date: Fri, 12 Apr 2024 16:18:48 +0300 Subject: [PATCH 08/10] [DOP-13855] - fix Version.min_digits() method --- docs/changelog/next_release/249.breaking.rst | 2 +- docs/changelog/next_release/249.feature.rst | 1 + onetl/_util/version.py | 12 ++++++------ .../db_connection/clickhouse/connection.py | 6 ++---- .../connection/db_connection/greenplum/connection.py | 2 +- .../tests_db_connection_unit/test_clickhouse_unit.py | 2 +- .../tests_db_connection_unit/test_greenplum_unit.py | 2 +- 7 files changed, 13 insertions(+), 14 deletions(-) create mode 100644 docs/changelog/next_release/249.feature.rst diff --git a/docs/changelog/next_release/249.breaking.rst b/docs/changelog/next_release/249.breaking.rst index aa7bcd2e2..04764a1e6 100644 --- a/docs/changelog/next_release/249.breaking.rst +++ b/docs/changelog/next_release/249.breaking.rst @@ -1 +1 @@ -Updated the Clickhouse JDBC driver from ``ru.yandex.clickhouse:clickhouse-jdbc:0.3.2`` to `com.clickhouse:clickhouse-jdbc:0.6.0 `_. Added support for dynamic package versioning and dependencies on the `Apache HTTP client `_ as needed. +Updated the Clickhouse JDBC driver from ``ru.yandex.clickhouse:clickhouse-jdbc:0.3.2`` to `com.clickhouse:clickhouse-jdbc:0.6.0 `_. diff --git a/docs/changelog/next_release/249.feature.rst b/docs/changelog/next_release/249.feature.rst new file mode 100644 index 000000000..7d79a6c87 --- /dev/null +++ b/docs/changelog/next_release/249.feature.rst @@ -0,0 +1 @@ +Allow passing custom JDBC driver version to ``Clickhouse.get_packages(package_version=...)``. diff --git a/onetl/_util/version.py b/onetl/_util/version.py index cb9bce7af..24b924da2 100644 --- a/onetl/_util/version.py +++ b/onetl/_util/version.py @@ -195,17 +195,17 @@ def min_digits(self, num_parts: int) -> Version: >>> Version("5.6.7").min_digits(3) Version('5.6.7') >>> Version("5.6.7").min_digits(2) - Version('5.6') + Version('5.6.7') >>> Version("5.6").min_digits(3) Traceback (most recent call last): ... - ValueError: Version '5.6' does not have enough numeric components for requested format. + ValueError: Version '5.6' does not have enough numeric components for requested format (expected at least 3). """ if len(self._numeric_parts) < num_parts: - raise ValueError(f"Version '{self}' does not have enough numeric components for requested format.") - truncated_parts = self._numeric_parts[:num_parts] - truncated_str = ".".join(str(part) for part in truncated_parts) - return Version(truncated_str) + raise ValueError( + f"Version '{self}' does not have enough numeric components for requested format (expected at least {num_parts}).", + ) + return self def format(self, format_string: str) -> str: """ diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 9fb2a254b..fa06d4daa 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -139,12 +139,10 @@ def get_packages( ``com.clickhouse:clickhouse-jdbc:0.6.0:all`` to install all required packages. """ - package_version_obj = Version(package_version) if package_version else Version("0.6.0") + package_version_obj = Version(package_version).min_digits(3) if package_version else Version("0.6.0") apache_http_client_version_obj = ( - Version(apache_http_client_version) if apache_http_client_version else Version("5.3.1") + Version(apache_http_client_version).min_digits(3) if apache_http_client_version else Version("5.3.1") ) - if len(package_version_obj) != 3 or len(apache_http_client_version_obj) != 3: - raise ValueError("Version should consist of exactly three numeric_parts (major.minor.patch)") result = [ f"com.clickhouse:clickhouse-jdbc:{package_version_obj}", diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index 431fd6022..6744816f8 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -212,7 +212,7 @@ def get_packages( scala_ver = Version(scala_version).min_digits(2) elif spark_version: spark_ver = Version(spark_version).min_digits(2) - if spark_ver > Version("3.2") or spark_ver < Version("2.3"): + if spark_ver >= Version("3.3") or spark_ver < Version("2.3"): raise ValueError(f"Spark version must be 2.3.x - 3.2.x, got {spark_ver}") scala_ver = get_default_scala_version(spark_ver) else: diff --git a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py index aaae7a0cf..15388ee7e 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py @@ -78,7 +78,7 @@ def test_clickhouse_get_packages(package_version, apache_http_client_version, ex def test_invalid_versions_raise_error(package_version, apache_http_client_version): with pytest.raises( ValueError, - match=r"Version should consist of exactly three numeric_parts \(major\.minor\.patch\)", + match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 3\).", ): Clickhouse.get_packages(package_version=package_version, apache_http_client_version=apache_http_client_version) diff --git a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py index c1cb13804..de24e5ce2 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py @@ -66,7 +66,7 @@ def test_greenplum_get_packages_scala_version_not_supported(scala_version): ("2.3", "2.11", "io.pivotal:greenplum-spark_2.11:2.2.0"), ("2.4", "2.12", "io.pivotal:greenplum-spark_2.12:2.2.0"), # Scala version contain three digits when only two needed - ("3.2.4", "2.12.1", "io.pivotal:greenplum-spark_2.12:2.2.0"), + ("3.2.4", "2.11.1", "io.pivotal:greenplum-spark_2.11:2.2.0"), ], ) def test_greenplum_get_packages(spark_version, scala_version, package): From 14474957ac9c82923cdc6e2a99ad1a1b05fd08ce Mon Sep 17 00:00:00 2001 From: maxim-lixakov Date: Mon, 15 Apr 2024 14:32:53 +0300 Subject: [PATCH 09/10] [DOP-13855] - update numeric mapping --- .../db_connection/clickhouse/types.rst | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index 1df369dd4..cbdaf7863 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -125,11 +125,9 @@ Numeric types ~~~~~~~~~~~~~ +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | +| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) | +================================+===================================+===============================+===============================+ -| ``Bool`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | -+--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``-`` | ``BooleanType()`` | ``UInt64`` | ``UInt64`` | +| ``Bool`` | ``BooleanType()`` | ``UInt64`` | ``UInt64`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Decimal`` | ``DecimalType(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ @@ -145,13 +143,13 @@ Numeric types +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Decimal128(S=0..38)`` | ``DecimalType(P=38, S=0..38)`` | ``Decimal(P=38, S=0..38)`` | ``Decimal(P=38, S=0..38)`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``Decimal256(S=0..76)`` | unsupported [3]_ | | | +| ``Decimal256(S=0..75)`` | unsupported [3]_ | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``Float32`` | ``DoubleType()`` | ``Float64`` | ``Float64`` | -+--------------------------------+ | | | -| ``Float64`` | | | | +| ``Decimal256(76)`` | ``DecimalType(P=38, S=38)`` | ``DecimalType(P=38, S=38)`` | ``DecimalType(P=38, S=38)`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``-`` | ``FloatType()`` | ``Float32`` | ``Float32`` | +| ``Float32`` | ``FloatType()`` | ``Float32`` | ``Float32`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Float64`` | ``DoubleType()`` | ``Float64`` | ``Float64`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Int8`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | +--------------------------------+ | | | @@ -161,7 +159,7 @@ Numeric types +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Int64`` | ``LongType()`` | ``Int64`` | ``Int64`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``Int128`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | +| ``Int128`` | unsupported [3]_ | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Int256`` | unsupported [3]_ | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ @@ -170,16 +168,16 @@ Numeric types | ``-`` | ``ShortType()`` | ``Int32`` | ``Int32`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``UInt8`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | -+--------------------------------+ | | | -| ``UInt16`` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``UInt16`` | ``LongType()`` | ``Int64`` | ``Int64`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``UInt32`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | +--------------------------------+ | | | | ``UInt64`` | | | | -+--------------------------------+ | | | -| ``UInt128`` | | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``UInt256`` | unsupported [3]_ | | | +| ``UInt128`` | unsupported [3]_ | | | ++--------------------------------+ | | | +| ``UInt256`` | | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ .. [3] From a4f1678e1f37a3b3e7ad29a3a7682dda2a9b9118 Mon Sep 17 00:00:00 2001 From: maxim-lixakov Date: Mon, 15 Apr 2024 14:52:40 +0300 Subject: [PATCH 10/10] [DOP-13855] - update temporal mapping --- .../db_connection/clickhouse/types.rst | 39 +++++++++---------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index cbdaf7863..b0171b503 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -127,7 +127,7 @@ Numeric types +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) | +================================+===================================+===============================+===============================+ -| ``Bool`` | ``BooleanType()`` | ``UInt64`` | ``UInt64`` | +| ``Bool`` | ``BooleanType()`` | ``UInt64`` | ``UInt64`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Decimal`` | ``DecimalType(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ @@ -143,9 +143,7 @@ Numeric types +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Decimal128(S=0..38)`` | ``DecimalType(P=38, S=0..38)`` | ``Decimal(P=38, S=0..38)`` | ``Decimal(P=38, S=0..38)`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``Decimal256(S=0..75)`` | unsupported [3]_ | | | -+--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``Decimal256(76)`` | ``DecimalType(P=38, S=38)`` | ``DecimalType(P=38, S=38)`` | ``DecimalType(P=38, S=38)`` | +| ``Decimal256(S=0..76)`` | unsupported [3]_ | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Float32`` | ``FloatType()`` | ``Float32`` | ``Float32`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ @@ -196,33 +194,32 @@ Notes: * ``TIMESTAMP`` is alias for ``DateTime32``, but ``TIMESTAMP(N)`` is alias for ``DateTime64(N)`` +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | +| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) | +===================================+======================================+==================================+===============================+ | ``Date`` | ``DateType()`` | ``Date`` | ``Date`` | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``Date32`` | unsupported | | | -+-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``DateTime32``, seconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | +| ``Date32`` | ``DateType()`` | ``Date`` | ``Date`` | +| | | | **cannot be inserted** [6]_ | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``DateTime64(3)``, milliseconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | -| | | | **precision loss** [5]_ | -| | | | | -+-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``DateTime64(6)``, microseconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | -+-----------------------------------+--------------------------------------+----------------------------------+ **cannot be inserted** [6]_ | -| ``DateTime64(7..9)``, nanoseconds | ``TimestampType()``, microseconds, | ``DateTime64(6)``, microseconds, | | -| | **precision loss** [4]_ | **precision loss** [4]_ | | -| | | | | +| ``DateTime32``, seconds | ``TimestampType()`` | ``DateTime64(6)``, microseconds | ``DateTime32`` | ++-----------------------------------+--------------------------------------+----------------------------------+ seconds | +| ``DateTime64(3)``, milliseconds | ``TimestampType()`` | ``DateTime64(6)``, microseconds | **precision loss** [4]_ | ++-----------------------------------+--------------------------------------+----------------------------------+ | +| ``DateTime64(6)``, microseconds | ``TimestampType()`` | ``DateTime64(6)``, microseconds | | +-----------------------------------+--------------------------------------+----------------------------------+ | -| ``-`` | ``TimestampNTZType()``, microseconds | ``DateTime64(6)`` | | +| ``DateTime64(7..9)``, nanoseconds | ``TimestampType()`` | ``DateTime64(6)`` | | +| | | microseconds | | +| | | **precision loss** [4]_ | | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``IntervalNanosecond`` | unsupported | | | +| ``-`` | ``TimestampNTZType()`` | ``DateTime64(6)`` | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``IntervalNanosecond`` | ``LongType()`` | ```Int64`` | ``Int64`` | +-----------------------------------+ | | | | ``IntervalMicrosecond`` | | | | +-----------------------------------+ | | | | ``IntervalMillisecond`` | | | | -+-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``IntervalSecond`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++-----------------------------------+ | | | +| ``IntervalSecond`` | | | | +-----------------------------------+ | | | | ``IntervalMinute`` | | | | +-----------------------------------+ | | |