Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
dolfinus committed Oct 28, 2024
2 parents 5a0fead + 493f8af commit 92349e2
Show file tree
Hide file tree
Showing 60 changed files with 403 additions and 135 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/data/clickhouse/matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ min: &min
max: &max
clickhouse-image: clickhouse/clickhouse-server
clickhouse-version: 24.8.2.3-alpine
spark-version: 3.5.2
spark-version: 3.5.3
pydantic-version: 2
python-version: '3.12'
java-version: 20
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/data/core/matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ min: &min
os: ubuntu-latest

max: &max
spark-version: 3.5.2
spark-version: 3.5.3
pydantic-version: 2
python-version: '3.12'
java-version: 20
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/data/hdfs/matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ min: &min

max: &max
hadoop-version: hadoop3-hdfs
spark-version: 3.5.2
spark-version: 3.5.3
pydantic-version: 2
python-version: '3.12'
java-version: 20
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/data/hive/matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ min: &min
os: ubuntu-latest

max: &max
spark-version: 3.5.2
spark-version: 3.5.3
pydantic-version: 2
python-version: '3.12'
java-version: 20
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/data/kafka/matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ min: &min
max: &max
kafka-version: 3.7.1
pydantic-version: 2
spark-version: 3.5.2
spark-version: 3.5.3
python-version: '3.12'
java-version: 20
os: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/data/mongodb/matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ min: &min

max: &max
mongodb-version: 7.0.14
spark-version: 3.5.2
spark-version: 3.5.3
pydantic-version: 2
python-version: '3.12'
java-version: 20
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/data/mssql/matrix.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
min: &min
mssql-version: 2017-GA-ubuntu
mssql-version: 2017-latest
spark-version: 2.3.1
pydantic-version: 1
python-version: '3.7'
java-version: 8
os: ubuntu-latest
os: ubuntu-20.04

max: &max
mssql-version: 2022-CU14-ubuntu-22.04
spark-version: 3.5.2
mssql-version: 2022-latest
spark-version: 3.5.3
pydantic-version: 2
python-version: '3.12'
java-version: 20
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/data/mysql/matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ min: &min

max: &max
mysql-version: 9.0.1
spark-version: 3.5.2
spark-version: 3.5.3
pydantic-version: 2
python-version: '3.12'
java-version: 20
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/data/oracle/matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ max: &max
oracle-image: gvenzl/oracle-free
oracle-version: 23.4-slim-faststart
db-name: FREEPDB1
spark-version: 3.5.2
spark-version: 3.5.3
pydantic-version: 2
python-version: '3.12'
java-version: 20
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/data/postgres/matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ min: &min

max: &max
postgres-version: 16.4-alpine
spark-version: 3.5.2
spark-version: 3.5.3
pydantic-version: 2
python-version: '3.12'
java-version: 20
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/data/s3/matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ min: &min

max: &max
minio-version: 2024.8.29
spark-version: 3.5.2
spark-version: 3.5.3
pydantic-version: 2
python-version: '3.12'
java-version: 20
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/data/teradata/matrix.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
max: &max
spark-version: 3.5.2
spark-version: 3.5.3
pydantic-version: 2
python-version: '3.12'
java-version: 20
Expand Down
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ default_language_version:

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
rev: v5.0.0
hooks:
- id: check-ast
- id: check-case-conflict
Expand Down Expand Up @@ -90,7 +90,7 @@ repos:
- id: text-unicode-replacement-char

- repo: https://github.com/asottile/pyupgrade
rev: v3.17.0
rev: v3.18.0
hooks:
- id: pyupgrade
args: [--py37-plus, --keep-runtime-typing]
Expand All @@ -101,20 +101,20 @@ repos:
- id: add-trailing-comma

- repo: https://github.com/psf/black
rev: 24.8.0
rev: 24.10.0
hooks:
- id: black
language_version: python3

- repo: https://github.com/asottile/blacken-docs
rev: 1.18.0
rev: 1.19.0
hooks:
- id: blacken-docs
additional_dependencies:
- black==24.4.2

- repo: https://github.com/pycqa/bandit
rev: 1.7.9
rev: 1.7.10
hooks:
- id: bandit
args:
Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.rst
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ Create virtualenv and install dependencies:
-r requirements/tests/postgres.txt \
-r requirements/tests/oracle.txt \
-r requirements/tests/pydantic-2.txt \
-r requirements/tests/spark-3.5.2.txt
-r requirements/tests/spark-3.5.3.txt
# TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4
pip install sphinx-plantuml --no-deps
Expand Down
6 changes: 3 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ Compatibility matrix
+--------------------------------------------------------------+-------------+-------------+-------+
| `3.4.x <https://spark.apache.org/docs/3.4.3/#downloading>`_ | 3.7 - 3.12 | 8u362 - 20 | 2.12 |
+--------------------------------------------------------------+-------------+-------------+-------+
| `3.5.x <https://spark.apache.org/docs/3.5.2/#downloading>`_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 |
| `3.5.x <https://spark.apache.org/docs/3.5.3/#downloading>`_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 |
+--------------------------------------------------------------+-------------+-------------+-------+

.. _pyspark-install:
Expand All @@ -199,7 +199,7 @@ or install PySpark explicitly:

.. code:: bash
pip install onetl pyspark==3.5.2 # install a specific PySpark version
pip install onetl pyspark==3.5.3 # install a specific PySpark version
or inject PySpark to ``sys.path`` in some other way BEFORE creating a class instance.
**Otherwise connection object cannot be created.**
Expand Down Expand Up @@ -540,7 +540,7 @@ Read files directly from S3 path, convert them to dataframe, transform it and th
setup_logging()
# Initialize new SparkSession with Hadoop AWS libraries and Postgres driver loaded
maven_packages = SparkS3.get_packages(spark_version="3.5.2") + Postgres.get_packages()
maven_packages = SparkS3.get_packages(spark_version="3.5.3") + Postgres.get_packages()
spark = (
SparkSession.builder.appName("spark_app_onetl_demo")
.config("spark.jars.packages", ",".join(maven_packages))
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ services:
context: .
target: base
args:
SPARK_VERSION: 3.5.2
SPARK_VERSION: 3.5.3
env_file: .env.docker
volumes:
- ./:/app/
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ ENV PATH=${ONETL_USER_HOME}/.local/bin:${PATH}
COPY --chown=onetl:onetl ./run_tests.sh ./pytest_runner.sh ./combine_coverage.sh /app/
RUN chmod +x /app/run_tests.sh /app/pytest_runner.sh /app/combine_coverage.sh

ARG SPARK_VERSION=3.5.2
ARG SPARK_VERSION=3.5.3
# Spark is heavy, and version change is quite rare
COPY --chown=onetl:onetl ./requirements/tests/spark-${SPARK_VERSION}.txt /app/requirements/tests/
RUN pip install -r /app/requirements/tests/spark-${SPARK_VERSION}.txt
Expand Down
23 changes: 23 additions & 0 deletions docs/changelog/0.12.1.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
0.12.1 (2024-10-28)
===================

Features
--------

- Log detected JDBC dialect while using ``DBWriter``.


Bug Fixes
---------

- Fix ``SparkMetricsRecorder`` failing when receiving ``SparkListenerTaskEnd`` without ``taskMetrics`` (e.g. executor was killed by OOM). (:github:pull:`313`)
- Call ``kinit`` before checking for HDFS active namenode.
- Wrap ``kinit`` with ``threading.Lock`` to avoid multithreading issues.
- Immediately show ``kinit`` errors to user, instead of hiding them.
- Use ``AttributeError`` instead of ``ImportError`` in module's ``__getattr__`` method, to make code compliant with Python spec.


Doc only Changes
----------------

- Add note about `spark-dialect-extension <https://github.com/MobileTeleSystems/spark-dialect-extension>`_ package to Clickhouse connector documentation. (:github:pull:`310`)
1 change: 1 addition & 0 deletions docs/changelog/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
:caption: Changelog

DRAFT
0.12.1
0.12.0
0.11.2
0.11.1
Expand Down
14 changes: 12 additions & 2 deletions docs/connection/db_connection/clickhouse/types.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,16 @@
Clickhouse <-> Spark type mapping
=================================

.. note::

The results below are valid for Spark 3.5.3, and may differ on other Spark versions.

.. note::

It is recommended to use `spark-dialect-extension <https://github.com/MobileTeleSystems/spark-dialect-extension>`_ package,
which implements writing Arrays from Spark to Clickhouse, fixes dropping fractions of seconds in ``TimestampType``,
and fixes other type conversion issues.

Type detection & casting
------------------------

Expand Down Expand Up @@ -106,8 +116,8 @@ References
Here you can find source code with type conversions:

* `Clickhouse -> JDBC <https://github.com/ClickHouse/clickhouse-java/blob/0.3.2/clickhouse-jdbc/src/main/java/com/clickhouse/jdbc/JdbcTypeMapping.java#L39-L176>`_
* `JDBC -> Spark <https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L307>`_
* `Spark -> JDBC <https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L141-L164>`_
* `JDBC -> Spark <https://github.com/apache/spark/blob/v3.5.3/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L307>`_
* `Spark -> JDBC <https://github.com/apache/spark/blob/v3.5.3/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L141-L164>`_
* `JDBC -> Clickhouse <https://github.com/ClickHouse/clickhouse-java/blob/0.3.2/clickhouse-jdbc/src/main/java/com/clickhouse/jdbc/JdbcTypeMapping.java#L185-L311>`_

Supported types
Expand Down
2 changes: 1 addition & 1 deletion docs/connection/db_connection/greenplum/prerequisites.rst
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ Number of connections can be limited by 2 ways:

spark = (
SparkSession.builder
# Spark will start EXACTLY 5 executors with 1 core each, so max number of parallel jobs is 10
# Spark will run with 5 threads in local mode, allowing up to 5 parallel tasks
.config("spark.master", "local[5]")
.config("spark.executor.cores", 1)
).getOrCreate()
Expand Down
4 changes: 4 additions & 0 deletions docs/connection/db_connection/greenplum/types.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
Greenplum <-> Spark type mapping
=================================

.. note::

The results below are valid for Spark 3.2.4, and may differ on other Spark versions.

Type detection & casting
------------------------

Expand Down
24 changes: 11 additions & 13 deletions docs/connection/db_connection/hive/write.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,25 @@ Examples
df = ... # data is here
# Create dataframe with specific number of Spark partitions.
# Use the Hive partitioning columns to group the data. Create only 20 files per Hive partition.
# Also sort the data by column which most data is correlated with, reducing files size.
# Use the Hive partitioning columns to group the data. Create max 20 files per Hive partition.
# Also sort the data by column which most data is correlated with (e.g. user_id), reducing files size.
num_files_per_partition = 20
partition_columns = ["country", "business_date"]
sort_columns = ["user_id"]
write_df = df.repartition(
20,
"country",
"business_date",
"user_id",
).sortWithinPartitions(
"country",
"business_date",
"user_id",
)
num_files_per_partition,
*partition_columns,
*sort_columns,
).sortWithinPartitions(*partition_columns, *sort_columns)
writer = DBWriter(
connection=hive,
target="schema.table",
options=Hive.WriteOptions(
if_exists="append",
# Hive partitioning columns.
# `user_id`` column is not included, as it has a lot of distinct values.
partitionBy=["country", "business_date"],
partitionBy=partition_columns,
),
)
Expand Down
4 changes: 4 additions & 0 deletions docs/connection/db_connection/mongodb/types.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
MongoDB <-> Spark type mapping
==============================

.. note::

The results below are valid for Spark 3.5.3, and may differ on other Spark versions.

Type detection & casting
------------------------

Expand Down
10 changes: 7 additions & 3 deletions docs/connection/db_connection/mssql/types.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
.. _mssql-types:

MSSQL <-> Spark type mapping
=================================
============================

.. note::

The results below are valid for Spark 3.5.3, and may differ on other Spark versions.

Type detection & casting
------------------------
Expand Down Expand Up @@ -101,8 +105,8 @@ References
Here you can find source code with type conversions:

* `MSSQL -> JDBC <https://github.com/microsoft/mssql-jdbc/blob/v12.2.0/src/main/java/com/microsoft/sqlserver/jdbc/SQLServerResultSetMetaData.java#L117-L170>`_
* `JDBC -> Spark <https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala#L117-L134>`_
* `Spark -> JDBC <https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala#L136-L145>`_
* `JDBC -> Spark <https://github.com/apache/spark/blob/v3.5.3/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala#L117-L134>`_
* `Spark -> JDBC <https://github.com/apache/spark/blob/v3.5.3/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala#L136-L145>`_
* `JDBC -> MSSQL <https://github.com/microsoft/mssql-jdbc/blob/v12.2.0/src/main/java/com/microsoft/sqlserver/jdbc/DataTypes.java#L625-L676>`_

Supported types
Expand Down
10 changes: 7 additions & 3 deletions docs/connection/db_connection/mysql/types.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
.. _mysql-types:

MySQL <-> Spark type mapping
=================================
============================

.. note::

The results below are valid for Spark 3.5.3, and may differ on other Spark versions.

Type detection & casting
------------------------
Expand Down Expand Up @@ -97,8 +101,8 @@ References
Here you can find source code with type conversions:

* `MySQL -> JDBC <https://github.com/mysql/mysql-connector-j/blob/8.0.33/src/main/core-api/java/com/mysql/cj/MysqlType.java#L44-L623>`_
* `JDBC -> Spark <https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala#L104-L132>`_
* `Spark -> JDBC <https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala#L204-L211>`_
* `JDBC -> Spark <https://github.com/apache/spark/blob/v3.5.3/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala#L104-L132>`_
* `Spark -> JDBC <https://github.com/apache/spark/blob/v3.5.3/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala#L204-L211>`_
* `JDBC -> MySQL <https://github.com/mysql/mysql-connector-j/blob/8.0.33/src/main/core-api/java/com/mysql/cj/MysqlType.java#L625-L867>`_

Supported types
Expand Down
Loading

0 comments on commit 92349e2

Please sign in to comment.