diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index 39f5b87900e5..5892a2677fca 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -90,10 +90,10 @@ body:
- type: textarea
attributes:
- label: Flink or Spark Version
- description: Provide Flink or Spark Version.
+ label: Zeta or Flink or Spark Version
+ description: Provide Zeta or Flink or Spark Version.
placeholder: >
- Please provide the version of Flink or Spark.
+ Please provide the version of Zeta or Flink or Spark.
validations:
required: false
diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index fbe37acece53..13a4d4b52d9d 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -564,7 +564,7 @@ jobs:
matrix:
java: [ '8', '11' ]
os: [ 'ubuntu-latest' ]
- timeout-minutes: 90
+ timeout-minutes: 150
steps:
- uses: actions/checkout@v2
- name: Set up JDK ${{ matrix.java }}
@@ -736,6 +736,30 @@ jobs:
env:
MAVEN_OPTS: -Xmx4096m
+ jdbc-connectors-it-part-4:
+ needs: [ changes, sanity-check ]
+ if: needs.changes.outputs.api == 'true'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ java: [ '8', '11' ]
+ os: [ 'ubuntu-latest' ]
+ timeout-minutes: 90
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up JDK ${{ matrix.java }}
+ uses: actions/setup-java@v3
+ with:
+ java-version: ${{ matrix.java }}
+ distribution: 'temurin'
+ cache: 'maven'
+ - name: run jdbc connectors integration test (part-4)
+ if: needs.changes.outputs.api == 'true'
+ run: |
+ ./mvnw -B -T 1C verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl :connector-jdbc-e2e-part-4 -am -Pci
+ env:
+ MAVEN_OPTS: -Xmx4096m
+
kafka-connector-it:
needs: [ changes, sanity-check ]
if: needs.changes.outputs.api == 'true'
diff --git a/.github/workflows/documents.yml b/.github/workflows/documents.yml
index 3a97a3a7de28..7c3c56cf07b7 100644
--- a/.github/workflows/documents.yml
+++ b/.github/workflows/documents.yml
@@ -52,3 +52,14 @@ jobs:
cd seatunnel-website
npm install
npm run build
+
+ code-style:
+ name: Code style
+ runs-on: ubuntu-latest
+ timeout-minutes: 10
+ steps:
+ - uses: actions/checkout@v3
+ with:
+ submodules: true
+ - name: Check code style
+ run: ./mvnw --batch-mode --quiet --no-snapshot-updates clean spotless:check
diff --git a/LICENSE b/LICENSE
index bd06a03806b3..adabba50de63 100644
--- a/LICENSE
+++ b/LICENSE
@@ -219,6 +219,7 @@ seatunnel-connectors-v2/connector-cdc/connector-base/src/main/java/org/apache/se
seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql from https://github.com/ververica/flink-cdc-connectors
seatunnel-connectors-v2/connector-cdc/connector-base/src/main/java/org/apache/seatunnel/connectors/cdc/debezium from https://github.com/ververica/flink-cdc-connectors
seatunnel-connectors-v2/connector-cdc/connector-cdc-sqlserver/src/main/java/io/debezium/connector/sqlserver/SqlServerStreamingChangeEventSource.java from https://github.com/debezium/debezium
+seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb from https://github.com/ververica/flink-cdc-connectors
generate_client_protocol.sh from https://github.com/hazelcast/hazelcast
seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/utils/ExceptionUtil.java from https://github.com/hazelcast/hazelcast
seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/protocol/task/AbstractSeaTunnelMessageTask.java from https://github.com/hazelcast/hazelcast
@@ -239,4 +240,4 @@ seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics
seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sqlengine/zeta/ZetaSQLEngine.java from https://github.com/JSQLParser/JSqlParser
seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sqlengine/zeta/ZetaSQLType.java from https://github.com/JSQLParser/JSqlParser
seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sqlengine/zeta/ZetaSQLFilter.java from https://github.com/JSQLParser/JSqlParser
-seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sqlengine/zeta/ZetaSQLFunction.java from https://github.com/JSQLParser/JSqlParser
\ No newline at end of file
+seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sqlengine/zeta/ZetaSQLFunction.java from https://github.com/JSQLParser/JSqlParser
diff --git a/config/plugin_config b/config/plugin_config
index 98c9fa8c2c69..95b952b31bf1 100644
--- a/config/plugin_config
+++ b/config/plugin_config
@@ -24,6 +24,7 @@ connector-amazondynamodb
connector-assert
connector-cassandra
connector-cdc-mysql
+connector-cdc-mongodb
connector-cdc-sqlserver
connector-clickhouse
connector-datahub
diff --git a/docs/en/connector-v2/formats/debezium-json.md b/docs/en/connector-v2/formats/debezium-json.md
new file mode 100644
index 000000000000..4c40a0298e49
--- /dev/null
+++ b/docs/en/connector-v2/formats/debezium-json.md
@@ -0,0 +1,107 @@
+# Debezium Format
+
+Changelog-Data-Capture Format: Serialization Schema Format: Deserialization Schema
+
+Debezium is a set of distributed services to capture changes in your databases so that your applications can see those changes and respond to them. Debezium records all row-level changes within each database table in a *change event stream*, and applications simply read these streams to see the change events in the same order in which they occurred.
+
+Seatunnel supports to interpret Debezium JSON messages as INSERT/UPDATE/DELETE messages into seatunnel system. This is useful in many cases to leverage this feature, such as
+
+ synchronizing incremental data from databases to other systems
+ auditing logs
+ real-time materialized views on databases
+ temporal join changing history of a database table and so on.
+
+Seatunnel also supports to encode the INSERT/UPDATE/DELETE messages in Seatunnel asDebezium JSON messages, and emit to storage like Kafka.
+
+# Format Options
+
+| option | default | required | Description |
+|-----------------------------------|---------|----------|------------------------------------------------------------------------------------------------------|
+| format | (none) | yes | Specify what format to use, here should be 'debezium_json'. |
+| debezium-json.ignore-parse-errors | false | no | Skip fields and rows with parse errors instead of failing. Fields are set to null in case of errors. |
+
+# How to use Debezium format
+
+## Kafka uses example
+
+Debezium provides a unified format for changelog, here is a simple example for an update operation captured from a MySQL products table:
+
+```bash
+{
+ "before": {
+ "id": 111,
+ "name": "scooter",
+ "description": "Big 2-wheel scooter ",
+ "weight": 5.18
+ },
+ "after": {
+ "id": 111,
+ "name": "scooter",
+ "description": "Big 2-wheel scooter ",
+ "weight": 5.17
+ },
+ "source": {
+ "version": "1.1.1.Final",
+ "connector": "mysql",
+ "name": "dbserver1",
+ "ts_ms": 1589362330000,
+ "snapshot": "false",
+ "db": "inventory",
+ "table": "products",
+ "server_id": 223344,
+ "gtid": null,
+ "file": "mysql-bin.000003",
+ "pos": 2090,
+ "row": 0,
+ "thread": 2,
+ "query": null
+ },
+ "op": "u",
+ "ts_ms": 1589362330904,
+ "transaction": null
+}
+```
+
+Note: please refer to Debezium documentation about the meaning of each fields.
+
+The MySQL products table has 4 columns (id, name, description and weight).
+The above JSON message is an update change event on the products table where the weight value of the row with id = 111 is changed from 5.18 to 5.15.
+Assuming the messages have been synchronized to Kafka topic products_binlog, then we can use the following Seatunnel conf to consume this topic and interpret the change events by Debezium format.
+
+```bash
+env {
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ Kafka {
+ bootstrap.servers = "kafkaCluster:9092"
+ topic = "products_binlog"
+ result_table_name = "kafka_name"
+ start_mode = earliest
+ schema = {
+ fields {
+ id = "int"
+ name = "string"
+ description = "string"
+ weight = "string"
+ }
+ }
+ format = debezium_json
+ }
+
+}
+
+transform {
+}
+
+sink {
+ Kafka {
+ bootstrap.servers = "kafkaCluster:9092"
+ topic = "consume-binlog"
+ format = debezium_json
+ }
+}
+```
+
diff --git a/docs/en/connector-v2/sink/Clickhouse.md b/docs/en/connector-v2/sink/Clickhouse.md
index 7c4bab991ba4..27bf274c77fe 100644
--- a/docs/en/connector-v2/sink/Clickhouse.md
+++ b/docs/en/connector-v2/sink/Clickhouse.md
@@ -2,95 +2,110 @@
> Clickhouse sink connector
-## Description
+## Support Those Engines
-Used to write data to Clickhouse.
+> Spark
+> Flink
+> SeaTunnel Zeta
-## Key features
+## Key Features
- [ ] [exactly-once](../../concept/connector-v2-features.md)
-
-The Clickhouse sink plug-in can achieve accuracy once by implementing idempotent writing, and needs to cooperate with aggregatingmergetree and other engines that support deduplication.
-
- [x] [cdc](../../concept/connector-v2-features.md)
-## Options
-
-| name | type | required | default value |
-|---------------------------------------|---------|----------|---------------|
-| host | string | yes | - |
-| database | string | yes | - |
-| table | string | yes | - |
-| username | string | yes | - |
-| password | string | yes | - |
-| clickhouse.config | map | no | |
-| bulk_size | string | no | 20000 |
-| split_mode | string | no | false |
-| sharding_key | string | no | - |
-| primary_key | string | no | - |
-| support_upsert | boolean | no | false |
-| allow_experimental_lightweight_delete | boolean | no | false |
-| common-options | | no | - |
-
-### host [string]
-
-`ClickHouse` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"host1:8123,host2:8123"` .
-
-### database [string]
-
-The `ClickHouse` database
-
-### table [string]
-
-The table name
-
-### username [string]
-
-`ClickHouse` user username
-
-### password [string]
-
-`ClickHouse` user password
-
-### clickhouse.config [map]
-
-In addition to the above mandatory parameters that must be specified by `clickhouse-jdbc` , users can also specify multiple optional parameters, which cover all the [parameters](https://github.com/ClickHouse/clickhouse-jdbc/tree/master/clickhouse-client#configuration) provided by `clickhouse-jdbc` .
-
-### bulk_size [number]
-
-The number of rows written through [Clickhouse-jdbc](https://github.com/ClickHouse/clickhouse-jdbc) each time, the `default is 20000`, if checkpoints are enabled, writing will also occur at the times when the checkpoints are satisfied .
-
-### split_mode [boolean]
-
-This mode only support clickhouse table which engine is 'Distributed'.And `internal_replication` option
-should be `true`. They will split distributed table data in seatunnel and perform write directly on each shard. The shard weight define is clickhouse will be
-counted.
-
-### sharding_key [string]
+> The Clickhouse sink plug-in can achieve accuracy once by implementing idempotent writing, and needs to cooperate with aggregatingmergetree and other engines that support deduplication.
-When use split_mode, which node to send data to is a problem, the default is random selection, but the
-'sharding_key' parameter can be used to specify the field for the sharding algorithm. This option only
-worked when 'split_mode' is true.
-
-### primary_key [string]
-
-Mark the primary key column from clickhouse table, and based on primary key execute INSERT/UPDATE/DELETE to clickhouse table
-
-### support_upsert [boolean]
+## Description
-Support upsert row by query primary key
+Used to write data to Clickhouse.
-### allow_experimental_lightweight_delete [boolean]
+## Supported DataSource Info
+
+In order to use the Clickhouse connector, the following dependencies are required.
+They can be downloaded via install-plugin.sh or from the Maven central repository.
+
+| Datasource | Supported Versions | Dependency |
+|------------|--------------------|------------------------------------------------------------------------------------------------------------------|
+| Clickhouse | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-clickhouse) |
+
+## Data Type Mapping
+
+| SeaTunnel Data type | Clickhouse Data type |
+|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|
+| STRING | String / Int128 / UInt128 / Int256 / UInt256 / Point / Ring / Polygon MultiPolygon |
+| INT | Int8 / UInt8 / Int16 / UInt16 / Int32 |
+| BIGINT | UInt64 / Int64 / IntervalYear / IntervalQuarter / IntervalMonth / IntervalWeek / IntervalDay / IntervalHour / IntervalMinute / IntervalSecond |
+| DOUBLE | Float64 |
+| DECIMAL | Decimal |
+| FLOAT | Float32 |
+| DATE | Date |
+| TIME | DateTime |
+| ARRAY | Array |
+| MAP | Map |
+
+## Sink Options
+
+| Name | Type | Required | Default | Description |
+|---------------------------------------|---------|----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| host | String | Yes | - | `ClickHouse` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"host1:8123,host2:8123"`. |
+| database | String | Yes | - | The `ClickHouse` database. |
+| table | String | Yes | - | The table name. |
+| username | String | Yes | - | `ClickHouse` user username. |
+| password | String | Yes | - | `ClickHouse` user password. |
+| clickhouse.config | Map | No | | In addition to the above mandatory parameters that must be specified by `clickhouse-jdbc` , users can also specify multiple optional parameters, which cover all the [parameters](https://github.com/ClickHouse/clickhouse-jdbc/tree/master/clickhouse-client#configuration) provided by `clickhouse-jdbc`. |
+| bulk_size | String | No | 20000 | The number of rows written through [Clickhouse-jdbc](https://github.com/ClickHouse/clickhouse-jdbc) each time, the `default is 20000`. |
+| split_mode | String | No | false | This mode only support clickhouse table which engine is 'Distributed'.And `internal_replication` option-should be `true`.They will split distributed table data in seatunnel and perform write directly on each shard. The shard weight define is clickhouse will counted. |
+| sharding_key | String | No | - | When use split_mode, which node to send data to is a problem, the default is random selection, but the 'sharding_key' parameter can be used to specify the field for the sharding algorithm. This option only worked when 'split_mode' is true. |
+| primary_key | String | No | - | Mark the primary key column from clickhouse table, and based on primary key execute INSERT/UPDATE/DELETE to clickhouse table. |
+| support_upsert | Boolean | No | false | Support upsert row by query primary key. |
+| allow_experimental_lightweight_delete | Boolean | No | false | Allow experimental lightweight delete based on `*MergeTree` table engine. |
+| common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. |
+
+## How to Create a Clickhouse Data Synchronization Jobs
+
+The following example demonstrates how to create a data synchronization job that writes randomly generated data to a Clickhouse database:
+
+```bash
+# Set the basic configuration of the task to be performed
+env {
+ execution.parallelism = 1
+ job.mode = "BATCH"
+ checkpoint.interval = 1000
+}
-Allow experimental lightweight delete based on `*MergeTree` table engine
+source {
+ FakeSource {
+ row.num = 2
+ bigint.min = 0
+ bigint.max = 10000000
+ split.num = 1
+ split.read-interval = 300
+ schema {
+ fields {
+ c_bigint = bigint
+ }
+ }
+ }
+}
-### common options
+sink {
+ Clickhouse {
+ host = "127.0.0.1:9092"
+ database = "default"
+ table = "test"
+ username = "xxxxx"
+ password = "xxxxx"
+ }
+}
+```
-Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details
+### Tips
-## Examples
+> 1.[SeaTunnel Deployment Document](../../start-v2/locally/deployment.md).
+> 2.The table to be written to needs to be created in advance before synchronization.
+> 3.When sink is writing to the ClickHouse table, you don't need to set its schema because the connector will query ClickHouse for the current table's schema information before writing.
-Simple
+## Clickhouse Sink Config
```hocon
sink {
@@ -98,9 +113,9 @@ sink {
host = "localhost:8123"
database = "default"
table = "fake_all"
- username = "default"
- password = ""
- clickhouse.confg = {
+ username = "xxxxx"
+ password = "xxxxx"
+ clickhouse.config = {
max_rows_to_read = "100"
read_overflow_mode = "throw"
}
@@ -108,7 +123,7 @@ sink {
}
```
-Split mode
+## Split Mode
```hocon
sink {
@@ -116,8 +131,8 @@ sink {
host = "localhost:8123"
database = "default"
table = "fake_all"
- username = "default"
- password = ""
+ username = "xxxxx"
+ password = "xxxxx"
# split mode options
split_mode = true
@@ -126,7 +141,7 @@ sink {
}
```
-CDC(Change data capture)
+## CDC(Change data capture) Sink
```hocon
sink {
@@ -134,8 +149,8 @@ sink {
host = "localhost:8123"
database = "default"
table = "fake_all"
- username = "default"
- password = ""
+ username = "xxxxx"
+ password = "xxxxx"
# cdc options
primary_key = "id"
@@ -144,7 +159,7 @@ sink {
}
```
-CDC(Change data capture) for *MergeTree engine
+## CDC(Change data capture) for *MergeTree engine
```hocon
sink {
@@ -152,8 +167,8 @@ sink {
host = "localhost:8123"
database = "default"
table = "fake_all"
- username = "default"
- password = ""
+ username = "xxxxx"
+ password = "xxxxx"
# cdc options
primary_key = "id"
@@ -163,21 +178,3 @@ sink {
}
```
-## Changelog
-
-### 2.2.0-beta 2022-09-26
-
-- Add ClickHouse Sink Connector
-
-### 2.3.0-beta 2022-10-20
-
-- [Improve] Clickhouse Support Int128,Int256 Type ([3067](https://github.com/apache/seatunnel/pull/3067))
-
-### next version
-
-- [Improve] Clickhouse Sink support nest type and array type([3047](https://github.com/apache/seatunnel/pull/3047))
-- [Improve] Clickhouse Sink support geo type([3141](https://github.com/apache/seatunnel/pull/3141))
-- [Feature] Support CDC write DELETE/UPDATE/INSERT events ([3653](https://github.com/apache/seatunnel/pull/3653))
-- [Improve] Remove Clickhouse Fields Config ([3826](https://github.com/apache/seatunnel/pull/3826))
-- [Improve] Change Connector Custom Config Prefix To Map [3719](https://github.com/apache/seatunnel/pull/3719)
-
diff --git a/docs/en/connector-v2/sink/CosFile.md b/docs/en/connector-v2/sink/CosFile.md
new file mode 100644
index 000000000000..563b174c3c82
--- /dev/null
+++ b/docs/en/connector-v2/sink/CosFile.md
@@ -0,0 +1,259 @@
+# CosFile
+
+> Cos file sink connector
+
+## Description
+
+Output data to cos file system.
+
+:::tip
+
+If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x.
+
+If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this.
+
+To use this connector you need put hadoop-cos-{hadoop.version}-{version}.jar and cos_api-bundle-{version}.jar in ${SEATUNNEL_HOME}/lib dir, download: [Hadoop-Cos-release](https://github.com/tencentyun/hadoop-cos/releases). It only supports hadoop version 2.6.5+ and version 8.0.2+.
+
+:::
+
+## Key features
+
+- [x] [exactly-once](../../concept/connector-v2-features.md)
+
+By default, we use 2PC commit to ensure `exactly-once`
+
+- [x] file format type
+ - [x] text
+ - [x] csv
+ - [x] parquet
+ - [x] orc
+ - [x] json
+ - [x] excel
+
+## Options
+
+| name | type | required | default value | remarks |
+|----------------------------------|---------|----------|--------------------------------------------|-----------------------------------------------------------|
+| path | string | yes | - | |
+| bucket | string | yes | - | |
+| secret_id | string | yes | - | |
+| secret_key | string | yes | - | |
+| region | string | yes | - | |
+| custom_filename | boolean | no | false | Whether you need custom the filename |
+| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true |
+| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true |
+| file_format_type | string | no | "csv" | |
+| field_delimiter | string | no | '\001' | Only used when file_format is text |
+| row_delimiter | string | no | "\n" | Only used when file_format is text |
+| have_partition | boolean | no | false | Whether you need processing partitions. |
+| partition_by | array | no | - | Only used then have_partition is true |
+| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true |
+| is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true |
+| sink_columns | array | no | | When this parameter is empty, all fields are sink columns |
+| is_enable_transaction | boolean | no | true | |
+| batch_size | int | no | 1000000 | |
+| compress_codec | string | no | none | |
+| common-options | object | no | - | |
+| max_rows_in_memory | int | no | - | Only used when file_format is excel. |
+| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. |
+
+### path [string]
+
+The target dir path is required.
+
+### bucket [string]
+
+The bucket address of cos file system, for example: `cosn://seatunnel-test-1259587829`
+
+### secret_id [string]
+
+The secret id of cos file system.
+
+### secret_key [string]
+
+The secret key of cos file system.
+
+### region [string]
+
+The region of cos file system.
+
+### custom_filename [boolean]
+
+Whether custom the filename
+
+### file_name_expression [string]
+
+Only used when `custom_filename` is `true`
+
+`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`,
+`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`.
+
+Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file.
+
+### filename_time_format [string]
+
+Only used when `custom_filename` is `true`
+
+When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows:
+
+| Symbol | Description |
+|--------|--------------------|
+| y | Year |
+| M | Month |
+| d | Day of month |
+| H | Hour in day (0-23) |
+| m | Minute in hour |
+| s | Second in minute |
+
+### file_format_type [string]
+
+We supported as the following file types:
+
+`text` `json` `csv` `orc` `parquet` `excel`
+
+Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`.
+
+### field_delimiter [string]
+
+The separator between columns in a row of data. Only needed by `text` file format.
+
+### row_delimiter [string]
+
+The separator between rows in a file. Only needed by `text` file format.
+
+### have_partition [boolean]
+
+Whether you need processing partitions.
+
+### partition_by [array]
+
+Only used when `have_partition` is `true`.
+
+Partition data based on selected fields.
+
+### partition_dir_expression [string]
+
+Only used when `have_partition` is `true`.
+
+If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory.
+
+Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field.
+
+### is_partition_field_write_in_file [boolean]
+
+Only used when `have_partition` is `true`.
+
+If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file.
+
+For example, if you want to write a Hive Data File, Its value should be `false`.
+
+### sink_columns [array]
+
+Which columns need be written to file, default value is all the columns get from `Transform` or `Source`.
+The order of the fields determines the order in which the file is actually written.
+
+### is_enable_transaction [boolean]
+
+If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory.
+
+Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file.
+
+Only support `true` now.
+
+### batch_size [int]
+
+The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger.
+
+### compress_codec [string]
+
+The compress codec of files and the details that supported as the following shown:
+
+- txt: `lzo` `none`
+- json: `lzo` `none`
+- csv: `lzo` `none`
+- orc: `lzo` `snappy` `lz4` `zlib` `none`
+- parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none`
+
+Tips: excel type does not support any compression format
+
+### common options
+
+Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details.
+
+### max_rows_in_memory [int]
+
+When File Format is Excel,The maximum number of data items that can be cached in the memory.
+
+### sheet_name [string]
+
+Writer the sheet of the workbook
+
+## Example
+
+For text file format with `have_partition` and `custom_filename` and `sink_columns`
+
+```hocon
+
+ CosFile {
+ path="/sink"
+ bucket = "cosn://seatunnel-test-1259587829"
+ secret_id = "xxxxxxxxxxxxxxxxxxx"
+ secret_key = "xxxxxxxxxxxxxxxxxxx"
+ region = "ap-chengdu"
+ file_format_type = "text"
+ field_delimiter = "\t"
+ row_delimiter = "\n"
+ have_partition = true
+ partition_by = ["age"]
+ partition_dir_expression = "${k0}=${v0}"
+ is_partition_field_write_in_file = true
+ custom_filename = true
+ file_name_expression = "${transactionId}_${now}"
+ filename_time_format = "yyyy.MM.dd"
+ sink_columns = ["name","age"]
+ is_enable_transaction = true
+ }
+
+```
+
+For parquet file format with `have_partition` and `sink_columns`
+
+```hocon
+
+ CosFile {
+ path="/sink"
+ bucket = "cosn://seatunnel-test-1259587829"
+ secret_id = "xxxxxxxxxxxxxxxxxxx"
+ secret_key = "xxxxxxxxxxxxxxxxxxx"
+ region = "ap-chengdu"
+ have_partition = true
+ partition_by = ["age"]
+ partition_dir_expression = "${k0}=${v0}"
+ is_partition_field_write_in_file = true
+ file_format_type = "parquet"
+ sink_columns = ["name","age"]
+ }
+
+```
+
+For orc file format simple config
+
+```bash
+
+ CosFile {
+ path="/sink"
+ bucket = "cosn://seatunnel-test-1259587829"
+ secret_id = "xxxxxxxxxxxxxxxxxxx"
+ secret_key = "xxxxxxxxxxxxxxxxxxx"
+ region = "ap-chengdu"
+ file_format_type = "orc"
+ }
+
+```
+
+## Changelog
+
+### next version
+
+- Add file cos sink connector ([4979](https://github.com/apache/seatunnel/pull/4979))
+
diff --git a/docs/en/connector-v2/sink/DB2.md b/docs/en/connector-v2/sink/DB2.md
new file mode 100644
index 000000000000..8f5a7285e35d
--- /dev/null
+++ b/docs/en/connector-v2/sink/DB2.md
@@ -0,0 +1,171 @@
+# DB2
+
+> JDBC DB2 Sink Connector
+
+## Support Those Engines
+
+> Spark
+> Flink
+> SeaTunnel Zeta
+
+## Key Features
+
+- [x] [exactly-once](../../concept/connector-v2-features.md)
+- [ ] [cdc](../../concept/connector-v2-features.md)
+
+> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is
+> support `Xa transactions`. You can set `is_exactly_once=true` to enable it.
+
+## Description
+
+Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once
+semantics (using XA transaction guarantee).
+
+## Supported DataSource Info
+
+| Datasource | Supported Versions | Driver | Url | Maven |
+|------------|----------------------------------------------------------|--------------------------------|-----------------------------------|-----------------------------------------------------------------------|
+| DB2 | Different dependency version has different driver class. | com.ibm.db2.jdbc.app.DB2Driver | jdbc:db2://127.0.0.1:50000/dbname | [Download](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) |
+
+## Database Dependency
+
+> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example DB2 datasource: cp db2-connector-java-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/
+
+## Data Type Mapping
+
+| DB2 Data type | SeaTunnel Data type |
+|------------------------------------------------------------------------------------------------------|---------------------|---|
+| BOOLEAN | BOOLEAN |
+| SMALLINT | SHORT |
+| INT
INTEGER
| INTEGER |
+| BIGINT | LONG |
+| DECIMAL
DEC
NUMERIC
NUM | DECIMAL(38,18) |
+| REAL | FLOAT |
+| FLOAT
DOUBLE
DOUBLE PRECISION
DECFLOAT | DOUBLE |
+| CHAR
VARCHAR
LONG VARCHAR
CLOB
GRAPHIC
VARGRAPHIC
LONG VARGRAPHIC
DBCLOB | STRING |
+| BLOB | BYTES |
+| DATE | DATE |
+| TIME | TIME |
+| TIMESTAMP | TIMESTAMP |
+| ROWID
XML | Not supported yet |
+
+## Sink Options
+
+| Name | Type | Required | Default | Description |
+|-------------------------------------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:db2://127.0.0.1:50000/dbname |
+| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use DB2 the value is `com.ibm.db2.jdbc.app.DB2Driver`. |
+| user | String | No | - | Connection instance user name |
+| password | String | No | - | Connection instance password |
+| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority |
+| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. |
+| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. |
+| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. |
+| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance |
+| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. |
+| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) |
+| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms`
, the data will be flushed into the database |
+| batch_interval_ms | Int | No | 1000 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the database |
+| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. |
+| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to |
+| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, DB2 is `com.db2.cj.jdbc.Db2XADataSource`, and
please refer to appendix for other data sources |
+| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures |
+| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics |
+| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default |
+| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details |
+
+### Tips
+
+> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks.
+
+## Task Example
+
+### Simple:
+
+> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your DB2. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job.
+
+```
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ # This is a example source plugin **only for test and demonstrate the feature source plugin**
+ FakeSource {
+ parallelism = 1
+ result_table_name = "fake"
+ row.num = 16
+ schema = {
+ fields {
+ name = "string"
+ age = "int"
+ }
+ }
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of source plugins,
+ # please go to https://seatunnel.apache.org/docs/category/source-v2
+}
+
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/category/transform-v2
+}
+
+sink {
+ jdbc {
+ url = "jdbc:db2://127.0.0.1:50000/dbname"
+ driver = "com.ibm.db2.jdbc.app.DB2Driver"
+ user = "root"
+ password = "123456"
+ query = "insert into test_table(name,age) values(?,?)"
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of sink plugins,
+ # please go to https://seatunnel.apache.org/docs/category/sink-v2
+}
+```
+
+### Generate Sink SQL
+
+> This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you
+
+```
+sink {
+ jdbc {
+ url = "jdbc:db2://127.0.0.1:50000/dbname"
+ driver = "com.ibm.db2.jdbc.app.DB2Driver"
+ user = "root"
+ password = "123456"
+ # Automatically generate sql statements based on database table names
+ generate_sink_sql = true
+ database = test
+ table = test_table
+ }
+}
+```
+
+### Exactly-once :
+
+> For accurate write scene we guarantee accurate once
+
+```
+sink {
+ jdbc {
+ url = "jdbc:db2://127.0.0.1:50000/dbname"
+ driver = "com.ibm.db2.jdbc.app.DB2Driver"
+
+ max_retries = 0
+ user = "root"
+ password = "123456"
+ query = "insert into test_table(name,age) values(?,?)"
+
+ is_exactly_once = "true"
+
+ xa_data_source_class_name = "com.db2.cj.jdbc.Db2XADataSource"
+ }
+}
+```
+
diff --git a/docs/en/connector-v2/sink/Doris.md b/docs/en/connector-v2/sink/Doris.md
index f586ac3bcca0..6bf8dc5369c9 100644
--- a/docs/en/connector-v2/sink/Doris.md
+++ b/docs/en/connector-v2/sink/Doris.md
@@ -2,11 +2,24 @@
> Doris sink connector
+## Support Those Engines
+
+> Spark
+> Flink
+> SeaTunnel Zeta
+
+## Key Features
+
+- [x] [exactly-once](../../concept/connector-v2-features.md)
+- [x] [cdc](../../concept/connector-v2-features.md)
+
## Description
Used to send data to Doris. Both support streaming and batch mode.
The internal implementation of Doris sink connector is cached and imported by stream load in batches.
+## Supported DataSource Info
+
:::tip
Version Supported
@@ -17,67 +30,186 @@ Version Supported
:::
-## Key features
-
-- [x] [exactly-once](../../concept/connector-v2-features.md)
-- [x] [cdc](../../concept/connector-v2-features.md)
-
-## Options
-
-| name | type | required | default value |
-|--------------------|--------|----------|---------------|
-| fenodes | string | yes | - |
-| username | string | yes | - |
-| password | string | yes | - |
-| table.identifier | string | yes | - |
-| sink.label-prefix | string | yes | - |
-| sink.enable-2pc | bool | no | true |
-| sink.enable-delete | bool | no | false |
-| doris.config | map | yes | - |
-
-### fenodes [string]
-
-`Doris` cluster fenodes address, the format is `"fe_ip:fe_http_port, ..."`
-
-### username [string]
-
-`Doris` user username
-
-### password [string]
-
-`Doris` user password
+## Sink Options
+
+| Name | Type | Required | Default | Description |
+|---------------------|--------|----------|------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| fenodes | String | Yes | - | `Doris` cluster fenodes address, the format is `"fe_ip:fe_http_port, ..."` |
+| username | String | Yes | - | `Doris` user username |
+| password | String | Yes | - | `Doris` user password |
+| table.identifier | String | Yes | - | The name of `Doris` table |
+| sink.label-prefix | String | Yes | - | The label prefix used by stream load imports. In the 2pc scenario, global uniqueness is required to ensure the EOS semantics of SeaTunnel. |
+| sink.enable-2pc | bool | No | - | Whether to enable two-phase commit (2pc), the default is true, to ensure Exactly-Once semantics. For two-phase commit, please refer to [here](https://doris.apache.org/docs/dev/sql-manual/sql-reference/Data-Manipulation-Statements/Load/STREAM-LOAD). |
+| sink.enable-delete | bool | No | - | Whether to enable deletion. This option requires Doris table to enable batch delete function (0.15+ version is enabled by default), and only supports Unique model. you can get more detail at this [link](https://doris.apache.org/docs/dev/data-operate/update-delete/batch-delete-manual) |
+| sink.check-interval | int | No | 10000 | check exception with the interval while loading |
+| sink.max-retries | int | No | 3 | the max retry times if writing records to database failed |
+| sink.buffer-size | int | No | 256 * 1024 | the buffer size to cache data for stream load. |
+| sink.buffer-count | int | No | 3 | the buffer count to cache data for stream load. |
+| doris.config | map | yes | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql,and supported formats. |
+
+## Data Type Mapping
+
+| Doris Data type | SeaTunnel Data type |
+|-----------------|-----------------------------------------|
+| BOOLEAN | BOOLEAN |
+| TINYINT | TINYINT |
+| SMALLINT | SMALLINT
TINYINT |
+| INT | INT
SMALLINT
TINYINT |
+| BIGINT | BIGINT
INT
SMALLINT
TINYINT |
+| LARGEINT | BIGINT
INT
SMALLINT
TINYINT |
+| FLOAT | FLOAT |
+| DOUBLE | DOUBLE
FLOAT |
+| DECIMAL | DECIMAL
DOUBLE
FLOAT |
+| DATE | DATE |
+| DATETIME | TIMESTAMP |
+| CHAR | STRING |
+| VARCHAR | STRING |
+| STRING | STRING |
+| ARRAY | ARRAY |
+| MAP | MAP |
+| JSON | STRING |
+| HLL | Not supported yet |
+| BITMAP | Not supported yet |
+| QUANTILE_STATE | Not supported yet |
+| STRUCT | Not supported yet |
-### table.identifier [string]
-
-The name of `Doris` table
+#### Supported import data formats
-### sink.label-prefix [string]
+The supported formats include CSV and JSON
-The label prefix used by stream load imports. In the 2pc scenario, global uniqueness is required to ensure the EOS semantics of SeaTunnel.
+## Task Example
-### sink.enable-2pc [bool]
+### Simple:
-Whether to enable two-phase commit (2pc), the default is true, to ensure Exactly-Once semantics. For two-phase commit, please refer to [here](https://doris.apache.org/docs/dev/sql-manual/sql-reference/Data-Manipulation-Statements/Load/STREAM-LOAD).
+> The following example describes writing multiple data types to Doris, and users need to create corresponding tables downstream
-### sink.enable-delete [bool]
+```hocon
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+ checkpoint.interval = 10000
+}
-Whether to enable deletion. This option requires Doris table to enable batch delete function (0.15+ version is enabled by default), and only supports Unique model. you can get more detail at this link:
+source {
+ FakeSource {
+ row.num = 10
+ map.size = 10
+ array.size = 10
+ bytes.length = 10
+ string.length = 10
+ schema = {
+ fields {
+ c_map = "map>"
+ c_array = "array"
+ c_string = string
+ c_boolean = boolean
+ c_tinyint = tinyint
+ c_smallint = smallint
+ c_int = int
+ c_bigint = bigint
+ c_float = float
+ c_double = double
+ c_decimal = "decimal(16, 1)"
+ c_null = "null"
+ c_bytes = bytes
+ c_date = date
+ c_timestamp = timestamp
+ }
+ }
+ }
+}
-https://doris.apache.org/docs/dev/data-operate/update-delete/batch-delete-manual
+sink {
+ Doris {
+ fenodes = "doris_cdc_e2e:8030"
+ username = root
+ password = ""
+ table.identifier = "test.e2e_table_sink"
+ sink.label-prefix = "test-cdc"
+ sink.enable-2pc = "true"
+ sink.enable-delete = "true"
+ doris.config {
+ format = "json"
+ read_json_by_line = "true"
+ }
+ }
+}
+```
-### doris.config [map]
+### CDC(Change Data Capture) Event:
-The parameter of the stream load `data_desc`, you can get more detail at this link:
+> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to Doris Sink,FakeSource simulates CDC data with schema, score (int type),Doris needs to create a table sink named test.e2e_table_sink and a corresponding table for it.
-https://doris.apache.org/docs/dev/sql-manual/sql-reference/Data-Manipulation-Statements/Load/STREAM-LOAD
+```hocon
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+ checkpoint.interval = 10000
+}
-#### Supported import data formats
+source {
+ FakeSource {
+ schema = {
+ fields {
+ pk_id = bigint
+ name = string
+ score = int
+ sex = boolean
+ number = tinyint
+ height = float
+ sight = double
+ create_time = date
+ update_time = timestamp
+ }
+ }
+ rows = [
+ {
+ kind = INSERT
+ fields = [1, "A", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"]
+ },
+ {
+ kind = INSERT
+ fields = [2, "B", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"]
+ },
+ {
+ kind = INSERT
+ fields = [3, "C", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"]
+ },
+ {
+ kind = UPDATE_BEFORE
+ fields = [1, "A", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"]
+ },
+ {
+ kind = UPDATE_AFTER
+ fields = [1, "A_1", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"]
+ },
+ {
+ kind = DELETE
+ fields = [2, "B", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"]
+ }
+ ]
+ }
+}
-The supported formats include CSV and JSON. Default value: CSV
+sink {
+ Doris {
+ fenodes = "doris_cdc_e2e:8030"
+ username = root
+ password = ""
+ table.identifier = "test.e2e_table_sink"
+ sink.label-prefix = "test-cdc"
+ sink.enable-2pc = "true"
+ sink.enable-delete = "true"
+ doris.config {
+ format = "json"
+ read_json_by_line = "true"
+ }
+ }
+}
-## Example
+```
-Use JSON format to import data
+### Use JSON format to import data
```
sink {
@@ -97,7 +229,7 @@ sink {
```
-Use CSV format to import data
+### Use CSV format to import data
```
sink {
diff --git a/docs/en/connector-v2/sink/FtpFile.md b/docs/en/connector-v2/sink/FtpFile.md
index b92bcd7fcc37..8b3214e44b3c 100644
--- a/docs/en/connector-v2/sink/FtpFile.md
+++ b/docs/en/connector-v2/sink/FtpFile.md
@@ -40,9 +40,9 @@ By default, we use 2PC commit to ensure `exactly-once`
| custom_filename | boolean | no | false | Whether you need custom the filename |
| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true |
| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true |
-| file_format | string | no | "csv" | |
-| field_delimiter | string | no | '\001' | Only used when file_format is text |
-| row_delimiter | string | no | "\n" | Only used when file_format is text |
+| file_format_type | string | no | "csv" | |
+| field_delimiter | string | no | '\001' | Only used when file_format_type is text |
+| row_delimiter | string | no | "\n" | Only used when file_format_type is text |
| have_partition | boolean | no | false | Whether you need processing partitions. |
| partition_by | array | no | - | Only used then have_partition is true |
| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true |
@@ -52,8 +52,8 @@ By default, we use 2PC commit to ensure `exactly-once`
| batch_size | int | no | 1000000 | |
| compress_codec | string | no | none | |
| common-options | object | no | - | |
-| max_rows_in_memory | int | no | - | Only used when file_format is excel. |
-| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. |
+| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. |
+| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. |
### host [string]
@@ -103,13 +103,13 @@ When the format in the `file_name_expression` parameter is `xxxx-${now}` , `file
| m | Minute in hour |
| s | Second in minute |
-### file_format [string]
+### file_format_type [string]
We supported as the following file types:
`text` `json` `csv` `orc` `parquet` `excel`
-Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`.
+Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`.
### field_delimiter [string]
@@ -198,7 +198,7 @@ FtpFile {
username = "username"
password = "password"
path = "/data/ftp"
- file_format = "text"
+ file_format_type = "text"
field_delimiter = "\t"
row_delimiter = "\n"
sink_columns = ["name","age"]
@@ -216,7 +216,7 @@ FtpFile {
username = "username"
password = "password"
path = "/data/ftp"
- file_format = "text"
+ file_format_type = "text"
field_delimiter = "\t"
row_delimiter = "\n"
have_partition = true
diff --git a/docs/en/connector-v2/sink/HdfsFile.md b/docs/en/connector-v2/sink/HdfsFile.md
index 1e094a5e573c..34ce19714b4d 100644
--- a/docs/en/connector-v2/sink/HdfsFile.md
+++ b/docs/en/connector-v2/sink/HdfsFile.md
@@ -41,8 +41,8 @@ By default, we use 2PC commit to ensure `exactly-once`
| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true |
| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true |
| file_format_type | string | no | "csv" | |
-| field_delimiter | string | no | '\001' | Only used when file_format is text |
-| row_delimiter | string | no | "\n" | Only used when file_format is text |
+| field_delimiter | string | no | '\001' | Only used when file_format_type is text |
+| row_delimiter | string | no | "\n" | Only used when file_format_type is text |
| have_partition | boolean | no | false | Whether you need processing partitions. |
| partition_by | array | no | - | Only used then have_partition is true |
| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true |
@@ -55,8 +55,8 @@ By default, we use 2PC commit to ensure `exactly-once`
| kerberos_keytab_path | string | no | - | |
| compress_codec | string | no | none | |
| common-options | object | no | - | |
-| max_rows_in_memory | int | no | - | Only used when file_format is excel. |
-| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. |
+| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. |
+| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. |
### fs.defaultFS [string]
@@ -104,7 +104,7 @@ We supported as the following file types:
`text` `json` `csv` `orc` `parquet` `excel`
-Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`.
+Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`.
### field_delimiter [string]
@@ -198,7 +198,7 @@ For orc file format simple config
HdfsFile {
fs.defaultFS = "hdfs://hadoopcluster"
path = "/tmp/hive/warehouse/test2"
- file_format = "orc"
+ file_format_type = "orc"
}
```
diff --git a/docs/en/connector-v2/sink/Jdbc.md b/docs/en/connector-v2/sink/Jdbc.md
index f128f6b4b218..9d68278cf51e 100644
--- a/docs/en/connector-v2/sink/Jdbc.md
+++ b/docs/en/connector-v2/sink/Jdbc.md
@@ -74,6 +74,8 @@ Use this sql write upstream input datas to database. e.g `INSERT ...`
The compatible mode of database, required when the database supports multiple compatible modes. For example, when using OceanBase database, you need to set it to 'mysql' or 'oracle'.
+Postgres 9.5 version or below,please set it to `postgresLow` to support cdc
+
### database [string]
Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
@@ -226,6 +228,26 @@ sink {
}
```
+Postgresql 9.5 version below support CDC(Change data capture) event
+
+```
+sink {
+ jdbc {
+ url = "jdbc:postgresql://localhost:5432"
+ driver = "org.postgresql.Driver"
+ user = "root"
+ password = "123456"
+ compatible_mode="postgresLow"
+ database = "sink_database"
+ table = "sink_table"
+ support_upsert_by_query_primary_key_exist = true
+ generate_sink_sql = true
+ primary_keys = ["key1", "key2", ...]
+ }
+}
+
+```
+
## Changelog
### 2.2.0-beta 2022-09-26
diff --git a/docs/en/connector-v2/sink/Kafka.md b/docs/en/connector-v2/sink/Kafka.md
index 4dbd3a84ce7f..1e258a058adb 100644
--- a/docs/en/connector-v2/sink/Kafka.md
+++ b/docs/en/connector-v2/sink/Kafka.md
@@ -1,36 +1,52 @@
# Kafka
> Kafka sink connector
->
- ## Description
-Write Rows to a Kafka topic.
+## Support Those Engines
+
+> Spark
+> Flink
+> Seatunnel Zeta
-## Key features
+## Key Features
- [x] [exactly-once](../../concept/connector-v2-features.md)
+- [ ] [cdc](../../concept/connector-v2-features.md)
+
+> By default, we will use 2pc to guarantee the message is sent to kafka exactly once.
+
+## Description
+
+Write Rows to a Kafka topic.
-By default, we will use 2pc to guarantee the message is sent to kafka exactly once.
+## Supported DataSource Info
-## Options
+In order to use the Kafka connector, the following dependencies are required.
+They can be downloaded via install-plugin.sh or from the Maven central repository.
-| name | type | required | default value |
-|----------------------|--------|----------|---------------|
-| topic | string | yes | - |
-| bootstrap.servers | string | yes | - |
-| kafka.config | map | no | - |
-| semantics | string | no | NON |
-| partition_key_fields | array | no | - |
-| partition | int | no | - |
-| assign_partitions | array | no | - |
-| transaction_prefix | string | no | - |
-| format | String | no | json |
-| field_delimiter | String | no | , |
-| common-options | config | no | - |
+| Datasource | Supported Versions | Maven |
+|------------|--------------------|-------------------------------------------------------------------------------------------------------------|
+| Kafka | Universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-kafka) |
-### topic [string]
+## Sink Options
-Kafka Topic.
+| Name | Type | Required | Default | Description |
+|----------------------|--------|----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| topic | String | Yes | - | When the table is used as sink, the topic name is the topic to write data to. |
+| bootstrap.servers | String | Yes | - | Comma separated list of Kafka brokers. |
+| kafka.config | Map | No | - | In addition to the above parameters that must be specified by the `Kafka producer` client, the user can also specify multiple non-mandatory parameters for the `producer` client, covering [all the producer parameters specified in the official Kafka document](https://kafka.apache.org/documentation.html#producerconfigs). |
+| semantics | String | No | NON | Semantics that can be chosen EXACTLY_ONCE/AT_LEAST_ONCE/NON, default NON. |
+| partition_key_fields | Array | No | - | Configure which fields are used as the key of the kafka message. |
+| partition | Int | No | - | We can specify the partition, all messages will be sent to this partition. |
+| assign_partitions | Array | No | - | We can decide which partition to send based on the content of the message. The function of this parameter is to distribute information. |
+| transaction_prefix | String | No | - | If semantic is specified as EXACTLY_ONCE, the producer will write all messages in a Kafka transaction,kafka distinguishes different transactions by different transactionId. This parameter is prefix of kafka transactionId, make sure different job use different prefix. |
+| format | String | No | json | Data format. The default format is json. Optional text format, canal-json and debezium-json.If you use json or text format. The default field separator is ", ". If you customize the delimiter, add the "field_delimiter" option.If you use canal format, please refer to [canal-json](../formats/canal-json.md) for details.If you use debezium format, please refer to [debezium-json](../formats/debezium-json.md) for details. |
+| field_delimiter | String | No | , | Customize the field delimiter for data format. |
+| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
+
+## Parameter Interpretation
+
+### Topic Formats
Currently two formats are supported:
@@ -47,27 +63,13 @@ Currently two formats are supported:
If `${name}` is set as the topic. So the first row is sent to Jack topic, and the second row is sent to Mary topic.
-### bootstrap.servers [string]
-
-Kafka Brokers List.
-
-### kafka.config [kafka producer config]
-
-In addition to the above parameters that must be specified by the `Kafka producer` client, the user can also specify multiple non-mandatory parameters for the `producer` client, covering [all the producer parameters specified in the official Kafka document](https://kafka.apache.org/documentation.html#producerconfigs).
-
-### semantics [string]
-
-Semantics that can be chosen EXACTLY_ONCE/AT_LEAST_ONCE/NON, default NON.
+### Semantics
In EXACTLY_ONCE, producer will write all messages in a Kafka transaction that will be committed to Kafka on a checkpoint.
-
In AT_LEAST_ONCE, producer will wait for all outstanding messages in the Kafka buffers to be acknowledged by the Kafka producer on a checkpoint.
-
NON does not provide any guarantees: messages may be lost in case of issues on the Kafka broker and messages may be duplicated.
-### partition_key_fields [array]
-
-Configure which fields are used as the key of the kafka message.
+### Partition Key Fields
For example, if you want to use value of fields from upstream data as key, you can assign field names to this property.
@@ -79,53 +81,48 @@ Upstream data is the following:
| Mary | 23 | data-example2 |
If name is set as the key, then the hash value of the name column will determine which partition the message is sent to.
-
If not set partition key fields, the null message key will be sent to.
-
The format of the message key is json, If name is set as the key, for example '{"name":"Jack"}'.
-
The selected field must be an existing field in the upstream.
-### partition [int]
-
-We can specify the partition, all messages will be sent to this partition.
-
-### assign_partitions [array]
-
-We can decide which partition to send based on the content of the message. The function of this parameter is to distribute information.
+### Assign Partitions
For example, there are five partitions in total, and the assign_partitions field in config is as follows:
assign_partitions = ["shoe", "clothing"]
-
Then the message containing "shoe" will be sent to partition zero ,because "shoe" is subscribed as zero in assign_partitions, and the message containing "clothing" will be sent to partition one.For other messages, the hash algorithm will be used to divide them into the remaining partitions.
-
This function by `MessageContentPartitioner` class implements `org.apache.kafka.clients.producer.Partitioner` interface.If we need custom partitions, we need to implement this interface as well.
-### transaction_prefix [string]
-
-If semantic is specified as EXACTLY_ONCE, the producer will write all messages in a Kafka transaction.
-Kafka distinguishes different transactions by different transactionId. This parameter is prefix of kafka transactionId, make sure different job use different prefix.
-
-### format
+## Task Example
-Data format. The default format is json. Optional text format. The default field separator is ",".
-If you customize the delimiter, add the "field_delimiter" option.
+### Simple:
-### field_delimiter
+> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to Kafka Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target topic is test_topic will also be 16 rows of data in the topic. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job.
-Customize the field delimiter for data format.
-
-### common options [config]
-
-Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details.
+```hocon
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
-## Examples
+source {
+ FakeSource {
+ parallelism = 1
+ result_table_name = "fake"
+ row.num = 16
+ schema = {
+ fields {
+ name = "string"
+ age = "int"
+ }
+ }
+ }
+}
-```hocon
sink {
-
kafka {
- topic = "seatunnel"
+ topic = "test_topic"
bootstrap.servers = "localhost:9092"
partition = 3
format = json
@@ -137,7 +134,6 @@ sink {
buffer.memory = 33554432
}
}
-
}
```
@@ -160,7 +156,6 @@ sink {
sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required \nusername=${username}\npassword=${password};"
}
}
-
}
```
@@ -197,20 +192,6 @@ sink {
sasl.client.callback.handler.class="software.amazon.msk.auth.iam.IAMClientCallbackHandler"
}
}
-
}
```
-## Changelog
-
-### 2.3.0-beta 2022-10-20
-
-- Add Kafka Sink Connector
-
-### next version
-
-- [Improve] Support to specify multiple partition keys [3230](https://github.com/apache/seatunnel/pull/3230)
-- [Improve] Add text format for kafka sink connector [3711](https://github.com/apache/seatunnel/pull/3711)
-- [Improve] Support extract topic from SeaTunnelRow fields [3742](https://github.com/apache/seatunnel/pull/3742)
-- [Improve] Change Connector Custom Config Prefix To Map [3719](https://github.com/apache/seatunnel/pull/3719)
-
diff --git a/docs/en/connector-v2/sink/LocalFile.md b/docs/en/connector-v2/sink/LocalFile.md
index fb008e909a9c..8e2c1526e907 100644
--- a/docs/en/connector-v2/sink/LocalFile.md
+++ b/docs/en/connector-v2/sink/LocalFile.md
@@ -20,7 +20,7 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you
By default, we use 2PC commit to ensure `exactly-once`
-- [x] file format
+- [x] file format type
- [x] text
- [x] csv
- [x] parquet
@@ -36,9 +36,9 @@ By default, we use 2PC commit to ensure `exactly-once`
| custom_filename | boolean | no | false | Whether you need custom the filename |
| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true |
| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true |
-| file_format | string | no | "csv" | |
-| field_delimiter | string | no | '\001' | Only used when file_format is text |
-| row_delimiter | string | no | "\n" | Only used when file_format is text |
+| file_format_type | string | no | "csv" | |
+| field_delimiter | string | no | '\001' | Only used when file_format_type is text |
+| row_delimiter | string | no | "\n" | Only used when file_format_type is text |
| have_partition | boolean | no | false | Whether you need processing partitions. |
| partition_by | array | no | - | Only used then have_partition is true |
| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true |
@@ -48,8 +48,8 @@ By default, we use 2PC commit to ensure `exactly-once`
| batch_size | int | no | 1000000 | |
| compress_codec | string | no | none | |
| common-options | object | no | - | |
-| max_rows_in_memory | int | no | - | Only used when file_format is excel. |
-| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. |
+| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. |
+| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. |
### path [string]
@@ -83,13 +83,13 @@ When the format in the `file_name_expression` parameter is `xxxx-${now}` , `file
| m | Minute in hour |
| s | Second in minute |
-### file_format [string]
+### file_format_type [string]
We supported as the following file types:
`text` `json` `csv` `orc` `parquet` `excel`
-Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`.
+Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`.
### field_delimiter [string]
@@ -174,7 +174,7 @@ For orc file format simple config
LocalFile {
path = "/tmp/hive/warehouse/test2"
- file_format = "orc"
+ file_format_type = "orc"
}
```
@@ -185,7 +185,7 @@ For parquet file format with `sink_columns`
LocalFile {
path = "/tmp/hive/warehouse/test2"
- file_format = "parquet"
+ file_format_type = "parquet"
sink_columns = ["name","age"]
}
@@ -197,7 +197,7 @@ For text file format with `have_partition` and `custom_filename` and `sink_colum
LocalFile {
path = "/tmp/hive/warehouse/test2"
- file_format = "text"
+ file_format_type = "text"
field_delimiter = "\t"
row_delimiter = "\n"
have_partition = true
@@ -224,7 +224,7 @@ LocalFile {
partition_dir_expression="${k0}=${v0}"
is_partition_field_write_in_file=true
file_name_expression="${transactionId}_${now}"
- file_format="excel"
+ file_format_type="excel"
filename_time_format="yyyy.MM.dd"
is_enable_transaction=true
}
diff --git a/docs/en/connector-v2/sink/OssFile.md b/docs/en/connector-v2/sink/OssFile.md
index d40cf4bf958c..a3095ecfd1a4 100644
--- a/docs/en/connector-v2/sink/OssFile.md
+++ b/docs/en/connector-v2/sink/OssFile.md
@@ -44,8 +44,8 @@ By default, we use 2PC commit to ensure `exactly-once`
| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true |
| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true |
| file_format_type | string | no | "csv" | |
-| field_delimiter | string | no | '\001' | Only used when file_format is text |
-| row_delimiter | string | no | "\n" | Only used when file_format is text |
+| field_delimiter | string | no | '\001' | Only used when file_format_type is text |
+| row_delimiter | string | no | "\n" | Only used when file_format_type is text |
| have_partition | boolean | no | false | Whether you need processing partitions. |
| partition_by | array | no | - | Only used then have_partition is true |
| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true |
@@ -55,8 +55,8 @@ By default, we use 2PC commit to ensure `exactly-once`
| batch_size | int | no | 1000000 | |
| compress_codec | string | no | none | |
| common-options | object | no | - | |
-| max_rows_in_memory | int | no | - | Only used when file_format is excel. |
-| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. |
+| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. |
+| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. |
### path [string]
@@ -112,7 +112,7 @@ We supported as the following file types:
`text` `json` `csv` `orc` `parquet` `excel`
-Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`.
+Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`.
### field_delimiter [string]
diff --git a/docs/en/connector-v2/sink/OssJindoFile.md b/docs/en/connector-v2/sink/OssJindoFile.md
index 02547f3aa6a7..1d098da009c4 100644
--- a/docs/en/connector-v2/sink/OssJindoFile.md
+++ b/docs/en/connector-v2/sink/OssJindoFile.md
@@ -44,8 +44,8 @@ By default, we use 2PC commit to ensure `exactly-once`
| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true |
| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true |
| file_format_type | string | no | "csv" | |
-| field_delimiter | string | no | '\001' | Only used when file_format is text |
-| row_delimiter | string | no | "\n" | Only used when file_format is text |
+| field_delimiter | string | no | '\001' | Only used when file_format_type is text |
+| row_delimiter | string | no | "\n" | Only used when file_format_type is text |
| have_partition | boolean | no | false | Whether you need processing partitions. |
| partition_by | array | no | - | Only used then have_partition is true |
| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true |
@@ -55,8 +55,8 @@ By default, we use 2PC commit to ensure `exactly-once`
| batch_size | int | no | 1000000 | |
| compress_codec | string | no | none | |
| common-options | object | no | - | |
-| max_rows_in_memory | int | no | - | Only used when file_format is excel. |
-| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. |
+| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. |
+| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. |
### path [string]
@@ -112,7 +112,7 @@ We supported as the following file types:
`text` `json` `csv` `orc` `parquet` `excel`
-Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`.
+Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`.
### field_delimiter [string]
diff --git a/docs/en/connector-v2/sink/S3-Redshift.md b/docs/en/connector-v2/sink/S3-Redshift.md
index 978ffc7c94fc..2e02e2f446a7 100644
--- a/docs/en/connector-v2/sink/S3-Redshift.md
+++ b/docs/en/connector-v2/sink/S3-Redshift.md
@@ -124,7 +124,7 @@ We supported as the following file types:
`text` `csv` `parquet` `orc` `json`
-Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`.
+Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`.
### filename_time_format [string]
diff --git a/docs/en/connector-v2/sink/S3File.md b/docs/en/connector-v2/sink/S3File.md
index 0892dc6a48a3..f7da9016c720 100644
--- a/docs/en/connector-v2/sink/S3File.md
+++ b/docs/en/connector-v2/sink/S3File.md
@@ -121,7 +121,6 @@ If write to `csv`, `text` file type, All column will be string.
| max_rows_in_memory | int | no | - | Only used when file_format is excel. |
| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. |
| hadoop_s3_properties | map | no | | If you need to add a other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) |
-
|
### hadoop_s3_properties [map]
@@ -169,7 +168,7 @@ We supported as the following file types:
`text` `json` `csv` `orc` `parquet` `excel`
-Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`.
+Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`.
### field_delimiter [string]
diff --git a/docs/en/connector-v2/sink/SftpFile.md b/docs/en/connector-v2/sink/SftpFile.md
index 79643b8c8aa2..b6460f39e398 100644
--- a/docs/en/connector-v2/sink/SftpFile.md
+++ b/docs/en/connector-v2/sink/SftpFile.md
@@ -41,8 +41,8 @@ By default, we use 2PC commit to ensure `exactly-once`
| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true |
| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true |
| file_format_type | string | no | "csv" | |
-| field_delimiter | string | no | '\001' | Only used when file_format is text |
-| row_delimiter | string | no | "\n" | Only used when file_format is text |
+| field_delimiter | string | no | '\001' | Only used when file_format_type is text |
+| row_delimiter | string | no | "\n" | Only used when file_format_type is text |
| have_partition | boolean | no | false | Whether you need processing partitions. |
| partition_by | array | no | - | Only used then have_partition is true |
| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true |
@@ -52,8 +52,8 @@ By default, we use 2PC commit to ensure `exactly-once`
| batch_size | int | no | 1000000 | |
| compress_codec | string | no | none | |
| common-options | object | no | - | |
-| max_rows_in_memory | int | no | - | Only used when file_format is excel. |
-| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. |
+| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. |
+| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. |
### host [string]
@@ -109,7 +109,7 @@ We supported as the following file types:
`text` `json` `csv` `orc` `parquet` `excel`
-Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`.
+Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`.
### field_delimiter [string]
diff --git a/docs/en/connector-v2/sink/Vertica.md b/docs/en/connector-v2/sink/Vertica.md
new file mode 100644
index 000000000000..0db8571d55f2
--- /dev/null
+++ b/docs/en/connector-v2/sink/Vertica.md
@@ -0,0 +1,173 @@
+# Vertica
+
+> JDBC Vertica Sink Connector
+
+## Support Those Engines
+
+> Spark
+> Flink
+> SeaTunnel Zeta
+
+## Key Features
+
+- [x] [exactly-once](../../concept/connector-v2-features.md)
+- [ ] [cdc](../../concept/connector-v2-features.md)
+
+> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is
+> support `Xa transactions`. You can set `is_exactly_once=true` to enable it.
+
+## Description
+
+Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once
+semantics (using XA transaction guarantee).
+
+## Supported DataSource Info
+
+| Datasource | Supported Versions | Driver | Url | Maven |
+|------------|----------------------------------------------------------|-------------------------|---------------------------------------|----------------------------------------------------------------------|
+| Vertica | Different dependency version has different driver class. | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433/vertica | [Download](https://www.vertica.com/download/vertica/client-drivers/) |
+
+## Database Dependency
+
+> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example Vertica datasource: cp vertica-jdbc-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/
+
+## Data Type Mapping
+
+| Vertica Data type | SeaTunnel Data type |
+|-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------|
+| BIT(1)
INT UNSIGNED | BOOLEAN |
+| TINYINT
TINYINT UNSIGNED
SMALLINT
SMALLINT UNSIGNED
MEDIUMINT
MEDIUMINT UNSIGNED
INT
INTEGER
YEAR | INT |
+| INT UNSIGNED
INTEGER UNSIGNED
BIGINT | BIGINT |
+| BIGINT UNSIGNED | DECIMAL(20,0) |
+| DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) |
+| DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) |
+| DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the decimal point.))) |
+| FLOAT
FLOAT UNSIGNED | FLOAT |
+| DOUBLE
DOUBLE UNSIGNED | DOUBLE |
+| CHAR
VARCHAR
TINYTEXT
MEDIUMTEXT
TEXT
LONGTEXT
JSON | STRING |
+| DATE | DATE |
+| TIME | TIME |
+| DATETIME
TIMESTAMP | TIMESTAMP |
+| TINYBLOB
MEDIUMBLOB
BLOB
LONGBLOB
BINARY
VARBINAR
BIT(n) | BYTES |
+| GEOMETRY
UNKNOWN | Not supported yet |
+
+## Sink Options
+
+| Name | Type | Required | Default | Description |
+|-------------------------------------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:vertica://localhost:5433/vertica |
+| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use Vertical the value is `com.vertica.jdbc.Driver`. |
+| user | String | No | - | Connection instance user name |
+| password | String | No | - | Connection instance password |
+| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority |
+| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. |
+| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. |
+| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. |
+| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance |
+| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. |
+| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) |
+| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms`
, the data will be flushed into the database |
+| batch_interval_ms | Int | No | 1000 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the database |
+| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. |
+| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to |
+| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, vertical is `com.vertical.cj.jdbc.VerticalXADataSource`, and
please refer to appendix for other data sources |
+| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures |
+| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics |
+| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default |
+| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details |
+
+### Tips
+
+> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks.
+
+## Task Example
+
+### Simple:
+
+> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your vertical. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job.
+
+```
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ # This is a example source plugin **only for test and demonstrate the feature source plugin**
+ FakeSource {
+ parallelism = 1
+ result_table_name = "fake"
+ row.num = 16
+ schema = {
+ fields {
+ name = "string"
+ age = "int"
+ }
+ }
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of source plugins,
+ # please go to https://seatunnel.apache.org/docs/category/source-v2
+}
+
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/category/transform-v2
+}
+
+sink {
+ jdbc {
+ url = "jdbc:vertica://localhost:5433/vertica"
+ driver = "com.vertica.jdbc.Driver"
+ user = "root"
+ password = "123456"
+ query = "insert into test_table(name,age) values(?,?)"
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of sink plugins,
+ # please go to https://seatunnel.apache.org/docs/category/sink-v2
+}
+```
+
+### Generate Sink SQL
+
+> This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you
+
+```
+sink {
+ jdbc {
+ url = "jdbc:vertica://localhost:5433/vertica"
+ driver = "com.vertica.jdbc.Driver"
+ user = "root"
+ password = "123456"
+ # Automatically generate sql statements based on database table names
+ generate_sink_sql = true
+ database = test
+ table = test_table
+ }
+}
+```
+
+### Exactly-once :
+
+> For accurate write scene we guarantee accurate once
+
+```
+sink {
+ jdbc {
+ url = "jdbc:vertica://localhost:5433/vertica"
+ driver = "com.vertica.jdbc.Driver"
+
+ max_retries = 0
+ user = "root"
+ password = "123456"
+ query = "insert into test_table(name,age) values(?,?)"
+
+ is_exactly_once = "true"
+
+ xa_data_source_class_name = "com.vertical.cj.jdbc.VerticalXADataSource"
+ }
+}
+```
+
diff --git a/docs/en/connector-v2/source/Clickhouse.md b/docs/en/connector-v2/source/Clickhouse.md
index 07384875cb0d..7596bf72a8f0 100644
--- a/docs/en/connector-v2/source/Clickhouse.md
+++ b/docs/en/connector-v2/source/Clickhouse.md
@@ -2,93 +2,96 @@
> Clickhouse source connector
-## Description
+## Support Those Engines
-Used to read data from Clickhouse.
+> Spark
+> Flink
+> SeaTunnel Zeta
-## Key features
+## Key Features
- [x] [batch](../../concept/connector-v2-features.md)
- [ ] [stream](../../concept/connector-v2-features.md)
- [ ] [exactly-once](../../concept/connector-v2-features.md)
- [x] [column projection](../../concept/connector-v2-features.md)
-
-supports query SQL and can achieve projection effect.
-
- [ ] [parallelism](../../concept/connector-v2-features.md)
- [ ] [support user-defined split](../../concept/connector-v2-features.md)
-## Options
-
-| name | type | required | default value |
-|------------------|--------|----------|------------------------|
-| host | string | yes | - |
-| database | string | yes | - |
-| sql | string | yes | - |
-| username | string | yes | - |
-| password | string | yes | - |
-| server_time_zone | string | no | ZoneId.systemDefault() |
-| common-options | | no | - |
-
-### host [string]
-
-`ClickHouse` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"host1:8123,host2:8123"` .
-
-### database [string]
-
-The `ClickHouse` database
-
-### sql [string]
-
-The query sql used to search data though Clickhouse server
-
-### username [string]
-
-`ClickHouse` user username
-
-### password [string]
-
-`ClickHouse` user password
+> supports query SQL and can achieve projection effect.
-### server_time_zone [string]
-
-The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone.
-
-### common options
+## Description
-Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details
+Used to read data from Clickhouse.
-## Examples
+## Supported DataSource Info
+
+In order to use the Clickhouse connector, the following dependencies are required.
+They can be downloaded via install-plugin.sh or from the Maven central repository.
+
+| Datasource | Supported Versions | Dependency |
+|------------|--------------------|------------------------------------------------------------------------------------------------------------------|
+| Clickhouse | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-clickhouse) |
+
+## Data Type Mapping
+
+| Clickhouse Data type | SeaTunnel Data type |
+|-----------------------------------------------------------------------------------------------------------------------------------------------|---------------------|
+| String / Int128 / UInt128 / Int256 / UInt256 / Point / Ring / Polygon MultiPolygon | STRING |
+| Int8 / UInt8 / Int16 / UInt16 / Int32 | INT |
+| UInt64 / Int64 / IntervalYear / IntervalQuarter / IntervalMonth / IntervalWeek / IntervalDay / IntervalHour / IntervalMinute / IntervalSecond | BIGINT |
+| Float64 | DOUBLE |
+| Decimal | DECIMAL |
+| Float32 | FLOAT |
+| Date | DATE |
+| DateTime | TIME |
+| Array | ARRAY |
+| Map | MAP |
+
+## Source Options
+
+| Name | Type | Required | Default | Description |
+|------------------|--------|----------|------------------------|------------------------------------------------------------------------------------------------------------------------------------------|
+| host | String | Yes | - | `ClickHouse` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"host1:8123,host2:8123"` . |
+| database | String | Yes | - | The `ClickHouse` database. |
+| sql | String | Yes | - | The query sql used to search data though Clickhouse server. |
+| username | String | Yes | - | `ClickHouse` user username. |
+| password | String | Yes | - | `ClickHouse` user password. |
+| server_time_zone | String | No | ZoneId.systemDefault() | The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone. |
+| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. |
+
+## How to Create a Clickhouse Data Synchronization Jobs
+
+The following example demonstrates how to create a data synchronization job that reads data from Clickhouse and prints it on the local client:
+
+```bash
+# Set the basic configuration of the task to be performed
+env {
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
-```hocon
+# Create a source to connect to Clickhouse
source {
-
Clickhouse {
host = "localhost:8123"
database = "default"
sql = "select * from test where age = 20 limit 100"
- username = "default"
- password = ""
+ username = "xxxxx"
+ password = "xxxxx"
server_time_zone = "UTC"
result_table_name = "test"
}
-
}
-```
-
-## Changelog
-### 2.2.0-beta 2022-09-26
-
-- Add ClickHouse Source Connector
-
-### 2.3.0-beta 2022-10-20
-
-- [Improve] Clickhouse Source random use host when config multi-host ([3108](https://github.com/apache/seatunnel/pull/3108))
-
-### next version
+# Console printing of the read Clickhouse data
+sink {
+ Console {
+ parallelism = 1
+ }
+}
+```
-- [Improve] Clickhouse Source support nest type and array type([3047](https://github.com/apache/seatunnel/pull/3047))
+### Tips
-- [Improve] Clickhouse Source support geo type([3141](https://github.com/apache/seatunnel/pull/3141))
+> 1.[SeaTunnel Deployment Document](../../start-v2/locally/deployment.md).
diff --git a/docs/en/connector-v2/source/CosFile.md b/docs/en/connector-v2/source/CosFile.md
new file mode 100644
index 000000000000..dd1e77ebcfd0
--- /dev/null
+++ b/docs/en/connector-v2/source/CosFile.md
@@ -0,0 +1,294 @@
+# CosFile
+
+> Cos file source connector
+
+## Description
+
+Read data from aliyun Cos file system.
+
+:::tip
+
+If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x.
+
+If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this.
+
+To use this connector you need put hadoop-cos-{hadoop.version}-{version}.jar and cos_api-bundle-{version}.jar in ${SEATUNNEL_HOME}/lib dir, download: [Hadoop-Cos-release](https://github.com/tencentyun/hadoop-cos/releases). It only supports hadoop version 2.6.5+ and version 8.0.2+.
+
+:::
+
+## Key features
+
+- [x] [batch](../../concept/connector-v2-features.md)
+- [ ] [stream](../../concept/connector-v2-features.md)
+- [x] [exactly-once](../../concept/connector-v2-features.md)
+
+Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot.
+
+- [x] [column projection](../../concept/connector-v2-features.md)
+- [x] [parallelism](../../concept/connector-v2-features.md)
+- [ ] [support user-defined split](../../concept/connector-v2-features.md)
+- [x] file format type
+ - [x] text
+ - [x] csv
+ - [x] parquet
+ - [x] orc
+ - [x] json
+ - [x] excel
+
+## Options
+
+| name | type | required | default value |
+|---------------------------|---------|----------|---------------------|
+| path | string | yes | - |
+| file_format_type | string | yes | - |
+| bucket | string | yes | - |
+| secret_id | string | yes | - |
+| secret_key | string | yes | - |
+| region | string | yes | - |
+| read_columns | list | yes | - |
+| delimiter | string | no | \001 |
+| parse_partition_from_path | boolean | no | true |
+| skip_header_row_number | long | no | 0 |
+| date_format | string | no | yyyy-MM-dd |
+| datetime_format | string | no | yyyy-MM-dd HH:mm:ss |
+| time_format | string | no | HH:mm:ss |
+| schema | config | no | - |
+| common-options | | no | - |
+| sheet_name | string | no | - |
+| file_filter_pattern | string | no | - |
+
+### path [string]
+
+The source file path.
+
+### delimiter [string]
+
+Field delimiter, used to tell connector how to slice and dice fields when reading text files
+
+default `\001`, the same as hive's default delimiter
+
+### parse_partition_from_path [boolean]
+
+Control whether parse the partition keys and values from file path
+
+For example if you read a file from path `cosn://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`
+
+Every record data from file will be added these two fields:
+
+| name | age |
+|---------------|-----|
+| tyrantlucifer | 26 |
+
+Tips: **Do not define partition fields in schema option**
+
+### date_format [string]
+
+Date type format, used to tell connector how to convert string to date, supported as the following formats:
+
+`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`
+
+default `yyyy-MM-dd`
+
+### datetime_format [string]
+
+Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:
+
+`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss`
+
+default `yyyy-MM-dd HH:mm:ss`
+
+### time_format [string]
+
+Time type format, used to tell connector how to convert string to time, supported as the following formats:
+
+`HH:mm:ss` `HH:mm:ss.SSS`
+
+default `HH:mm:ss`
+
+### skip_header_row_number [long]
+
+Skip the first few lines, but only for the txt and csv.
+
+For example, set like following:
+
+`skip_header_row_number = 2`
+
+then SeaTunnel will skip the first 2 lines from source files
+
+### file_format_type [string]
+
+File type, supported as the following file types:
+
+`text` `csv` `parquet` `orc` `json` `excel`
+
+If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want.
+
+For example:
+
+upstream data is the following:
+
+```json
+
+{"code": 200, "data": "get success", "success": true}
+
+```
+
+You can also save multiple pieces of data in one file and split them by newline:
+
+```json lines
+
+{"code": 200, "data": "get success", "success": true}
+{"code": 300, "data": "get failed", "success": false}
+
+```
+
+you should assign schema as the following:
+
+```hocon
+
+schema {
+ fields {
+ code = int
+ data = string
+ success = boolean
+ }
+}
+
+```
+
+connector will generate data as the following:
+
+| code | data | success |
+|------|-------------|---------|
+| 200 | get success | true |
+
+If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically.
+
+If you assign file type to `text` `csv`, you can choose to specify the schema information or not.
+
+For example, upstream data is the following:
+
+```text
+
+tyrantlucifer#26#male
+
+```
+
+If you do not assign data schema connector will treat the upstream data as the following:
+
+| content |
+|-----------------------|
+| tyrantlucifer#26#male |
+
+If you assign data schema, you should also assign the option `delimiter` too except CSV file type
+
+you should assign schema and delimiter as the following:
+
+```hocon
+
+delimiter = "#"
+schema {
+ fields {
+ name = string
+ age = int
+ gender = string
+ }
+}
+
+```
+
+connector will generate data as the following:
+
+| name | age | gender |
+|---------------|-----|--------|
+| tyrantlucifer | 26 | male |
+
+### bucket [string]
+
+The bucket address of Cos file system, for example: `Cos://tyrantlucifer-image-bed`
+
+### secret_id [string]
+
+The secret id of Cos file system.
+
+### secret_key [string]
+
+The secret key of Cos file system.
+
+### region [string]
+
+The region of cos file system.
+
+### schema [config]
+
+#### fields [Config]
+
+The schema of upstream data.
+
+### read_columns [list]
+
+The read column list of the data source, user can use it to implement field projection.
+
+The file type supported column projection as the following shown:
+
+- text
+- json
+- csv
+- orc
+- parquet
+- excel
+
+**Tips: If the user wants to use this feature when reading `text` `json` `csv` files, the schema option must be configured**
+
+### common options
+
+Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details.
+
+### sheet_name [string]
+
+Reader the sheet of the workbook,Only used when file_format is excel.
+
+### file_filter_pattern [string]
+
+Filter pattern, which used for filtering files.
+
+## Example
+
+```hocon
+
+ CosFile {
+ path = "/seatunnel/orc"
+ bucket = "cosn://seatunnel-test-1259587829"
+ secret_id = "xxxxxxxxxxxxxxxxxxx"
+ secret_key = "xxxxxxxxxxxxxxxxxxx"
+ region = "ap-chengdu"
+ file_format_type = "orc"
+ }
+
+```
+
+```hocon
+
+ CosFile {
+ path = "/seatunnel/json"
+ bucket = "cosn://seatunnel-test-1259587829"
+ secret_id = "xxxxxxxxxxxxxxxxxxx"
+ secret_key = "xxxxxxxxxxxxxxxxxxx"
+ region = "ap-chengdu"
+ file_format_type = "json"
+ schema {
+ fields {
+ id = int
+ name = string
+ }
+ }
+ }
+
+```
+
+## Changelog
+
+### next version
+
+- Add file cos source connector ([4979](https://github.com/apache/seatunnel/pull/4979))
+
diff --git a/docs/en/connector-v2/source/DB2.md b/docs/en/connector-v2/source/DB2.md
new file mode 100644
index 000000000000..7ea91b7165c7
--- /dev/null
+++ b/docs/en/connector-v2/source/DB2.md
@@ -0,0 +1,155 @@
+# DB2
+
+> JDBC DB2 Source Connector
+
+## Support Those Engines
+
+> Spark
+> Flink
+> SeaTunnel Zeta
+
+## Key Features
+
+- [x] [batch](../../concept/connector-v2-features.md)
+- [ ] [stream](../../concept/connector-v2-features.md)
+- [x] [exactly-once](../../concept/connector-v2-features.md)
+- [x] [column projection](../../concept/connector-v2-features.md)
+- [x] [parallelism](../../concept/connector-v2-features.md)
+- [x] [support user-defined split](../../concept/connector-v2-features.md)
+
+> supports query SQL and can achieve projection effect.
+
+## Description
+
+Read external data source data through JDBC.
+
+## Supported DataSource Info
+
+| Datasource | Supported versions | Driver | Url | Maven |
+|------------|----------------------------------------------------------|--------------------------------|-----------------------------------|-----------------------------------------------------------------------|
+| DB2 | Different dependency version has different driver class. | com.ibm.db2.jdbc.app.DB2Driver | jdbc:db2://127.0.0.1:50000/dbname | [Download](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) |
+
+## Database Dependency
+
+> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example DB2 datasource: cp db2-connector-java-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/
+
+## Data Type Mapping
+
+| DB2 Data type | SeaTunnel Data type |
+|------------------------------------------------------------------------------------------------------|---------------------|---|
+| BOOLEAN | BOOLEAN |
+| SMALLINT | SHORT |
+| INT
INTEGER
| INTEGER |
+| BIGINT | LONG |
+| DECIMAL
DEC
NUMERIC
NUM | DECIMAL(38,18) |
+| REAL | FLOAT |
+| FLOAT
DOUBLE
DOUBLE PRECISION
DECFLOAT | DOUBLE |
+| CHAR
VARCHAR
LONG VARCHAR
CLOB
GRAPHIC
VARGRAPHIC
LONG VARGRAPHIC
DBCLOB | STRING |
+| BLOB | BYTES |
+| DATE | DATE |
+| TIME | TIME |
+| TIMESTAMP | TIMESTAMP |
+| ROWID
XML | Not supported yet |
+
+## Source Options
+
+| Name | Type | Required | Default | Description |
+|------------------------------|--------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:db2://127.0.0.1:50000/dbname |
+| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use db2 the value is `com.ibm.db2.jdbc.app.DB2Driver`. |
+| user | String | No | - | Connection instance user name |
+| password | String | No | - | Connection instance password |
+| query | String | Yes | - | Query statement |
+| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete |
+| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. |
+| partition_lower_bound | Long | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. |
+| partition_upper_bound | Long | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. |
+| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism |
+| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
the row fetch size used in the query toimprove performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. |
+| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
+
+### Tips
+
+> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks.
+
+## Task Example
+
+### Simple:
+
+> This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console.
+
+```
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 2
+ job.mode = "BATCH"
+}
+source{
+ Jdbc {
+ url = "jdbc:db2://127.0.0.1:50000/dbname"
+ driver = "com.ibm.db2.jdbc.app.DB2Driver"
+ connection_check_timeout_sec = 100
+ user = "root"
+ password = "123456"
+ query = "select * from table_xxx"
+ }
+}
+
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/transform-v2/sql
+}
+
+sink {
+ Console {}
+}
+```
+
+### Parallel:
+
+> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table
+
+```
+source {
+ Jdbc {
+ url = "jdbc:db2://127.0.0.1:50000/dbname"
+ driver = "com.ibm.db2.jdbc.app.DB2Driver"
+ connection_check_timeout_sec = 100
+ user = "root"
+ password = "123456"
+ # Define query logic as required
+ query = "select * from type_bin"
+ # Parallel sharding reads fields
+ partition_column = "id"
+ # Number of fragments
+ partition_num = 10
+ }
+}
+```
+
+### Parallel Boundary:
+
+> It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured
+
+```
+source {
+ Jdbc {
+ url = "jdbc:db2://127.0.0.1:50000/dbname"
+ driver = "com.ibm.db2.jdbc.app.DB2Driver"
+ connection_check_timeout_sec = 100
+ user = "root"
+ password = "123456"
+ # Define query logic as required
+ query = "select * from type_bin"
+ partition_column = "id"
+ # Read start boundary
+ partition_lower_bound = 1
+ # Read end boundary
+ partition_upper_bound = 500
+ partition_num = 10
+ }
+}
+```
+
diff --git a/docs/en/connector-v2/source/FtpFile.md b/docs/en/connector-v2/source/FtpFile.md
index b550bde8baac..c692a7483a6d 100644
--- a/docs/en/connector-v2/source/FtpFile.md
+++ b/docs/en/connector-v2/source/FtpFile.md
@@ -48,6 +48,7 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you
| schema | config | no | - |
| common-options | | no | - |
| sheet_name | string | no | - |
+| file_filter_pattern | string | no | - |
### host [string]
@@ -225,7 +226,7 @@ Source plugin common parameters, please refer to [Source Common Options](common-
### sheet_name [string]
-Reader the sheet of the workbook,Only used when file_format is excel.
+Reader the sheet of the workbook,Only used when file_format_type is excel.
## Example
diff --git a/docs/en/connector-v2/source/HdfsFile.md b/docs/en/connector-v2/source/HdfsFile.md
index d255f4fd3a7c..f479e40a2bc2 100644
--- a/docs/en/connector-v2/source/HdfsFile.md
+++ b/docs/en/connector-v2/source/HdfsFile.md
@@ -53,6 +53,7 @@ Read all the data in a split in a pollNext call. What splits are read will be sa
| schema | config | no | - |
| common-options | | no | - |
| sheet_name | string | no | - |
+| file_filter_pattern | string | no | - |
### path [string]
@@ -243,7 +244,11 @@ Source plugin common parameters, please refer to [Source Common Options](common-
### sheet_name [string]
-Reader the sheet of the workbook,Only used when file_format is excel.
+Reader the sheet of the workbook,Only used when file_format_type is excel.
+
+### file_filter_pattern [string]
+
+Filter pattern, which used for filtering files.
## Example
diff --git a/docs/en/connector-v2/source/Hudi.md b/docs/en/connector-v2/source/Hudi.md
index cb3b154d58b6..ffe17f7de715 100644
--- a/docs/en/connector-v2/source/Hudi.md
+++ b/docs/en/connector-v2/source/Hudi.md
@@ -2,69 +2,67 @@
> Hudi source connector
-## Description
+## Support Those Engines
-Used to read data from Hudi. Currently, only supports hudi cow table and Snapshot Query with Batch Mode.
+> Spark
+> Flink
+> SeaTunnel Zeta
-In order to use this connector, You must ensure your spark/flink cluster already integrated hive. The tested hive version is 2.3.9.
-
-## Key features
+## Key Features
- [x] [batch](../../concept/connector-v2-features.md)
-
-Currently, only supports hudi cow table and Snapshot Query with Batch Mode
-
- [ ] [stream](../../concept/connector-v2-features.md)
- [x] [exactly-once](../../concept/connector-v2-features.md)
- [ ] [column projection](../../concept/connector-v2-features.md)
- [x] [parallelism](../../concept/connector-v2-features.md)
- [ ] [support user-defined split](../../concept/connector-v2-features.md)
-## Options
-
-| name | type | required | default value |
-|-------------------------|---------|------------------------------|---------------|
-| table.path | string | yes | - |
-| table.type | string | yes | - |
-| conf.files | string | yes | - |
-| use.kerberos | boolean | no | false |
-| kerberos.principal | string | yes when use.kerberos = true | - |
-| kerberos.principal.file | string | yes when use.kerberos = true | - |
-| common-options | config | no | - |
-
-### table.path [string]
-
-`table.path` The hdfs root path of hudi table,such as 'hdfs://nameserivce/data/hudi/hudi_table/'.
+## Description
-### table.type [string]
+Used to read data from Hudi. Currently, only supports hudi cow table and Snapshot Query with Batch Mode.
-`table.type` The type of hudi table. Now we only support 'cow', 'mor' is not support yet.
+In order to use this connector, You must ensure your spark/flink cluster already integrated hive. The tested hive version is 2.3.9.
-### conf.files [string]
+## Supported DataSource Info
-`conf.files` The environment conf file path list(local path), which used to init hdfs client to read hudi table file. The example is '/home/test/hdfs-site.xml;/home/test/core-site.xml;/home/test/yarn-site.xml'.
+:::tip
-### use.kerberos [boolean]
+* Currently, only supports Hudi cow table and Snapshot Query with Batch Mode
-`use.kerberos` Whether to enable Kerberos, default is false.
+:::
-### kerberos.principal [string]
+## Data Type Mapping
-`kerberos.principal` When use kerberos, we should set kerberos princal such as 'test_user@xxx'.
+| Hudi Data type | Seatunnel Data type |
+|----------------|---------------------|
+| ALL TYPE | STRING |
-### kerberos.principal.file [string]
+## Source Options
-`kerberos.principal.file` When use kerberos, we should set kerberos princal file such as '/home/test/test_user.keytab'.
+| Name | Type | Required | Default | Description |
+|-------------------------|--------|------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| table.path | String | Yes | - | The hdfs root path of hudi table,such as 'hdfs://nameserivce/data/hudi/hudi_table/'. |
+| table.type | String | Yes | - | The type of hudi table. Now we only support 'cow', 'mor' is not support yet. |
+| conf.files | String | Yes | - | The environment conf file path list(local path), which used to init hdfs client to read hudi table file. The example is '/home/test/hdfs-site.xml;/home/test/core-site.xml;/home/test/yarn-site.xml'. |
+| use.kerberos | bool | No | false | Whether to enable Kerberos, default is false. |
+| kerberos.principal | String | yes when use.kerberos = true | - | When use kerberos, we should set kerberos principal such as 'test_user@xxx'. |
+| kerberos.principal.file | string | yes when use.kerberos = true | - | When use kerberos, we should set kerberos principal file such as '/home/test/test_user.keytab'. |
+| common-options | config | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. |
-### common options
+## Task Example
-Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details.
+### Simple:
-## Examples
+> This example reads from a Hudi COW table and configures Kerberos for the environment, printing to the console.
```hocon
-source {
-
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 2
+ job.mode = "BATCH"
+}
+source{
Hudi {
table.path = "hdfs://nameserivce/data/hudi/hudi_table/"
table.type = "cow"
@@ -73,7 +71,15 @@ source {
kerberos.principal = "test_user@xxx"
kerberos.principal.file = "/home/test/test_user.keytab"
}
+}
+
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/transform-v2/sql/
+}
+sink {
+ Console {}
}
```
diff --git a/docs/en/connector-v2/source/LocalFile.md b/docs/en/connector-v2/source/LocalFile.md
index d33288b7a57a..80adfa6d9ad9 100644
--- a/docs/en/connector-v2/source/LocalFile.md
+++ b/docs/en/connector-v2/source/LocalFile.md
@@ -49,6 +49,7 @@ Read all the data in a split in a pollNext call. What splits are read will be sa
| schema | config | no | - |
| common-options | | no | - |
| sheet_name | string | no | - |
+| file_filter_pattern | string | no | - |
### path [string]
@@ -223,7 +224,11 @@ Source plugin common parameters, please refer to [Source Common Options](common-
### sheet_name [string]
-Reader the sheet of the workbook,Only used when file_format is excel.
+Reader the sheet of the workbook,Only used when file_format_type is excel.
+
+### file_filter_pattern [string]
+
+Filter pattern, which used for filtering files.
## Example
diff --git a/docs/en/connector-v2/source/MongoDB-CDC.md b/docs/en/connector-v2/source/MongoDB-CDC.md
new file mode 100644
index 000000000000..d78f70110fc1
--- /dev/null
+++ b/docs/en/connector-v2/source/MongoDB-CDC.md
@@ -0,0 +1,311 @@
+# MongoDB CDC
+
+> MongoDB CDC source connector
+
+## Support Those Engines
+
+> SeaTunnel Zeta
+
+## Key Features
+
+- [ ] [batch](../../concept/connector-v2-features.md)
+- [x] [stream](../../concept/connector-v2-features.md)
+- [x] [exactly-once](../../concept/connector-v2-features.md)
+- [ ] [column projection](../../concept/connector-v2-features.md)
+- [x] [parallelism](../../concept/connector-v2-features.md)
+- [x] [support user-defined split](../../concept/connector-v2-features.md)
+
+## Description
+
+The MongoDB CDC connector allows for reading snapshot data and incremental data from MongoDB database.
+
+## Supported DataSource Info
+
+In order to use the Mongodb CDC connector, the following dependencies are required.
+They can be downloaded via install-plugin.sh or from the Maven central repository.
+
+| Datasource | Supported Versions | Dependency |
+|------------|--------------------|-------------------------------------------------------------------------------------------------------------------|
+| MongoDB | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-cdc-mongodb) |
+
+## Availability Settings
+
+1.MongoDB version: MongoDB version >= 4.0.
+
+2.Cluster deployment: replica sets or sharded clusters.
+
+3.Storage Engine: WiredTiger Storage Engine.
+
+4.Permissions:changeStream and read
+
+```shell
+use admin;
+db.createRole(
+ {
+ role: "strole",
+ privileges: [{
+ resource: { db: "", collection: "" },
+ actions: [
+ "splitVector",
+ "listDatabases",
+ "listCollections",
+ "collStats",
+ "find",
+ "changeStream" ]
+ }],
+ roles: [
+ { role: 'read', db: 'config' }
+ ]
+ }
+);
+
+db.createUser(
+ {
+ user: 'stuser',
+ pwd: 'stpw',
+ roles: [
+ { role: 'strole', db: 'admin' }
+ ]
+ }
+);
+```
+
+## Data Type Mapping
+
+The following table lists the field data type mapping from MongoDB BSON type to Seatunnel data type.
+
+| MongoDB BSON type | Seatunnel Data type |
+|-------------------|---------------------|
+| ObjectId | STRING |
+| String | STRING |
+| Boolean | BOOLEAN |
+| Binary | BINARY |
+| Int32 | INTEGER |
+| Int64 | BIGINT |
+| Double | DOUBLE |
+| Decimal128 | DECIMAL |
+| Date | DATE |
+| Timestamp | TIMESTAMP |
+| Object | ROW |
+| Array | ARRAY |
+
+For specific types in MongoDB, we use Extended JSON format to map them to Seatunnel STRING type.
+
+| MongoDB BSON type | Seatunnel STRING |
+|-------------------|----------------------------------------------------------------------------------------------|
+| Symbol | {"_value": {"$symbol": "12"}} |
+| RegularExpression | {"_value": {"$regularExpression": {"pattern": "^9$", "options": "i"}}} |
+| JavaScript | {"_value": {"$code": "function() { return 10; }"}} |
+| DbPointer | {"_value": {"$dbPointer": {"$ref": "db.coll", "$id": {"$oid": "63932a00da01604af329e33c"}}}} |
+
+**Tips**
+
+> 1.When using the DECIMAL type in SeaTunnel, be aware that the maximum range cannot exceed 34 digits, which means you should use decimal(34, 18).
+
+## Source Options
+
+| Name | Type | Required | Default | Description |
+|------------------------------------|--------|----------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| hosts | String | Yes | - | The comma-separated list of hostname and port pairs of the MongoDB servers. eg. `localhost:27017,localhost:27018` |
+| username | String | No | - | Name of the database user to be used when connecting to MongoDB. |
+| password | String | No | - | Password to be used when connecting to MongoDB. |
+| database | List | Yes | - | Name of the database to watch for changes. If not set then all databases will be captured. The database also supports regular expressions to monitor multiple databases matching the regular expression. eg. `db1,db2`. |
+| collection | List | Yes | - | Name of the collection in the database to watch for changes. If not set then all collections will be captured. The collection also supports regular expressions to monitor multiple collections matching fully-qualified collection identifiers. eg. `db1.coll1,db2.coll2`. |
+| connection.options | String | No | - | The ampersand-separated connection options of MongoDB. eg. `replicaSet=test&connectTimeoutMS=300000`. |
+| batch.size | Long | No | 1024 | The cursor batch size. |
+| poll.max.batch.size | Enum | No | 1024 | Maximum number of change stream documents to include in a single batch when polling for new data. |
+| poll.await.time.ms | Long | No | 1000 | The amount of time to wait before checking for new results on the change stream. |
+| heartbeat.interval.ms | String | No | 0 | The length of time in milliseconds between sending heartbeat messages. Use 0 to disable. |
+| incremental.snapshot.chunk.size.mb | Long | No | 64 | The chunk size mb of incremental snapshot. |
+| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. |
+
+### Tips:
+
+> 1.If the collection changes at a slow pace, it is strongly recommended to set an appropriate value greater than 0 for the heartbeat.interval.ms parameter. When we recover a Seatunnel job from a checkpoint or savepoint, the heartbeat events can push the resumeToken forward to avoid its expiration.
+> 2.MongoDB has a limit of 16MB for a single document. Change documents include additional information, so even if the original document is not larger than 15MB, the change document may exceed the 16MB limit, resulting in the termination of the Change Stream operation.
+> 3.It is recommended to use immutable shard keys. In MongoDB, shard keys allow modifications after transactions are enabled, but changing the shard key can cause frequent shard migrations, resulting in additional performance overhead. Additionally, modifying the shard key can also cause the Update Lookup feature to become ineffective, leading to inconsistent results in CDC (Change Data Capture) scenarios.
+
+## How to Create a MongoDB CDC Data Synchronization Jobs
+
+### CDC Data Print to Client
+
+The following example demonstrates how to create a data synchronization job that reads cdc data from MongoDB and prints it on the local client:
+
+```hocon
+env {
+ # You can set engine configuration here
+ execution.parallelism = 1
+ job.mode = "STREAMING"
+ execution.checkpoint.interval = 5000
+}
+
+source {
+ MongoDB-CDC {
+ hosts = "mongo0:27017"
+ database = ["inventory"]
+ collection = ["inventory.products"]
+ username = stuser
+ password = stpw
+ schema = {
+ fields {
+ "_id" : string,
+ "name" : string,
+ "description" : string,
+ "weight" : string
+ }
+ }
+ }
+}
+
+# Console printing of the read Mongodb data
+sink {
+ Console {
+ parallelism = 1
+ }
+}
+```
+
+## CDC Data Write to MysqlDB
+
+The following example demonstrates how to create a data synchronization job that reads cdc data from MongoDB and write to mysql database:
+
+```hocon
+env {
+ # You can set engine configuration here
+ execution.parallelism = 1
+ job.mode = "STREAMING"
+ execution.checkpoint.interval = 5000
+}
+
+source {
+ MongoDB-CDC {
+ hosts = "mongo0:27017"
+ database = ["inventory"]
+ collection = ["inventory.products"]
+ username = stuser
+ password = stpw
+ }
+}
+
+sink {
+ jdbc {
+ url = "jdbc:mysql://mysql_cdc_e2e:3306"
+ driver = "com.mysql.cj.jdbc.Driver"
+ user = "st_user"
+ password = "seatunnel"
+
+ generate_sink_sql = true
+ # You need to configure both database and table
+ database = mongodb_cdc
+ table = products
+ primary_keys = ["_id"]
+ }
+}
+```
+
+## Multi-table Synchronization
+
+The following example demonstrates how to create a data synchronization job that read the cdc data of multiple library tables mongodb and prints it on the local client:
+
+```hocon
+env {
+ # You can set engine configuration here
+ execution.parallelism = 1
+ job.mode = "STREAMING"
+ execution.checkpoint.interval = 5000
+}
+
+source {
+ MongoDB-CDC {
+ hosts = "mongo0:27017"
+ database = ["inventory","crm"]
+ collection = ["inventory.products","crm.test"]
+ username = stuser
+ password = stpw
+ }
+}
+
+# Console printing of the read Mongodb data
+sink {
+ Console {
+ parallelism = 1
+ }
+}
+```
+
+### Tips:
+
+> 1.The cdc synchronization of multiple library tables cannot specify the schema, and can only output json data downstream.
+> This is because MongoDB does not provide metadata information for querying, so if you want to support multiple tables, all tables can only be read as one structure.
+
+## Regular Expression Matching for Multiple Tables
+
+The following example demonstrates how to create a data synchronization job that through regular expression read the data of multiple library tables mongodb and prints it on the local client:
+
+| Matching example | Expressions | | Describe |
+|------------------|-------------|---|----------------------------------------------------------------------------------------|
+| Prefix matching | ^(test).* | | Match the database name or table name with the prefix test, such as test1, test2, etc. |
+| Suffix matching | .*[p$] | | Match the database name or table name with the suffix p, such as cdcp, edcp, etc. |
+
+```hocon
+env {
+ # You can set engine configuration here
+ execution.parallelism = 1
+ job.mode = "STREAMING"
+ execution.checkpoint.interval = 5000
+}
+
+source {
+ MongoDB-CDC {
+ hosts = "mongo0:27017"
+ # So this example is used (^(test).*|^(tpc).*|txc|.*[p$]|t{2}).(t[5-8]|tt),matching txc.tt、test2.test5.
+ database = ["(^(test).*|^(tpc).*|txc|.*[p$]|t{2})"]
+ collection = ["(t[5-8]|tt)"]
+ username = stuser
+ password = stpw
+ }
+}
+
+# Console printing of the read Mongodb data
+sink {
+ Console {
+ parallelism = 1
+ }
+}
+```
+
+## Format of real-time streaming data
+
+```shell
+{
+ _id : { }, // Identifier of the open change stream, can be assigned to the 'resumeAfter' parameter for subsequent resumption of this change stream
+ "operationType" : "", // The type of change operation that occurred, such as: insert, delete, update, etc.
+ "fullDocument" : { }, // The full document data involved in the change operation. This field does not exist in delete operations
+ "ns" : {
+ "db" : "", // The database where the change operation occurred
+ "coll" : "" // The collection where the change operation occurred
+ },
+ "to" : { // These fields are displayed only when the operation type is 'rename'
+ "db" : "", // The new database name after the change
+ "coll" : "" // The new collection name after the change
+ },
+ "source":{
+ "ts_ms":"", // The timestamp when the change operation occurred
+ "table":"" // The collection where the change operation occurred
+ "db":"", // The database where the change operation occurred
+ "snapshot":"false" // Identify the current stage of data synchronization
+ },
+ "documentKey" : { "_id" : }, // The _id field value of the document involved in the change operation
+ "updateDescription" : { // Description of the update operation
+ "updatedFields" : { }, // The fields and values that the update operation modified
+ "removedFields" : [ "", ... ] // The fields and values that the update operation removed
+ }
+ "clusterTime" : , // The timestamp of the Oplog log entry corresponding to the change operation
+ "txnNumber" : , // If the change operation is executed in a multi-document transaction, this field and value are displayed, representing the transaction number
+ "lsid" : { // Represents information related to the Session in which the transaction is located
+ "id" : ,
+ "uid" :
+ }
+}
+```
+
diff --git a/docs/en/connector-v2/source/OssFile.md b/docs/en/connector-v2/source/OssFile.md
index 532b4d03aa79..7c992581f5a6 100644
--- a/docs/en/connector-v2/source/OssFile.md
+++ b/docs/en/connector-v2/source/OssFile.md
@@ -56,6 +56,7 @@ Read all the data in a split in a pollNext call. What splits are read will be sa
| schema | config | no | - |
| common-options | | no | - |
| sheet_name | string | no | - |
+| file_filter_pattern | string | no | - |
### path [string]
@@ -246,7 +247,7 @@ Source plugin common parameters, please refer to [Source Common Options](common-
### sheet_name [string]
-Reader the sheet of the workbook,Only used when file_format is excel.
+Reader the sheet of the workbook,Only used when file_format_type is excel.
## Example
@@ -282,6 +283,10 @@ Reader the sheet of the workbook,Only used when file_format is excel.
```
+### file_filter_pattern [string]
+
+Filter pattern, which used for filtering files.
+
## Changelog
### 2.2.0-beta 2022-09-26
diff --git a/docs/en/connector-v2/source/OssJindoFile.md b/docs/en/connector-v2/source/OssJindoFile.md
index 3e3649e19b92..f77c4a4543a9 100644
--- a/docs/en/connector-v2/source/OssJindoFile.md
+++ b/docs/en/connector-v2/source/OssJindoFile.md
@@ -56,6 +56,7 @@ Read all the data in a split in a pollNext call. What splits are read will be sa
| schema | config | no | - |
| common-options | | no | - |
| sheet_name | string | no | - |
+| file_filter_pattern | string | no | - |
### path [string]
@@ -246,7 +247,11 @@ Source plugin common parameters, please refer to [Source Common Options](common-
### sheet_name [string]
-Reader the sheet of the workbook,Only used when file_format is excel.
+Reader the sheet of the workbook,Only used when file_format_type is excel.
+
+### file_filter_pattern [string]
+
+Filter pattern, which used for filtering files.
## Example
diff --git a/docs/en/connector-v2/source/SftpFile.md b/docs/en/connector-v2/source/SftpFile.md
index 500ec2af5b57..184a587a9286 100644
--- a/docs/en/connector-v2/source/SftpFile.md
+++ b/docs/en/connector-v2/source/SftpFile.md
@@ -47,6 +47,7 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you
| schema | config | no | - |
| common-options | | no | - |
| sheet_name | string | no | - |
+| file_filter_pattern | string | no | - |
### host [string]
@@ -224,7 +225,11 @@ Source plugin common parameters, please refer to [Source Common Options](common-
### sheet_name [string]
-Reader the sheet of the workbook,Only used when file_format is excel.
+Reader the sheet of the workbook,Only used when file_format_type is excel.
+
+### file_filter_pattern [string]
+
+Filter pattern, which used for filtering files.
## Example
diff --git a/docs/en/connector-v2/source/Vertica.md b/docs/en/connector-v2/source/Vertica.md
new file mode 100644
index 000000000000..66f18e7a4ed4
--- /dev/null
+++ b/docs/en/connector-v2/source/Vertica.md
@@ -0,0 +1,157 @@
+# Vertica
+
+> JDBC Vertica Source Connector
+
+## Support Those Engines
+
+> Spark
+> Flink
+> SeaTunnel Zeta
+
+## Key Features
+
+- [x] [batch](../../concept/connector-v2-features.md)
+- [ ] [stream](../../concept/connector-v2-features.md)
+- [x] [exactly-once](../../concept/connector-v2-features.md)
+- [x] [column projection](../../concept/connector-v2-features.md)
+- [x] [parallelism](../../concept/connector-v2-features.md)
+- [x] [support user-defined split](../../concept/connector-v2-features.md)
+
+> supports query SQL and can achieve projection effect.
+
+## Description
+
+Read external data source data through JDBC.
+
+## Supported DataSource Info
+
+| Datasource | Supported versions | Driver | Url | Maven |
+|------------|----------------------------------------------------------|-------------------------|---------------------------------------|----------------------------------------------------------------------|
+| Vertica | Different dependency version has different driver class. | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433/vertica | [Download](https://www.vertica.com/download/vertica/client-drivers/) |
+
+## Database Dependency
+
+> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example Vertica datasource: cp vertica-jdbc-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/
+
+## Data Type Mapping
+
+| Vertical Data type | SeaTunnel Data type |
+|-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------|
+| BIT | BOOLEAN |
+| TINYINT
TINYINT UNSIGNED
SMALLINT
SMALLINT UNSIGNED
MEDIUMINT
MEDIUMINT UNSIGNED
INT
INTEGER
YEAR | INT |
+| INT UNSIGNED
INTEGER UNSIGNED
BIGINT | LONG |
+| BIGINT UNSIGNED | DECIMAL(20,0) |
+| DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) |
+| DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) |
+| DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the decimal point.))) |
+| FLOAT
FLOAT UNSIGNED | FLOAT |
+| DOUBLE
DOUBLE UNSIGNED | DOUBLE |
+| CHAR
VARCHAR
TINYTEXT
MEDIUMTEXT
TEXT
LONGTEXT
JSON | STRING |
+| DATE | DATE |
+| TIME | TIME |
+| DATETIME
TIMESTAMP | TIMESTAMP |
+| TINYBLOB
MEDIUMBLOB
BLOB
LONGBLOB
BINARY
VARBINAR
BIT(n) | BYTES |
+| GEOMETRY
UNKNOWN | Not supported yet |
+
+## Source Options
+
+| Name | Type | Required | Default | Description |
+|------------------------------|--------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:vertica://localhost:5433/vertica |
+| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use Vertica the value is `com.vertica.jdbc.Driver`. |
+| user | String | No | - | Connection instance user name |
+| password | String | No | - | Connection instance password |
+| query | String | Yes | - | Query statement |
+| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete |
+| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. |
+| partition_lower_bound | Long | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. |
+| partition_upper_bound | Long | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. |
+| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism |
+| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
the row fetch size used in the query toimprove performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. |
+| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
+
+### Tips
+
+> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks.
+
+## Task Example
+
+### Simple:
+
+> This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console.
+
+```
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 2
+ job.mode = "BATCH"
+}
+source{
+ Jdbc {
+ url = "jdbc:vertica://localhost:5433/vertica"
+ driver = "com.vertica.jdbc.Driver"
+ connection_check_timeout_sec = 100
+ user = "root"
+ password = "123456"
+ query = "select * from type_bin limit 16"
+ }
+}
+
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/transform-v2/sql
+}
+
+sink {
+ Console {}
+}
+```
+
+### Parallel:
+
+> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table
+
+```
+source {
+ Jdbc {
+ url = "jdbc:vertica://localhost:5433/vertica"
+ driver = "com.vertica.jdbc.Driver"
+ connection_check_timeout_sec = 100
+ user = "root"
+ password = "123456"
+ # Define query logic as required
+ query = "select * from type_bin"
+ # Parallel sharding reads fields
+ partition_column = "id"
+ # Number of fragments
+ partition_num = 10
+ }
+}
+```
+
+### Parallel Boundary:
+
+> It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured
+
+```
+source {
+ Jdbc {
+ url = "jdbc:vertica://localhost:5433/vertica"
+ driver = "com.vertica.jdbc.Driver"
+ connection_check_timeout_sec = 100
+ user = "root"
+ password = "123456"
+ # Define query logic as required
+ query = "select * from type_bin"
+ partition_column = "id"
+ # Read start boundary
+ partition_lower_bound = 1
+ # Read end boundary
+ partition_upper_bound = 500
+ partition_num = 10
+ }
+}
+```
+
diff --git a/docs/en/connector-v2/source/kafka.md b/docs/en/connector-v2/source/kafka.md
index 06f60af6d879..16b9c5420b3f 100644
--- a/docs/en/connector-v2/source/kafka.md
+++ b/docs/en/connector-v2/source/kafka.md
@@ -2,11 +2,13 @@
> Kafka source connector
-## Description
+## Support Those Engines
-Source connector for Apache Kafka.
+> Spark
+> Flink
+> Seatunnel Zeta
-## Key features
+## Key Features
- [x] [batch](../../concept/connector-v2-features.md)
- [x] [stream](../../concept/connector-v2-features.md)
@@ -15,109 +17,54 @@ Source connector for Apache Kafka.
- [x] [parallelism](../../concept/connector-v2-features.md)
- [ ] [support user-defined split](../../concept/connector-v2-features.md)
-## Options
-
-| name | type | required | default value |
-|-------------------------------------|---------|----------|--------------------------|
-| topic | String | yes | - |
-| bootstrap.servers | String | yes | - |
-| pattern | Boolean | no | false |
-| consumer.group | String | no | SeaTunnel-Consumer-Group |
-| commit_on_checkpoint | Boolean | no | true |
-| kafka.config | Map | no | - |
-| common-options | config | no | - |
-| schema | | no | - |
-| format | String | no | json |
-| format_error_handle_way | String | no | fail |
-| field_delimiter | String | no | , |
-| start_mode | String | no | group_offsets |
-| start_mode.offsets | | no | |
-| start_mode.timestamp | Long | no | |
-| partition-discovery.interval-millis | long | no | -1 |
-
-### topic [string]
-
-`Kafka topic` name. If there are multiple `topics`, use `,` to split, for example: `"tpc1,tpc2"`.
-
-### bootstrap.servers [string]
-
-`Kafka` cluster address, separated by `","`.
-
-### pattern [boolean]
-
-If `pattern` is set to `true`,the regular expression for a pattern of topic names to read from. All topics in clients with names that match the specified regular expression will be subscribed by the consumer.
-
-### consumer.group [string]
-
-`Kafka consumer group id`, used to distinguish different consumer groups.
-
-### commit_on_checkpoint [boolean]
-
-If true the consumer's offset will be periodically committed in the background.
-
-## partition-discovery.interval-millis [long]
-
-The interval for dynamically discovering topics and partitions.
-
-### kafka.config [map]
-
-In addition to the above necessary parameters that must be specified by the `Kafka consumer` client, users can also specify multiple `consumer` client non-mandatory parameters, covering [all consumer parameters specified in the official Kafka document](https://kafka.apache.org/documentation.html#consumerconfigs).
-
-### common-options [config]
-
-Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details.
-
-### schema
-
-The structure of the data, including field names and field types.
-
-## format
-
-Data format. The default format is json. Optional text format. The default field separator is ", ".
-If you customize the delimiter, add the "field_delimiter" option.
-
-## format_error_handle_way
-
-The processing method of data format error. The default value is fail, and the optional value is (fail, skip).
-When fail is selected, data format error will block and an exception will be thrown.
-When skip is selected, data format error will skip this line data.
-
-## field_delimiter
-
-Customize the field delimiter for data format.
-
-## start_mode
-
-The initial consumption pattern of consumers,there are several types:
-[earliest],[group_offsets],[latest],[specific_offsets],[timestamp]
-
-## start_mode.timestamp
-
-The time required for consumption mode to be "timestamp".
-
-## start_mode.offsets
-
-The offset required for consumption mode to be specific_offsets.
-
-for example:
+## Description
-```hocon
-start_mode.offsets = {
- info-0 = 70
- info-1 = 10
- info-2 = 10
- }
-```
+Source connector for Apache Kafka.
-## Example
+## Supported DataSource Info
+
+In order to use the Kafka connector, the following dependencies are required.
+They can be downloaded via install-plugin.sh or from the Maven central repository.
+
+| Datasource | Supported Versions | Maven |
+|------------|--------------------|-------------------------------------------------------------------------------------------------------------|
+| Kafka | Universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-kafka) |
+
+## Source Options
+
+| Name | Type | Required | Default | Description |
+|-------------------------------------|-----------------------------------------------------------------------------|----------|--------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| topic | String | Yes | - | Topic name(s) to read data from when the table is used as source. It also supports topic list for source by separating topic by comma like 'topic-1,topic-2'. |
+| bootstrap.servers | String | Yes | - | Comma separated list of Kafka brokers. |
+| pattern | Boolean | No | false | If `pattern` is set to `true`,the regular expression for a pattern of topic names to read from. All topics in clients with names that match the specified regular expression will be subscribed by the consumer. |
+| consumer.group | String | No | SeaTunnel-Consumer-Group | `Kafka consumer group id`, used to distinguish different consumer groups. |
+| commit_on_checkpoint | Boolean | No | true | If true the consumer's offset will be periodically committed in the background. |
+| kafka.config | Map | No | - | In addition to the above necessary parameters that must be specified by the `Kafka consumer` client, users can also specify multiple `consumer` client non-mandatory parameters, covering [all consumer parameters specified in the official Kafka document](https://kafka.apache.org/documentation.html#consumerconfigs). |
+| schema | Config | No | - | The structure of the data, including field names and field types. |
+| format | String | No | json | Data format. The default format is json. Optional text format, canal-json and debezium-json.If you use json or text format. The default field separator is ", ". If you customize the delimiter, add the "field_delimiter" option.If you use canal format, please refer to [canal-json](../formats/canal-json.md) for details.If you use debezium format, please refer to [debezium-json](../formats/debezium-json.md) for details. |
+| format_error_handle_way | String | No | fail | The processing method of data format error. The default value is fail, and the optional value is (fail, skip). When fail is selected, data format error will block and an exception will be thrown. When skip is selected, data format error will skip this line data. |
+| field_delimiter | String | No | , | Customize the field delimiter for data format. |
+| start_mode | StartMode[earliest],[group_offsets],[latest],[specific_offsets],[timestamp] | No | group_offsets | The initial consumption pattern of consumers. |
+| start_mode.offsets | Config | No | - | The offset required for consumption mode to be specific_offsets. |
+| start_mode.timestamp | Long | No | - | The time required for consumption mode to be "timestamp". |
+| partition-discovery.interval-millis | Long | No | -1 | The interval for dynamically discovering topics and partitions. |
+| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
+
+## Task Example
### Simple
+> This example reads the data of kafka's topic_1, topic_2, topic_3 and prints it to the client.And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in Install SeaTunnel to install and deploy SeaTunnel. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job.
+
```hocon
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 2
+ job.mode = "BATCH"
+}
source {
-
Kafka {
- result_table_name = "kafka_name"
schema = {
fields {
name = "string"
@@ -134,8 +81,10 @@ source {
auto.offset.reset = "earliest"
enable.auto.commit = "false"
}
- }
-
+ }
+}
+sink {
+ Console {}
}
```
@@ -143,14 +92,12 @@ source {
```hocon
source {
-
Kafka {
topic = ".*seatunnel*."
pattern = "true"
bootstrap.servers = "localhost:9092"
consumer.group = "seatunnel_group"
}
-
}
```
@@ -167,7 +114,7 @@ source {
kafka.config = {
security.protocol=SASL_SSL
sasl.mechanism=SCRAM-SHA-512
- sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required \nusername=${username}\npassword=${password};"
+ sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required username=\"username\" password=\"password\";"
#security.protocol=SASL_SSL
#sasl.mechanism=AWS_MSK_IAM
#sasl.jaas.config="software.amazon.msk.auth.iam.IAMLoginModule required;"
@@ -203,7 +150,7 @@ source {
kafka.config = {
#security.protocol=SASL_SSL
#sasl.mechanism=SCRAM-SHA-512
- #sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required \nusername=${username}\npassword=${password};"
+ #sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required username=\"username\" password=\"password\";"
security.protocol=SASL_SSL
sasl.mechanism=AWS_MSK_IAM
sasl.jaas.config="software.amazon.msk.auth.iam.IAMLoginModule required;"
@@ -213,17 +160,3 @@ source {
}
```
-## Changelog
-
-### 2.3.0-beta 2022-10-20
-
-- Add Kafka Source Connector
-
-### Next Version
-
-- [Improve] Support setting read starting offset or time at startup config ([3157](https://github.com/apache/seatunnel/pull/3157))
-- [Improve] Support for dynamic discover topic & partition in streaming mode ([3125](https://github.com/apache/seatunnel/pull/3125))
-- [Improve] Change Connector Custom Config Prefix To Map [3719](https://github.com/apache/seatunnel/pull/3719)
-- [Bug] Fixed the problem that parsing the offset format failed when the startup mode was offset([3810](https://github.com/apache/seatunnel/pull/3810))
-- [Feature] Kafka source supports data deserialization failure skipping([4364](https://github.com/apache/seatunnel/pull/4364))
-
diff --git a/plugin-mapping.properties b/plugin-mapping.properties
index de6593b4523c..551da2c7cecf 100644
--- a/plugin-mapping.properties
+++ b/plugin-mapping.properties
@@ -47,8 +47,10 @@ seatunnel.source.LocalFile = connector-file-local
seatunnel.sink.LocalFile = connector-file-local
seatunnel.source.OssFile = connector-file-oss
seatunnel.sink.OssFile = connector-file-oss
-seatunnel.source.OssJindoFile = connector-file-oss-jindo
-seatunnel.sink.OssJindoFile = connector-file-oss-jindo
+seatunnel.source.OssJindoFile = connector-file-jindo-oss
+seatunnel.sink.OssJindoFile = connector-file-jindo-oss
+seatunnel.source.CosFile = connector-file-cos
+seatunnel.sink.CosFile = connector-file-cos
seatunnel.source.Pulsar = connector-pulsar
seatunnel.source.Hudi = connector-hudi
seatunnel.sink.DingTalk = connector-dingtalk
@@ -99,6 +101,7 @@ seatunnel.sink.Doris = connector-doris
seatunnel.source.Maxcompute = connector-maxcompute
seatunnel.sink.Maxcompute = connector-maxcompute
seatunnel.source.MySQL-CDC = connector-cdc-mysql
+seatunnel.source.MongoDB-CDC = connector-cdc-mongodb
seatunnel.sink.S3Redshift = connector-s3-redshift
seatunnel.source.TDengine = connector-tdengine
seatunnel.sink.TDengine = connector-tdengine
diff --git a/pom.xml b/pom.xml
index 51b03a26d5bb..3d619644952b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -55,7 +55,7 @@
- 2.3.2-SNAPSHOT
+ 2.3.3-SNAPSHOT
2.1.1
UTF-8
1.8
@@ -141,11 +141,6 @@
4.2.0
true
-
- 3.0.0
- 2.4.7
- 3.1.4
- 4.1.60.Final
@@ -452,39 +447,6 @@
provided
-
-
- org.apache.hadoop
- hadoop-aliyun
- ${hadoop-aliyun.version}
- provided
-
-
- net.minidev
- json-smart
-
-
-
-
-
- net.minidev
- json-smart
- ${json-smart.version}
-
-
-
- org.apache.hadoop
- hadoop-aws
- ${hadoop-aws.version}
- provided
-
-
-
- io.netty
- netty-buffer
- ${netty-buffer.version}
-
-
diff --git a/release-note.md b/release-note.md
index 68d14e609f31..b542b35a8148 100644
--- a/release-note.md
+++ b/release-note.md
@@ -3,9 +3,21 @@
## Bug fix
### Core
-
- [Core] [API] Fixed generic class loss for lists (#4421)
- [Core] [API] Fix parse nested row data type key changed upper (#4459)
+- [Starter][Flink]Support transform-v2 for flink #3396
+- [Flink] Support flink 1.14.x #3963
+- [Core][Translation][Spark] Fix SeaTunnelRowConvertor fail to convert when schema contains row type (#5170)
+
+### Transformer
+- [Spark] Support transform-v2 for spark (#3409)
+- [ALL]Add FieldMapper Transform #3781
+### Connectors
+- [Elasticsearch] Support https protocol & compatible with opensearch
+- [Hbase] Add hbase sink connector #4049
+### Formats
+- [Canal]Support read canal format message #3950
+- [Debezium]Support debezium canal format message #3981
### Connector-V2
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/Options.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/Options.java
index 432e931c235f..a4ce408d73b0 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/Options.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/Options.java
@@ -249,7 +249,7 @@ public static class SingleChoiceOptionBuilder {
* @param value The default value for the config option
* @return The config option with the default value.
*/
- public Option defaultValue(T value) {
+ public SingleChoiceOption defaultValue(T value) {
return new SingleChoiceOption(key, typeReference, optionValues, value);
}
@@ -258,7 +258,7 @@ public Option defaultValue(T value) {
*
* @return The config option without a default value.
*/
- public Option noDefaultValue() {
+ public SingleChoiceOption noDefaultValue() {
return new SingleChoiceOption(key, typeReference, optionValues, null);
}
}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/SingleChoiceOption.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/SingleChoiceOption.java
index fd3697f681f4..b3a6574e9ed7 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/SingleChoiceOption.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/SingleChoiceOption.java
@@ -23,7 +23,7 @@
import java.util.List;
-public class SingleChoiceOption extends Option {
+public class SingleChoiceOption extends Option {
@Getter private final List optionValues;
@@ -32,4 +32,10 @@ public SingleChoiceOption(
super(key, typeReference, defaultValue);
this.optionValues = optionValues;
}
+
+ @Override
+ public SingleChoiceOption withDescription(String description) {
+ this.description = description;
+ return this;
+ }
}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/DataSaveMode.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/DataSaveMode.java
index f269c9f2cb78..7ef849f6147f 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/DataSaveMode.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/DataSaveMode.java
@@ -31,6 +31,10 @@ public enum DataSaveMode {
// path and files in the path, create new files in the path.
KEEP_SCHEMA_AND_DATA,
+ // The connector provides custom processing methods, such as running user provided SQL or shell
+ // scripts, etc
+ CUSTOM_PROCESSING,
+
// Throw error when table is exists for MySQL. Throw error when path is exists.
ERROR_WHEN_EXISTS
}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SupportDataSaveMode.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SupportDataSaveMode.java
index 7d0c2838befb..46ea2e70e53b 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SupportDataSaveMode.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SupportDataSaveMode.java
@@ -17,55 +17,16 @@
package org.apache.seatunnel.api.sink;
-import org.apache.seatunnel.shade.com.typesafe.config.Config;
-
-import org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode;
-import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException;
-
-import java.util.List;
-import java.util.Locale;
-
/** The Sink Connectors which support data SaveMode should implement this interface */
public interface SupportDataSaveMode {
-
- /**
- * We hope every sink connector use the same option name to config SaveMode, So I add
- * checkOptions method to this interface. checkOptions method have a default implement to check
- * whether `save_mode` parameter is in config.
- *
- * @param config config of sink Connector
- */
- default void checkOptions(Config config) {
- if (config.hasPath(SinkCommonOptions.DATA_SAVE_MODE)) {
- String tableSaveMode = config.getString(SinkCommonOptions.DATA_SAVE_MODE);
- DataSaveMode dataSaveMode =
- DataSaveMode.valueOf(tableSaveMode.toUpperCase(Locale.ROOT));
- if (!supportedDataSaveModeValues().contains(dataSaveMode)) {
- throw new SeaTunnelRuntimeException(
- SeaTunnelAPIErrorCode.CONFIG_VALIDATION_FAILED,
- "This connector don't support save mode: " + dataSaveMode);
- }
- }
- }
-
+ String SAVE_MODE_KEY = "savemode";
/**
- * Get the {@link DataSaveMode} that the user configured
+ * Return the value of DataSaveMode configured by user in the job config file.
*
- * @return DataSaveMode
+ * @return
*/
- DataSaveMode getDataSaveMode();
+ DataSaveMode getUserConfigSaveMode();
- /**
- * Return the {@link DataSaveMode} list supported by this connector
- *
- * @return the list of supported data save modes
- */
- List supportedDataSaveModeValues();
-
- /**
- * The implementation of specific logic according to different {@link DataSaveMode}
- *
- * @param saveMode data save mode
- */
- void handleSaveMode(DataSaveMode saveMode);
+ /** The implementation of specific logic according to different {@link DataSaveMode} */
+ void handleSaveMode(DataSaveMode userConfigSaveMode);
}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/CatalogTable.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/CatalogTable.java
index 3aa50335910d..1be6de028417 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/CatalogTable.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/CatalogTable.java
@@ -38,6 +38,8 @@ public final class CatalogTable implements Serializable {
private final String comment;
+ private final String catalogName;
+
public static CatalogTable of(
TableIdentifier tableId,
TableSchema tableSchema,
@@ -47,17 +49,38 @@ public static CatalogTable of(
return new CatalogTable(tableId, tableSchema, options, partitionKeys, comment);
}
+ public static CatalogTable of(
+ TableIdentifier tableId,
+ TableSchema tableSchema,
+ Map options,
+ List partitionKeys,
+ String comment,
+ String catalogName) {
+ return new CatalogTable(tableId, tableSchema, options, partitionKeys, comment, catalogName);
+ }
+
private CatalogTable(
TableIdentifier tableId,
TableSchema tableSchema,
Map options,
List partitionKeys,
String comment) {
+ this(tableId, tableSchema, options, partitionKeys, comment, "");
+ }
+
+ private CatalogTable(
+ TableIdentifier tableId,
+ TableSchema tableSchema,
+ Map options,
+ List partitionKeys,
+ String comment,
+ String catalogName) {
this.tableId = tableId;
this.tableSchema = tableSchema;
this.options = options;
this.partitionKeys = partitionKeys;
this.comment = comment;
+ this.catalogName = catalogName;
}
public TableIdentifier getTableId() {
@@ -80,6 +103,10 @@ public String getComment() {
return comment;
}
+ public String getCatalogName() {
+ return catalogName;
+ }
+
@Override
public String toString() {
return "CatalogTable{"
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/Column.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/Column.java
index b528996a3aec..bec10b3d7581 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/Column.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/Column.java
@@ -23,6 +23,7 @@
import lombok.Data;
import java.io.Serializable;
+import java.util.Map;
/**
* Represent the column of {@link TableSchema}.
@@ -54,6 +55,24 @@ public abstract class Column implements Serializable {
protected final String comment;
+ /** Field type in the database * */
+ protected final String sourceType;
+
+ /** Unsigned bit * */
+ protected final boolean isUnsigned;
+
+ /** Whether to use the 0 bit * */
+ protected final boolean isZeroFill;
+
+ /** Bit length * */
+ protected final Long bitLen;
+
+ /** integer may be cross the border * */
+ protected final Long longColumnLength;
+
+ /** your options * */
+ protected final Map options;
+
protected Column(
String name,
SeaTunnelDataType> dataType,
@@ -61,12 +80,46 @@ protected Column(
boolean nullable,
Object defaultValue,
String comment) {
+ this(
+ name,
+ dataType,
+ columnLength,
+ nullable,
+ defaultValue,
+ comment,
+ null,
+ false,
+ false,
+ null,
+ 0L,
+ null);
+ }
+
+ protected Column(
+ String name,
+ SeaTunnelDataType> dataType,
+ Integer columnLength,
+ boolean nullable,
+ Object defaultValue,
+ String comment,
+ String sourceType,
+ boolean isUnsigned,
+ boolean isZeroFill,
+ Long bitLen,
+ Long longColumnLength,
+ Map options) {
this.name = name;
this.dataType = dataType;
this.columnLength = columnLength;
this.nullable = nullable;
this.defaultValue = defaultValue;
this.comment = comment;
+ this.sourceType = sourceType;
+ this.isUnsigned = isUnsigned;
+ this.isZeroFill = isZeroFill;
+ this.bitLen = bitLen;
+ this.longColumnLength = longColumnLength;
+ this.options = options;
}
/**
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/PhysicalColumn.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/PhysicalColumn.java
index bc379e355466..164752d46863 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/PhysicalColumn.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/PhysicalColumn.java
@@ -23,6 +23,8 @@
import lombok.EqualsAndHashCode;
import lombok.ToString;
+import java.util.Map;
+
/** Representation of a physical column. */
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
@@ -38,6 +40,34 @@ protected PhysicalColumn(
super(name, dataType, columnLength, nullable, defaultValue, comment);
}
+ protected PhysicalColumn(
+ String name,
+ SeaTunnelDataType> dataType,
+ Integer columnLength,
+ boolean nullable,
+ Object defaultValue,
+ String comment,
+ String sourceType,
+ boolean isUnsigned,
+ boolean isZeroFill,
+ Long bitLen,
+ Long longColumnLength,
+ Map options) {
+ super(
+ name,
+ dataType,
+ columnLength,
+ nullable,
+ defaultValue,
+ comment,
+ sourceType,
+ isUnsigned,
+ isZeroFill,
+ bitLen,
+ longColumnLength,
+ options);
+ }
+
public static PhysicalColumn of(
String name,
SeaTunnelDataType> dataType,
@@ -48,6 +78,34 @@ public static PhysicalColumn of(
return new PhysicalColumn(name, dataType, columnLength, nullable, defaultValue, comment);
}
+ public static PhysicalColumn of(
+ String name,
+ SeaTunnelDataType> dataType,
+ Integer columnLength,
+ boolean nullable,
+ Object defaultValue,
+ String comment,
+ String sourceType,
+ boolean isUnsigned,
+ boolean isZeroFill,
+ Long bitLen,
+ Map options,
+ Long longColumnLength) {
+ return new PhysicalColumn(
+ name,
+ dataType,
+ columnLength,
+ nullable,
+ defaultValue,
+ comment,
+ sourceType,
+ isUnsigned,
+ isZeroFill,
+ bitLen,
+ longColumnLength,
+ options);
+ }
+
@Override
public boolean isPhysical() {
return true;
@@ -55,11 +113,35 @@ public boolean isPhysical() {
@Override
public Column copy(SeaTunnelDataType> newType) {
- return PhysicalColumn.of(name, newType, columnLength, nullable, defaultValue, comment);
+ return PhysicalColumn.of(
+ name,
+ newType,
+ columnLength,
+ nullable,
+ defaultValue,
+ comment,
+ sourceType,
+ isUnsigned,
+ isZeroFill,
+ bitLen,
+ options,
+ longColumnLength);
}
@Override
public Column copy() {
- return PhysicalColumn.of(name, dataType, columnLength, nullable, defaultValue, comment);
+ return PhysicalColumn.of(
+ name,
+ dataType,
+ columnLength,
+ nullable,
+ defaultValue,
+ comment,
+ sourceType,
+ isUnsigned,
+ isZeroFill,
+ bitLen,
+ options,
+ longColumnLength);
}
}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/FactoryUtil.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/FactoryUtil.java
index 6ac939149c71..f30900269912 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/FactoryUtil.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/FactoryUtil.java
@@ -18,21 +18,15 @@
package org.apache.seatunnel.api.table.factory;
import org.apache.seatunnel.api.common.CommonOptions;
-import org.apache.seatunnel.api.configuration.Option;
-import org.apache.seatunnel.api.configuration.Options;
import org.apache.seatunnel.api.configuration.ReadonlyConfig;
import org.apache.seatunnel.api.configuration.util.ConfigValidator;
import org.apache.seatunnel.api.configuration.util.OptionRule;
-import org.apache.seatunnel.api.sink.DataSaveMode;
import org.apache.seatunnel.api.sink.SeaTunnelSink;
-import org.apache.seatunnel.api.sink.SinkCommonOptions;
-import org.apache.seatunnel.api.sink.SupportDataSaveMode;
import org.apache.seatunnel.api.source.SeaTunnelSource;
import org.apache.seatunnel.api.source.SourceSplit;
import org.apache.seatunnel.api.source.SupportParallelism;
import org.apache.seatunnel.api.table.catalog.Catalog;
import org.apache.seatunnel.api.table.catalog.CatalogTable;
-import org.apache.seatunnel.api.table.connector.TableSink;
import org.apache.seatunnel.api.table.connector.TableSource;
import org.apache.seatunnel.api.transform.SeaTunnelTransform;
@@ -289,28 +283,6 @@ public static OptionRule sinkFullOptionRule(@NonNull TableSinkFactory factory) {
if (sinkOptionRule == null) {
throw new FactoryException("sinkOptionRule can not be null");
}
-
- try {
- TableSink sink = factory.createSink(null);
- if (SupportDataSaveMode.class.isAssignableFrom(sink.getClass())) {
- SupportDataSaveMode supportDataSaveModeSink = (SupportDataSaveMode) sink;
- Option saveMode =
- Options.key(SinkCommonOptions.DATA_SAVE_MODE)
- .singleChoice(
- DataSaveMode.class,
- supportDataSaveModeSink.supportedDataSaveModeValues())
- .noDefaultValue()
- .withDescription("data save mode");
- OptionRule sinkCommonOptionRule = OptionRule.builder().required(saveMode).build();
- sinkOptionRule
- .getOptionalOptions()
- .addAll(sinkCommonOptionRule.getOptionalOptions());
- }
- } catch (Exception e) {
- LOG.warn(
- "Add save mode option need sink connector support create sink by TableSinkFactory");
- }
-
return sinkOptionRule;
}
diff --git a/seatunnel-api/src/test/resources/conf/option-test.conf b/seatunnel-api/src/test/resources/conf/option-test.conf
index 4f20d493d4c0..9461e5298b98 100644
--- a/seatunnel-api/src/test/resources/conf/option-test.conf
+++ b/seatunnel-api/src/test/resources/conf/option-test.conf
@@ -101,7 +101,7 @@ sink {
partition_dir_expression = "${k0}=${v0}"
is_partition_field_write_in_file = true
file_name_expression = "${transactionId}_${now}"
- file_format = "text"
+ file_format_type = "text"
sink_columns = ["name","age"]
}
}
\ No newline at end of file
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/dialect/JdbcDataSourceDialect.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/dialect/JdbcDataSourceDialect.java
index 2c93bf387a60..17947ad1a6bf 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/dialect/JdbcDataSourceDialect.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/dialect/JdbcDataSourceDialect.java
@@ -17,6 +17,8 @@
package org.apache.seatunnel.connectors.cdc.base.dialect;
+import org.apache.seatunnel.api.table.catalog.ConstraintKey;
+import org.apache.seatunnel.api.table.catalog.PrimaryKey;
import org.apache.seatunnel.common.utils.SeaTunnelException;
import org.apache.seatunnel.connectors.cdc.base.config.JdbcSourceConfig;
import org.apache.seatunnel.connectors.cdc.base.relational.connection.JdbcConnectionFactory;
@@ -25,11 +27,23 @@
import org.apache.seatunnel.connectors.cdc.base.source.reader.external.JdbcSourceFetchTaskContext;
import org.apache.seatunnel.connectors.cdc.base.source.split.SourceSplitBase;
+import org.apache.commons.collections4.CollectionUtils;
+import org.apache.commons.lang3.tuple.Pair;
+
import io.debezium.jdbc.JdbcConnection;
import io.debezium.relational.TableId;
import io.debezium.relational.history.TableChanges;
+import java.sql.DatabaseMetaData;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.stream.Collectors;
public interface JdbcDataSourceDialect extends DataSourceDialect {
@@ -68,4 +82,90 @@ default JdbcConnection openJdbcConnection(JdbcSourceConfig sourceConfig) {
@Override
JdbcSourceFetchTaskContext createFetchTaskContext(
SourceSplitBase sourceSplitBase, JdbcSourceConfig taskSourceConfig);
+
+ default Optional getPrimaryKey(JdbcConnection jdbcConnection, TableId tableId)
+ throws SQLException {
+
+ DatabaseMetaData metaData = jdbcConnection.connection().getMetaData();
+
+ // According to the Javadoc of java.sql.DatabaseMetaData#getPrimaryKeys,
+ // the returned primary key columns are ordered by COLUMN_NAME, not by KEY_SEQ.
+ // We need to sort them based on the KEY_SEQ value.
+ ResultSet rs =
+ metaData.getPrimaryKeys(tableId.catalog(), tableId.schema(), tableId.table());
+
+ // seq -> column name
+ List> primaryKeyColumns = new ArrayList<>();
+ String pkName = null;
+ while (rs.next()) {
+ // all the PK_NAME should be the same
+ pkName = rs.getString("PK_NAME");
+ String columnName = rs.getString("COLUMN_NAME");
+ int keySeq = rs.getInt("KEY_SEQ");
+ // KEY_SEQ is 1-based index
+ primaryKeyColumns.add(Pair.of(keySeq, columnName));
+ }
+ // initialize size
+ List pkFields =
+ primaryKeyColumns.stream()
+ .sorted(Comparator.comparingInt(Pair::getKey))
+ .map(Pair::getValue)
+ .collect(Collectors.toList());
+ if (CollectionUtils.isEmpty(pkFields)) {
+ return Optional.empty();
+ }
+ return Optional.of(PrimaryKey.of(pkName, pkFields));
+ }
+
+ default List getUniqueKeys(JdbcConnection jdbcConnection, TableId tableId)
+ throws SQLException {
+ return getConstraintKeys(jdbcConnection, tableId).stream()
+ .filter(
+ constraintKey ->
+ constraintKey.getConstraintType()
+ == ConstraintKey.ConstraintType.UNIQUE_KEY)
+ .collect(Collectors.toList());
+ }
+
+ default List getConstraintKeys(JdbcConnection jdbcConnection, TableId tableId)
+ throws SQLException {
+ DatabaseMetaData metaData = jdbcConnection.connection().getMetaData();
+
+ ResultSet resultSet =
+ metaData.getIndexInfo(
+ tableId.catalog(), tableId.schema(), tableId.table(), false, false);
+ // index name -> index
+ Map constraintKeyMap = new HashMap<>();
+ while (resultSet.next()) {
+ String columnName = resultSet.getString("COLUMN_NAME");
+ if (columnName == null) {
+ continue;
+ }
+
+ String indexName = resultSet.getString("INDEX_NAME");
+ boolean noUnique = resultSet.getBoolean("NON_UNIQUE");
+
+ ConstraintKey constraintKey =
+ constraintKeyMap.computeIfAbsent(
+ indexName,
+ s -> {
+ ConstraintKey.ConstraintType constraintType =
+ ConstraintKey.ConstraintType.KEY;
+ if (!noUnique) {
+ constraintType = ConstraintKey.ConstraintType.UNIQUE_KEY;
+ }
+ return ConstraintKey.of(
+ constraintType, indexName, new ArrayList<>());
+ });
+
+ ConstraintKey.ColumnSortType sortType =
+ "A".equals(resultSet.getString("ASC_OR_DESC"))
+ ? ConstraintKey.ColumnSortType.ASC
+ : ConstraintKey.ColumnSortType.DESC;
+ ConstraintKey.ConstraintKeyColumn constraintKeyColumn =
+ new ConstraintKey.ConstraintKeyColumn(columnName, sortType);
+ constraintKey.getColumnNames().add(constraintKeyColumn);
+ }
+ return new ArrayList<>(constraintKeyMap.values());
+ }
}
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/AbstractJdbcSourceChunkSplitter.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/AbstractJdbcSourceChunkSplitter.java
new file mode 100644
index 000000000000..e956b111709b
--- /dev/null
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/AbstractJdbcSourceChunkSplitter.java
@@ -0,0 +1,392 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.connectors.cdc.base.source.enumerator.splitter;
+
+import org.apache.seatunnel.api.table.catalog.ConstraintKey;
+import org.apache.seatunnel.api.table.catalog.PrimaryKey;
+import org.apache.seatunnel.api.table.type.SeaTunnelRowType;
+import org.apache.seatunnel.connectors.cdc.base.config.JdbcSourceConfig;
+import org.apache.seatunnel.connectors.cdc.base.dialect.JdbcDataSourceDialect;
+import org.apache.seatunnel.connectors.cdc.base.source.split.SnapshotSplit;
+import org.apache.seatunnel.connectors.cdc.base.utils.ObjectUtils;
+
+import io.debezium.jdbc.JdbcConnection;
+import io.debezium.relational.Column;
+import io.debezium.relational.Table;
+import io.debezium.relational.TableId;
+import lombok.extern.slf4j.Slf4j;
+
+import java.math.BigDecimal;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+import java.util.Optional;
+
+import static java.math.BigDecimal.ROUND_CEILING;
+import static org.apache.seatunnel.connectors.cdc.base.utils.ObjectUtils.doubleCompare;
+
+@Slf4j
+public abstract class AbstractJdbcSourceChunkSplitter implements JdbcSourceChunkSplitter {
+
+ private final JdbcSourceConfig sourceConfig;
+ private final JdbcDataSourceDialect dialect;
+
+ public AbstractJdbcSourceChunkSplitter(
+ JdbcSourceConfig sourceConfig, JdbcDataSourceDialect dialect) {
+ this.sourceConfig = sourceConfig;
+ this.dialect = dialect;
+ }
+
+ @Override
+ public Collection generateSplits(TableId tableId) {
+ try (JdbcConnection jdbc = dialect.openJdbcConnection(sourceConfig)) {
+ log.info("Start splitting table {} into chunks...", tableId);
+ long start = System.currentTimeMillis();
+
+ Column splitColumn = getSplitColumn(jdbc, dialect, tableId);
+ final List chunks;
+ try {
+ chunks = splitTableIntoChunks(jdbc, tableId, splitColumn);
+ } catch (SQLException e) {
+ throw new RuntimeException("Failed to split chunks for table " + tableId, e);
+ }
+
+ // convert chunks into splits
+ List splits = new ArrayList<>();
+ SeaTunnelRowType splitType = getSplitType(splitColumn);
+ for (int i = 0; i < chunks.size(); i++) {
+ ChunkRange chunk = chunks.get(i);
+ SnapshotSplit split =
+ createSnapshotSplit(
+ jdbc,
+ tableId,
+ i,
+ splitType,
+ chunk.getChunkStart(),
+ chunk.getChunkEnd());
+ splits.add(split);
+ }
+
+ long end = System.currentTimeMillis();
+ log.info(
+ "Split table {} into {} chunks, time cost: {}ms.",
+ tableId,
+ splits.size(),
+ end - start);
+ return splits;
+ } catch (Exception e) {
+ throw new RuntimeException(
+ String.format("Generate Splits for table %s error", tableId), e);
+ }
+ }
+
+ private List splitTableIntoChunks(
+ JdbcConnection jdbc, TableId tableId, Column splitColumn) throws SQLException {
+ final String splitColumnName = splitColumn.name();
+ final Object[] minMax = queryMinMax(jdbc, tableId, splitColumnName);
+ final Object min = minMax[0];
+ final Object max = minMax[1];
+ if (min == null || max == null || min.equals(max)) {
+ // empty table, or only one row, return full table scan as a chunk
+ return Collections.singletonList(ChunkRange.all());
+ }
+
+ final int chunkSize = sourceConfig.getSplitSize();
+ final double distributionFactorUpper = sourceConfig.getDistributionFactorUpper();
+ final double distributionFactorLower = sourceConfig.getDistributionFactorLower();
+
+ if (isEvenlySplitColumn(splitColumn)) {
+ long approximateRowCnt = queryApproximateRowCnt(jdbc, tableId);
+ double distributionFactor =
+ calculateDistributionFactor(tableId, min, max, approximateRowCnt);
+
+ boolean dataIsEvenlyDistributed =
+ doubleCompare(distributionFactor, distributionFactorLower) >= 0
+ && doubleCompare(distributionFactor, distributionFactorUpper) <= 0;
+
+ if (dataIsEvenlyDistributed) {
+ // the minimum dynamic chunk size is at least 1
+ final int dynamicChunkSize = Math.max((int) (distributionFactor * chunkSize), 1);
+ return splitEvenlySizedChunks(
+ tableId, min, max, approximateRowCnt, chunkSize, dynamicChunkSize);
+ } else {
+ int shardCount = (int) (approximateRowCnt / chunkSize);
+ int inverseSamplingRate = sourceConfig.getInverseSamplingRate();
+ if (sourceConfig.getSampleShardingThreshold() < shardCount) {
+ // It is necessary to ensure that the number of data rows sampled by the
+ // sampling rate is greater than the number of shards.
+ // Otherwise, if the sampling rate is too low, it may result in an insufficient
+ // number of data rows for the shards, leading to an inadequate number of
+ // shards.
+ // Therefore, inverseSamplingRate should be less than chunkSize
+ if (inverseSamplingRate > chunkSize) {
+ log.warn(
+ "The inverseSamplingRate is {}, which is greater than chunkSize {}, so we set inverseSamplingRate to chunkSize",
+ inverseSamplingRate,
+ chunkSize);
+ inverseSamplingRate = chunkSize;
+ }
+ Object[] sample =
+ sampleDataFromColumn(
+ jdbc, tableId, splitColumnName, inverseSamplingRate);
+ return efficientShardingThroughSampling(
+ tableId, sample, approximateRowCnt, shardCount);
+ }
+ return splitUnevenlySizedChunks(
+ jdbc, tableId, splitColumnName, min, max, chunkSize);
+ }
+ } else {
+ return splitUnevenlySizedChunks(jdbc, tableId, splitColumnName, min, max, chunkSize);
+ }
+ }
+
+ /** Split table into unevenly sized chunks by continuously calculating next chunk max value. */
+ protected List splitUnevenlySizedChunks(
+ JdbcConnection jdbc,
+ TableId tableId,
+ String splitColumnName,
+ Object min,
+ Object max,
+ int chunkSize)
+ throws SQLException {
+ log.info(
+ "Use unevenly-sized chunks for table {}, the chunk size is {}", tableId, chunkSize);
+ final List splits = new ArrayList<>();
+ Object chunkStart = null;
+ Object chunkEnd = nextChunkEnd(jdbc, min, tableId, splitColumnName, max, chunkSize);
+ int count = 0;
+ while (chunkEnd != null && ObjectCompare(chunkEnd, max) <= 0) {
+ // we start from [null, min + chunk_size) and avoid [null, min)
+ splits.add(ChunkRange.of(chunkStart, chunkEnd));
+ // may sleep a while to avoid DDOS on MySQL server
+ maySleep(count++, tableId);
+ chunkStart = chunkEnd;
+ chunkEnd = nextChunkEnd(jdbc, chunkEnd, tableId, splitColumnName, max, chunkSize);
+ }
+ // add the ending split
+ splits.add(ChunkRange.of(chunkStart, null));
+ return splits;
+ }
+
+ protected Object nextChunkEnd(
+ JdbcConnection jdbc,
+ Object previousChunkEnd,
+ TableId tableId,
+ String splitColumnName,
+ Object max,
+ int chunkSize)
+ throws SQLException {
+ // chunk end might be null when max values are removed
+ Object chunkEnd =
+ queryNextChunkMax(jdbc, tableId, splitColumnName, chunkSize, previousChunkEnd);
+ if (Objects.equals(previousChunkEnd, chunkEnd)) {
+ // we don't allow equal chunk start and end,
+ // should query the next one larger than chunkEnd
+ chunkEnd = queryMin(jdbc, tableId, splitColumnName, chunkEnd);
+ }
+ if (ObjectCompare(chunkEnd, max) >= 0) {
+ return null;
+ } else {
+ return chunkEnd;
+ }
+ }
+
+ protected List efficientShardingThroughSampling(
+ TableId tableId, Object[] sampleData, long approximateRowCnt, int shardCount) {
+ log.info(
+ "Use efficient sharding through sampling optimization for table {}, the approximate row count is {}, the shardCount is {}",
+ tableId,
+ approximateRowCnt,
+ shardCount);
+
+ final List splits = new ArrayList<>();
+
+ if (shardCount == 0) {
+ splits.add(ChunkRange.of(null, null));
+ return splits;
+ }
+
+ double approxSamplePerShard = (double) sampleData.length / shardCount;
+
+ if (approxSamplePerShard <= 1) {
+
+ splits.add(ChunkRange.of(null, sampleData[0]));
+ for (int i = 0; i < sampleData.length - 1; i++) {
+ splits.add(ChunkRange.of(sampleData[i], sampleData[i + 1]));
+ }
+ splits.add(ChunkRange.of(sampleData[sampleData.length - 1], null));
+ } else {
+ // Calculate the shard boundaries
+ for (int i = 0; i < shardCount; i++) {
+ Object chunkStart = i == 0 ? null : sampleData[(int) (i * approxSamplePerShard)];
+ Object chunkEnd =
+ i < shardCount - 1
+ ? sampleData[(int) ((i + 1) * approxSamplePerShard)]
+ : null;
+ splits.add(ChunkRange.of(chunkStart, chunkEnd));
+ }
+ }
+ return splits;
+ }
+
+ /**
+ * Split table into evenly sized chunks based on the numeric min and max value of split column,
+ * and tumble chunks in step size.
+ */
+ protected List splitEvenlySizedChunks(
+ TableId tableId,
+ Object min,
+ Object max,
+ long approximateRowCnt,
+ int chunkSize,
+ int dynamicChunkSize) {
+ log.info(
+ "Use evenly-sized chunk optimization for table {}, the approximate row count is {}, the chunk size is {}, the dynamic chunk size is {}",
+ tableId,
+ approximateRowCnt,
+ chunkSize,
+ dynamicChunkSize);
+ if (approximateRowCnt <= chunkSize) {
+ // there is no more than one chunk, return full table as a chunk
+ return Collections.singletonList(ChunkRange.all());
+ }
+
+ final List splits = new ArrayList<>();
+ Object chunkStart = null;
+ Object chunkEnd = ObjectUtils.plus(min, dynamicChunkSize);
+ while (ObjectCompare(chunkEnd, max) <= 0) {
+ splits.add(ChunkRange.of(chunkStart, chunkEnd));
+ chunkStart = chunkEnd;
+ try {
+ chunkEnd = ObjectUtils.plus(chunkEnd, dynamicChunkSize);
+ } catch (ArithmeticException e) {
+ // Stop chunk split to avoid dead loop when number overflows.
+ break;
+ }
+ }
+ // add the ending split
+ splits.add(ChunkRange.of(chunkStart, null));
+ return splits;
+ }
+
+ // ------------------------------------------------------------------------------------------
+ /** Returns the distribution factor of the given table. */
+ @SuppressWarnings("MagicNumber")
+ protected double calculateDistributionFactor(
+ TableId tableId, Object min, Object max, long approximateRowCnt) {
+
+ if (!min.getClass().equals(max.getClass())) {
+ throw new IllegalStateException(
+ String.format(
+ "Unsupported operation type, the MIN value type %s is different with MAX value type %s.",
+ min.getClass().getSimpleName(), max.getClass().getSimpleName()));
+ }
+ if (approximateRowCnt == 0) {
+ return Double.MAX_VALUE;
+ }
+ BigDecimal difference = ObjectUtils.minus(max, min);
+ // factor = (max - min + 1) / rowCount
+ final BigDecimal subRowCnt = difference.add(BigDecimal.valueOf(1));
+ double distributionFactor =
+ subRowCnt.divide(new BigDecimal(approximateRowCnt), 4, ROUND_CEILING).doubleValue();
+ log.info(
+ "The distribution factor of table {} is {} according to the min split key {}, max split key {} and approximate row count {}",
+ tableId,
+ distributionFactor,
+ min,
+ max,
+ approximateRowCnt);
+ return distributionFactor;
+ }
+
+ protected SnapshotSplit createSnapshotSplit(
+ JdbcConnection jdbc,
+ TableId tableId,
+ int chunkId,
+ SeaTunnelRowType splitKeyType,
+ Object chunkStart,
+ Object chunkEnd) {
+ // currently, we only support single split column
+ Object[] splitStart = chunkStart == null ? null : new Object[] {chunkStart};
+ Object[] splitEnd = chunkEnd == null ? null : new Object[] {chunkEnd};
+ return new SnapshotSplit(
+ splitId(tableId, chunkId), tableId, splitKeyType, splitStart, splitEnd);
+ }
+
+ protected Column getSplitColumn(
+ JdbcConnection jdbc, JdbcDataSourceDialect dialect, TableId tableId)
+ throws SQLException {
+ Optional primaryKey = dialect.getPrimaryKey(jdbc, tableId);
+ if (primaryKey.isPresent()) {
+ List pkColumns = primaryKey.get().getColumnNames();
+
+ Table table = dialect.queryTableSchema(jdbc, tableId).getTable();
+ for (String pkColumn : pkColumns) {
+ Column column = table.columnWithName(pkColumn);
+ if (isEvenlySplitColumn(column)) {
+ return column;
+ }
+ }
+ }
+
+ List uniqueKeys = dialect.getUniqueKeys(jdbc, tableId);
+ if (!uniqueKeys.isEmpty()) {
+ Table table = dialect.queryTableSchema(jdbc, tableId).getTable();
+ for (ConstraintKey uniqueKey : uniqueKeys) {
+ List uniqueKeyColumns =
+ uniqueKey.getColumnNames();
+ for (ConstraintKey.ConstraintKeyColumn uniqueKeyColumn : uniqueKeyColumns) {
+ Column column = table.columnWithName(uniqueKeyColumn.getColumnName());
+ if (isEvenlySplitColumn(column)) {
+ return column;
+ }
+ }
+ }
+ }
+
+ throw new UnsupportedOperationException(
+ String.format(
+ "Incremental snapshot for tables requires primary key/unique key,"
+ + " but table %s doesn't have primary key.",
+ tableId));
+ }
+
+ protected String splitId(TableId tableId, int chunkId) {
+ return tableId.toString() + ":" + chunkId;
+ }
+
+ protected int ObjectCompare(Object obj1, Object obj2) {
+ return ObjectUtils.compare(obj1, obj2);
+ }
+
+ @SuppressWarnings("MagicNumber")
+ private static void maySleep(int count, TableId tableId) {
+ // every 100 queries to sleep 1s
+ if (count % 10 == 0) {
+ try {
+ Thread.sleep(100);
+ } catch (InterruptedException e) {
+ // nothing to do
+ }
+ log.info("JdbcSourceChunkSplitter has split {} chunks for table {}", count, tableId);
+ }
+ }
+}
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/JdbcSourceChunkSplitter.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/JdbcSourceChunkSplitter.java
index 9e42d5526351..b271be0d7653 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/JdbcSourceChunkSplitter.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/JdbcSourceChunkSplitter.java
@@ -136,6 +136,7 @@ default boolean isEvenlySplitColumn(Column splitColumn) {
case INT:
case BIGINT:
case DECIMAL:
+ case STRING:
return true;
default:
return false;
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceScanFetcher.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceScanFetcher.java
index 7a09ac6bc4ef..97c0c523e639 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceScanFetcher.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceScanFetcher.java
@@ -223,14 +223,11 @@ public void close() {
private boolean isChangeRecordInChunkRange(SourceRecord record) {
if (taskContext.isDataChangeRecord(record)) {
+ // fix the between condition
return taskContext.isRecordBetween(
record,
- null == currentSnapshotSplit.getSplitStart()
- ? null
- : new Object[] {currentSnapshotSplit.getSplitStart()},
- null == currentSnapshotSplit.getSplitEnd()
- ? null
- : new Object[] {currentSnapshotSplit.getSplitEnd()});
+ currentSnapshotSplit.getSplitStart(),
+ currentSnapshotSplit.getSplitEnd());
}
return false;
}
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceStreamFetcher.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceStreamFetcher.java
index 5257064dc1fe..2b8e9f7725fd 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceStreamFetcher.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceStreamFetcher.java
@@ -19,6 +19,7 @@
import org.apache.seatunnel.common.utils.SeaTunnelException;
import org.apache.seatunnel.connectors.cdc.base.source.offset.Offset;
+import org.apache.seatunnel.connectors.cdc.base.source.split.CompletedSnapshotSplitInfo;
import org.apache.seatunnel.connectors.cdc.base.source.split.IncrementalSplit;
import org.apache.seatunnel.connectors.cdc.base.source.split.SourceRecords;
import org.apache.seatunnel.connectors.cdc.base.source.split.SourceSplitBase;
@@ -32,8 +33,12 @@
import lombok.extern.slf4j.Slf4j;
import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
+import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadFactory;
@@ -49,6 +54,8 @@
public class IncrementalSourceStreamFetcher implements Fetcher {
private final FetchTask.Context taskContext;
private final ExecutorService executorService;
+ // has entered pure binlog mode
+ private final Set pureBinlogPhaseTables;
private volatile ChangeEventQueue queue;
private volatile Throwable readException;
@@ -58,6 +65,11 @@ public class IncrementalSourceStreamFetcher implements Fetcher maxSplitHighWatermarkMap;
+ // finished spilt info
+ private Map> finishedSplitsInfo;
+
private static final long READER_CLOSE_TIMEOUT_SECONDS = 30L;
public IncrementalSourceStreamFetcher(FetchTask.Context taskContext, int subTaskId) {
@@ -65,6 +77,7 @@ public IncrementalSourceStreamFetcher(FetchTask.Context taskContext, int subTask
ThreadFactory threadFactory =
new ThreadFactoryBuilder().setNameFormat("debezium-reader-" + subTaskId).build();
this.executorService = Executors.newSingleThreadExecutor(threadFactory);
+ this.pureBinlogPhaseTables = new HashSet<>();
}
@Override
@@ -157,14 +170,72 @@ private boolean shouldEmit(SourceRecord sourceRecord) {
tableId);
return position.isAfter(splitStartWatermark);
}
- // TODO only the table who captured snapshot splits need to filter( Used to support
- // Exactly-Once )
- return position.isAfter(splitStartWatermark);
+ // check whether the pure binlog mode has been entered
+ if (hasEnterPureBinlogPhase(tableId, position)) {
+ return true;
+ }
+ // not enter pure binlog mode and need to check whether the current record meets the
+ // emitting conditions.
+ if (finishedSplitsInfo.containsKey(tableId)) {
+ for (CompletedSnapshotSplitInfo splitInfo : finishedSplitsInfo.get(tableId)) {
+ if (taskContext.isRecordBetween(
+ sourceRecord,
+ splitInfo.getSplitStart(),
+ splitInfo.getSplitEnd())
+ && position.isAfter(splitInfo.getWatermark().getHighWatermark())) {
+ return true;
+ }
+ }
+ }
+ return false;
}
return true;
}
+ private boolean hasEnterPureBinlogPhase(TableId tableId, Offset position) {
+ // only the table who captured snapshot splits need to filter
+ if (pureBinlogPhaseTables.contains(tableId)) {
+ return true;
+ }
+ // the existed tables those have finished snapshot reading
+ if (maxSplitHighWatermarkMap.containsKey(tableId)
+ && position.isAtOrAfter(maxSplitHighWatermarkMap.get(tableId))) {
+ pureBinlogPhaseTables.add(tableId);
+ return true;
+ }
+ return false;
+ }
+
private void configureFilter() {
splitStartWatermark = currentIncrementalSplit.getStartupOffset();
+ Map> splitsInfoMap = new HashMap<>();
+ Map tableIdBinlogPositionMap = new HashMap<>();
+ List completedSnapshotSplitInfos =
+ currentIncrementalSplit.getCompletedSnapshotSplitInfos();
+
+ // latest-offset mode
+ if (completedSnapshotSplitInfos.isEmpty()) {
+ for (TableId tableId : currentIncrementalSplit.getTableIds()) {
+ tableIdBinlogPositionMap.put(tableId, currentIncrementalSplit.getStartupOffset());
+ }
+ }
+
+ // calculate the max high watermark of every table
+ for (CompletedSnapshotSplitInfo finishedSplitInfo : completedSnapshotSplitInfos) {
+ TableId tableId = finishedSplitInfo.getTableId();
+ List list =
+ splitsInfoMap.getOrDefault(tableId, new ArrayList<>());
+ list.add(finishedSplitInfo);
+ splitsInfoMap.put(tableId, list);
+
+ Offset highWatermark = finishedSplitInfo.getWatermark().getHighWatermark();
+ Offset maxHighWatermark = tableIdBinlogPositionMap.get(tableId);
+ if (maxHighWatermark == null || highWatermark.isAfter(maxHighWatermark)) {
+ tableIdBinlogPositionMap.put(tableId, highWatermark);
+ }
+ }
+ this.finishedSplitsInfo = splitsInfoMap;
+ this.maxSplitHighWatermarkMap = tableIdBinlogPositionMap;
+ this.pureBinlogPhaseTables.clear();
}
}
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/utils/ObjectUtils.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/utils/ObjectUtils.java
index 3c5b669a257a..0f703f02c1cb 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/utils/ObjectUtils.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/utils/ObjectUtils.java
@@ -63,6 +63,8 @@ public static BigDecimal minus(Object minuend, Object subtrahend) {
((BigInteger) minuend).subtract((BigInteger) subtrahend).toString());
} else if (minuend instanceof BigDecimal) {
return ((BigDecimal) minuend).subtract((BigDecimal) subtrahend);
+ } else if (minuend instanceof String) {
+ return BigDecimal.valueOf(Long.MAX_VALUE);
} else {
throw new UnsupportedOperationException(
String.format(
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/pom.xml b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/pom.xml
new file mode 100644
index 000000000000..e22560ed0383
--- /dev/null
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/pom.xml
@@ -0,0 +1,87 @@
+
+
+
+ 4.0.0
+
+ org.apache.seatunnel
+ connector-cdc
+ ${revision}
+
+ connector-cdc-mongodb
+ SeaTunnel : Connectors V2 : CDC : Mongodb
+
+
+ 4.7.1
+ 1.11.1
+ 1.10.1
+ 4.13.2
+
+
+
+
+ org.apache.seatunnel
+ connector-cdc-base
+ ${project.version}
+ compile
+
+
+ io.debezium
+ debezium-connector-mongodb
+ ${debezium.version}
+ compile
+
+
+ org.mongodb.kafka
+ mongo-kafka-connect
+ ${mongo-kafka-connect.version}
+
+
+ org.mongodb
+ mongodb-driver-sync
+
+
+ org.apache.kafka
+ connect-api
+
+
+ org.apache.avro
+ avro
+
+
+
+
+ org.apache.avro
+ avro
+ ${avro.version}
+
+
+ org.mongodb
+ mongodb-driver-sync
+ ${mongo.driver.version}
+
+
+ junit
+ junit
+ ${junit.vserion}
+ test
+
+
+
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/MongodbIncrementalSource.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/MongodbIncrementalSource.java
new file mode 100644
index 000000000000..41191cfa52ba
--- /dev/null
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/MongodbIncrementalSource.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.connectors.seatunnel.cdc.mongodb;
+
+import org.apache.seatunnel.api.configuration.Option;
+import org.apache.seatunnel.api.configuration.ReadonlyConfig;
+import org.apache.seatunnel.api.source.SeaTunnelSource;
+import org.apache.seatunnel.api.source.SupportParallelism;
+import org.apache.seatunnel.api.table.type.SeaTunnelDataType;
+import org.apache.seatunnel.api.table.type.SeaTunnelRow;
+import org.apache.seatunnel.connectors.cdc.base.config.SourceConfig;
+import org.apache.seatunnel.connectors.cdc.base.dialect.DataSourceDialect;
+import org.apache.seatunnel.connectors.cdc.base.option.StartupMode;
+import org.apache.seatunnel.connectors.cdc.base.option.StopMode;
+import org.apache.seatunnel.connectors.cdc.base.source.IncrementalSource;
+import org.apache.seatunnel.connectors.cdc.base.source.offset.OffsetFactory;
+import org.apache.seatunnel.connectors.cdc.debezium.DebeziumDeserializationSchema;
+import org.apache.seatunnel.connectors.cdc.debezium.row.DebeziumJsonDeserializeSchema;
+import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceConfig;
+import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceConfigProvider;
+import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions;
+import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.sender.MongoDBConnectorDeserializationSchema;
+import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.source.dialect.MongodbDialect;
+import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.source.offset.ChangeStreamOffsetFactory;
+
+import com.google.auto.service.AutoService;
+import lombok.NoArgsConstructor;
+
+import javax.annotation.Nonnull;
+
+import java.util.Optional;
+
+@NoArgsConstructor
+@AutoService(SeaTunnelSource.class)
+public class MongodbIncrementalSource extends IncrementalSource
+ implements SupportParallelism {
+
+ static final String IDENTIFIER = "MongoDB-CDC";
+
+ public MongodbIncrementalSource(
+ ReadonlyConfig options, SeaTunnelDataType dataType) {
+ super(options, dataType);
+ }
+
+ @Override
+ public Option getStartupModeOption() {
+ return MongodbSourceOptions.STARTUP_MODE;
+ }
+
+ @Override
+ public Option getStopModeOption() {
+ return MongodbSourceOptions.STOP_MODE;
+ }
+
+ @Override
+ public String getPluginName() {
+ return IDENTIFIER;
+ }
+
+ @Override
+ public SourceConfig.Factory createSourceConfigFactory(
+ @Nonnull ReadonlyConfig config) {
+ MongodbSourceConfigProvider.Builder builder =
+ MongodbSourceConfigProvider.newBuilder()
+ .hosts(config.get(MongodbSourceOptions.HOSTS))
+ .validate();
+ Optional.ofNullable(config.get(MongodbSourceOptions.DATABASE))
+ .ifPresent(builder::databaseList);
+ Optional.ofNullable(config.get(MongodbSourceOptions.COLLECTION))
+ .ifPresent(builder::collectionList);
+ Optional.ofNullable(config.get(MongodbSourceOptions.USERNAME)).ifPresent(builder::username);
+ Optional.ofNullable(config.get(MongodbSourceOptions.PASSWORD)).ifPresent(builder::password);
+ Optional.ofNullable(config.get(MongodbSourceOptions.CONNECTION_OPTIONS))
+ .ifPresent(builder::connectionOptions);
+ Optional.ofNullable(config.get(MongodbSourceOptions.BATCH_SIZE))
+ .ifPresent(builder::batchSize);
+ Optional.ofNullable(config.get(MongodbSourceOptions.POLL_MAX_BATCH_SIZE))
+ .ifPresent(builder::pollMaxBatchSize);
+ Optional.ofNullable(config.get(MongodbSourceOptions.POLL_AWAIT_TIME_MILLIS))
+ .ifPresent(builder::pollAwaitTimeMillis);
+ Optional.ofNullable(config.get(MongodbSourceOptions.HEARTBEAT_INTERVAL_MILLIS))
+ .ifPresent(builder::heartbeatIntervalMillis);
+ Optional.ofNullable(config.get(MongodbSourceOptions.HEARTBEAT_INTERVAL_MILLIS))
+ .ifPresent(builder::splitMetaGroupSize);
+ Optional.ofNullable(config.get(MongodbSourceOptions.INCREMENTAL_SNAPSHOT_CHUNK_SIZE_MB))
+ .ifPresent(builder::splitSizeMB);
+ Optional.ofNullable(startupConfig).ifPresent(builder::startupOptions);
+ Optional.ofNullable(stopConfig).ifPresent(builder::stopOptions);
+ return builder;
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public DebeziumDeserializationSchema createDebeziumDeserializationSchema(
+ ReadonlyConfig config) {
+ SeaTunnelDataType physicalRowType;
+ if (dataType == null) {
+ return (DebeziumDeserializationSchema)
+ new DebeziumJsonDeserializeSchema(
+ config.get(MongodbSourceOptions.DEBEZIUM_PROPERTIES));
+ } else {
+ physicalRowType = dataType;
+ return (DebeziumDeserializationSchema)
+ new MongoDBConnectorDeserializationSchema(physicalRowType, physicalRowType);
+ }
+ }
+
+ @Override
+ public DataSourceDialect createDataSourceDialect(ReadonlyConfig config) {
+ return new MongodbDialect();
+ }
+
+ @Override
+ public OffsetFactory createOffsetFactory(ReadonlyConfig config) {
+ return new ChangeStreamOffsetFactory();
+ }
+}
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/MongodbIncrementalSourceFactory.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/MongodbIncrementalSourceFactory.java
new file mode 100644
index 000000000000..6215afb74ef0
--- /dev/null
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/MongodbIncrementalSourceFactory.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.connectors.seatunnel.cdc.mongodb;
+
+import org.apache.seatunnel.api.configuration.util.OptionRule;
+import org.apache.seatunnel.api.source.SeaTunnelSource;
+import org.apache.seatunnel.api.source.SourceSplit;
+import org.apache.seatunnel.api.table.catalog.CatalogTable;
+import org.apache.seatunnel.api.table.connector.TableSource;
+import org.apache.seatunnel.api.table.factory.Factory;
+import org.apache.seatunnel.api.table.factory.SupportMultipleTable;
+import org.apache.seatunnel.api.table.factory.TableFactoryContext;
+import org.apache.seatunnel.api.table.factory.TableSourceFactory;
+import org.apache.seatunnel.api.table.type.MultipleRowType;
+import org.apache.seatunnel.api.table.type.SeaTunnelDataType;
+import org.apache.seatunnel.api.table.type.SeaTunnelRow;
+import org.apache.seatunnel.api.table.type.SeaTunnelRowType;
+import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions;
+
+import com.google.auto.service.AutoService;
+
+import javax.annotation.Nonnull;
+
+import java.io.Serializable;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+@AutoService(Factory.class)
+public class MongodbIncrementalSourceFactory implements TableSourceFactory, SupportMultipleTable {
+ @Override
+ public String factoryIdentifier() {
+ return MongodbIncrementalSource.IDENTIFIER;
+ }
+
+ @Override
+ public OptionRule optionRule() {
+ return MongodbSourceOptions.getBaseRule()
+ .required(
+ MongodbSourceOptions.HOSTS,
+ MongodbSourceOptions.DATABASE,
+ MongodbSourceOptions.COLLECTION)
+ .optional(
+ MongodbSourceOptions.USERNAME,
+ MongodbSourceOptions.PASSWORD,
+ MongodbSourceOptions.CONNECTION_OPTIONS,
+ MongodbSourceOptions.BATCH_SIZE,
+ MongodbSourceOptions.POLL_MAX_BATCH_SIZE,
+ MongodbSourceOptions.POLL_AWAIT_TIME_MILLIS,
+ MongodbSourceOptions.HEARTBEAT_INTERVAL_MILLIS,
+ MongodbSourceOptions.INCREMENTAL_SNAPSHOT_CHUNK_SIZE_MB,
+ MongodbSourceOptions.STARTUP_MODE,
+ MongodbSourceOptions.STOP_MODE)
+ .build();
+ }
+
+ @Override
+ public Class extends SeaTunnelSource> getSourceClass() {
+ return MongodbIncrementalSource.class;
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public
+ TableSource createSource(TableFactoryContext context) {
+ return () -> {
+ SeaTunnelDataType dataType;
+ if (context.getCatalogTables().size() == 1) {
+ dataType =
+ context.getCatalogTables().get(0).getTableSchema().toPhysicalRowDataType();
+ } else {
+ Map rowTypeMap = new HashMap<>();
+ for (CatalogTable catalogTable : context.getCatalogTables()) {
+ rowTypeMap.put(
+ catalogTable.getTableId().toTablePath().toString(),
+ catalogTable.getTableSchema().toPhysicalRowDataType());
+ }
+ dataType = new MultipleRowType(rowTypeMap);
+ }
+ return (SeaTunnelSource)
+ new MongodbIncrementalSource<>(context.getOptions(), dataType);
+ };
+ }
+
+ @Override
+ public Result applyTables(@Nonnull TableFactoryContext context) {
+ return Result.of(context.getCatalogTables(), Collections.emptyList());
+ }
+}
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/config/MongodbSourceConfig.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/config/MongodbSourceConfig.java
new file mode 100644
index 000000000000..049b37db3634
--- /dev/null
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/config/MongodbSourceConfig.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config;
+
+import org.apache.seatunnel.connectors.cdc.base.config.SourceConfig;
+import org.apache.seatunnel.connectors.cdc.base.config.StartupConfig;
+import org.apache.seatunnel.connectors.cdc.base.config.StopConfig;
+
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+
+import java.util.List;
+
+import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.utils.MongodbUtils.buildConnectionString;
+import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull;
+
+@Getter
+@EqualsAndHashCode
+public class MongodbSourceConfig implements SourceConfig {
+
+ private static final long serialVersionUID = 1L;
+
+ private final String hosts;
+
+ private final String username;
+
+ private final String password;
+
+ private final List databaseList;
+
+ private final List collectionList;
+
+ private final String connectionString;
+
+ private final int batchSize;
+
+ private final int pollAwaitTimeMillis;
+
+ private final int pollMaxBatchSize;
+
+ private final boolean updateLookup;
+
+ private final StartupConfig startupOptions;
+
+ private final StopConfig stopOptions;
+
+ private final int heartbeatIntervalMillis;
+
+ private final int splitMetaGroupSize;
+
+ private final int splitSizeMB;
+
+ MongodbSourceConfig(
+ String hosts,
+ String username,
+ String password,
+ List databaseList,
+ List collectionList,
+ String connectionOptions,
+ int batchSize,
+ int pollAwaitTimeMillis,
+ int pollMaxBatchSize,
+ boolean updateLookup,
+ StartupConfig startupOptions,
+ StopConfig stopOptions,
+ int heartbeatIntervalMillis,
+ int splitMetaGroupSize,
+ int splitSizeMB) {
+ this.hosts = checkNotNull(hosts);
+ this.username = username;
+ this.password = password;
+ this.databaseList = databaseList;
+ this.collectionList = collectionList;
+ this.connectionString =
+ buildConnectionString(username, password, hosts, connectionOptions)
+ .getConnectionString();
+ this.batchSize = batchSize;
+ this.pollAwaitTimeMillis = pollAwaitTimeMillis;
+ this.pollMaxBatchSize = pollMaxBatchSize;
+ this.updateLookup = updateLookup;
+ this.startupOptions = startupOptions;
+ this.stopOptions = stopOptions;
+ this.heartbeatIntervalMillis = heartbeatIntervalMillis;
+ this.splitMetaGroupSize = splitMetaGroupSize;
+ this.splitSizeMB = splitSizeMB;
+ }
+
+ @Override
+ public StartupConfig getStartupConfig() {
+ return startupOptions;
+ }
+
+ @Override
+ public StopConfig getStopConfig() {
+ return stopOptions;
+ }
+
+ @Override
+ public int getSplitSize() {
+ return splitSizeMB;
+ }
+
+ @Override
+ public boolean isExactlyOnce() {
+ return true;
+ }
+}
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/config/MongodbSourceConfigProvider.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/config/MongodbSourceConfigProvider.java
new file mode 100644
index 000000000000..ebe7af13e0c6
--- /dev/null
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/config/MongodbSourceConfigProvider.java
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config;
+
+import org.apache.seatunnel.connectors.cdc.base.config.SourceConfig;
+import org.apache.seatunnel.connectors.cdc.base.config.StartupConfig;
+import org.apache.seatunnel.connectors.cdc.base.config.StopConfig;
+import org.apache.seatunnel.connectors.cdc.base.option.StartupMode;
+import org.apache.seatunnel.connectors.cdc.base.option.StopMode;
+import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.exception.MongodbConnectorException;
+
+import java.util.List;
+import java.util.Objects;
+
+import static org.apache.seatunnel.common.exception.CommonErrorCode.ILLEGAL_ARGUMENT;
+import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.BATCH_SIZE;
+import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.HEARTBEAT_INTERVAL_MILLIS;
+import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.INCREMENTAL_SNAPSHOT_CHUNK_SIZE_MB;
+import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.POLL_AWAIT_TIME_MILLIS;
+import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.POLL_MAX_BATCH_SIZE;
+import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument;
+import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull;
+
+public class MongodbSourceConfigProvider {
+
+ private MongodbSourceConfigProvider() {}
+
+ public static Builder newBuilder() {
+ return new Builder();
+ }
+
+ public static class Builder implements SourceConfig.Factory {
+ private String hosts;
+ private String username;
+ private String password;
+ private List databaseList;
+ private List collectionList;
+ private String connectionOptions;
+ private int batchSize = BATCH_SIZE.defaultValue();
+ private int pollAwaitTimeMillis = POLL_AWAIT_TIME_MILLIS.defaultValue();
+ private int pollMaxBatchSize = POLL_MAX_BATCH_SIZE.defaultValue();
+ private StartupConfig startupOptions;
+ private StopConfig stopOptions;
+ private int heartbeatIntervalMillis = HEARTBEAT_INTERVAL_MILLIS.defaultValue();
+ private int splitMetaGroupSize = 2;
+ private int splitSizeMB = INCREMENTAL_SNAPSHOT_CHUNK_SIZE_MB.defaultValue();
+
+ public Builder hosts(String hosts) {
+ this.hosts = hosts;
+ return this;
+ }
+
+ public Builder connectionOptions(String connectionOptions) {
+ this.connectionOptions = connectionOptions;
+ return this;
+ }
+
+ public Builder username(String username) {
+ this.username = username;
+ return this;
+ }
+
+ public Builder password(String password) {
+ this.password = password;
+ return this;
+ }
+
+ public Builder databaseList(List databases) {
+ this.databaseList = databases;
+ return this;
+ }
+
+ public Builder collectionList(List collections) {
+ this.collectionList = collections;
+ return this;
+ }
+
+ public Builder batchSize(int batchSize) {
+ checkArgument(batchSize >= 0);
+ this.batchSize = batchSize;
+ return this;
+ }
+
+ public Builder pollAwaitTimeMillis(int pollAwaitTimeMillis) {
+ checkArgument(pollAwaitTimeMillis > 0);
+ this.pollAwaitTimeMillis = pollAwaitTimeMillis;
+ return this;
+ }
+
+ public Builder pollMaxBatchSize(int pollMaxBatchSize) {
+ checkArgument(pollMaxBatchSize > 0);
+ this.pollMaxBatchSize = pollMaxBatchSize;
+ return this;
+ }
+
+ public Builder startupOptions(StartupConfig startupOptions) {
+ this.startupOptions = Objects.requireNonNull(startupOptions);
+ if (startupOptions.getStartupMode() != StartupMode.INITIAL
+ && startupOptions.getStartupMode() != StartupMode.TIMESTAMP) {
+ throw new MongodbConnectorException(
+ ILLEGAL_ARGUMENT,
+ "Unsupported startup mode " + startupOptions.getStartupMode());
+ }
+ return this;
+ }
+
+ public Builder stopOptions(StopConfig stopOptions) {
+ this.stopOptions = Objects.requireNonNull(stopOptions);
+ if (stopOptions.getStopMode() != StopMode.NEVER) {
+ throw new MongodbConnectorException(
+ ILLEGAL_ARGUMENT,
+ String.format("The %s mode is not supported.", stopOptions.getStopMode()));
+ }
+ return this;
+ }
+
+ public Builder heartbeatIntervalMillis(int heartbeatIntervalMillis) {
+ checkArgument(heartbeatIntervalMillis >= 0);
+ this.heartbeatIntervalMillis = heartbeatIntervalMillis;
+ return this;
+ }
+
+ public Builder splitSizeMB(int splitSizeMB) {
+ checkArgument(splitSizeMB > 0);
+ this.splitSizeMB = splitSizeMB;
+ return this;
+ }
+
+ public Builder splitMetaGroupSize(int splitMetaGroupSize) {
+ this.splitMetaGroupSize = splitMetaGroupSize;
+ return this;
+ }
+
+ public Builder validate() {
+ checkNotNull(hosts, "hosts must be provided");
+ return this;
+ }
+
+ @Override
+ public MongodbSourceConfig create(int subtask) {
+ boolean updateLookup = true;
+ return new MongodbSourceConfig(
+ hosts,
+ username,
+ password,
+ databaseList,
+ collectionList,
+ connectionOptions,
+ batchSize,
+ pollAwaitTimeMillis,
+ pollMaxBatchSize,
+ updateLookup,
+ startupOptions,
+ stopOptions,
+ heartbeatIntervalMillis,
+ splitMetaGroupSize,
+ splitSizeMB);
+ }
+ }
+}
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/config/MongodbSourceOptions.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/config/MongodbSourceOptions.java
new file mode 100644
index 000000000000..170bef34e961
--- /dev/null
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/config/MongodbSourceOptions.java
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config;
+
+import org.apache.seatunnel.api.configuration.Option;
+import org.apache.seatunnel.api.configuration.Options;
+import org.apache.seatunnel.api.configuration.SingleChoiceOption;
+import org.apache.seatunnel.connectors.cdc.base.option.SourceOptions;
+import org.apache.seatunnel.connectors.cdc.base.option.StartupMode;
+import org.apache.seatunnel.connectors.cdc.base.option.StopMode;
+
+import org.bson.BsonDouble;
+import org.bson.json.JsonMode;
+import org.bson.json.JsonWriterSettings;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class MongodbSourceOptions extends SourceOptions {
+
+ public static final String ENCODE_VALUE_FIELD = "_value";
+
+ public static final String CLUSTER_TIME_FIELD = "clusterTime";
+
+ public static final String TS_MS_FIELD = "ts_ms";
+
+ public static final String SOURCE_FIELD = "source";
+
+ public static final String SNAPSHOT_FIELD = "snapshot";
+
+ public static final String FALSE_FALSE = "false";
+
+ public static final String OPERATION_TYPE_INSERT = "insert";
+
+ public static final String SNAPSHOT_TRUE = "true";
+
+ public static final String ID_FIELD = "_id";
+
+ public static final String DOCUMENT_KEY = "documentKey";
+
+ public static final String NS_FIELD = "ns";
+
+ public static final String OPERATION_TYPE = "operationType";
+
+ public static final String TIMESTAMP_FIELD = "timestamp";
+
+ public static final String RESUME_TOKEN_FIELD = "resumeToken";
+
+ public static final String FULL_DOCUMENT = "fullDocument";
+
+ public static final String DB_FIELD = "db";
+
+ public static final String COLL_FIELD = "coll";
+
+ public static final int FAILED_TO_PARSE_ERROR = 9;
+
+ public static final int UNAUTHORIZED_ERROR = 13;
+
+ public static final int ILLEGAL_OPERATION_ERROR = 20;
+
+ public static final int UNKNOWN_FIELD_ERROR = 40415;
+
+ public static final String DROPPED_FIELD = "dropped";
+
+ public static final String MAX_FIELD = "max";
+
+ public static final String MIN_FIELD = "min";
+
+ public static final String ADD_NS_FIELD_NAME = "_ns_";
+
+ public static final String UUID_FIELD = "uuid";
+
+ public static final String SHARD_FIELD = "shard";
+
+ public static final String DIALECT_NAME = "MongoDB";
+
+ public static final BsonDouble COMMAND_SUCCEED_FLAG = new BsonDouble(1.0d);
+
+ public static final JsonWriterSettings DEFAULT_JSON_WRITER_SETTINGS =
+ JsonWriterSettings.builder().outputMode(JsonMode.EXTENDED).build();
+
+ public static final String OUTPUT_SCHEMA =
+ "{"
+ + " \"name\": \"ChangeStream\","
+ + " \"type\": \"record\","
+ + " \"fields\": ["
+ + " { \"name\": \"_id\", \"type\": \"string\" },"
+ + " { \"name\": \"operationType\", \"type\": [\"string\", \"null\"] },"
+ + " { \"name\": \"fullDocument\", \"type\": [\"string\", \"null\"] },"
+ + " { \"name\": \"source\","
+ + " \"type\": [{\"name\": \"source\", \"type\": \"record\", \"fields\": ["
+ + " {\"name\": \"ts_ms\", \"type\": \"long\"},"
+ + " {\"name\": \"table\", \"type\": [\"string\", \"null\"]},"
+ + " {\"name\": \"db\", \"type\": [\"string\", \"null\"]},"
+ + " {\"name\": \"snapshot\", \"type\": [\"string\", \"null\"] } ]"
+ + " }, \"null\" ] },"
+ + " { \"name\": \"ts_ms\", \"type\": [\"long\", \"null\"]},"
+ + " { \"name\": \"ns\","
+ + " \"type\": [{\"name\": \"ns\", \"type\": \"record\", \"fields\": ["
+ + " {\"name\": \"db\", \"type\": \"string\"},"
+ + " {\"name\": \"coll\", \"type\": [\"string\", \"null\"] } ]"
+ + " }, \"null\" ] },"
+ + " { \"name\": \"to\","
+ + " \"type\": [{\"name\": \"to\", \"type\": \"record\", \"fields\": ["
+ + " {\"name\": \"db\", \"type\": \"string\"},"
+ + " {\"name\": \"coll\", \"type\": [\"string\", \"null\"] } ]"
+ + " }, \"null\" ] },"
+ + " { \"name\": \"documentKey\", \"type\": [\"string\", \"null\"] },"
+ + " { \"name\": \"updateDescription\","
+ + " \"type\": [{\"name\": \"updateDescription\", \"type\": \"record\", \"fields\": ["
+ + " {\"name\": \"updatedFields\", \"type\": [\"string\", \"null\"]},"
+ + " {\"name\": \"removedFields\","
+ + " \"type\": [{\"type\": \"array\", \"items\": \"string\"}, \"null\"]"
+ + " }] }, \"null\"] },"
+ + " { \"name\": \"clusterTime\", \"type\": [\"string\", \"null\"] },"
+ + " { \"name\": \"txnNumber\", \"type\": [\"long\", \"null\"]},"
+ + " { \"name\": \"lsid\", \"type\": [{\"name\": \"lsid\", \"type\": \"record\","
+ + " \"fields\": [ {\"name\": \"id\", \"type\": \"string\"},"
+ + " {\"name\": \"uid\", \"type\": \"string\"}] }, \"null\"] }"
+ + " ]"
+ + "}";
+
+ public static final Option HOSTS =
+ Options.key("hosts")
+ .stringType()
+ .noDefaultValue()
+ .withDescription(
+ "The comma-separated list of hostname and port pairs of the MongoDB servers. "
+ + "eg. localhost:27017,localhost:27018");
+
+ public static final Option USERNAME =
+ Options.key("username")
+ .stringType()
+ .noDefaultValue()
+ .withDescription(
+ "Name of the database user to be used when connecting to MongoDB. "
+ + "This is required only when MongoDB is configured to use authentication.");
+
+ public static final Option PASSWORD =
+ Options.key("password")
+ .stringType()
+ .noDefaultValue()
+ .withDescription(
+ "Password to be used when connecting to MongoDB. "
+ + "This is required only when MongoDB is configured to use authentication.");
+
+ public static final Option> DATABASE =
+ Options.key("database")
+ .listType()
+ .noDefaultValue()
+ .withDescription("Name of the database to watch for changes.");
+
+ public static final Option> COLLECTION =
+ Options.key("collection")
+ .listType()
+ .noDefaultValue()
+ .withDescription(
+ "Name of the collection in the database to watch for changes.");
+
+ public static final Option CONNECTION_OPTIONS =
+ Options.key("connection.options")
+ .stringType()
+ .noDefaultValue()
+ .withDescription(
+ "The ampersand-separated MongoDB connection options. "
+ + "eg. replicaSet=test&connectTimeoutMS=300000");
+
+ public static final Option BATCH_SIZE =
+ Options.key("batch.size")
+ .intType()
+ .defaultValue(1024)
+ .withDescription("The cursor batch size. Defaults to 1024.");
+
+ public static final Option POLL_MAX_BATCH_SIZE =
+ Options.key("poll.max.batch.size")
+ .intType()
+ .defaultValue(1024)
+ .withDescription(
+ "Maximum number of change stream documents "
+ + "to include in a single batch when polling for new data. "
+ + "This setting can be used to limit the amount of data buffered internally in the connector. "
+ + "Defaults to 1024.");
+
+ public static final Option POLL_AWAIT_TIME_MILLIS =
+ Options.key("poll.await.time.ms")
+ .intType()
+ .defaultValue(1000)
+ .withDescription(
+ "The amount of time to wait before checking for new results on the change stream."
+ + "Defaults: 1000.");
+
+ public static final Option HEARTBEAT_INTERVAL_MILLIS =
+ Options.key("heartbeat.interval.ms")
+ .intType()
+ .defaultValue(0)
+ .withDescription(
+ "The length of time in milliseconds between sending heartbeat messages."
+ + "Heartbeat messages contain the post batch resume token and are sent when no source records "
+ + "have been published in the specified interval. This improves the resumability of the connector "
+ + "for low volume namespaces. Use 0 to disable. Defaults to 0.");
+
+ public static final Option INCREMENTAL_SNAPSHOT_CHUNK_SIZE_MB =
+ Options.key("incremental.snapshot.chunk.size.mb")
+ .intType()
+ .defaultValue(64)
+ .withDescription(
+ "The chunk size mb of incremental snapshot. Defaults to 64mb.");
+
+ public static final Option