From 968cdf8d26bb107d41bb8e4dac72ce041a417a61 Mon Sep 17 00:00:00 2001 From: cooper-lzy <78672629+cooper-lzy@users.noreply.github.com> Date: Thu, 19 Oct 2023 10:39:43 +0800 Subject: [PATCH] Importer 4.1.0 update (#2286) --- docs-2.0/nebula-importer/use-importer.md | 46 +++++++++++++++++++++--- mkdocs.yml | 6 ++-- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/docs-2.0/nebula-importer/use-importer.md b/docs-2.0/nebula-importer/use-importer.md index 08530cc282a..f1def2991f3 100644 --- a/docs-2.0/nebula-importer/use-importer.md +++ b/docs-2.0/nebula-importer/use-importer.md @@ -1,11 +1,13 @@ # NebulaGraph Importer -NebulaGraph Importer (Importer) is a standalone tool for importing data from CSV files into NebulaGraph. Importer can read and import CSV file data from multiple data sources. +NebulaGraph Importer (Importer) is a standalone tool for importing data from CSV files into NebulaGraph. Importer can read and batch import CSV file data from multiple data sources, and also supports batch update and delete operations. ## Features -- Support multiple data sources, including local, S3, OSS, HDFS, FTP, and SFTP. +- Support multiple data sources, including local, S3, OSS, HDFS, FTP, SFTP, and GCS. - Support importing data from CSV format files. A single file can contain multiple tags, multiple edge types or a mix of both. +- Support filtering the data from source. +- Support batch operation, including insert, update, delete. - Support connecting to multiple Graph services simultaneously for importing and dynamic load balancing. - Support reconnect or retry after failure. - Support displaying statistics in multiple dimensions, including import time, import percentage, etc. Support for printing statistics in Console or logs. @@ -226,7 +228,7 @@ log: level: INFO console: true files: - - logs/nebula-importer.log + - logs/nebula-importer.log ``` |Parameter|Default value|Required|Description| @@ -275,7 +277,33 @@ sources: # - hdfs: # address: "127.0.0.1:8020" # Required. The address of HDFS service. # user: "hdfs" # Optional. The user of HDFS service. -# path: "/events/20190918.export.csv" # Required. The path of file in the HDFS service. +# servicePrincipalName: # Optional. The name of the Kerberos service instance for the HDFS service when Kerberos authentication is enabled. +# krb5ConfigFile: # Optional. The path to the Kerberos configuration file for the HDFS service when Kerberos authentication is enabled. Defaults to `/etc/krb5.conf`. +# ccacheFile: # Optional. The path to the Kerberos ccache file for the HDFS service when Kerberos authentication is enabled. +# keyTabFile: # Optional. The path to the Kerberos keytab file for the HDFS service when Kerberos authentication is enabled. +# password: # Optional. The Kerberos password for the HDFS service when Kerberos authentication is enabled. +# dataTransferProtection: # Optional. The type of transport encryption when Kerberos authentication is enabled. Optional values are `authentication`, `integrity`, `privacy`. +# disablePAFXFAST: false # Optional. Whether to disable the use of PA_FX_FAST for clients. +# path: "/events/20190918.export.csv" # Required. The path to the file in the HDFS service. Wildcard filenames are also supported, e.g. `/events/*.export.csv`, make sure all matching files have the same schema. +# - gcs: # Google Cloud Storage +# bucket: chicago-crime-sample # Required. The name of the bucket in the GCS service. +# key: stats/000000000000.csv # Required. The path to the file in the GCS service. +# withoutAuthentication: false # Optional. Whether to anonymize access. Defaults to false, which means access with credentials. +# # When using credentials access, one of the credentialsFile and credentialsJSON parameters is sufficient. +# credentialsFile: "/path/to/your/credentials/file" # Optional. The path to the credentials file for the GCS service. +# credentialsJSON: '{ # Optional. The JSON content of the credentials for the GCS service. +# "type": "service_account", +# "project_id": "your-project-id", +# "private_key_id": "key-id", +# "private_key": "-----BEGIN PRIVATE KEY-----\nxxxxx\n-----END PRIVATE KEY-----\n", +# "client_email": "your-client@your-project-id.iam.gserviceaccount.com", +# "client_id": "client-id", +# "auth_uri": "https://accounts.google.com/o/oauth2/auth", +# "token_uri": "https://oauth2.googleapis.com/token", +# "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", +# "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/your-client%40your-project-id.iam.gserviceaccount.com", +# "universe_domain": "googleapis.com" +# }' batch: 256 csv: delimiter: "|" @@ -283,6 +311,9 @@ sources: lazyQuotes: false tags: - name: Person +# mode: INSERT +# filter: +# expr: Record[1] == "XXX" id: type: "STRING" function: "hash" @@ -321,6 +352,9 @@ sources: batch: 256 edges: - name: KNOWS # person_knows_person +# mode: INSERT +# filter: +# expr: Record[1] == "XXX" src: id: type: "STRING" @@ -360,6 +394,8 @@ The configuration mainly includes the following parts: |`sources.csv.withHeader` |`false`| No | Whether to ignore the first record in the CSV file. | |`sources.csv.lazyQuotes` |`false`| No | Whether to allow lazy quotes. If `lazyQuotes` is true, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field. | |`sources.tags.name` |-| Yes | The tag name. | +|`sources.tags.mode` |`INSERT`| No | Batch operation types, including insert, update and delete. Optional values are `INSERT`, `UPDATE` and `DELETE`. | +|`sources.tags.filter.expr` |-| No | Filter the data and only import if the filter conditions are met.
Supported comparison characters are `==`, `! =`, `<`, `>`, `<=` and `>=`.
Logical operators supported are `not` (!) , `and` (&&) and `or` (\|\|).
For example `(Record[0] == "Mahinda" or Record[0] == "Michael") and Record[3] == "male"`. | |`sources.tags.id.type` |`STRING`| No | The type of the VID. | |`sources.tags.id.function` |-| No | Functions to generate the VID. Currently, only function `hash` are supported. | |`sources.tags.id.index` |-| No | The column number corresponding to the VID in the data file. If `sources.tags.id.concatItems` is not configured, this parameter must be configured. | @@ -373,6 +409,8 @@ The configuration mainly includes the following parts: |`sources.tags.props.alternativeIndices` |-| No | Ignored when `nullable` is `false`. The property is fetched from records according to the indices in order until not equal to `nullValue`. | |`sources.tags.props.defaultValue` |-| No | Ignored when `nullable` is `false`. The property default value, when all the values obtained by `index` and `alternativeIndices` are `nullValue`. | |`sources.edges.name` |-| Yes | The edge type name. | +|`sources.edges.mode` |`INSERT`| No | Batch operation types, including insert, update and delete. Optional values are `INSERT`, `UPDATE` and `DELETE`. | +|`sources.edges.filter.expr` |-| No | Filter the data and only import if the filter conditions are met.
Supported comparison characters are `==`, `! =`, `<`, `>`, `<=` and `>=`.
Logical operators supported are `not` (!) , `and` (&&) and `or` (\|\|).
For example `(Record[0] == "Mahinda" or Record[0] == "Michael") and Record[3] == "male"`. | |`sources.edges.src.id.type` |`STRING`| No | The data type of the VID at the starting vertex on the edge. | |`sources.edges.src.id.index` |-| Yes | The column number in the data file corresponding to the VID at the starting vertex on the edge. | |`sources.edges.dst.id.type` |`STRING`| No | The data type of the VID at the destination vertex on the edge. | diff --git a/mkdocs.yml b/mkdocs.yml index 9c763ce3614..788ff9c1858 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -189,9 +189,9 @@ extra: branch: release-3.6 tag: v3.6.0 importer: - release: 4.0.0 - branch: release-4.0 - tag: v4.0.0 + release: 4.1.0 + branch: release-4.1 + tag: v4.1.0 algorithm: release: 3.0.0 branch: v3.0.0