Importer 4.1.0 update (#2286)

vesoft-inc · Oct 19, 2023 · 968cdf8 · 968cdf8
1 parent 7edc85e
commit 968cdf8
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 7 deletions.
diff --git a/docs-2.0/nebula-importer/use-importer.md b/docs-2.0/nebula-importer/use-importer.md
@@ -1,11 +1,13 @@
 # NebulaGraph Importer
 
-NebulaGraph Importer (Importer) is a standalone tool for importing data from CSV files into NebulaGraph. Importer can read and import CSV file data from multiple data sources.
+NebulaGraph Importer (Importer) is a standalone tool for importing data from CSV files into NebulaGraph. Importer can read and batch import CSV file data from multiple data sources, and also supports batch update and delete operations.
 
 ## Features
 
-- Support multiple data sources, including local, S3, OSS, HDFS, FTP, and SFTP.
+- Support multiple data sources, including local, S3, OSS, HDFS, FTP, SFTP, and GCS.
 - Support importing data from CSV format files. A single file can contain multiple tags, multiple edge types or a mix of both.
+- Support filtering the data from source.
+- Support batch operation, including insert, update, delete.
 - Support connecting to multiple Graph services simultaneously for importing and dynamic load balancing.
 - Support reconnect or retry after failure.
 - Support displaying statistics in multiple dimensions, including import time, import percentage, etc. Support for printing statistics in Console or logs.
@@ -226,7 +228,7 @@ log:
   level: INFO
   console: true
   files:
-   - logs/nebula-importer.log
+    - logs/nebula-importer.log
 ```
 
 |Parameter|Default value|Required|Description|
@@ -275,14 +277,43 @@ sources:
 #  - hdfs:
 #      address: "127.0.0.1:8020"    # Required. The address of HDFS service.
 #      user: "hdfs"    # Optional. The user of HDFS service.
-#      path: "/events/20190918.export.csv"    # Required. The path of file in the HDFS service.
+#      servicePrincipalName: <Kerberos Service Principal Name>  # Optional. The name of the Kerberos service instance for the HDFS service when Kerberos authentication is enabled.
+#      krb5ConfigFile: <Kerberos config file>  # Optional. The path to the Kerberos configuration file for the HDFS service when Kerberos authentication is enabled. Defaults to `/etc/krb5.conf`.
+#      ccacheFile: <Kerberos ccache file>  # Optional. The path to the Kerberos ccache file for the HDFS service when Kerberos authentication is enabled.
+#      keyTabFile: <Kerberos keytab file>  # Optional. The path to the Kerberos keytab file for the HDFS service when Kerberos authentication is enabled.
+#      password: <Kerberos password>  # Optional. The Kerberos password for the HDFS service when Kerberos authentication is enabled.
+#      dataTransferProtection: <Kerberos Data Transfer Protection>  # Optional. The type of transport encryption when Kerberos authentication is enabled. Optional values are `authentication`, `integrity`, `privacy`.
+#      disablePAFXFAST: false  # Optional. Whether to disable the use of PA_FX_FAST for clients.
+#      path: "/events/20190918.export.csv"    # Required. The path to the file in the HDFS service. Wildcard filenames are also supported, e.g. `/events/*.export.csv`, make sure all matching files have the same schema.
+#  - gcs: # Google Cloud Storage
+#      bucket: chicago-crime-sample  # Required. The name of the bucket in the GCS service.
+#      key: stats/000000000000.csv  # Required. The path to the file in the GCS service.
+#      withoutAuthentication: false  # Optional. Whether to anonymize access. Defaults to false, which means access with credentials.
+#      # When using credentials access, one of the credentialsFile and credentialsJSON parameters is sufficient.
+#      credentialsFile: "/path/to/your/credentials/file"  # Optional. The path to the credentials file for the GCS service.
+#      credentialsJSON: '{  # Optional. The JSON content of the credentials for the GCS service.
+#        "type": "service_account",
+#        "project_id": "your-project-id",
+#        "private_key_id": "key-id",
+#        "private_key": "-----BEGIN PRIVATE KEY-----\nxxxxx\n-----END PRIVATE KEY-----\n",
+#        "client_email": "your-client@your-project-id.iam.gserviceaccount.com",
+#        "client_id": "client-id",
+#        "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+#        "token_uri": "https://oauth2.googleapis.com/token",
+#        "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+#        "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/your-client%40your-project-id.iam.gserviceaccount.com",
+#        "universe_domain": "googleapis.com"
+#      }'
     batch: 256
     csv:
       delimiter: "|"
       withHeader: false
       lazyQuotes: false
     tags:
     - name: Person
+#      mode: INSERT
+#      filter:  
+#        expr: Record[1] == "XXX"    
       id:
         type: "STRING"
         function: "hash"
@@ -321,6 +352,9 @@ sources:
     batch: 256
     edges:
     - name: KNOWS # person_knows_person
+#      mode: INSERT
+#      filter:  
+#        expr: Record[1] == "XXX"
       src:
         id:
           type: "STRING"
@@ -360,6 +394,8 @@ The configuration mainly includes the following parts:
 |`sources.csv.withHeader`   |`false`| No | Whether to ignore the first record in the CSV file.          |  
 |`sources.csv.lazyQuotes`   |`false`| No | Whether to allow lazy quotes. If `lazyQuotes` is true, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field.    |  
 |`sources.tags.name`   |-| Yes | The tag name.         |  
+|`sources.tags.mode`   |`INSERT`| No | Batch operation types, including insert, update and delete. Optional values are `INSERT`, `UPDATE` and `DELETE`.         |  
+|`sources.tags.filter.expr`   |-| No | Filter the data and only import if the filter conditions are met. </br>Supported comparison characters are `==`, `! =`, `<`, `>`, `<=` and `>=`. </br>Logical operators supported are `not` (!) , `and` (&&) and `or` (\|\|). </br>For example `(Record[0] == "Mahinda" or Record[0] == "Michael") and Record[3] == "male"`.         |  
 |`sources.tags.id.type`   |`STRING`| No |  The type of the VID.       |  
 |`sources.tags.id.function`   |-| No | Functions to generate the VID. Currently, only function `hash` are supported.         |  
 |`sources.tags.id.index`   |-| No | The column number corresponding to the VID in the data file. If `sources.tags.id.concatItems` is not configured, this parameter must be configured.   |  
@@ -373,6 +409,8 @@ The configuration mainly includes the following parts:
 |`sources.tags.props.alternativeIndices`   |-| No | Ignored when `nullable` is `false`. The property is fetched from records according to the indices in order until not equal to `nullValue`.         |  
 |`sources.tags.props.defaultValue`   |-| No | Ignored when `nullable` is `false`. The property default value, when all the values obtained by `index` and `alternativeIndices` are `nullValue`.         |  
 |`sources.edges.name`   |-| Yes | The edge type name.          |  
+|`sources.edges.mode`   |`INSERT`| No | Batch operation types, including insert, update and delete. Optional values are `INSERT`, `UPDATE` and `DELETE`.       |  
+|`sources.edges.filter.expr`   |-| No | Filter the data and only import if the filter conditions are met. </br>Supported comparison characters are `==`, `! =`, `<`, `>`, `<=` and `>=`. </br>Logical operators supported are `not` (!) , `and` (&&) and `or` (\|\|). </br>For example `(Record[0] == "Mahinda" or Record[0] == "Michael") and Record[3] == "male"`.          |  
 |`sources.edges.src.id.type`   |`STRING`| No |  The data type of the VID at the starting vertex on the edge.       |  
 |`sources.edges.src.id.index`   |-| Yes | The column number in the data file corresponding to the VID at the starting vertex on the edge.         |  
 |`sources.edges.dst.id.type`   |`STRING`| No | The data type of the VID at the destination vertex on the edge.         |  

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -189,9 +189,9 @@ extra:
     branch: release-3.6
     tag: v3.6.0
   importer:
-    release: 4.0.0
-    branch: release-4.0
-    tag: v4.0.0
+    release: 4.1.0
+    branch: release-4.1
+    tag: v4.1.0
   algorithm:
     release: 3.0.0
     branch: v3.0.0