vesoft-inc · Nicole00 · Dec 23, 2021 · Dec 16, 2021 · Dec 16, 2021 · Dec 22, 2021
diff --git a/nebula-exchange/pom.xml b/nebula-exchange/pom.xml
@@ -37,6 +37,7 @@
         <commons-codec.version>1.14</commons-codec.version>
         <hadoop.version>2.6.1</hadoop.version>
         <hbase.version>1.2.0</hbase.version>
+        <kafka.version>2.0.0</kafka.version>
     </properties>
 
     <build>
@@ -132,7 +133,7 @@
                             <createDependencyReducedPom>false</createDependencyReducedPom>
                             <artifactSet>
                                 <excludes>
-                                    <exclude>org.apache.spark:*</exclude>
+                                    <!--<exclude>org.apache.spark:*</exclude>-->
                                     <exclude>org.apache.hadoop:*</exclude>
                                     <exclude>org.apache.hive:*</exclude>
                                     <exclude>log4j:log4j</exclude>
@@ -254,6 +255,17 @@
     </build>
 
     <dependencies>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-sql-kafka-0-10_2.11</artifactId>
+            <version>${spark.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.apache.spark</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
         <dependency>
             <groupId>io.streamnative.connectors</groupId>
             <artifactId>pulsar-spark-connector_2.11</artifactId>
@@ -263,6 +275,7 @@
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-core_2.11</artifactId>
             <version>${spark.version}</version>
+            <scope>provided</scope>
             <exclusions>
                 <exclusion>
                     <artifactId>snappy-java</artifactId>
@@ -362,6 +375,7 @@
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-sql_2.11</artifactId>
             <version>${spark.version}</version>
+            <scope>provided</scope>
             <exclusions>
                 <exclusion>
                     <artifactId>snappy-java</artifactId>
@@ -401,11 +415,13 @@
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-catalyst_2.11</artifactId>
             <version>${spark.version}</version>
+            <scope>provided</scope>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-hive_2.11</artifactId>
             <version>${spark.version}</version>
+            <scope>provided</scope>
             <exclusions>
                 <exclusion>
                     <artifactId>commons-codec</artifactId>
@@ -455,12 +471,17 @@
                     <artifactId>commons-io</artifactId>
                     <groupId>commons-io</groupId>
                 </exclusion>
+                <exclusion>
+                    <artifactId>hive-metastore</artifactId>
+                    <groupId>org.spark-project.hive</groupId>
+                </exclusion>
             </exclusions>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-yarn_2.11</artifactId>
             <version>${spark.version}</version>
+            <scope>provided</scope>
             <exclusions>
                 <exclusion>
                     <artifactId>guava</artifactId>

diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/Configs.scala b/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/Configs.scala
@@ -249,6 +249,7 @@ object Configs {
   private[this] val DEFAULT_LOCAL_PATH           = None
   private[this] val DEFAULT_REMOTE_PATH          = None
   private[this] val DEFAULT_STREAM_INTERVAL      = 30
+  private[this] val DEFAULT_KAFKA_STARTINGOFFSETS      = "latest"
   private[this] val DEFAULT_PARALLEL             = 1
 
   /**
@@ -659,10 +660,18 @@ object Configs {
         val intervalSeconds =
           if (config.hasPath("interval.seconds")) config.getInt("interval.seconds")
           else DEFAULT_STREAM_INTERVAL
+        val startingOffsets =
+          if (config.hasPath("startingOffsets")) config.getString("startingOffsets")
+          else DEFAULT_KAFKA_STARTINGOFFSETS
+        val maxOffsetsPerTrigger =
+          if (config.hasPath("maxOffsetsPerTrigger")) Some(config.getLong("maxOffsetsPerTrigger"))
+          else None
         KafkaSourceConfigEntry(SourceCategory.KAFKA,
                                intervalSeconds,
                                config.getString("service"),
-                               config.getString("topic"))
+                               config.getString("topic"),
+                               startingOffsets,
+                               maxOffsetsPerTrigger)
       case SourceCategory.PULSAR =>
         val options =
           config.getObject("options").unwrapped.asScala.map(x => x._1 -> x._2.toString).toMap

diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/SourceConfigs.scala b/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/config/SourceConfigs.scala
@@ -162,16 +162,20 @@ case class MySQLSourceConfigEntry(override val category: SourceCategory.Value,
   *
   * @param server
   * @param topic
+ * @param startingOffsets
+ * @param maxOffsetsPerTrigger
   */
 case class KafkaSourceConfigEntry(override val category: SourceCategory.Value,
                                   override val intervalSeconds: Int,
                                   server: String,
-                                  topic: String)
+                                  topic: String,
+                                  startingOffsets: String,
+                                  maxOffsetsPerTrigger: Option[Long]=None)
     extends StreamingDataSourceConfigEntry {
   require(server.trim.nonEmpty && topic.trim.nonEmpty)
 
   override def toString: String = {
-    s"Kafka source server: ${server} topic:${topic}"
+    s"Kafka source server: ${server} topic:${topic} startingOffsets:${startingOffsets} maxOffsetsPerTrigger:${maxOffsetsPerTrigger}"
   }
 }
 

diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/EdgeProcessor.scala b/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/EdgeProcessor.scala
@@ -274,49 +274,77 @@ class EdgeProcessor(data: DataFrame,
           }
         }
     } else {
+      val streamFlag = data.isStreaming
       val edgeFrame = data
+        .filter { row => //filter and check row data,if streaming only print log
+          val sourceField = if (!edgeConfig.isGeo) {
+            val sourceIndex = row.schema.fieldIndex(edgeConfig.sourceField)
+            if (sourceIndex < 0 || row.isNullAt(sourceIndex)) {
+              printChoice(streamFlag, s"source vertexId must exist and cannot be null, your row data is $row")
+              None
+            } else Some(row.get(sourceIndex).toString)
+          } else {
+            val lat = row.getDouble(row.schema.fieldIndex(edgeConfig.latitude.get))
+            val lng = row.getDouble(row.schema.fieldIndex(edgeConfig.longitude.get))
+            Some(indexCells(lat, lng).mkString(","))
+          }
+
+          val sourceFlag = sourceField.isDefined
+          val sourcePolicyFlag = if (sourceFlag && edgeConfig.sourcePolicy.isEmpty && !isVidStringType
+            && !NebulaUtils.isNumic(sourceField.get)) {
+            printChoice(streamFlag, s"space vidType is int, but your srcId $sourceField is not numeric.your row data is $row")
+            false
+          } else if (sourceFlag && edgeConfig.sourcePolicy.isDefined && isVidStringType) {
+            printChoice(streamFlag, s"only int vidType can use policy, but your vidType is FIXED_STRING.your row data is $row")
+            false
+          } else true
+
+          val targetIndex = row.schema.fieldIndex(edgeConfig.targetField)
+          val targetFlag = if (targetIndex < 0 || row.isNullAt(targetIndex)) {
+            printChoice(streamFlag, s"target vertexId must exist and cannot be null, your row data is $row")
+            false
+          } else {
+            val targetField = row.get(targetIndex).toString
+            if (edgeConfig.targetPolicy.isEmpty && !isVidStringType && !NebulaUtils.isNumic(targetField)) {
+              printChoice(streamFlag, s"space vidType is int, but your dstId $targetField is not numeric.your row data is $row")
+              false
+            } else if (edgeConfig.targetPolicy.isDefined && isVidStringType) {
+              printChoice(streamFlag, s"only int vidType can use policy, but your vidType is FIXED_STRING.your row data is $row")
+              false
+            } else true
+          }
+
+          val edgeRankFlag = if (edgeConfig.rankingField.isDefined) {
+            val index = row.schema.fieldIndex(edgeConfig.rankingField.get)
+            val ranking = row.get(index).toString
+            if (!NebulaUtils.isNumic(ranking)) {
+              printChoice(streamFlag, s"Not support non-Numeric type for ranking field.your row data is $row")
+              false
+            } else true
+          } else true
+          sourceFlag && sourcePolicyFlag && targetFlag && edgeRankFlag
+        }
         .map { row =>
           var sourceField = if (!edgeConfig.isGeo) {
             val sourceIndex = row.schema.fieldIndex(edgeConfig.sourceField)
-            assert(sourceIndex >= 0 && !row.isNullAt(sourceIndex),
-                   s"source vertexId must exist and cannot be null, your row data is $row")
             val value = row.get(sourceIndex).toString
             if (value.equals(DEFAULT_EMPTY_VALUE)) "" else value
           } else {
             val lat = row.getDouble(row.schema.fieldIndex(edgeConfig.latitude.get))
             val lng = row.getDouble(row.schema.fieldIndex(edgeConfig.longitude.get))
             indexCells(lat, lng).mkString(",")
           }
-
-          if (edgeConfig.sourcePolicy.isEmpty) {
-            // process string type vid
-            if (isVidStringType) {
-              sourceField = NebulaUtils.escapeUtil(sourceField).mkString("\"", "", "\"")
-            } else {
-              assert(NebulaUtils.isNumic(sourceField),
-                     s"space vidType is int, but your srcId $sourceField is not numeric.")
-            }
-          } else {
-            assert(!isVidStringType,
-                   "only int vidType can use policy, but your vidType is FIXED_STRING.")
+          // process string type vid
+          if (edgeConfig.sourcePolicy.isEmpty && isVidStringType) {
+            sourceField = NebulaUtils.escapeUtil(sourceField).mkString("\"", "", "\"")
           }
 
           val targetIndex = row.schema.fieldIndex(edgeConfig.targetField)
-          assert(targetIndex >= 0 && !row.isNullAt(targetIndex),
-                 s"target vertexId must exist and cannot be null, your row data is $row")
           var targetField = row.get(targetIndex).toString
           if (targetField.equals(DEFAULT_EMPTY_VALUE)) targetField = ""
-          if (edgeConfig.targetPolicy.isEmpty) {
-            // process string type vid
-            if (isVidStringType) {
-              targetField = NebulaUtils.escapeUtil(targetField).mkString("\"", "", "\"")
-            } else {
-              assert(NebulaUtils.isNumic(targetField),
-                     s"space vidType is int, but your dstId $targetField is not numeric.")
-            }
-          } else {
-            assert(!isVidStringType,
-                   "only int vidType can use policy, but your vidType is FIXED_STRING.")
+          // process string type vid
+          if (edgeConfig.targetPolicy.isEmpty && isVidStringType) {
+            targetField = NebulaUtils.escapeUtil(targetField).mkString("\"", "", "\"")
           }
 
           val values = for {
@@ -326,19 +354,20 @@ class EdgeProcessor(data: DataFrame,
           if (edgeConfig.rankingField.isDefined) {
             val index   = row.schema.fieldIndex(edgeConfig.rankingField.get)
             val ranking = row.get(index).toString
-            assert(NebulaUtils.isNumic(ranking), s"Not support non-Numeric type for ranking field")
-
             Edge(sourceField, targetField, Some(ranking.toLong), values)
           } else {
             Edge(sourceField, targetField, None, values)
           }
         }(Encoders.kryo[Edge])
 
       // streaming write
-      if (data.isStreaming) {
+      if (streamFlag) {
         val streamingDataSourceConfig =
           edgeConfig.dataSourceConfigEntry.asInstanceOf[StreamingDataSourceConfigEntry]
-        edgeFrame.writeStream
+        val wStream = edgeFrame.writeStream
+        if (edgeConfig.checkPointPath.isDefined) wStream.option("checkpointLocation", edgeConfig.checkPointPath.get)
+
+        wStream
           .foreachBatch((edges, batchId) => {
             LOG.info(s"${edgeConfig.name} edge start batch ${batchId}.")
             edges.foreachPartition(processEachPartition _)

diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/Processor.scala b/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/Processor.scala
@@ -5,21 +5,10 @@
 
 package com.vesoft.nebula.exchange.processor
 
-import com.vesoft.nebula.{
-  Coordinate,
-  Date,
-  DateTime,
-  Geography,
-  LineString,
-  NullType,
-  Point,
-  Polygon,
-  PropertyType,
-  Time,
-  Value
-}
+import com.vesoft.nebula.{Coordinate, Date, DateTime, Geography, LineString, NullType, Point, Polygon, PropertyType, Time, Value}
 import com.vesoft.nebula.exchange.utils.NebulaUtils.DEFAULT_EMPTY_VALUE
 import com.vesoft.nebula.exchange.utils.{HDFSUtils, NebulaUtils}
+import org.apache.log4j.Logger
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.types.{IntegerType, LongType, StringType}
 
@@ -33,6 +22,9 @@ import scala.collection.mutable.ListBuffer
   */
 trait Processor extends Serializable {
 
+  @transient
+  private[this] lazy val LOG = Logger.getLogger(this.getClass)
+
   /**
     * process dataframe to vertices or edges
     */
@@ -230,4 +222,9 @@ trait Processor extends Serializable {
       }
     }
   }
+
+  def printChoice(streamFlag: Boolean, context: String): Unit = {
+    if (streamFlag) LOG.info(context)
+    else assert(assertion = false, context)
+  }
 }
diff --git a/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/VerticesProcessor.scala b/nebula-exchange/src/main/scala/com/vesoft/nebula/exchange/processor/VerticesProcessor.scala
@@ -229,42 +229,50 @@ class VerticesProcessor(data: DataFrame,
           }
         }
     } else {
+      val streamFlag = data.isStreaming
       val vertices = data
+        .filter { row => //filter and check row data,if streaming only print log
+          val index = row.schema.fieldIndex(tagConfig.vertexField)
+          if (index < 0 || row.isNullAt(index)) {
+            printChoice(streamFlag, s"vertexId must exist and cannot be null, your row data is $row")
+            false
+          } else {
+            val vertexId = row.get(index).toString
+            // process int type vid
+            if (tagConfig.vertexPolicy.isEmpty && !isVidStringType && !NebulaUtils.isNumic(vertexId)) {
+              printChoice(streamFlag, s"space vidType is int, but your vertex id $vertexId is not numeric.your row data is $row")
+              false
+            } else if (tagConfig.vertexPolicy.isDefined && isVidStringType) {
+              printChoice(streamFlag, s"only int vidType can use policy, but your vidType is FIXED_STRING.your row data is $row")
+              false
+            } else true
+          }
+        }
         .map { row =>
-          val vertexID = {
-            val index = row.schema.fieldIndex(tagConfig.vertexField)
-            assert(index >= 0 && !row.isNullAt(index),
-                   s"vertexId must exist and cannot be null, your row data is $row")
-            var value = row.get(index).toString
-            if (value.equals(DEFAULT_EMPTY_VALUE)) { value = "" }
-            if (tagConfig.vertexPolicy.isEmpty) {
-              // process string type vid
-              if (isVidStringType) {
-                NebulaUtils.escapeUtil(value).mkString("\"", "", "\"")
-              } else {
-                // process int type vid
-                assert(NebulaUtils.isNumic(value),
-                       s"space vidType is int, but your vertex id $value is not numeric.")
-                value
-              }
-            } else {
-              assert(!isVidStringType,
-                     "only int vidType can use policy, but your vidType is FIXED_STRING.")
-              value
-            }
+          val index = row.schema.fieldIndex(tagConfig.vertexField)
+          var vertexId = row.get(index).toString
+          if (vertexId.equals(DEFAULT_EMPTY_VALUE)) {
+            vertexId = ""
+          }
+
+          if (tagConfig.vertexPolicy.isEmpty && isVidStringType){
+            vertexId = NebulaUtils.escapeUtil(vertexId).mkString("\"", "", "\"")
           }
 
           val values = for {
             property <- fieldKeys if property.trim.length != 0
           } yield extraValueForClient(row, property, fieldTypeMap)
-          Vertex(vertexID, values)
+          Vertex(vertexId, values)
         }(Encoders.kryo[Vertex])
 
       // streaming write
-      if (data.isStreaming) {
+      if (streamFlag) {
         val streamingDataSourceConfig =
           tagConfig.dataSourceConfigEntry.asInstanceOf[StreamingDataSourceConfigEntry]
-        vertices.writeStream
+        val wStream = vertices.writeStream
+        if (tagConfig.checkPointPath.isDefined) wStream.option("checkpointLocation", tagConfig.checkPointPath.get)
+
+        wStream
           .foreachBatch((vertexSet, batchId) => {
             LOG.info(s"${tagConfig.name} tag start batch ${batchId}.")
             vertexSet.foreachPartition(processEachPartition _)