Skip to content

Commit

Permalink
replace v1 to v2 (pingcap#2264)
Browse files Browse the repository at this point in the history
  • Loading branch information
shiyuhang0 authored Mar 18, 2022
1 parent e215eb7 commit 1433a28
Show file tree
Hide file tree
Showing 15 changed files with 236 additions and 241 deletions.
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,6 @@ spark.sql("use tidb_catalog.${database}")
spark.sql("select count(*) from ${table}").show
```

> **Note:**
>
> If you use TiSpark 2.0+, for spark-submit on Pyspark, `tidbMapDatabase` is still required and `TiExtension` is not supported yet. PingCAP is working on this issue.
## Current Version

```
Expand Down
156 changes: 6 additions & 150 deletions core/src/main/scala/com/pingcap/tispark/TiDBRelation.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,155 +15,11 @@

package com.pingcap.tispark

import com.pingcap.tikv.TiSession
import com.pingcap.tikv.exception.{TiBatchWriteException, TiClientInternalException}
import com.pingcap.tikv.key.Handle
import com.pingcap.tikv.meta.{TiDAGRequest, TiTableInfo, TiTimestamp}
import com.pingcap.tispark.utils.TiUtil
import com.pingcap.tispark.write.{TiDBOptions, TiDBWriter}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.execution._
import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation}
import org.apache.spark.sql.tispark.{TiHandleRDD, TiRowRDD}
import org.apache.spark.sql.types.{ArrayType, LongType, Metadata, ObjectType, StructType}
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, execution}
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.sql.types.StructType

import scala.collection.mutable.ListBuffer

case class TiDBRelation(
session: TiSession,
tableRef: TiTableReference,
meta: MetaManager,
var ts: TiTimestamp = null,
options: Option[TiDBOptions] = None)(@transient val sqlContext: SQLContext)
extends BaseRelation
with InsertableRelation {
lazy val table: TiTableInfo = getTableOrThrow(tableRef.databaseName, tableRef.tableName)

override lazy val schema: StructType = TiUtil.getSchemaFromTable(table)
lazy val isTiFlashReplicaAvailable: Boolean = {
// Note:
// - INFORMATION_SCHEMA.TIFLASH_REPLICA is not present in TiKV or PD,
// it is calculated in TiDB and stored in memory.
// - In order to get those helpful information we have to read them from
// either TiKV or PD and keep them in memory as well.
//
// select * from INFORMATION_SCHEMA.TIFLASH_REPLICA where table_id = $id
// TABLE_SCHEMA, TABLE_NAME, TABLE_ID, REPLICA_COUNT, LOCATION_LABELS, AVAILABLE, PROGRESS
table.getTiflashReplicaInfo != null && table.getTiflashReplicaInfo.isAvailable
}

def getTiFlashReplicaProgress: Double = {
import scala.collection.JavaConversions._
val progress = table.getPartitionInfo.getDefs
.map(partitonDef => session.getPDClient.getTiFlashReplicaProgress(partitonDef.getId))
.sum
progress / table.getPartitionInfo.getDefs.size()
}

override def sizeInBytes: Long = tableRef.sizeInBytes

def logicalPlanToRDD(dagRequest: TiDAGRequest, output: Seq[Attribute]): List[TiRowRDD] = {
import scala.collection.JavaConverters._
val ids = dagRequest.getPrunedPhysicalIds.asScala
var tiRDDs = new ListBuffer[TiRowRDD]
val tiConf = session.getConf
tiConf.setPartitionPerSplit(TiUtil.getPartitionPerSplit(sqlContext))
ids.foreach(id => {
tiRDDs += new TiRowRDD(
dagRequest.copyReqWithPhysicalId(id),
id,
TiUtil.getChunkBatchSize(sqlContext),
tiConf,
output,
tableRef,
session,
sqlContext.sparkSession)
})
tiRDDs.toList
}

def dagRequestToRegionTaskExec(dagRequest: TiDAGRequest, output: Seq[Attribute]): SparkPlan = {
import scala.collection.JavaConverters._
val ids = dagRequest.getPrunedPhysicalIds.asScala
var tiHandleRDDs = new ListBuffer[TiHandleRDD]()
lazy val attributeRef = Seq(
AttributeReference("RegionId", LongType, nullable = false, Metadata.empty)(),
AttributeReference(
"Handles",
ArrayType(ObjectType(classOf[Handle]), containsNull = false),
nullable = false,
Metadata.empty)())

val tiConf = session.getConf
tiConf.setPartitionPerSplit(TiUtil.getPartitionPerSplit(sqlContext))
ids.foreach(id => {
tiHandleRDDs +=
new TiHandleRDD(
dagRequest,
id,
attributeRef,
tiConf,
tableRef,
session,
sqlContext.sparkSession)
})

// TODO: we may optimize by partitioning the result by region.
// https://github.com/pingcap/tispark/issues/1200
val handlePlan = ColumnarCoprocessorRDD(attributeRef, tiHandleRDDs.toList, fetchHandle = true)
execution.ColumnarRegionTaskExec(
handlePlan,
output,
TiUtil.getChunkBatchSize(sqlContext),
dagRequest,
session.getConf,
session.getTimestamp,
session,
sqlContext.sparkSession)
}

override def equals(obj: Any): Boolean =
obj match {
case other: TiDBRelation =>
this.table.equals(other.table)
case _ =>
false
}

override def insert(data: DataFrame, overwrite: Boolean): Unit =
// default forbid sql interface
// cause tispark provide `replace` instead of `insert` semantic
if (session.getConf.isWriteAllowSparkSQL) {
val saveMode = if (overwrite) {
SaveMode.Overwrite
} else {
SaveMode.Append
}
TiDBWriter.write(data, sqlContext, saveMode, options.get)
} else {
throw new TiBatchWriteException(
"SparkSQL entry for tispark write is disabled. Set spark.tispark.write.allow_spark_sql to enable.")
}

override def toString: String = {
s"TiDBRelation($tableRef, $ts)"
}

private def getTableOrThrow(database: String, table: String): TiTableInfo =
meta.getTable(database, table).getOrElse {
val db = meta.getDatabaseFromCache(database)
if (db.isEmpty) {
throw new TiClientInternalException(
"Database not exist " + database + " valid databases are: " + meta.getDatabasesFromCache
.map(_.getName)
.mkString("[", ",", "]"))
} else {
throw new TiClientInternalException(
"Table not exist " + tableRef + " valid tables are: " + meta
.getTablesFromCache(db.get)
.map(_.getName)
.mkString("[", ",", "]"))
}
}
// Just for the build of v1 path
case class TiDBRelation(sqlContext: SQLContext) extends BaseRelation {
override def schema: StructType = ???
}
34 changes: 28 additions & 6 deletions core/src/main/scala/com/pingcap/tispark/v2/TiDBTable.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,24 @@

package com.pingcap.tispark.v2

import com.pingcap.tikv.{TiConfiguration, TiSession}
import com.pingcap.tikv.exception.TiInternalException
import com.pingcap.tikv.TiSession
import com.pingcap.tikv.exception.TiBatchWriteException
import com.pingcap.tikv.key.Handle
import com.pingcap.tikv.meta.{TiDAGRequest, TiTableInfo, TiTimestamp}
import com.pingcap.tispark.utils.TiUtil
import com.pingcap.tispark.v2.TiDBTable.{getDagRequestToRegionTaskExec, getLogicalPlanToRDD}
import com.pingcap.tispark.v2.sink.TiDBWriterBuilder
import com.pingcap.tispark.write.TiDBOptions
import com.pingcap.tispark.TiTableReference
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.connector.catalog.{SupportsRead, TableCapability}
import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, TableCapability}
import org.apache.spark.sql.connector.read.ScanBuilder
import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
import org.apache.spark.sql.execution.{ColumnarCoprocessorRDD, SparkPlan}
import org.apache.spark.sql.tispark.{TiHandleRDD, TiRowRDD}
import org.apache.spark.sql.types._
import org.apache.spark.sql.util.CaseInsensitiveStringMap
import org.apache.spark.sql.{SQLContext, SparkSession, TiExtensions, execution}
import org.apache.spark.sql.{SQLContext, execution}

import java.util
import java.util.Collections
Expand All @@ -43,7 +45,8 @@ case class TiDBTable(
table: TiTableInfo,
var ts: TiTimestamp = null,
options: Option[TiDBOptions] = None)(@transient val sqlContext: SQLContext)
extends SupportsRead {
extends SupportsRead
with SupportsWrite {

implicit class IdentifierHelper(identifier: TiTableReference) {
def quoted: String = {
Expand Down Expand Up @@ -91,6 +94,7 @@ case class TiDBTable(
override def capabilities(): util.Set[TableCapability] = {
val capabilities = new util.HashSet[TableCapability]
capabilities.add(TableCapability.BATCH_READ)
capabilities.add(TableCapability.V1_BATCH_WRITE)
capabilities
}

Expand All @@ -103,6 +107,25 @@ case class TiDBTable(
def logicalPlanToRDD(dagRequest: TiDAGRequest, output: Seq[Attribute]): List[TiRowRDD] = {
getLogicalPlanToRDD(dagRequest, output, session, sqlContext, tableRef)
}

override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = {
var scalaMap = info.options().asScala.toMap
// TODO https://github.com/pingcap/tispark/issues/2269 we need to move TiDB dependencies which will block insert SQL.
// if we don't support it before release, insert SQL should throw exception in catalyst
if (scalaMap.isEmpty) {
throw new TiBatchWriteException("tidbOption is neccessary.")
}
// Support df.writeto: need add db and table for write
if (!scalaMap.contains("database")) {
scalaMap += ("database" -> databaseName)
}
if (!scalaMap.contains("table")) {
scalaMap += ("table" -> tableName)
}
// Get TiDBOptions
val tiDBOptions = new TiDBOptions(scalaMap)
TiDBWriterBuilder(info, tiDBOptions, sqlContext)
}
}

object TiDBTable {
Expand Down Expand Up @@ -175,5 +198,4 @@ object TiDBTable {
})
tiRDDs.toList
}

}
35 changes: 14 additions & 21 deletions core/src/main/scala/com/pingcap/tispark/v2/TiDBTableProvider.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import com.pingcap.tispark.TiDBRelation
import com.pingcap.tispark.write.{TiDBOptions, TiDBWriter}
import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession, TiExtensions}
import org.apache.spark.sql.connector.catalog.{Table, TableProvider}
import org.apache.spark.sql.connector.catalog.{Identifier, Table, TableProvider}
import org.apache.spark.sql.connector.expressions.Transform
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister}
import org.apache.spark.sql.types.StructType
Expand Down Expand Up @@ -62,32 +62,25 @@ class TiDBTableProvider

override def shortName(): String = "tidb"

// TODO: replace v1 path in next pr
def extractIdentifier(options: CaseInsensitiveStringMap): Identifier = {
require(options.get("database") != null, "Option 'database' is required.")
require(options.get("table") != null, "Option 'table' is required.")
Identifier.of(Array(options.get("database")), options.get("table"))
}

def extractCatalog(options: CaseInsensitiveStringMap): String = {
"tidb_catalog"
}

// DF.write still go v1 path now
// Because v2 path will go through catalyst, which may block something like datatype convert.
override def createRelation(
sqlContext: SQLContext,
mode: SaveMode,
parameters: Map[String, String],
data: DataFrame): BaseRelation = {
val options = new TiDBOptions(parameters)
TiDBWriter.write(data, sqlContext, mode, options)
createRelation(sqlContext, parameters)
}

def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {

val options = new TiDBOptions(parameters)
val sparkSession = sqlContext.sparkSession

TiExtensions.getTiContext(sparkSession) match {
case Some(tiContext) =>
val ts = tiContext.tiSession.getTimestamp
TiDBRelation(
tiContext.tiSession,
options.getTiTableRef(tiContext.tiConf),
tiContext.meta,
ts,
Some(options))(sqlContext)
case None => throw new TiBatchWriteException("TiExtensions is disable!")
}
TiDBRelation(sqlContext)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Copyright 2021 PingCAP, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.pingcap.tispark.v2.sink

import com.pingcap.tispark.write.TiDBOptions
import org.apache.spark.sql.TiContext
import org.apache.spark.sql.connector.write._

/**
* Use V1WriteBuilder before turn to v2
*/
case class TiDBBatchWrite(logicalInfo: LogicalWriteInfo, tiDBOptions: TiDBOptions)(
@transient val tiContext: TiContext)
extends BatchWrite {
override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory =
TiDBDataWriterFactory(logicalInfo.schema(), tiDBOptions, tiContext.tiConf)

override def commit(messages: Array[WriterCommitMessage]): Unit = ???

override def abort(messages: Array[WriterCommitMessage]): Unit = ???
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Copyright 2021 PingCAP, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.pingcap.tispark.v2.sink

import com.pingcap.tikv.TiConfiguration
import com.pingcap.tispark.write.TiDBOptions
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.connector.write.{DataWriter, WriterCommitMessage}
import org.apache.spark.sql.types.StructType

/**
* Use V1WriteBuilder before turn to v2
*/
case class TiDBDataWrite(
partitionId: Int,
taskId: Long,
schema: StructType,
tiDBOptions: TiDBOptions,
ticonf: TiConfiguration)
extends DataWriter[InternalRow] {

override def write(record: InternalRow): Unit = {
val row = Row.fromSeq(record.toSeq(schema))
???
}

override def commit(): WriterCommitMessage = ???

override def abort(): Unit = {}

override def close(): Unit = {}
}

object WriteSucceeded extends WriterCommitMessage
Loading

0 comments on commit 1433a28

Please sign in to comment.