#115 Add Spark sink

AbsaOSS · Jan 12, 2023 · 813308a · 813308a
1 parent 5492abb
commit 813308a
Show file tree

Hide file tree

Showing 4 changed files with 541 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -644,6 +644,7 @@ Sinks define a way data needs to be sent to a target system. Built-in sinks incl
 - Kafka sink.
 - CSV in a local folder sink.
 - Command Line sink.
+- Spark sink.
 - Dynamic Conformance Engine (Enceladus) sink.
 
 You can define your own sink by implementing `Sink` trait and providing the corresponding class name in pipeline configuration.
@@ -705,7 +706,7 @@ The corresponding pipeline operation could look like this:
   ]
   tables = [
     {
-      metastore.table = metastore_table
+      input.metastore.table = metastore_table
       output.topic.name = "my.topic"
       
       # All following settings are OPTIONAL
@@ -784,7 +785,7 @@ The corresponding pipeline operation could look like this:
   ]
   tables = [
     {
-      metastore.table = metastore_table
+      input.metastore.table = metastore_table
       output.path = "/local/csv/path"
       # Date range to read the source table for. By default the job information date is used.
       # But you can define an arbitrary expression based on the information date.
@@ -895,6 +896,95 @@ The pipeline operation for this sink could look like this:
 ```
 </details>
 
+### Spark sink
+
+This sink allows writing data using Spark, similarly as you would do using `df.write.format(...).save(...)`.
+
+Here is an example of a Spark sink definition:
+<details>
+  <summary>Click to expand</summary>
+
+```config
+{
+    # Define a name to reference from the pipeline:
+    name = "spark_sink"
+    factory.class = "za.co.absa.pramen.core.sink.SparkSink"
+    
+    # Output format. Can be: csv, parquet, json, delta, etc (anything supported by Spark). Default: parquet
+    format = "parquet"
+    
+    # Save mode. Can be overwrite, append, ignore, errorifexists. Default: errorifexists
+    mode = "overwrite"
+    
+    ## Only one of these following two options should be specified
+    # Optionally repartition the dataframe according to the specified number of partitions
+    number.of.partitions = 10
+    # Optionally repartition te dataframe according to the number of records per partition
+    records.per.partition = 1000000
+    
+    # If true (default), the data will be saved even if it does not contain any records. If false, the saving will be skipped
+    save.empty = true
+    
+    # If non-empty, the data will be partitioned by the specified columns at the output path. Default: []
+    partition.by = [ pramen_info_date ]
+    
+    # These are additional option passed to the writer as 'df.write(...).options(...)'
+    option {
+      compression = "gzip"
+    }
+}
+```
+</details>
+
+The corresponding pipeline operation could look like this:
+<details>
+  <summary>Click to expand</summary>
+
+```config
+{
+    name = "Spark sink"
+    type = "sink"
+    sink = "spark_sink"
+    
+    schedule.type = "daily"
+    
+    # Optional dependencies
+    dependencies = [
+      {
+        tables = [ dependent_table ]
+        date.from = "@infoDate"
+      }
+    ]
+    
+    tables = [
+      {
+        input.metastore.table = metastore_table
+        output.path = "/datalake/base/path"
+    
+        # Date range to read the source table for. By default the job information date is used.
+        # But you can define an arbitrary expression based on the information date.
+        # More: see the section of documentation regarding date expressions, an the list of functions allowed.
+        date {
+          from = "@infoDate"
+          to = "@infoDate"
+        }
+    
+        transformations = [
+         { col = "col1", expr = "lower(some_string_column)" }
+        ],
+        filters = [
+          "some_numeric_column > 100"
+        ]
+        columns = [ "col1", "col2", "col2", "some_numeric_column" ]
+      }
+    ]
+}
+```
+
+</details>
+
+
+
 ### Dynamic Conformance Engine (Enceladus) sink
 
 This sink is used to send data to the landing area of the Enceladus Data Lake (also known as 'raw folder'). You can configure
@@ -974,7 +1064,7 @@ The pipeline operation for this sink could look like this:
   
   tables = [
     {
-      metastore.table = metastore_table
+      input.metastore.table = metastore_table
       output.path = "/datalake/base/path"
       
       # Optional info version (default = 1)

diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/sink/LocalCsvSink.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/sink/LocalCsvSink.scala
@@ -135,7 +135,7 @@ class LocalCsvSink(sinkConfig: Config,
       createCsvFromDf(df, count, tableName, infoDate, outputPath)
       count
     } else {
-      log.info(s"Notting to send to $outputPath.")
+      log.info(s"Nothing to send to $outputPath.")
       if (params.createEmptyCsv) {
         createEmptyCsv(df.schema, tableName, infoDate, outputPath)
       }

diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/sink/SparkSink.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/sink/SparkSink.scala
@@ -0,0 +1,227 @@
+/*
+ * Copyright 2022 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.pramen.core.sink
+
+import com.typesafe.config.Config
+import org.apache.hadoop.fs.Path
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.slf4j.LoggerFactory
+import za.co.absa.pramen.api.{ExternalChannelFactory, MetastoreReader, Sink}
+import za.co.absa.pramen.core.config.Keys.KEYS_TO_REDACT
+import za.co.absa.pramen.core.utils.ConfigUtils
+
+import java.time.LocalDate
+
+/**
+  * This sink allows writing data using Spark, similarly as you would do using 'df.write.format(...).save(...)'.
+  *
+  * In order to use the sink you need to define sink parameters.
+  *
+  * Example sink definition:
+  * {{{
+  *  {
+  *    # Define a name to reference from the pipeline:
+  *    name = "spark_sink"
+  *    factory.class = "za.co.absa.pramen.core.sink.SparkSink"
+  *
+  *    # Output format. Can be: csv, parquet, json, delta, etc (anything supported by Spark). Default: parquet
+  *    format = "parquet"
+  *
+  *    # Save mode. Can be overwrite, append, ignore, errorifexists. Default: errorifexists
+  *    mode = "overwrite"
+  *
+  *    ## Only one of these following two options should be specified
+  *    # Optionally repartition the dataframe according to the specified number of partitions
+  *    number.of.partitions = 10
+  *    # Optionally repartition te dataframe according to the number of records per partition
+  *    records.per.partition = 1000000
+  *
+  *    # If true (default), the data will be saved even if it does not contain any records. If false, the saving will be skipped
+  *    save.empty = true
+  *
+  *    # If non-empty, the data will be partitioned by the specified columns at the output path. Default: []
+  *    partition.by = [ pramen_info_date ]
+  *
+  *    # These are additional option passed to the writer as 'df.write(...).options(...)
+  *    option {
+  *      compression = "gzip"
+  *    }
+  *  }
+  * }}}
+  *
+  * Here is an example of a sink definition in a pipeline. As for any other operation you can specify
+  * dependencies, transformations, filters and columns to select.
+  *
+  * {{{
+  *  {
+  *    name = "Spark sink"
+  *    type = "sink"
+  *    sink = "spark_sink"
+  *
+  *    schedule.type = "daily"
+  *
+  *    # Optional dependencies
+  *    dependencies = [
+  *      {
+  *        tables = [ dependent_table ]
+  *        date.from = "@infoDate"
+  *      }
+  *    ]
+  *
+  *    tables = [
+  *      {
+  *        input.metastore.table = metastore_table
+  *        output.path = "/datalake/base/path"
+  *
+  *        # Date range to read the source table for. By default the job information date is used.
+  *        # But you can define an arbitrary expression based on the information date.
+  *        # More: see the section of documentation regarding date expressions, an the list of functions allowed.
+  *        date {
+  *          from = "@infoDate"
+  *          to = "@infoDate"
+  *        }
+  *
+  *        transformations = [
+  *         { col = "col1", expr = "lower(some_string_column)" }
+  *        ],
+  *        filters = [
+  *          "some_numeric_column > 100"
+  *        ]
+  *        columns = [ "col1", "col2", "col2", "some_numeric_column" ]
+  *
+  *        # This overrides options of the sink
+  *        sink {
+  *          mode = "append"
+  *
+  *          # These are additional options passed to the writer as 'df.write(...).options(...)'
+  *          option {
+  *             compression = "snappy"
+  *          }
+  *        }
+  *      }
+  *    ]
+  *  }
+  * }}}
+  *
+  */
+class SparkSink(format: String,
+                formatOptions: Map[String, String],
+                mode: String,
+                partitionBy: Seq[String],
+                numberOfPartitions: Option[Int],
+                recordsPerPartition: Option[Long],
+                saveEmpty: Boolean,
+                sinkConfig: Config) extends Sink {
+
+  import za.co.absa.pramen.core.sink.SparkSink._
+
+  private val log = LoggerFactory.getLogger(this.getClass)
+
+  override val config: Config = sinkConfig
+
+  override def connect(): Unit = {}
+
+  override def close(): Unit = {}
+
+  override def send(df: DataFrame,
+                    tableName: String,
+                    metastore: MetastoreReader,
+                    infoDate: LocalDate,
+                    options: Map[String, String])
+                   (implicit spark: SparkSession): Long = {
+    val outputPath = getOutputPath(tableName, options)
+    val recordCount = df.count()
+
+    if (recordCount > 0 || saveEmpty) {
+      log.info(s"Saving $recordCount records to folder: ${outputPath.toUri.toString}")
+      log.info(s"Options passed for '$format':")
+      ConfigUtils.renderExtraOptions(formatOptions, KEYS_TO_REDACT)(log.info)
+
+      val dfToWrite = applyRepartitioning(df, recordCount, tableName)
+      writeData(dfToWrite, outputPath)
+    } else {
+      log.info(s"Nothing to save to folder: ${outputPath.toUri.toString}")
+    }
+
+    recordCount
+  }
+
+  private[core] def writeData(df: DataFrame, outputPath: Path): Unit = {
+    df
+      .write
+      .partitionBy(partitionBy: _*)
+      .format(format)
+      .mode(mode)
+      .options(formatOptions)
+      .save(outputPath.toUri.toString)
+  }
+
+  private[core] def applyRepartitioning(df: DataFrame, recordCount: Long, tableName: String): DataFrame = {
+    (numberOfPartitions, recordsPerPartition) match {
+      case (Some(_), Some(_)) =>
+        throw new IllegalArgumentException(
+          s"Both $NUMBER_OF_PARTITIONS_KEY and $RECORDS_PER_PARTITION_KEY are specified for Spark sink," +
+            s"table: $tableName. Please specify only one of those options")
+      case (Some(nop), None) =>
+        log.info(s"Repartitioning to $nop partitions")
+        df.repartition(nop)
+      case (None, Some(rpp)) =>
+        val n = Math.max(1, Math.ceil(recordCount.toDouble / rpp)).toInt
+        log.info(s"Repartitioning to $n partitions")
+        df.repartition(n)
+      case (None, None) =>
+        df
+    }
+  }
+
+  private[core] def getOutputPath(tableName: String, options: Map[String, String]): Path = {
+    if (!options.contains(OUTPUT_PATH_KEY)) {
+      throw new IllegalArgumentException(s"$OUTPUT_PATH_KEY is not specified for Spark sink, table: $tableName")
+    }
+
+    new Path(options(OUTPUT_PATH_KEY))
+  }
+
+}
+
+object SparkSink extends ExternalChannelFactory[SparkSink] {
+  val OUTPUT_PATH_KEY = "path"
+
+  val FORMAT_KEY = "format"
+  val MODE_KEY = "mode"
+  val PARTITION_BY_KEY = "partition.by"
+  val NUMBER_OF_PARTITIONS_KEY = "number.of.partitions"
+  val RECORDS_PER_PARTITION_KEY = "records.per.partition"
+  val SAVE_EMPTY_KEY = "save.empty"
+
+  val DEFAULT_FORMAT = "parquet"
+  val DEFAULT_MODE = "errorifexists"
+  val DEFAULT_SAVE_EMPTY = true
+
+  override def apply(conf: Config, parentPath: String, spark: SparkSession): SparkSink = {
+    new SparkSink(
+      ConfigUtils.getOptionString(conf, FORMAT_KEY).getOrElse(DEFAULT_FORMAT),
+      ConfigUtils.getExtraOptions(conf, "option"),
+      ConfigUtils.getOptionString(conf, MODE_KEY).getOrElse(DEFAULT_MODE),
+      ConfigUtils.getOptListStrings(conf, PARTITION_BY_KEY),
+      ConfigUtils.getOptionInt(conf, NUMBER_OF_PARTITIONS_KEY),
+      ConfigUtils.getOptionLong(conf, RECORDS_PER_PARTITION_KEY),
+      ConfigUtils.getOptionBoolean(conf, SAVE_EMPTY_KEY).getOrElse(DEFAULT_SAVE_EMPTY),
+      conf
+    )
+  }
+}