From faf31ee708df06be5689e0916800ee368dccbffa Mon Sep 17 00:00:00 2001 From: Luca Canali Date: Fri, 8 Mar 2024 19:30:35 +0100 Subject: [PATCH] Prepare SparkMeasure 0.24 --- README.md | 62 +++++++++++-------- build.sbt | 4 +- docs/Flight_recorder_mode_FileSink.md | 6 +- docs/Flight_recorder_mode_InfluxDBSink.md | 6 +- docs/Flight_recorder_mode_KafkaSink.md | 2 +- docs/Instrument_Python_code.md | 8 +-- docs/Instrument_Scala_code.md | 8 +-- docs/Notes_on_metrics_analysis.md | 4 +- docs/Prometheus.md | 4 +- docs/Python_shell_and_Jupyter.md | 2 +- docs/Scala_shell_and_notebooks.md | 46 ++++++++------ examples/testSparkMeasureScala/README.md | 2 +- examples/testSparkMeasureScala/build.sbt | 6 +- .../project/build.properties | 2 +- project/plugins.sbt | 6 +- python/setup.py | 8 +-- .../ch/cern/sparkmeasure/InfluxDBSink.scala | 5 +- 17 files changed, 102 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index a5593ea..c35e25b 100644 --- a/README.md +++ b/README.md @@ -53,13 +53,13 @@ Choose the sparkMeasure version for your environment: Examples: * Spark with Scala 2.12: - - **Scala:** `bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.12:0.23` - - **Python:** `bin/pyspark --packages ch.cern.sparkmeasure:spark-measure_2.12:0.23` + - **Scala:** `bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.12:0.24` + - **Python:** `bin/pyspark --packages ch.cern.sparkmeasure:spark-measure_2.12:0.24` - note: you also need `pip install sparkmeasure` to get the [Python wrapper API](https://pypi.org/project/sparkmeasure/) * Spark with Scala 2.13: - - Scala: `bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.13:0.23` - - Python: `bin/pyspark --packages ch.cern.sparkmeasure:spark-measure_2.13:0.23` + - Scala: `bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.13:0.24` + - Python: `bin/pyspark --packages ch.cern.sparkmeasure:spark-measure_2.13:0.24` - note: `pip install sparkmeasure` to get the Python wrapper API * Spark 2.4 and 2.3 with Scala 2.11: @@ -69,15 +69,15 @@ Examples: * Where to get sparkMeasure: * [sparkMeasure on Maven Central](https://mvnrepository.com/artifact/ch.cern.sparkmeasure/spark-measure) - * Jars in sparkMeasure's [release notes](https://github.com/LucaCanali/sparkMeasure/releases/tag/v0.23) + * Jars in sparkMeasure's [release notes](https://github.com/LucaCanali/sparkMeasure/releases/tag/v0.24) * Bleeding edge jars as artifacts in [GitHub actions](https://github.com/LucaCanali/sparkMeasure/actions) * Build jars from master using sbt: `sbt +package` * Some practical examples of how to set the configuration to use sparkMeasure with Spark - * `--packages ch.cern.sparkmeasure:spark-measure_2.12:0.23` - * `--jars /path/to/spark-measure_2.12-0.23.jar` - * `--jars https://github.com/LucaCanali/sparkMeasure/releases/download/v0.23/spark-measure_2.12-0.23.jar` - * `--conf spark.driver.extraClassPath=/path/to/spark-measure_2.12-0.23.jar` + * `--packages ch.cern.sparkmeasure:spark-measure_2.12:0.24` + * `--jars /path/to/spark-measure_2.12-0.24.jar` + * `--jars https://github.com/LucaCanali/sparkMeasure/releases/download/v0.24/spark-measure_2.12-0.24.jar` + * `--conf spark.driver.extraClassPath=/path/to/spark-measure_2.12-0.24.jar` --- ### Examples of interactive use of sparkMeasure @@ -94,7 +94,7 @@ Examples: - Stage-level metrics from the command line: ``` # Scala CLI - bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.12:0.23 + bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.12:0.24 val stageMetrics = ch.cern.sparkmeasure.StageMetrics(spark) stageMetrics.runAndMeasure(spark.sql("select count(*) from range(1000) cross join range(1000) cross join range(1000)").show()) @@ -102,7 +102,7 @@ Examples: ``` # Python CLI pip install sparkmeasure - bin/pyspark --packages ch.cern.sparkmeasure:spark-measure_2.12:0.23 + bin/pyspark --packages ch.cern.sparkmeasure:spark-measure_2.12:0.24 from sparkmeasure import StageMetrics stagemetrics = StageMetrics(spark) @@ -110,23 +110,31 @@ Examples: ``` The output should look like this: ``` ++----------+ +| count(1)| ++----------+ +|1000000000| ++----------+ + +Time taken: 3833 ms + Scheduling mode = FIFO Spark Context default degree of parallelism = 8 Aggregated Spark stage metrics: numStages => 3 numTasks => 17 -elapsedTime => 1291 (1 s) -stageDuration => 1058 (1 s) -executorRunTime => 2774 (3 s) -executorCpuTime => 2004 (2 s) -executorDeserializeTime => 2868 (3 s) -executorDeserializeCpuTime => 1051 (1 s) -resultSerializationTime => 5 (5 ms) -jvmGCTime => 88 (88 ms) +elapsedTime => 1112 (1 s) +stageDuration => 864 (0.9 s) +executorRunTime => 3358 (3 s) +executorCpuTime => 2168 (2 s) +executorDeserializeTime => 892 (0.9 s) +executorDeserializeCpuTime => 251 (0.3 s) +resultSerializationTime => 72 (72 ms) +jvmGCTime => 0 (0 ms) shuffleFetchWaitTime => 0 (0 ms) -shuffleWriteTime => 16 (16 ms) -resultSize => 16091 (15.0 KB) +shuffleWriteTime => 36 (36 ms) +resultSize => 16295 (15.9 KB) diskBytesSpilled => 0 (0 Bytes) memoryBytesSpilled => 0 (0 Bytes) peakExecutionMemory => 0 @@ -145,10 +153,12 @@ shuffleRemoteBytesReadToDisk => 0 (0 Bytes) shuffleBytesWritten => 472 (472 Bytes) shuffleRecordsWritten => 8 +Average number of active tasks => 3.0 + Stages and their duration: -Stage 0 duration => 593 (0.6 s) -Stage 1 duration => 416 (0.4 s) -Stage 3 duration => 49 (49 ms) +Stage 0 duration => 355 (0.4 s) +Stage 1 duration => 411 (0.4 s) +Stage 3 duration => 98 (98 ms) ``` - Stage metrics collection mode has an optional memory report command @@ -176,7 +186,7 @@ Stage 3 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes) - this is similar but slightly different from the example above as it collects metrics at the Task-level rather than Stage-level ``` # Scala CLI - bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.12:0.23 + bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.12:0.24 val taskMetrics = ch.cern.sparkmeasure.TaskMetrics(spark) taskMetrics.runAndMeasure(spark.sql("select count(*) from range(1000) cross join range(1000) cross join range(1000)").show()) @@ -184,7 +194,7 @@ Stage 3 OnHeapExecutionMemory maxVal bytes => 0 (0 Bytes) ``` # Python CLI pip install sparkmeasure - bin/pyspark --packages ch.cern.sparkmeasure:spark-measure_2.12:0.23 + bin/pyspark --packages ch.cern.sparkmeasure:spark-measure_2.12:0.24 from sparkmeasure import TaskMetrics taskmetrics = TaskMetrics(spark) diff --git a/build.sbt b/build.sbt index c9497f5..1275adf 100644 --- a/build.sbt +++ b/build.sbt @@ -1,6 +1,6 @@ name := "spark-measure" -version := "0.24-SNAPSHOT" +version := "0.24" scalaVersion := "2.12.18" crossScalaVersions := Seq("2.12.18", "2.13.8") @@ -9,7 +9,7 @@ licenses += ("Apache-2.0", url("http://www.apache.org/licenses/LICENSE-2.0")) // publishing to Sonatype Nexus repository and Maven publishMavenStyle := true -isSnapshot := true +isSnapshot := false libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.5.1" libraryDependencies += "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.16.1" diff --git a/docs/Flight_recorder_mode_FileSink.md b/docs/Flight_recorder_mode_FileSink.md index a6907fa..cb6a138 100644 --- a/docs/Flight_recorder_mode_FileSink.md +++ b/docs/Flight_recorder_mode_FileSink.md @@ -80,7 +80,7 @@ This example collected metrics with Task granularity. (note: source the Hadoop environment before running this) ``` bin/spark-submit --master yarn --deploy-mode cluster \ ---packages ch.cern.sparkmeasure:spark-measure_2.12:0.23 \ +--packages ch.cern.sparkmeasure:spark-measure_2.12:0.24 \ --conf spark.extraListeners=ch.cern.sparkmeasure.FlightRecorderTaskMetrics \ --conf spark.sparkmeasure.outputFormat=json_to_hadoop \ --conf spark.sparkmeasure.outputFilename="hdfs://myclustername/user/luca/test/myoutput_$(date +%s).json" \ @@ -96,7 +96,7 @@ Example, use spark-3.3.0, Kubernetes, Scala 2.12 and write output to S3: bin/spark-submit --master k8s://https://XXX.XXX.XXX.XXX --deploy-mode client --conf spark.executor.instances=3 \ --conf spark.executor.cores=2 --executor-memory 6g --driver-memory 8g \ --conf spark.kubernetes.container.image=/spark:v3.0.0_20190529_hadoop32 \ ---packages org.apache.hadoop:hadoop-aws:3.3.2,ch.cern.sparkmeasure:spark-measure_2.12:0.23 \ +--packages org.apache.hadoop:hadoop-aws:3.3.2,ch.cern.sparkmeasure:spark-measure_2.12:0.24 \ --conf spark.hadoop.fs.s3a.secret.key="YYY..." \ --conf spark.hadoop.fs.s3a.access.key="ZZZ..." \ --conf spark.hadoop.fs.s3a.endpoint="https://s3.cern.ch" \ @@ -115,7 +115,7 @@ To post-process the saved metrics you will need to deserialize objects saved by This is an example of how to do that using the supplied helper object sparkmeasure.Utils ``` -bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.12:0.23 +bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.12:0.24 val myMetrics = ch.cern.sparkmeasure.IOUtils.readSerializedStageMetricsJSON("/tmp/stageMetrics_flightRecorder") // use ch.cern.sparkmeasure.IOUtils.readSerializedStageMetrics("/tmp/stageMetrics.serialized") for java serialization diff --git a/docs/Flight_recorder_mode_InfluxDBSink.md b/docs/Flight_recorder_mode_InfluxDBSink.md index ac91739..88b7554 100644 --- a/docs/Flight_recorder_mode_InfluxDBSink.md +++ b/docs/Flight_recorder_mode_InfluxDBSink.md @@ -2,7 +2,9 @@ Use sparkMeasure in flight recorder mode to instrument Spark applications without touching their code. Flight recorder mode attaches a Spark Listener that collects the metrics while the application runs. -This describes how to sink Spark metrics to an InfluxDB instance. +This describes how to sink Spark metrics to an InfluxDB instance. +Note this is for InfluxDB version 1.x, for version 2.x some changes are needed. +You can use this also with VictoriaMetrics, ingesting the InfluxDB line protocol. ## InfluxDBSink and InfluxDBSinkExtended @@ -85,7 +87,7 @@ bin/spark-shell \ --conf spark.sparkmeasure.influxdbURL="http://localhost:8086" \ --conf spark.extraListeners=ch.cern.sparkmeasure.InfluxDBSink,ch.cern.sparkmeasure.InfluxDBSinkExtended \ --conf spark.sparkmeasure.influxdbStagemetrics=true - --packages ch.cern.sparkmeasure:spark-measure_2.12:0.23 + --packages ch.cern.sparkmeasure:spark-measure_2.12:0.24 // run a Spark job, this will produce metrics spark.sql("select count(*) from range(1000) cross join range(1000) cross join range(1000)").show diff --git a/docs/Flight_recorder_mode_KafkaSink.md b/docs/Flight_recorder_mode_KafkaSink.md index 7caa667..af81935 100644 --- a/docs/Flight_recorder_mode_KafkaSink.md +++ b/docs/Flight_recorder_mode_KafkaSink.md @@ -68,7 +68,7 @@ bin/spark-shell \ --conf spark.extraListeners=ch.cern.sparkmeasure.KafkaSink \ --conf spark.sparkmeasure.kafkaBroker=my_kafka_server:9092 \ --conf spark.sparkmeasure.kafkaTopic=metrics ---packages ch.cern.sparkmeasure:spark-measure_2.12:0.23 +--packages ch.cern.sparkmeasure:spark-measure_2.12:0.24 ``` - Look at the metrics being written to Kafka: diff --git a/docs/Instrument_Python_code.md b/docs/Instrument_Python_code.md index 7159be8..9f4b356 100644 --- a/docs/Instrument_Python_code.md +++ b/docs/Instrument_Python_code.md @@ -54,10 +54,10 @@ The details are discussed in the [examples for Python shell and notebook](docs/P - This is how to run sparkMeasure using a packaged version in Maven Central ``` - bin/spark-submit --packages ch.cern.sparkmeasure:spark-measure_2.12:0.23 your_python_code.py + bin/spark-submit --packages ch.cern.sparkmeasure:spark-measure_2.12:0.24 your_python_code.py // alternative: just download and use the jar (it is only needed in the driver) as in: - bin/spark-submit --conf spark.driver.extraClassPath=/spark-measure_2.12-0.23.jar ... + bin/spark-submit --conf spark.driver.extraClassPath=/spark-measure_2.12-0.24.jar ... ``` ### Download and build sparkMeasure (optional) @@ -73,8 +73,8 @@ The details are discussed in the [examples for Python shell and notebook](docs/P pip install . # Run as in one of these examples: - bin/spark-submit --jars path>/spark-measure_2.12-0.24-SNAPSHOT.jar ... + bin/spark-submit --jars path>/spark-measure_2.12-0.25-SNAPSHOT.jar ... # alternative, set classpath for the driver (sparkmeasure code runs only in the driver) - bin/spark-submit --conf spark.driver.extraClassPath=/spark-measure_2.12-0.24-SNAPSHOT.jar ... + bin/spark-submit --conf spark.driver.extraClassPath=/spark-measure_2.12-0.25-SNAPSHOT.jar ... ``` diff --git a/docs/Instrument_Scala_code.md b/docs/Instrument_Scala_code.md index 23b6ab0..666149a 100644 --- a/docs/Instrument_Scala_code.md +++ b/docs/Instrument_Scala_code.md @@ -71,10 +71,10 @@ See details at: [Prometheus Pushgateway](Prometheus.md) - This is how to run sparkMeasure using a packaged version in Maven Central ``` - bin/spark-submit --packages ch.cern.sparkmeasure:spark-measure_2.12:0.23 + bin/spark-submit --packages ch.cern.sparkmeasure:spark-measure_2.12:0.24 // or just download and use the jar (it is only needed in the driver) as in: - bin/spark-submit --conf spark.driver.extraClassPath=/spark-measure_2.12-0.23.jar ... + bin/spark-submit --conf spark.driver.extraClassPath=/spark-measure_2.12-0.24.jar ... ``` - The alternative, see paragraph above, is to build a jar from master (See below). @@ -88,8 +88,8 @@ See details at: [Prometheus Pushgateway](Prometheus.md) ls -l target/scala-2.12/spark-measure*.jar # location of the compiled jar # Run as in one of these examples: - bin/spark-submit --jars path>/spark-measure_2.12-0.23-SNAPSHOT.jar + bin/spark-submit --jars path>/spark-measure_2.12-0.25-SNAPSHOT.jar # alternative, set classpath for the driver (it is only needed in the driver) - bin/spark-submit --conf spark.driver.extraClassPath=/spark-measure_2.12-0.23-SNAPSHOT.jar ... + bin/spark-submit --conf spark.driver.extraClassPath=/spark-measure_2.12-0.25-SNAPSHOT.jar ... ``` diff --git a/docs/Notes_on_metrics_analysis.md b/docs/Notes_on_metrics_analysis.md index c8741cc..f08855b 100644 --- a/docs/Notes_on_metrics_analysis.md +++ b/docs/Notes_on_metrics_analysis.md @@ -1,8 +1,8 @@ -## Notes on analysis of Spark performance metrics collecteed with sparkMeasure +## Notes on analysis of Spark performance metrics collected using sparkMeasure One of the key features of sparkMeasure is that it makes data easily accessible for analysis. This is achieved by exporting the collected data into Spark DataFrames where they can be queries with Spark APIs and/or SQL. -In addition the metrics can be used for plotting and other visualizations, for example using Jupyter notebooks. +In addition ,the metrics can be used for plotting and other visualizations, for example using Jupyter notebooks. Example of analysis of Task Metrics using a Jupyter notebook at: [SparkTaskMetricsAnalysisExample.ipynb](../examples/SparkTaskMetricsAnalysisExample.ipynb) diff --git a/docs/Prometheus.md b/docs/Prometheus.md index 21a88b1..465f170 100644 --- a/docs/Prometheus.md +++ b/docs/Prometheus.md @@ -11,7 +11,7 @@ The Prometheus Pushgateway can accept collected data and keep it for Prometheus **Parameters:** -* serverIPnPort: String with prometheus pushgateway hostIP:Port, +* serverIPnPort: String with Prometheus Pushgateway hostIP:Port, * metricsJob: job name, * labelName: metrics label name, * labelValue: metrics label value @@ -35,7 +35,7 @@ https://prometheus.io/docs/instrumenting/exposition_formats/ 1. Measure metrics at the Stage level (example in Scala): ``` -bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.12:0.19 +bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.12:0.24 val stageMetrics = ch.cern.sparkmeasure.StageMetrics(spark) stageMetrics.begin() diff --git a/docs/Python_shell_and_Jupyter.md b/docs/Python_shell_and_Jupyter.md index b2ce176..d0dd90b 100644 --- a/docs/Python_shell_and_Jupyter.md +++ b/docs/Python_shell_and_Jupyter.md @@ -9,7 +9,7 @@ See also [README](../README.md) for an introduction to sparkMeasure and its arch - Use PyPi to install the Python wrapper and take the jar from Maven central: ``` pip install sparkmeasure - bin/pyspark --packages ch.cern.sparkmeasure:spark-measure_2.12:0.23 + bin/pyspark --packages ch.cern.sparkmeasure:spark-measure_2.12:0.24 ``` - If you prefer to build from the latest development version: ``` diff --git a/docs/Scala_shell_and_notebooks.md b/docs/Scala_shell_and_notebooks.md index e25efca..1fbd7db 100644 --- a/docs/Scala_shell_and_notebooks.md +++ b/docs/Scala_shell_and_notebooks.md @@ -8,10 +8,10 @@ See also [README](../README.md) for an introduction to sparkMeasure and its arch - The alternative, see paragraph above, is to build a jar from master. ``` - bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.12:0.23 + bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.12:0.24 // or just download and use the jar (it is only needed in the driver) as in: - bin/spark-shell --conf spark.driver.extraClassPath=/spark-measure_2.12-0.23.jar + bin/spark-shell --conf spark.driver.extraClassPath=/spark-measure_2.12-0.24.jar ``` ### Download and build sparkMeasure (optional) @@ -24,17 +24,17 @@ See also [README](../README.md) for an introduction to sparkMeasure and its arch ls -l target/scala-2.12/spark-measure*.jar # location of the compiled jar # Run as in one of these examples: - bin/spark-shell --jars /spark-measure_2.12-0.23-SNAPSHOT.jar + bin/spark-shell --jars /spark-measure_2.12-0.24-SNAPSHOT.jar # alternative, set classpath for the driver (the jar is only needed in the driver) - bin/spark-shell --conf spark.driver.extraClassPath=/spark-measure_2.11-0.23-SNAPSHOT.jar + bin/spark-shell --conf spark.driver.extraClassPath=/spark-measure_2.11-0.24-SNAPSHOT.jar ``` ### Example: collect and print stage metrics with sparkMeasure 1. Measure metrics at the Stage level, a basic example: ``` - bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.12:0.23 + bin/spark-shell --packages ch.cern.sparkmeasure:spark-measure_2.12:0.24 val stageMetrics = ch.cern.sparkmeasure.StageMetrics(spark) stageMetrics.runAndMeasure(spark.sql("select count(*) from range(1000) cross join range(1000) cross join range(1000)").show) @@ -45,23 +45,31 @@ See also [README](../README.md) for an introduction to sparkMeasure and its arch Example output: ``` ++----------+ +| count(1)| ++----------+ +|1000000000| ++----------+ + +Time taken: 3833 ms + Scheduling mode = FIFO Spark Context default degree of parallelism = 8 Aggregated Spark stage metrics: numStages => 3 numTasks => 17 -elapsedTime => 1218 (1 s) -stageDuration => 994 (1.0 s) -executorRunTime => 2625 (3 s) -executorCpuTime => 2224 (2 s) -executorDeserializeTime => 2945 (3 s) -executorDeserializeCpuTime => 1153 (1 s) -resultSerializationTime => 8 (8 ms) -jvmGCTime => 80 (80 ms) +elapsedTime => 1112 (1 s) +stageDuration => 864 (0.9 s) +executorRunTime => 3358 (3 s) +executorCpuTime => 2168 (2 s) +executorDeserializeTime => 892 (0.9 s) +executorDeserializeCpuTime => 251 (0.3 s) +resultSerializationTime => 72 (72 ms) +jvmGCTime => 0 (0 ms) shuffleFetchWaitTime => 0 (0 ms) -shuffleWriteTime => 30 (30 ms) -resultSize => 16220 (15.0 KB) +shuffleWriteTime => 36 (36 ms) +resultSize => 16295 (15.9 KB) diskBytesSpilled => 0 (0 Bytes) memoryBytesSpilled => 0 (0 Bytes) peakExecutionMemory => 0 @@ -80,10 +88,12 @@ shuffleRemoteBytesReadToDisk => 0 (0 Bytes) shuffleBytesWritten => 472 (472 Bytes) shuffleRecordsWritten => 8 +Average number of active tasks => 3.0 + Stages and their duration: -Stage 0 duration => 573 (0.6 s) -Stage 1 duration => 354 (0.4 s) -Stage 3 duration => 67 (67 ms) +Stage 0 duration => 355 (0.4 s) +Stage 1 duration => 411 (0.4 s) +Stage 3 duration => 98 (98 ms) ``` - New in sparkMeasure v01: memory metrics report: diff --git a/examples/testSparkMeasureScala/README.md b/examples/testSparkMeasureScala/README.md index 3452b67..e28323c 100644 --- a/examples/testSparkMeasureScala/README.md +++ b/examples/testSparkMeasureScala/README.md @@ -9,5 +9,5 @@ How to run the example: # build the example jar sbt package -bin/spark-submit --master local[*] --packages ch.cern.sparkmeasure:spark-measure_2.12:0.23 --class ch.cern.testSparkMeasure.testSparkMeasure /testsparkmeasurescala_2.12-0.1.jar +bin/spark-submit --master local[*] --packages ch.cern.sparkmeasure:spark-measure_2.12:0.24 --class ch.cern.testSparkMeasure.testSparkMeasure /testsparkmeasurescala_2.12-0.1.jar ``` diff --git a/examples/testSparkMeasureScala/build.sbt b/examples/testSparkMeasureScala/build.sbt index 2757e7f..da4b9a0 100644 --- a/examples/testSparkMeasureScala/build.sbt +++ b/examples/testSparkMeasureScala/build.sbt @@ -2,7 +2,7 @@ name := "testSparkMeasureScala" version := "0.1" -scalaVersion := "2.12.17" +scalaVersion := "2.12.18" -libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.4.0" -libraryDependencies += "ch.cern.sparkmeasure" %% "spark-measure" % "0.23" +libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.5.1" +libraryDependencies += "ch.cern.sparkmeasure" %% "spark-measure" % "0.24" diff --git a/examples/testSparkMeasureScala/project/build.properties b/examples/testSparkMeasureScala/project/build.properties index f344c14..e3c114d 100644 --- a/examples/testSparkMeasureScala/project/build.properties +++ b/examples/testSparkMeasureScala/project/build.properties @@ -1 +1 @@ -sbt.version = 1.8.2 +sbt.version = 1.9.3 diff --git a/project/plugins.sbt b/project/plugins.sbt index c501e9f..d4ccb6c 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,5 +1,5 @@ logLevel := Level.Warn -addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.1") -addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.4") -addSbtPlugin("com.jsuereth" % "sbt-pgp" % "2.0.1") +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.11") +addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.21") +addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.2.1") diff --git a/python/setup.py b/python/setup.py index 6016a9c..2814fa0 100644 --- a/python/setup.py +++ b/python/setup.py @@ -6,12 +6,12 @@ long_description = """SparkMeasure is a tool for performance troubleshooting of Apache Spark workloads. It simplifies the collection and analysis of Spark performance metrics. The bulk of sparkMeasure is written in Scala. -This package contains the Python API for sparkMeasure and is intended to work Python and Spark. -Use from PySpark, or in Jupyter notebook environments, or in general as a tool to instrument Spark jobs in your Python code. +This package contains the Python API for sparkMeasure. +Use it from Python programs, PySpark, Jupyter notebooks, or in general as a tool to instrument and collect runtime metrics for your Spark jobs. **[Link to sparkMeasure GitHub page and documentation](https://github.com/lucacanali/sparkMeasure)**""" setup(name='sparkmeasure', - version='0.23.2', + version='0.24.0', description=description, long_description=long_description, long_description_content_type="text/markdown", @@ -22,7 +22,7 @@ include_package_data=True, packages=find_packages(), zip_safe=False, - python_requires='>=3.6', + python_requires='>=3.8', install_requires=[], classifiers=[ 'Programming Language :: Python :: 3', diff --git a/src/main/scala/ch/cern/sparkmeasure/InfluxDBSink.scala b/src/main/scala/ch/cern/sparkmeasure/InfluxDBSink.scala index aa65dc6..037a67f 100644 --- a/src/main/scala/ch/cern/sparkmeasure/InfluxDBSink.scala +++ b/src/main/scala/ch/cern/sparkmeasure/InfluxDBSink.scala @@ -12,17 +12,18 @@ import java.util.concurrent.TimeUnit import org.slf4j.LoggerFactory /** - * InfluxDBSink: write Spark metrics and application info in near real-time to InfluxDB + * InfluxDBSink: write Spark metrics and application info in near real-time to InfluxDB v 1.x * use this mode to monitor Spark execution workload * use for Grafana dashboard and analytics of job execution * How to use: attach the InfluxDBSInk to a Spark Context using the extra listener infrastructure. + * Note: this is for InfluxDB v1.x * Example: * --conf spark.extraListeners=ch.cern.sparkmeasure.InfluxDBSink * * Configuration for InfluxDBSink is handled with Spark conf parameters: * * spark.sparkmeasure.influxdbURL (default "http://localhost:8086") - * spark.sparkmeasure.influxdbUsername (default "", not this can be empty if InfluxDB is configured with no authentication) + * spark.sparkmeasure.influxdbUsername (default "", this can be empty if InfluxDB is configured with no authentication) * spark.sparkmeasure.influxdbPassword (default "") * spark.sparkmeasure.influxdbName (default "sparkmeasure") * spark.sparkmeasure.influxdbStagemetrics, (boolean, default is false)