oap-project · xwu99 · Jul 27, 2021 · Jul 26, 2021 · Jul 27, 2021 · Jul 27, 2021
diff --git a/dev/set-examples-version.sh b/dev/set-examples-version.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+
+SPARK_VERSION=3.1.1
+OAP_VERSION=1.2.0
+OAP_EXAMPLE_VERSION=$OAP_VERSION
+
+exampleDirs=(kmeans pca als naive-bayes linear-regression)
+
+cd $SCRIPT_DIR/../examples
+
+for dir in ${exampleDirs[*]}
+do
+  cd $dir
+  mvn versions:set -DnewVersion=$OAP_EXAMPLE_VERSION
+  mvn versions:set-property -Dproperty=spark.version -DnewVersion=$SPARK_VERSION
+  cd ..  
+done
diff --git a/dev/set-pom-version.sh b/dev/set-pom-version.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+SPARK_VERSION=3.1.1
+OAP_VERSION=1.2.0
+
+mvn versions:set -DnewVersion=$OAP_VERSION
+mvn versions:set-property -Dproperty=spark.version -DnewVersion=$SPARK_VERSION
diff --git a/examples/als-pyspark/run.sh b/examples/als-pyspark/run.sh
@@ -4,14 +4,15 @@ source ../../conf/env.sh
 
 # Data file is converted from oneDAL examples ($DAALROOT/examples/daal/data/batch/implicit_als_csr.csv)
 # The data file should be copied to $HDFS_ROOT before running examples
-DATA_FILE=data/onedal_als_csr_ratings.txt
+DATA_FILE=$HDFS_ROOT/data/onedal_als_csr_ratings.txt
 
 APP_PY=als-pyspark.py
 
 time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
     --num-executors $SPARK_NUM_EXECUTORS \
-    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-cores $SPARK_EXECUTOR_CORES \
+    --total-executor-cores $SPARK_TOTAL_CORES \
+    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-memory $SPARK_EXECUTOR_MEMORY \
     --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
     --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \

diff --git a/examples/als/pom.xml b/examples/als/pom.xml
@@ -4,7 +4,7 @@
 
   <groupId>com.intel.oap</groupId>
   <artifactId>oap-mllib-examples</artifactId>
-  <version>${oap.version}-with-spark-${spark.version}</version>
+  <version>1.2.0</version>
   <packaging>jar</packaging>
 
   <name>ALSExample</name>
@@ -15,7 +15,7 @@
     <oap.version>1.2.0</oap.version>
     <scala.version>2.12.10</scala.version>
     <scala.binary.version>2.12</scala.binary.version>
-    <spark.version>3.0.0</spark.version>
+    <spark.version>3.1.1</spark.version>
   </properties>
 
   <dependencies>

diff --git a/examples/als/run.sh b/examples/als/run.sh
@@ -4,15 +4,16 @@ source ../../conf/env.sh
 
 # Data file is converted from oneDAL examples ($DAALROOT/examples/daal/data/batch/implicit_als_csr.csv)
 # The data file should be copied to $HDFS_ROOT before running examples
-DATA_FILE=data/onedal_als_csr_ratings.txt
+DATA_FILE=$HDFS_ROOT/data/onedal_als_csr_ratings.txt
 
-APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION-with-spark-3.0.0.jar
+APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION.jar
 APP_CLASS=org.apache.spark.examples.ml.ALSExample
 
 time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
     --num-executors $SPARK_NUM_EXECUTORS \
-    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-cores $SPARK_EXECUTOR_CORES \
+    --total-executor-cores $SPARK_TOTAL_CORES \
+    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-memory $SPARK_EXECUTOR_MEMORY \
     --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
     --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \

diff --git a/examples/build-all.sh b/examples/build-all.sh
@@ -1,6 +1,8 @@
 #!/usr/bin/env bash
 
-for dir in kmeans pca als naive-bayes linear-regression
+exampleDirs=(kmeans pca als naive-bayes linear-regression)
+
+for dir in ${exampleDirs[*]}
 do
   cd $dir
   ./build.sh

diff --git a/examples/kmeans-pyspark/run.sh b/examples/kmeans-pyspark/run.sh
@@ -4,14 +4,15 @@ source ../../conf/env.sh
 
 # Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt) and put in examples/data
 # The data file should be copied to $HDFS_ROOT before running examples
-DATA_FILE=data/sample_kmeans_data.txt
+DATA_FILE=$HDFS_ROOT/data/sample_kmeans_data.txt
 
 APP_PY=kmeans-pyspark.py
 
 time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
     --num-executors $SPARK_NUM_EXECUTORS \
-    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-cores $SPARK_EXECUTOR_CORES \
+    --total-executor-cores $SPARK_TOTAL_CORES \
+    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-memory $SPARK_EXECUTOR_MEMORY \
     --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
     --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \

diff --git a/examples/kmeans/pom.xml b/examples/kmeans/pom.xml
@@ -4,7 +4,7 @@
 
   <groupId>com.intel.oap</groupId>
   <artifactId>oap-mllib-examples</artifactId>
-  <version>${oap.version}-with-spark-${spark.version}</version>
+  <version>1.2.0</version>
   <packaging>jar</packaging>
 
   <name>KMeansExample</name>
@@ -15,7 +15,7 @@
     <oap.version>1.2.0</oap.version>
     <scala.version>2.12.10</scala.version>
     <scala.binary.version>2.12</scala.binary.version>
-    <spark.version>3.0.0</spark.version>
+    <spark.version>3.1.1</spark.version>
   </properties>
 
   <dependencies>

diff --git a/examples/kmeans/run-gpu-standalone.sh b/examples/kmeans/run-gpu-standalone.sh
@@ -4,27 +4,34 @@ source ../../conf/env.sh
 
 # Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt) and put in examples/data
 # The data file should be copied to $HDFS_ROOT before running examples
-DATA_FILE=data/sample_kmeans_data.txt
+DATA_FILE=$HDFS_ROOT/data/sample_kmeans_data.txt
 
 APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION.jar
 APP_CLASS=org.apache.spark.examples.ml.KMeansExample
 
+RESOURCE_FILE=$PWD/IntelGpuResourceFile.json
+WORKER_GPU_AMOUNT=4
+EXECUTOR_GPU_AMOUNT=1
+TASK_GPU_AMOUNT=1
+USE_GPU=true
+
+# Should run in standalone mode
 time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
     --num-executors $SPARK_NUM_EXECUTORS \
-    --driver-memory $SPARK_DRIVER_MEMORY \
-    --total-executor-cores $SPARK_TOTAL_CORES \
     --executor-cores $SPARK_EXECUTOR_CORES \
+    --total-executor-cores $SPARK_TOTAL_CORES \
+    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-memory $SPARK_EXECUTOR_MEMORY \
     --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
     --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
     --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
     --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
     --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
-    --conf "spark.worker.resource.gpu.amount=4" \
-    --conf "spark.worker.resourcesFile=$PWD/IntelGpuResourceFile.json" \
-    --conf "spark.oap.mllib.useGPU=true" \
-    --conf "spark.executor.resource.gpu.amount=1" \
-    --conf "spark.task.resource.gpu.amount=1" \
+    --conf "spark.oap.mllib.useGPU=$USE_GPU" \
+    --conf "spark.worker.resourcesFile=$RESOURCE_FILE" \
+    --conf "spark.worker.resource.gpu.amount=$WORKER_GPU_AMOUNT" \
+    --conf "spark.executor.resource.gpu.amount=$EXECUTOR_GPU_AMOUNT" \
+    --conf "spark.task.resource.gpu.amount=$TASK_GPU_AMOUNT" \
     --conf "spark.shuffle.reduceLocality.enabled=false" \
     --conf "spark.network.timeout=1200s" \
     --conf "spark.task.maxFailures=1" \

diff --git a/examples/kmeans/run-cpu.sh → examples/kmeans/run.sh b/examples/kmeans/run-cpu.sh → examples/kmeans/run.sh
@@ -4,16 +4,16 @@ source ../../conf/env.sh
 
 # Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt) and put in examples/data
 # The data file should be copied to $HDFS_ROOT before running examples
-DATA_FILE=data/sample_kmeans_data.txt
+DATA_FILE=$HDFS_ROOT/data/sample_kmeans_data.txt
 
 APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION.jar
 APP_CLASS=org.apache.spark.examples.ml.KMeansExample
 
 time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
     --num-executors $SPARK_NUM_EXECUTORS \
-    --driver-memory $SPARK_DRIVER_MEMORY \
-    --total-executor-cores $SPARK_TOTAL_CORES \
     --executor-cores $SPARK_EXECUTOR_CORES \
+    --total-executor-cores $SPARK_TOTAL_CORES \
+    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-memory $SPARK_EXECUTOR_MEMORY \
     --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
     --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \

diff --git a/examples/linear-regression/pom.xml b/examples/linear-regression/pom.xml
@@ -4,7 +4,7 @@
 
   <groupId>com.intel.oap</groupId>
   <artifactId>oap-mllib-examples</artifactId>
-  <version>${oap.version}-with-spark-${spark.version}</version>  
+  <version>1.2.0</version>  
   <packaging>jar</packaging>
 
   <name>LinearRegressionExample</name>
@@ -15,7 +15,7 @@
     <oap.version>1.2.0</oap.version>
     <scala.version>2.12.10</scala.version>
     <scala.binary.version>2.12</scala.binary.version>
-    <spark.version>3.0.0</spark.version>
+    <spark.version>3.1.1</spark.version>
   </properties>
 
   <dependencies>

diff --git a/examples/linear-regression/run.sh b/examples/linear-regression/run.sh
@@ -2,18 +2,19 @@
 
 source ../../conf/env.sh
 
-APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION-with-spark-3.0.0.jar
-APP_CLASS=org.apache.spark.examples.ml.LinearRegressionExample
-DATA_FILE=data/sample_linear_regression_data.txt
+# Data file is from Spark Examples (data/mllib/sample_linear_regression_data.txt) and put in examples/data
+# The data file should be copied to $HDFS_ROOT before running examples
+DATA_FILE=$HDFS_ROOT/data/sample_linear_regression_data.txt
 
-OAP_MLLIB_ENABLED=true
+APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION.jar
+APP_CLASS=org.apache.spark.examples.ml.LinearRegressionExample
 
 time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
     --num-executors $SPARK_NUM_EXECUTORS \
-    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-cores $SPARK_EXECUTOR_CORES \
+    --total-executor-cores $SPARK_TOTAL_CORES \
+    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-memory $SPARK_EXECUTOR_MEMORY \
-    --conf "spark.oap.mllib.enabled=$OAP_MLLIB_ENABLED" \
     --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
     --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
     --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \

diff --git a/examples/naive-bayes/pom.xml b/examples/naive-bayes/pom.xml
@@ -4,7 +4,7 @@
 
   <groupId>com.intel.oap</groupId>
   <artifactId>oap-mllib-examples</artifactId>
-  <version>${oap.version}-with-spark-${spark.version}</version>  
+  <version>1.2.0</version>  
   <packaging>jar</packaging>
 
   <name>NaiveBayesExample</name>
@@ -15,7 +15,7 @@
     <oap.version>1.2.0</oap.version>
     <scala.version>2.12.10</scala.version>
     <scala.binary.version>2.12</scala.binary.version>
-    <spark.version>3.0.0</spark.version>
+    <spark.version>3.1.1</spark.version>
   </properties>
 
   <dependencies>

diff --git a/examples/naive-bayes/run.sh b/examples/naive-bayes/run.sh
@@ -2,16 +2,19 @@
 
 source ../../conf/env.sh
 
-APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION-with-spark-3.0.0.jar
+# Data file is from Spark Examples (data/mllib/sample_libsvm_data.txt) and put in examples/data
+# The data file should be copied to $HDFS_ROOT before running examples
+DATA_FILE=$HDFS_ROOT/data/sample_libsvm_data.txt
+
+APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION.jar
 APP_CLASS=org.apache.spark.examples.ml.NaiveBayesExample
-DATA_FILE=data/sample_libsvm_data.txt
 
 time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
     --num-executors $SPARK_NUM_EXECUTORS \
-    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-cores $SPARK_EXECUTOR_CORES \
+    --total-executor-cores $SPARK_TOTAL_CORES \
+    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-memory $SPARK_EXECUTOR_MEMORY \
-    --conf "spark.oap.mllib.enabled=true" \
     --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
     --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
     --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
@@ -22,5 +25,5 @@ time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
     --conf "spark.task.maxFailures=1" \
     --jars $OAP_MLLIB_JAR \
     --class $APP_CLASS \
-    $APP_JAR $DATA_FILE $K \
+    $APP_JAR $DATA_FILE \
     2>&1 | tee NaiveBayes-$(date +%m%d_%H_%M_%S).log
diff --git a/examples/pca-pyspark/run-vanilla.sh b/examples/pca-pyspark/run-vanilla.sh
diff --git a/examples/pca-pyspark/run.sh b/examples/pca-pyspark/run.sh
@@ -4,15 +4,16 @@ source ../../conf/env.sh
 
 # CSV data is the same as in Spark example "ml/pca_example.py"
 # The data file should be copied to $HDFS_ROOT before running examples
-DATA_FILE=data/pca_data.csv
+DATA_FILE=$HDFS_ROOT/data/pca_data.csv
 
 APP_PY=pca-pyspark.py
 K=3
 
 time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
     --num-executors $SPARK_NUM_EXECUTORS \
-    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-cores $SPARK_EXECUTOR_CORES \
+    --total-executor-cores $SPARK_TOTAL_CORES \
+    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-memory $SPARK_EXECUTOR_MEMORY \
     --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
     --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \

diff --git a/examples/pca/pom.xml b/examples/pca/pom.xml
@@ -4,7 +4,7 @@
 
   <groupId>com.intel.oap</groupId>
   <artifactId>oap-mllib-examples</artifactId>
-  <version>${oap.version}-with-spark-${spark.version}</version>  
+  <version>1.2.0</version>  
   <packaging>jar</packaging>
 
   <name>PCAExample</name>
@@ -15,7 +15,7 @@
     <oap.version>1.2.0</oap.version>
     <scala.version>2.12.10</scala.version>
     <scala.binary.version>2.12</scala.binary.version>
-    <spark.version>3.0.0</spark.version>
+    <spark.version>3.1.1</spark.version>
   </properties>
 
   <dependencies>

diff --git a/examples/pca/run.sh b/examples/pca/run.sh
@@ -2,13 +2,16 @@
 
 source ../../conf/env.sh
 
-APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION-with-spark-3.0.0.jar
+APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION.jar
 APP_CLASS=org.apache.spark.examples.ml.PCAExample
 
+# Dataset is created in the code, so no need to pass in as parameter
+
 time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
     --num-executors $SPARK_NUM_EXECUTORS \
-    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-cores $SPARK_EXECUTOR_CORES \
+    --total-executor-cores $SPARK_TOTAL_CORES \
+    --driver-memory $SPARK_DRIVER_MEMORY \
     --executor-memory $SPARK_EXECUTOR_MEMORY \
     --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
     --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \