oap-project · xwu99 · Oct 21, 2021 · Aug 30, 2021 · Sep 2, 2021 · Sep 6, 2021
diff --git a/examples/correlation/build.sh b/examples/correlation/build.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+mvn clean package
diff --git a/examples/correlation/pom.xml b/examples/correlation/pom.xml
@@ -0,0 +1,94 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" 
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>com.intel.oap</groupId>
+  <artifactId>oap-mllib-examples</artifactId>
+  <version>1.2.0</version>  
+  <packaging>jar</packaging>
+
+  <name>CorrelationExample</name>
+  <url>https://github.com/oap-project/oap-mllib.git</url>
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <oap.version>1.2.0</oap.version>
+    <scala.version>2.12.10</scala.version>
+    <scala.binary.version>2.12</scala.binary.version>
+    <spark.version>3.1.1</spark.version>
+  </properties>
+
+  <dependencies>
+
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+      <version>2.12.10</version>
+    </dependency>
+
+    <dependency>
+      <groupId>com.github.scopt</groupId>
+      <artifactId>scopt_2.12</artifactId>
+      <version>3.7.0</version>      
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_2.12</artifactId>
+      <version>${spark.version}</version>
+      <scope>provided</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-mllib_2.12</artifactId>
+      <version>${spark.version}</version>
+      <scope>provided</scope>
+    </dependency>
+
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.scala-tools</groupId>
+        <artifactId>maven-scala-plugin</artifactId>
+        <version>2.15.2</version>
+        <executions>
+          <execution>
+            <goals>
+              <goal>compile</goal>
+              <goal>testCompile</goal>
+            </goals>
+          </execution>
+        </executions>
+        <configuration>
+          <scalaVersion>${scala.version}</scalaVersion>
+          <args>
+            <arg>-target:jvm-1.8</arg>
+          </args>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <version>3.0.0</version>
+        <configuration>
+          <appendAssemblyId>false</appendAssemblyId>
+          <descriptorRefs>
+            <descriptorRef>jar-with-dependencies</descriptorRef>
+          </descriptorRefs>
+        </configuration>
+        <executions>
+          <execution>
+            <id>assembly</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
diff --git a/examples/correlation/run.sh b/examples/correlation/run.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+source ../../conf/env.sh
+
+APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION.jar
+APP_CLASS=org.apache.spark.examples.ml.CorrelationExample
+
+time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
+    --num-executors $SPARK_NUM_EXECUTORS \
+    --executor-cores $SPARK_EXECUTOR_CORES \
+    --total-executor-cores $SPARK_TOTAL_CORES \
+    --driver-memory $SPARK_DRIVER_MEMORY \
+    --executor-memory $SPARK_EXECUTOR_MEMORY \
+    --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
+    --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
+    --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
+    --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
+    --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
+    --conf "spark.shuffle.reduceLocality.enabled=false" \
+    --conf "spark.network.timeout=1200s" \
+    --conf "spark.task.maxFailures=1" \
+    --jars $OAP_MLLIB_JAR \
+    --class $APP_CLASS \
+    $APP_JAR $DATA_FILE \
+    2>&1 | tee Correlation-$(date +%m%d_%H_%M_%S).log
diff --git a/examples/correlation/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala b/examples/correlation/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.linalg.{Matrix, Vectors}
+import org.apache.spark.ml.stat.Correlation
+import org.apache.spark.sql.Row
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+/**
+ * An example for computing correlation matrix.
+ * Run with
+ * {{{
+ * bin/run-example ml.CorrelationExample
+ * }}}
+ */
+object CorrelationExample {
+
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder
+      .appName("CorrelationExample")
+      .getOrCreate()
+    import spark.implicits._
+
+    // $example on$
+    val data = Seq(
+      Vectors.sparse(4, Seq((0, 1.0), (3, -2.0))),
+      Vectors.dense(4.0, 5.0, 0.0, 3.0),
+      Vectors.dense(6.0, 7.0, 0.0, 8.0),
+      Vectors.sparse(4, Seq((0, 9.0), (3, 1.0)))
+    )
+
+    val df = data.map(Tuple1.apply).toDF("features")
+    Correlation.corr(df, "features").collect().foreach((coeff1) => {
+      println(s"Pearson correlation matrix:\n $coeff1.")
+    })
+
+
+    Correlation.corr(df, "features", "spearman").collect().foreach((coeff2) => {
+      println(s"Pearson correlation matrix:\n $coeff2.")
+    })
+    // $example off$
+
+    spark.stop()
+  }
+}
+// scalastyle:on println
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/stat/CorrelationResult.java b/mllib-dal/src/main/java/org/apache/spark/ml/stat/CorrelationResult.java
@@ -0,0 +1,22 @@
+/*******************************************************************************
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+package org.apache.spark.ml.stat;
+
+public class CorrelationResult {
+    public long correlationNumericTable;
+    public long meanNumericTable;
+}
diff --git a/mllib-dal/src/main/native/CorrelationDALImpl.cpp b/mllib-dal/src/main/native/CorrelationDALImpl.cpp
@@ -0,0 +1,180 @@
+/*******************************************************************************
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+#include <chrono>
+#include <iostream>
+
+#include "OneCCL.h"
+#include "org_apache_spark_ml_stat_CorrelationDALImpl.h"
+#include "service.h"
+
+
+using namespace std;
+using namespace daal;
+using namespace daal::algorithms;
+
+
+typedef double algorithmFPType; /* Algorithm floating-point type */
+
+/*
+ * Class:     org_apache_spark_ml_stat_CorrelationDALImpl
+ * Method:    cCorrelationTrainDAL
+ * Signature: (JJDDIILorg/apache/spark/ml/stat/CorrelationResult;)J
+ */
+
+JNIEXPORT jlong JNICALL
+Java_org_apache_spark_ml_stat_CorrelationDALImpl_cCorrelationTrainDAL(
+    JNIEnv *env, jobject obj, jlong pNumTabData,
+    jint executor_num, jint executor_cores, jobject resultObj) {
+
+    ccl::communicator &comm = getComm();
+    size_t rankId = comm.rank();
+    std::cout << " rankId : " << rankId << " ! "
+                  << std::endl;
+
+    const size_t nBlocks = executor_num;
+
+    NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
+
+    // Set number of threads for oneDAL to use for each rank
+    services::Environment::getInstance()->setNumberOfThreads(executor_cores);
+
+    int nThreadsNew =
+        services::Environment::getInstance()->getNumberOfThreads();
+    cout << "oneDAL (native): Number of CPU threads used: " << nThreadsNew
+         << endl;
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+
+    const bool isRoot = (rankId == ccl_root);
+
+    covariance::Distributed<step1Local> localAlgorithm;
+
+   /* Set the input data set to the algorithm */
+    localAlgorithm.input.set(covariance::data, pData);
+
+    /* Compute covariance */
+    localAlgorithm.compute();
+
+    auto t2 = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
+    std::cout << "Correleation (native): local step took " << duration << " secs"
+              << std::endl;
+
+    t1 = std::chrono::high_resolution_clock::now();
+
+    /* Serialize partial results required by step 2 */
+    InputDataArchive dataArch;
+    localAlgorithm.getPartialResult()->serialize(dataArch);
+    const uint64_t perNodeArchLength = (size_t)dataArch.getSizeOfArchive();
+
+
+    std::vector<uint64_t> aPerNodeArchLength(comm.size());
+    std::vector<size_t> aReceiveCount(comm.size(), 1);
+    /* Transfer archive length to the step 2 on the root node */
+    ccl::allgatherv(&perNodeArchLength, 1, aPerNodeArchLength.data(), aReceiveCount, comm).wait();
+
+    ByteBuffer serializedData;
+    /* Calculate total archive length */
+    int totalArchLength = 0;
+
+    for (size_t i = 0; i < nBlocks; ++i)
+    {
+        totalArchLength += aPerNodeArchLength[i];
+    }
+    aReceiveCount[ccl_root] = totalArchLength;
+
+    serializedData.resize(totalArchLength);
+
+
+    ByteBuffer nodeResults(perNodeArchLength);
+    dataArch.copyArchiveToArray(&nodeResults[0], perNodeArchLength);
+
+    /* Transfer partial results to step 2 on the root node */
+    ccl::allgatherv((int8_t *)&nodeResults[0], perNodeArchLength, (int8_t *)&serializedData[0], aPerNodeArchLength, comm).wait();
+    t2 = std::chrono::high_resolution_clock::now();
+
+
+    duration =
+        std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
+    std::cout << "Correleation (native): ccl_allgatherv took " << duration << " secs"
+              << std::endl;
+    if (isRoot) {
+            auto t1 = std::chrono::high_resolution_clock::now();
+            /* Create an algorithm to compute covariance on the master node */
+            covariance::Distributed<step2Master> masterAlgorithm;
+
+            for (size_t i = 0, shift = 0; i < nBlocks; shift += aPerNodeArchLength[i], ++i) {
+                /* Deserialize partial results from step 1 */
+                OutputDataArchive dataArch(&serializedData[shift], aPerNodeArchLength[i]);
+
+                covariance::PartialResultPtr dataForStep2FromStep1(new covariance::PartialResult());
+                dataForStep2FromStep1->deserialize(dataArch);
+
+                /* Set local partial results as input for the master-node algorithm
+                */
+                masterAlgorithm.input.add(covariance::partialResults,
+                                        dataForStep2FromStep1);
+            }
+
+            /* Set the parameter to choose the type of the output matrix */
+            masterAlgorithm.parameter.outputMatrixType = covariance::correlationMatrix;
+
+            /* Merge and finalizeCompute covariance decomposition on the master node */
+            masterAlgorithm.compute();
+            masterAlgorithm.finalizeCompute();
+
+            /* Retrieve the algorithm results */
+            covariance::ResultPtr result = masterAlgorithm.getResult();
+            auto t2 = std::chrono::high_resolution_clock::now();
+            auto duration =
+                std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
+            std::cout << "Correlation (native): master step took " << duration << " secs"
+                    << std::endl;
+
+            /* Print the results */
+            printNumericTable(result->get(covariance::correlation),
+                            "Correlation first 20 columns of "
+                            "correlation matrix:",
+                            1, 20);
+            printNumericTable(result->get(covariance::mean),
+                            "Correlation first 20 columns of "
+                            "mean matrix:",
+                            1, 20);
+
+            // Return all correlation & mean
+            jclass clazz = env->GetObjectClass(resultObj);
+
+            // Get Field references
+            jfieldID correlationNumericTableField =
+                env->GetFieldID(clazz, "correlationNumericTable", "J");
+            jfieldID meanNumericTableField =
+                env->GetFieldID(clazz, "meanNumericTable", "J");
+
+            NumericTablePtr *correlation =
+                new NumericTablePtr(result->get(covariance::correlation));
+            NumericTablePtr *mean =
+                new NumericTablePtr(result->get(covariance::mean));
+
+            env->SetLongField(resultObj, correlationNumericTableField, (jlong)correlation);
+            env->SetLongField(resultObj, meanNumericTableField,(jlong)mean);
+
+        }
+
+        return 0;
+
+}