diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 2c6973321..73700a96d 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -38,5 +38,4 @@ jobs: source /opt/intel/oneapi/dal/latest/env/vars.sh source /opt/intel/oneapi/tbb/latest/env/vars.sh source /tmp/oneCCL/build/_install/env/setvars.sh - # temp disable and will enable for new release of oneCCL - #./build.sh + ./test.sh diff --git a/.gitignore b/.gitignore index 1d621bdd4..21a5d40c2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ *.o *.log .vscode -*.iml target/ .idea/ .idea_modules/ diff --git a/README.md b/README.md index 477c74b1f..d217becee 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ You can find the all the OAP MLlib documents on the [project web page](https://o ### Java/Scala Users Preferred -Use a pre-built OAP MLlib JAR to get started. You can firstly download OAP package from [OAP-JARs-Tarball](https://github.com/Intel-bigdata/OAP/releases/download/v1.0.0-spark-3.0.0/oap-1.0.0-bin-spark-3.0.0.tar.gz) and extract this Tarball to get `oap-mllib-x.x.x-with-spark-x.x.x.jar` under `oap-1.0.0-bin-spark-3.0.0/jars`. +Use a pre-built OAP MLlib JAR to get started. You can firstly download OAP package from [OAP-JARs-Tarball](https://github.com/oap-mllib/releases/download/v1.1.0-spark-3.0.0/oap-1.1.0-bin-spark-3.0.0.tar.gz) and extract this Tarball to get `oap-mllib-x.x.x-with-spark-x.x.x.jar` under `oap-1.1.0-bin-spark-3.0.0/jars`. Then you can refer to the following [Running](#running) section to try out. @@ -65,14 +65,6 @@ To use K-means example for sanity check, you need to upload a data file to your $ ./run.sh ``` -### Benchmark with HiBench -Use [Hibench](https://github.com/Intel-bigdata/HiBench) to generate dataset with various profiles, and change related variables in `run-XXX.sh` script when applicable. Then run the following commands: -``` - $ cd oap-mllib/examples/kmeans-hibench - $ ./build.sh - $ ./run-hibench-oap-mllib.sh -``` - ### PySpark Support As PySpark-based applications call their Scala couterparts, they shall be supported out-of-box. An example can be found in the [Examples](#examples) section. @@ -95,7 +87,7 @@ IntelĀ® oneAPI Toolkits and its components can be downloaded and install from [h More details about oneAPI can be found [here](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html). -You can also refer to [this script and comments in it](https://github.com/Intel-bigdata/OAP/blob/branch-1.0-spark-3.x/oap-mllib/dev/install-build-deps-centos.sh) to install correct oneAPI version and manually setup the environments. +You can also refer to [this script and comments in it](https://github.com/oap-project/oap-mllib/blob/branch-1.1-spark-3.x/dev/install-build-deps-centos.sh) to install correct oneAPI version and manually setup the environments. Scala and Java dependency descriptions are already included in Maven POM file. @@ -138,7 +130,7 @@ CCL_ROOT | Path to oneCCL home directory We suggest you to source `setvars.sh` script into current shell to setup building environments as following: ``` - $ source /opt/intel/inteloneapi/setvars.sh + $ source /opt/intel/oneapi/setvars.sh $ source /your/oneCCL_source_code/build/_install/env/setvars.sh ``` @@ -160,8 +152,11 @@ Example | Description ----------------|--------------------------- kmeans | K-means example for Scala kmeans-pyspark | K-means example for PySpark -kmeans-hibench | Use HiBench-generated input dataset to benchmark K-means performance +pca | PCA example for Scala +pca-pyspark | PCA example for PySpark ## List of Accelerated Algorithms * K-Means (CPU, Experimental) +* PCA (CPU, Experimental) + diff --git a/dev/install-build-deps-centos.sh b/dev/install-build-deps-centos.sh index 8a347fdef..7b27736ae 100755 --- a/dev/install-build-deps-centos.sh +++ b/dev/install-build-deps-centos.sh @@ -23,7 +23,7 @@ cd /tmp rm -rf oneCCL git clone https://github.com/oneapi-src/oneCCL cd oneCCL -git checkout 2021.1 +git checkout beta08 mkdir -p build && cd build cmake .. make -j 2 install diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh index d43e35b89..07019b834 100755 --- a/dev/install-build-deps-ubuntu.sh +++ b/dev/install-build-deps-ubuntu.sh @@ -17,7 +17,7 @@ echo "Building oneCCL ..." cd /tmp git clone https://github.com/oneapi-src/oneCCL cd oneCCL -git checkout 2021.1 +git checkout beta08 mkdir build && cd build cmake .. make -j 2 install diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh deleted file mode 100755 index d093fa17a..000000000 --- a/dev/test-cluster/config-ssh.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa -cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys -echo " StrictHostKeyChecking no " | sudo tee -a /etc/ssh/ssh_config -sudo service ssh restart diff --git a/dev/test-cluster/core-site.xml b/dev/test-cluster/core-site.xml deleted file mode 100644 index 7016e477e..000000000 --- a/dev/test-cluster/core-site.xml +++ /dev/null @@ -1,24 +0,0 @@ - - - - - - - - - fs.default.name - hdfs://localhost:8020 - - diff --git a/dev/test-cluster/envs.sh b/dev/test-cluster/envs.sh deleted file mode 100644 index 71e8506e6..000000000 --- a/dev/test-cluster/envs.sh +++ /dev/null @@ -1,22 +0,0 @@ -# Set user Spark and Hadoop home directory -export HADOOP_HOME=~/opt/hadoop-2.7.7 -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop -export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7 - -export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH -export PYSPARK_PYTHON=python3 - -# Set user HDFS Root -export HDFS_ROOT=hdfs://localhost:8020 -export OAP_MLLIB_DATA_ROOT=OAPMLlib/Data -# Set user Intel MLlib Root directory -export OAP_MLLIB_ROOT=${GITHUB_WORKSPACE} - -# Target jar built -OAP_MLLIB_JAR_NAME=oap-mllib-1.1.0.jar -OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME - -# Use absolute path -SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR -# Use relative path -SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME diff --git a/dev/test-cluster/hadoop-env.sh b/dev/test-cluster/hadoop-env.sh deleted file mode 100755 index bee6c1f69..000000000 --- a/dev/test-cluster/hadoop-env.sh +++ /dev/null @@ -1,99 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Set Hadoop-specific environment variables here. - -# The only required environment variable is JAVA_HOME. All others are -# optional. When running a distributed configuration it is best to -# set JAVA_HOME in this file, so that it is correctly defined on -# remote nodes. - -# The java implementation to use. -# export JAVA_HOME=${JAVA_HOME} -export JAVA_HOME=/usr/local/lib/jvm/openjdk8 - -# The jsvc implementation to use. Jsvc is required to run secure datanodes -# that bind to privileged ports to provide authentication of data transfer -# protocol. Jsvc is not required if SASL is configured for authentication of -# data transfer protocol using non-privileged ports. -#export JSVC_HOME=${JSVC_HOME} - -export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"} - -# Extra Java CLASSPATH elements. Automatically insert capacity-scheduler. -for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do - if [ "$HADOOP_CLASSPATH" ]; then - export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f - else - export HADOOP_CLASSPATH=$f - fi -done - -# The maximum amount of heap to use, in MB. Default is 1000. -#export HADOOP_HEAPSIZE= -#export HADOOP_NAMENODE_INIT_HEAPSIZE="" - -# Extra Java runtime options. Empty by default. -export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true" - -# Command specific options appended to HADOOP_OPTS when specified -export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS" -export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS" - -export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS" - -export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS" -export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS" - -# The following applies to multiple commands (fs, dfs, fsck, distcp etc) -export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS" -#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS" - -# On secure datanodes, user to run the datanode as after dropping privileges. -# This **MUST** be uncommented to enable secure HDFS if using privileged ports -# to provide authentication of data transfer protocol. This **MUST NOT** be -# defined if SASL is configured for authentication of data transfer protocol -# using non-privileged ports. -export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER} - -# Where log files are stored. $HADOOP_HOME/logs by default. -#export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER - -# Where log files are stored in the secure data environment. -export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER} - -### -# HDFS Mover specific parameters -### -# Specify the JVM options to be used when starting the HDFS Mover. -# These options will be appended to the options specified as HADOOP_OPTS -# and therefore may override any similar flags set in HADOOP_OPTS -# -# export HADOOP_MOVER_OPTS="" - -### -# Advanced Users Only! -### - -# The directory where pid files are stored. /tmp by default. -# NOTE: this should be set to a directory that can only be written to by -# the user that will run the hadoop daemons. Otherwise there is the -# potential for a symlink attack. -export HADOOP_PID_DIR=${HADOOP_PID_DIR} -export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR} - -# A string representing this instance of hadoop. $USER by default. -export HADOOP_IDENT_STRING=$USER diff --git a/dev/test-cluster/hdfs-site.xml b/dev/test-cluster/hdfs-site.xml deleted file mode 100644 index 40fcbb5d6..000000000 --- a/dev/test-cluster/hdfs-site.xml +++ /dev/null @@ -1,32 +0,0 @@ - - - - - - - - - dfs.replication - 1 - - - dfs.namenode.name.dir - /tmp/run/hdfs/namenode - - - dfs.datanode.data.dir - /tmp/run/hdfs/datanode - - diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh deleted file mode 100755 index eea058f80..000000000 --- a/dev/test-cluster/setup-cluster.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env bash - -WORK_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - -cd $WORK_DIR - -echo JAVA_HOME is $JAVA_HOME - -mkdir ~/opt -cd ~/opt -wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz -tar -xzf spark-3.0.0-bin-hadoop2.7.tgz -wget https://archive.apache.org/dist/hadoop/core/hadoop-2.7.7/hadoop-2.7.7.tar.gz -tar -xzf hadoop-2.7.7.tar.gz - -cd $WORK_DIR - -cp ./core-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ -cp ./hdfs-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ -cp ./yarn-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ -cp ./hadoop-env.sh ~/opt/hadoop-2.7.7/etc/hadoop/ -cp ./spark-defaults.conf ~/opt/spark-3.0.0-bin-hadoop2.7/conf - -# create directories -mkdir -p /tmp/run/hdfs/namenode -mkdir -p /tmp/run/hdfs/datanode - -# hdfs format -~/opt/hadoop-2.7.7/bin/hdfs namenode -format - -export HADOOP_HOME=~/opt/hadoop-2.7.7 -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop -export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7 - -export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH - -# start hdfs and yarn -$HADOOP_HOME/sbin/start-dfs.sh -$HADOOP_HOME/sbin/start-yarn.sh - -hadoop fs -ls / -yarn node -list diff --git a/dev/test-cluster/setup-python3-env.sh b/dev/test-cluster/setup-python3-env.sh deleted file mode 100755 index 29208dc5e..000000000 --- a/dev/test-cluster/setup-python3-env.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash - -sudo apt-get update -sudo apt-get install python3-pip python3-setuptools python3-wheel - -pip3 install --user numpy - -echo python is in $(which python) -python --version - -echo python3 is in $(which python3) -python3 --version diff --git a/dev/test-cluster/spark-defaults.conf b/dev/test-cluster/spark-defaults.conf deleted file mode 100644 index 1c25bb2ec..000000000 --- a/dev/test-cluster/spark-defaults.conf +++ /dev/null @@ -1,34 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Default system properties included when running spark-submit. -# This is useful for setting default environmental settings. - -# Example: -# spark.master spark://master:7077 -# spark.eventLog.enabled true -# spark.eventLog.dir hdfs://namenode:8021/directory -# spark.serializer org.apache.spark.serializer.KryoSerializer -# spark.driver.memory 5g -# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" - -spark.master yarn -spark.serializer org.apache.spark.serializer.KryoSerializer -spark.driver.memory 3g -spark.executor.num 2 -spark.executor.cores 1 -spark.executor.memory 4g diff --git a/dev/test-cluster/workloads/kmeans-pyspark.py b/dev/test-cluster/workloads/kmeans-pyspark.py deleted file mode 100644 index cf93e6034..000000000 --- a/dev/test-cluster/workloads/kmeans-pyspark.py +++ /dev/null @@ -1,70 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -An example demonstrating k-means clustering. -Run with: - bin/spark-submit examples/src/main/python/ml/kmeans_example.py - -This example requires NumPy (http://www.numpy.org/). -""" -from __future__ import print_function -import sys - -# $example on$ -from pyspark.ml.clustering import KMeans -from pyspark.ml.evaluation import ClusteringEvaluator -# $example off$ - -from pyspark.sql import SparkSession - -if __name__ == "__main__": - spark = SparkSession\ - .builder\ - .appName("KMeansExample")\ - .getOrCreate() - - if (len(sys.argv) != 2) : - println("Require data file path as input parameter") - sys.exit(1) - - # $example on$ - # Loads data. - dataset = spark.read.format("libsvm").load(sys.argv[1]) - - # Trains a k-means model. - kmeans = KMeans().setK(2).setSeed(1) - model = kmeans.fit(dataset) - - # Make predictions - predictions = model.transform(dataset) - - # Evaluate clustering by computing Silhouette score - evaluator = ClusteringEvaluator() - - silhouette = evaluator.evaluate(predictions) - print("Silhouette with squared euclidean distance = " + str(silhouette)) - - # Shows the result. - centers = model.clusterCenters() - print("Cluster Centers: ") - for center in centers: - print(center) - # $example off$ - - spark.stop() - diff --git a/dev/test-cluster/workloads/run-kmeans-pyspark.sh b/dev/test-cluster/workloads/run-kmeans-pyspark.sh deleted file mode 100755 index e07f3f7b6..000000000 --- a/dev/test-cluster/workloads/run-kmeans-pyspark.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash - -source ../envs.sh - -# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt), the data file should be copied to HDFS -$HADOOP_HOME/bin/hadoop fs -mkdir -p $OAP_MLLIB_DATA_ROOT -$HADOOP_HOME/bin/hadoop fs -copyFromLocal $SPARK_HOME/data/mllib/sample_kmeans_data.txt $OAP_MLLIB_DATA_ROOT - -# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=1G -SPARK_NUM_EXECUTORS=2 -SPARK_EXECUTOR_CORES=1 -SPARK_EXECUTOR_MEMORY=1G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) - -# ======================================================= # - -# Check env -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -APP_PY="$OAP_MLLIB_ROOT/dev/test-cluster/workloads/kmeans-pyspark.py" -DATA_FILE=$OAP_MLLIB_DATA_ROOT/sample_kmeans_data.txt - -$SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ - --num-executors $SPARK_NUM_EXECUTORS \ - --driver-memory $SPARK_DRIVER_MEMORY \ - --executor-cores $SPARK_EXECUTOR_CORES \ - --executor-memory $SPARK_EXECUTOR_MEMORY \ - --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ - --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ - --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ - --conf "spark.shuffle.reduceLocality.enabled=false" \ - --conf "spark.network.timeout=1200s" \ - --conf "spark.task.maxFailures=1" \ - --jars $OAP_MLLIB_JAR \ - $APP_PY $DATA_FILE diff --git a/dev/test-cluster/yarn-site.xml b/dev/test-cluster/yarn-site.xml deleted file mode 100644 index ff74d23a7..000000000 --- a/dev/test-cluster/yarn-site.xml +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - yarn.nodemanager.aux-services - mapreduce_shuffle - - - yarn.nodemanager.aux-services.mapreduce.shuffle.class - org.apache.hadoop.mapred.ShuffleHandler - - - yarn.resourcemanager.hostname - localhost - - - yarn.resourcemanager.address - localhost:8032 - - - - yarn.nodemanager.resource.memory-mb - 7168 - - - yarn.nodemanager.resource.cpu-vcores - 2 - - - yarn.nodemanager.vmem-check-enabled - false - - - yarn.nodemanager.vmem-pmem-ratio - 2 - - - yarn.scheduler.minimum-allocation-mb - 1024 - - - yarn.scheduler.maximum-allocation-mb - 7168 - - - yarn.scheduler.minimum-allocation-vcores - 1 - - - yarn.scheduler.maximum-allocation-vcores - 2 - - - diff --git a/examples/als-hibench/build.sh b/examples/als-hibench/build.sh deleted file mode 100755 index 8cbc692be..000000000 --- a/examples/als-hibench/build.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env bash - -mvn clean package \ No newline at end of file diff --git a/examples/als-hibench/pom.xml b/examples/als-hibench/pom.xml deleted file mode 100644 index 68e02c256..000000000 --- a/examples/als-hibench/pom.xml +++ /dev/null @@ -1,100 +0,0 @@ - - 4.0.0 - - com.intel.oap - oap-mllib-examples - 0.9.0-with-spark-3.0.0 - jar - - ALSHiBenchExample - https://github.com/Intel-bigdata/OAP - - - UTF-8 - 2.12.10 - 2.12 - 3.0.0 - - - - - - org.scala-lang - scala-library - 2.12.10 - - - - com.github.scopt - scopt_2.12 - 3.7.0 - - - - - - - - - - - org.apache.spark - spark-sql_2.12 - ${spark.version} - provided - - - - org.apache.spark - spark-mllib_2.12 - ${spark.version} - provided - - - - - - - - org.scala-tools - maven-scala-plugin - 2.15.2 - - - - compile - testCompile - - - - - ${scala.version} - - -target:jvm-1.8 - - - - - maven-assembly-plugin - 3.0.0 - - false - - jar-with-dependencies - - - - - assembly - package - - single - - - - - - - - diff --git a/examples/als-hibench/run-hibench-oap-mllib.sh b/examples/als-hibench/run-hibench-oap-mllib.sh deleted file mode 100755 index 050b80558..000000000 --- a/examples/als-hibench/run-hibench-oap-mllib.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env bash - -export HDFS_ROOT=hdfs://sr591:8020 -export OAP_MLLIB_ROOT=/home/xiaochang/Works/OAP-xwu99-als/oap-mllib - -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=16G -SPARK_NUM_EXECUTORS=6 -SPARK_EXECUTOR_CORES=28 -SPARK_EXECUTOR_MEMORY_OVERHEAD=25G -SPARK_EXECUTOR_MEMORY=100G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) -#SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES) - -# ======================================================= # - -# for log suffix -SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" ) - -# Check envs -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop - -# Target jar built -OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar -OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME - -# Use absolute path -SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR -# Use relative path -SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME - -APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar -APP_CLASS=com.intel.hibench.sparkbench.ml.ALSExample - -HDFS_INPUT=hdfs://sr591:8020/HiBench/ALS/Input -RANK=10 -NUM_ITERATIONS=1 -LAMBDA=0.1 -IMPLICIT=true - -/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ - --num-executors $SPARK_NUM_EXECUTORS \ - --driver-memory $SPARK_DRIVER_MEMORY \ - --executor-cores $SPARK_EXECUTOR_CORES \ - --executor-memory $SPARK_EXECUTOR_MEMORY \ - --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ - --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ - --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ - --conf "spark.shuffle.reduceLocality.enabled=false" \ - --conf "spark.executor.memoryOverhead=$SPARK_EXECUTOR_MEMORY_OVERHEAD" \ - --conf "spark.network.timeout=1200s" \ - --conf "spark.task.maxFailures=1" \ - --jars $OAP_MLLIB_JAR \ - --class $APP_CLASS \ - $APP_JAR \ - --rank $RANK --numIterations $NUM_ITERATIONS --implicitPrefs $IMPLICIT --lambda $LAMBDA \ - --numProductBlocks $SPARK_DEFAULT_PARALLELISM --numUserBlocks $SPARK_DEFAULT_PARALLELISM \ - $HDFS_INPUT \ - 2>&1 | tee ALS-$SUFFIX-$(date +%m%d_%H_%M_%S).log - diff --git a/examples/als-hibench/run-hibench-vanilla.sh b/examples/als-hibench/run-hibench-vanilla.sh deleted file mode 100755 index 6cb6b3ae7..000000000 --- a/examples/als-hibench/run-hibench-vanilla.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env bash - -export HDFS_ROOT=hdfs://sr591:8020 - -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=16G -SPARK_NUM_EXECUTORS=6 -SPARK_EXECUTOR_CORES=28 -SPARK_EXECUTOR_MEMORY_OVERHEAD=25G -SPARK_EXECUTOR_MEMORY=100G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) - -# ======================================================= # - -# for log suffix -SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" ) - -# Check envs -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop - -APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar -APP_CLASS=com.intel.hibench.sparkbench.ml.ALSExample - -HDFS_INPUT=hdfs://sr591:8020/HiBench/ALS/Input -RANK=10 -NUM_ITERATIONS=1 -LAMBDA=0.1 -IMPLICIT=true - -/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ - --num-executors $SPARK_NUM_EXECUTORS \ - --driver-memory $SPARK_DRIVER_MEMORY \ - --executor-cores $SPARK_EXECUTOR_CORES \ - --executor-memory $SPARK_EXECUTOR_MEMORY \ - --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ - --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ - --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ - --conf "spark.shuffle.reduceLocality.enabled=false" \ - --conf "spark.executor.memoryOverhead=$SPARK_EXECUTOR_MEMORY_OVERHEAD" \ - --conf "spark.network.timeout=1200s" \ - --conf "spark.task.maxFailures=1" \ - --class $APP_CLASS \ - $APP_JAR \ - --rank $RANK --numIterations $NUM_ITERATIONS --implicitPrefs $IMPLICIT --lambda $LAMBDA \ - --numProductBlocks $SPARK_DEFAULT_PARALLELISM --numUserBlocks $SPARK_DEFAULT_PARALLELISM \ - $HDFS_INPUT \ - 2>&1 | tee ALS-$SUFFIX-$(date +%m%d_%H_%M_%S).log - diff --git a/examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala b/examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala deleted file mode 100644 index 5a29bcc80..000000000 --- a/examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.hibench.sparkbench.ml - -import org.apache.spark.ml.evaluation.RegressionEvaluator -import org.apache.spark.ml.recommendation.ALS -import org.apache.spark.ml.recommendation.ALS.Rating -import org.apache.spark.sql.SparkSession -import scopt.OptionParser - -object ALSExample { - - case class Params( - dataPath: String = null, - numIterations: Int = 10, - lambda: Double = 0.1, - rank: Int = 10, - numUserBlocks: Int = 10, - numItemBlocks: Int = 10, - implicitPrefs: Boolean = false) - - def main(args: Array[String]) { - val defaultParams = Params() - - val parser = new OptionParser[Params]("ALS") { - head("ALS: an example app for ALS on User-Item data.") - opt[Int]("rank") - .text(s"rank, default: ${defaultParams.rank}") - .action((x, c) => c.copy(rank = x)) - opt[Int]("numIterations") - .text(s"number of iterations, default: ${defaultParams.numIterations}") - .action((x, c) => c.copy(numIterations = x)) - opt[Double]("lambda") - .text(s"regularization parameter, default: ${defaultParams.lambda}") - .action((x, c) => c.copy(lambda = x)) - opt[Int]("numUserBlocks") - .text(s"number of user blocks, default: ${defaultParams.numUserBlocks}") - .action((x, c) => c.copy(numUserBlocks = x)) - opt[Int]("numProductBlocks") - .text(s"number of product blocks, default: ${defaultParams.numItemBlocks}") - .action((x, c) => c.copy(numItemBlocks = x)) - opt[Boolean]("implicitPrefs") - .text("implicit preference, default: ${defaultParams.implicitPrefs}") - .action((x, c) => c.copy(implicitPrefs = x)) - arg[String]("") - .required() - .text("Input paths to a User-Product dataset of ratings") - .action((x, c) => c.copy(dataPath = x)) - } - parser.parse(args, defaultParams) match { - case Some(params) => run(params) - case _ => sys.exit(1) - } - } - - def run(params: Params): Unit = { - val spark = SparkSession - .builder - .appName(s"ALS with $params") - .getOrCreate() - val sc = spark.sparkContext - - import spark.implicits._ - - val ratings = sc.objectFile[Rating[Int]](params.dataPath).toDF() - - val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2), 1L) - - // Build the recommendation model using ALS on the training data - val als = new ALS() - .setRank(params.rank) - .setMaxIter(params.numIterations) - .setRegParam(params.lambda) - .setImplicitPrefs(params.implicitPrefs) - .setNumUserBlocks(params.numUserBlocks) - .setNumItemBlocks(params.numItemBlocks) - .setUserCol("user") - .setItemCol("item") - .setRatingCol("rating") - val model = als.fit(training) - - // Evaluate the model by computing the RMSE on the test data - // Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics - model.setColdStartStrategy("drop") - val predictions = model.transform(test) - - val evaluator = new RegressionEvaluator() - .setMetricName("rmse") - .setLabelCol("rating") - .setPredictionCol("prediction") - val rmse = evaluator.evaluate(predictions) - println(s"Root-mean-square error = $rmse") - - spark.stop() - } -} diff --git a/examples/als-pyspark/als-pyspark.py b/examples/als-pyspark/als-pyspark.py deleted file mode 100644 index 8847ca2b9..000000000 --- a/examples/als-pyspark/als-pyspark.py +++ /dev/null @@ -1,67 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -import sys -if sys.version >= '3': - long = int - -from pyspark.sql import SparkSession - -# $example on$ -from pyspark.ml.evaluation import RegressionEvaluator -from pyspark.ml.recommendation import ALS -from pyspark.sql import Row -# $example off$ - -if __name__ == "__main__": - spark = SparkSession\ - .builder\ - .appName("ALSExample")\ - .getOrCreate() - - if (len(sys.argv) != 2) : - print("Require data file path as input parameter") - sys.exit(1) - - # $example on$ - lines = spark.read.text(sys.argv[1]).rdd - parts = lines.map(lambda row: row.value.split("::")) - ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), - rating=float(p[2]))) - ratings = spark.createDataFrame(ratingsRDD) - # (training, test) = ratings.randomSplit([0.8, 0.2]) - - # Build the recommendation model using ALS on the training data - # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics - als = ALS(rank=10, maxIter=5, regParam=0.01, implicitPrefs=True, alpha=40.0, - userCol="userId", itemCol="movieId", ratingCol="rating", - coldStartStrategy="drop") - print("\nALS training with implicitPrefs={}, rank={}, maxIter={}, regParam={}, alpha={}, seed={}\n".format( - als.getImplicitPrefs(), als.getRank(), als.getMaxIter(), als.getRegParam(), als.getAlpha(), als.getSeed() - )) - model = als.fit(ratings) - - # Evaluate the model by computing the RMSE on the test data - # predictions = model.transform(test) - # evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", - # predictionCol="prediction") - # rmse = evaluator.evaluate(predictions) - # print("Root-mean-square error = " + str(rmse)) - - spark.stop() diff --git a/examples/als-pyspark/run.sh b/examples/als-pyspark/run.sh deleted file mode 100755 index b3ba1b6d2..000000000 --- a/examples/als-pyspark/run.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env bash - -# == User to customize the following environments ======= # - -# Set user Spark and Hadoop home directory -#export SPARK_HOME=/path/to/your/spark/home -#export HADOOP_HOME=/path/to/your/hadoop/home -# Set user HDFS Root -export HDFS_ROOT=hdfs://sr549:8020 -# Set user Intel MLlib Root directory -export OAP_MLLIB_ROOT=/home/xiaochang/Works/OAP-xwu99-als/oap-mllib -# Set IP and Port for oneCCL KVS, you can select any one of the worker nodes and set CCL_KVS_IP_PORT to its IP and Port -# IP can be got with `hostname -I`, if multiple IPs are returned, the first IP should be used. Port can be any available port. -# For example, if one of the worker IP is 192.168.0.1 and an available port is 51234. -# CCL_KVS_IP_PORT can be set in the format of 192.168.0.1_51234 -# Incorrectly setting this value will result in hanging when oneCCL initialize -export CCL_KVS_IP_PORT=10.0.2.149_51234 - -# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt), the data file should be copied to HDFS -DATA_FILE=data/onedal_als_csr_ratings.txt - -# == User to customize Spark executor cores and memory == # - -# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=1G -SPARK_NUM_EXECUTORS=2 -SPARK_EXECUTOR_CORES=1 -SPARK_EXECUTOR_MEMORY=1G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) - -# ======================================================= # - -# Check env -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop - -# Target jar built -OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar -OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME - -# Use absolute path -SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR -# Use relative path -SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME - -APP_PY=als-pyspark.py - -/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ - --num-executors $SPARK_NUM_EXECUTORS \ - --driver-memory $SPARK_DRIVER_MEMORY \ - --executor-cores $SPARK_EXECUTOR_CORES \ - --executor-memory $SPARK_EXECUTOR_MEMORY \ - --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ - --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ - --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ - --conf "spark.executorEnv.CCL_KVS_IP_PORT=$CCL_KVS_IP_PORT" \ - --conf "spark.shuffle.reduceLocality.enabled=false" \ - --conf "spark.network.timeout=1200s" \ - --conf "spark.task.maxFailures=1" \ - --jars $OAP_MLLIB_JAR \ - $APP_PY $DATA_FILE \ - 2>&1 | tee ALS-$(date +%m%d_%H_%M_%S).log diff --git a/examples/kmeans-hibench/build.sh b/examples/kmeans-hibench/build.sh deleted file mode 100755 index da373645b..000000000 --- a/examples/kmeans-hibench/build.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env bash - -mvn clean package diff --git a/examples/kmeans-hibench/pom.xml b/examples/kmeans-hibench/pom.xml deleted file mode 100644 index 3f5b56e29..000000000 --- a/examples/kmeans-hibench/pom.xml +++ /dev/null @@ -1,99 +0,0 @@ - - 4.0.0 - - com.intel.oap - oap-mllib-examples - 1.1.0-with-spark-3.0.0 - jar - - KMeansHiBenchExample - https://github.com/oap-project/oap-mllib.git - - - UTF-8 - 2.12.10 - 2.12 - 3.0.0 - - - - - - org.scala-lang - scala-library - 2.12.10 - - - - com.github.scopt - scopt_2.12 - 3.7.0 - - - - org.apache.mahout - mahout-hdfs - 14.1 - - - - org.apache.spark - spark-sql_2.12 - ${spark.version} - provided - - - - org.apache.spark - spark-mllib_2.12 - ${spark.version} - provided - - - - - - - - org.scala-tools - maven-scala-plugin - 2.15.2 - - - - compile - testCompile - - - - - ${scala.version} - - -target:jvm-1.8 - - - - - maven-assembly-plugin - 3.0.0 - - false - - jar-with-dependencies - - - - - assembly - package - - single - - - - - - - - diff --git a/examples/kmeans-hibench/run-hibench-oap-mllib.sh b/examples/kmeans-hibench/run-hibench-oap-mllib.sh deleted file mode 100755 index caa42584f..000000000 --- a/examples/kmeans-hibench/run-hibench-oap-mllib.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env bash - -# == User to customize the following environments ======= # - -# Set user Spark and Hadoop home directory -export SPARK_HOME=/path/to/your/spark/home -export HADOOP_HOME=/path/to/your/hadoop/home -# Set user HDFS Root -export HDFS_ROOT=hdfs://your_hostname:8020 -# Set user Intel MLlib Root directory -export OAP_MLLIB_ROOT=/path/to/your/OAP/oap-mllib -# Set IP and Port for oneCCL KVS, you can select any one of the worker nodes and set CCL_KVS_IP_PORT to its IP and Port -# IP can be got with `hostname -I`, if multiple IPs are returned, the first IP should be used. Port can be any available port. -# For example, if one of the worker IP is 192.168.0.1 and an available port is 51234. -# CCL_KVS_IP_PORT can be set in the format of 192.168.0.1_51234 -# Incorrectly setting this value will result in hanging when oneCCL initialize -export CCL_KVS_IP_PORT=192.168.0.1_51234 - -# == User to customize Spark executor cores and memory == # - -# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=8G -SPARK_NUM_EXECUTORS=6 -SPARK_EXECUTOR_CORES=15 -SPARK_EXECUTOR_MEMORY_OVERHEAD=25G -SPARK_EXECUTOR_MEMORY=50G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) - -# ======================================================= # - -# for log suffix -SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" ) - -# Check envs -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop - -# Target jar built -OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar -OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME - -# Use absolute path -SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR -# Use relative path -SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME - -APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar -APP_CLASS=com.intel.hibench.sparkbench.ml.DenseKMeansDS - -K=200 -INIT_MODE=Random -MAX_ITERATION=20 -INPUT_HDFS=$HDFS_ROOT/HiBench/Kmeans/Input/samples - -/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ - --num-executors $SPARK_NUM_EXECUTORS \ - --driver-memory $SPARK_DRIVER_MEMORY \ - --executor-cores $SPARK_EXECUTOR_CORES \ - --executor-memory $SPARK_EXECUTOR_MEMORY \ - --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ - --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ - --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ - --conf "spark.executorEnv.CCL_KVS_IP_PORT=$CCL_KVS_IP_PORT" \ - --conf "spark.shuffle.reduceLocality.enabled=false" \ - --conf "spark.executor.memoryOverhead=$SPARK_EXECUTOR_MEMORY_OVERHEAD" \ - --conf "spark.memory.fraction=0.8" \ - --conf "spark.network.timeout=1200s" \ - --conf "spark.task.maxFailures=1" \ - --jars $OAP_MLLIB_JAR \ - --class $APP_CLASS \ - $APP_JAR \ - -k $K --initMode $INIT_MODE --numIterations $MAX_ITERATION $INPUT_HDFS \ - 2>&1 | tee KMeansHiBench-$SUFFIX-$(date +%m%d_%H_%M_%S).log diff --git a/examples/kmeans-hibench/run-hibench-vanilla.sh b/examples/kmeans-hibench/run-hibench-vanilla.sh deleted file mode 100755 index 475c25aff..000000000 --- a/examples/kmeans-hibench/run-hibench-vanilla.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env bash - -# == User to customize the following environments ======= # - -# Set user Spark and Hadoop home directory -export SPARK_HOME=/path/to/your/spark/home -export HADOOP_HOME=/path/to/your/hadoop/home -# Set user HDFS Root -export HDFS_ROOT=hdfs://your_hostname:8020 - -# == User to customize Spark executor cores and memory == # - -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=8G -SPARK_NUM_EXECUTORS=6 -SPARK_EXECUTOR_CORES=15 -SPARK_EXECUTOR_MEMORY=75G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) - -# ======================================================= # - -# for log suffix -SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" ) - -# Check envs -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop - -APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar -APP_CLASS=com.intel.hibench.sparkbench.ml.DenseKMeansDS - -K=200 -INIT_MODE=Random -MAX_ITERATION=20 -INPUT_HDFS=$HDFS_ROOT/HiBench/Kmeans/Input/samples - -/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ - --num-executors $SPARK_NUM_EXECUTORS \ - --driver-memory $SPARK_DRIVER_MEMORY \ - --executor-cores $SPARK_EXECUTOR_CORES \ - --executor-memory $SPARK_EXECUTOR_MEMORY \ - --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ - --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ - --class $APP_CLASS \ - $APP_JAR \ - -k $K --initMode $INIT_MODE --numIterations $MAX_ITERATION $INPUT_HDFS \ - 2>&1 | tee KMeansHiBench-$SUFFIX-$(date +%m%d_%H_%M_%S).log diff --git a/examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala b/examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala deleted file mode 100644 index 3a949bb1c..000000000 --- a/examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.hibench.sparkbench.ml - -import breeze.linalg.DenseVector -import org.apache.hadoop.io.LongWritable -import org.apache.mahout.math.VectorWritable -import org.apache.spark.ml.clustering.KMeans -import org.apache.spark.ml.evaluation.ClusteringEvaluator -import org.apache.spark.ml.linalg.{Vector, Vectors} -import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.sql._ -import scopt.OptionParser -import org.apache.spark.sql.SparkSession - -object DenseKMeansDS { - - object InitializationMode extends Enumeration { - type InitializationMode = Value - val Random, Parallel = Value - } - - import com.intel.hibench.sparkbench.ml.DenseKMeansDS.InitializationMode._ - - case class Params(input: String = null, - k: Int = -1, - numIterations: Int = 10, - initializationMode: InitializationMode = Random) - - def main(args: Array[String]) { - val defaultParams = Params() - - val parser = new OptionParser[Params]("DenseKMeans") { - head("DenseKMeans: an example k-means app for dense data.") - opt[Int]('k', "k") - .required() - .text(s"number of clusters, required") - .action((x, c) => c.copy(k = x)) - opt[Int]("numIterations") - .text(s"number of iterations, default; ${defaultParams.numIterations}") - .action((x, c) => c.copy(numIterations = x)) - opt[String]("initMode") - .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + - s"default: ${defaultParams.initializationMode}") - .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) - arg[String]("") - .text("input paths to examples") - .required() - .action((x, c) => c.copy(input = x)) - } - - parser.parse(args, defaultParams).map { params => - run(params) - }.getOrElse { - sys.exit(1) - } - } - - def run(params: Params) { - val spark = SparkSession - .builder - .appName(s"DenseKMeansDS with $params") - .getOrCreate() - import spark.implicits._ - - val sc = spark.sparkContext - - val data = sc.sequenceFile[LongWritable, VectorWritable](params.input) - - // Should use Tuple1 to warp around for calling toDF - val dataset = data.map { case (k, v) => - var vector: Array[Double] = new Array[Double](v.get().size) - for (i <- 0 until v.get().size) vector(i) = v.get().get(i) - Tuple1(Vectors.dense(vector)) - }.toDF("features") - - val initMode = params.initializationMode match { - case Random => "random" - case Parallel => "k-means||" - } - - val model = new KMeans() - .setInitMode(initMode) - .setK(params.k) - .setMaxIter(params.numIterations) - .setSeed(1L) - .fit(dataset) - - spark.stop() - } -} - diff --git a/mllib-dal/pom.xml b/mllib-dal/pom.xml index 4e51f9157..01e002830 100644 --- a/mllib-dal/pom.xml +++ b/mllib-dal/pom.xml @@ -218,12 +218,10 @@ ${env.CCL_ROOT}/lib - - - libmpi.so.12.0.0 + libpmi.so.1 + libresizable_pmi.so.1 libfabric.so.1 - libccl.so - + libccl_atl_ofi.so.1 @@ -273,13 +271,9 @@ ${project.build.testOutputDirectory}/lib/libtbbmalloc.so.2 - ${project.build.testOutputDirectory}/lib/libmpi.so.12.0.0 - ${project.build.testOutputDirectory}/lib/libmpi.so.12 + ${project.build.testOutputDirectory}/lib/libccl_atl_ofi.so.1 + ${project.build.testOutputDirectory}/lib/libccl_atl_ofi.so - - - - diff --git a/mllib-dal/src/assembly/assembly.xml b/mllib-dal/src/assembly/assembly.xml index 498b90e02..137f19b81 100644 --- a/mllib-dal/src/assembly/assembly.xml +++ b/mllib-dal/src/assembly/assembly.xml @@ -58,21 +58,26 @@ - ${env.CCL_ROOT}/lib/libfabric.so.1 + ${env.CCL_ROOT}/lib/libpmi.so.1 lib - ${env.CCL_ROOT}/lib/libmpi.so.12.0.0 + ${env.CCL_ROOT}/lib/libresizable_pmi.so.1 lib - libmpi.so.12 - ${env.CCL_ROOT}/lib/libccl.so + ${env.CCL_ROOT}/lib//libfabric.so.1 lib ${env.CCL_ROOT}/lib/prov/libsockets-fi.so lib + + + ${env.CCL_ROOT}/lib/libccl_atl_ofi.so.1 + lib + libccl_atl_ofi.so + \ No newline at end of file diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSPartitionInfo.java b/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSPartitionInfo.java deleted file mode 100644 index 5f2561772..000000000 --- a/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSPartitionInfo.java +++ /dev/null @@ -1,6 +0,0 @@ -package org.apache.spark.ml.recommendation; - -public class ALSPartitionInfo { - public int ratingsNum; - public int csrRowNum; -} diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSResult.java b/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSResult.java deleted file mode 100644 index 67173a0c5..000000000 --- a/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSResult.java +++ /dev/null @@ -1,9 +0,0 @@ -package org.apache.spark.ml.recommendation; - -public class ALSResult { - public long rankId = -1; - public long cUsersFactorsNumTab; - public long cItemsFactorsNumTab; - public long cUserOffset; - public long cItemOffset; -} diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java b/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java index d8ea09a23..5b51451ae 100644 --- a/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java +++ b/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java @@ -21,8 +21,7 @@ import java.io.*; import java.util.UUID; import java.util.logging.Level; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import java.util.logging.Logger; import com.intel.daal.utils.LibUtils; @@ -31,7 +30,8 @@ public final class LibLoader { // Make sure loading libraries from different temp directory for each process private final static String subDir = "MLlibDAL_" + UUID.randomUUID(); - private static final Logger log = LoggerFactory.getLogger("LibLoader"); + private static final Logger logger = Logger.getLogger(LibLoader.class.getName()); + private static final Level logLevel = Level.INFO; /** * Get temp dir for exacting lib files @@ -54,23 +54,24 @@ public static synchronized void loadLibraries() throws IOException { /** * Load oneCCL libs in dependency order */ - private static synchronized void loadLibCCL() throws IOException { + public static synchronized void loadLibCCL() throws IOException { + loadFromJar(subDir, "libpmi.so.1"); + loadFromJar(subDir, "libresizable_pmi.so.1"); loadFromJar(subDir, "libfabric.so.1"); - loadFromJar(subDir, "libmpi.so.12"); - loadFromJar(subDir, "libccl.so"); loadFromJar(subDir, "libsockets-fi.so"); + loadFromJar(subDir, "libccl_atl_ofi.so"); } /** * Load MLlibDAL lib, it depends TBB libs that are loaded by oneDAL, * so this function should be called after oneDAL loadLibrary */ - private static synchronized void loadLibMLlibDAL() throws IOException { + public static synchronized void loadLibMLlibDAL() throws IOException { // oneDAL Java API doesn't load correct libtbb version for oneAPI Beta 10 // Rename in pom.xml and assembly.xml to workaround. // See https://github.com/oneapi-src/oneDAL/issues/1254 --> LibUtils.loadLibrary(); - + loadFromJar(subDir, "libMLlibDAL.so"); } @@ -81,12 +82,12 @@ private static synchronized void loadLibMLlibDAL() throws IOException { * @param name library name */ private static void loadFromJar(String path, String name) throws IOException { - log.debug("Loading " + name + " ..."); + logger.log(logLevel, "Loading " + name + " ..."); File fileOut = createTempFile(path, name); // File exists already if (fileOut == null) { - log.debug("DONE: Loading library as resource."); + logger.log(logLevel, "DONE: Loading library as resource."); return; } @@ -96,7 +97,7 @@ private static void loadFromJar(String path, String name) throws IOException { } try (OutputStream streamOut = new FileOutputStream(fileOut)) { - log.debug("Writing resource to temp file."); + logger.log(logLevel, "Writing resource to temp file."); byte[] buffer = new byte[32768]; while (true) { @@ -115,7 +116,7 @@ private static void loadFromJar(String path, String name) throws IOException { } System.load(fileOut.toString()); - log.debug("DONE: Loading library as resource."); + logger.log(logLevel, "DONE: Loading library as resource."); } /** diff --git a/mllib-dal/src/main/native/ALSDALImpl.cpp b/mllib-dal/src/main/native/ALSDALImpl.cpp deleted file mode 100644 index 53212dc1d..000000000 --- a/mllib-dal/src/main/native/ALSDALImpl.cpp +++ /dev/null @@ -1,668 +0,0 @@ -#include -#include -#include -#include - -#include "OneCCL.h" -#include "ALSShuffle.h" -#include "org_apache_spark_ml_recommendation_ALSDALImpl.h" -#include "service.h" - -using namespace std; -using namespace daal; -using namespace daal::algorithms; -using namespace daal::algorithms::implicit_als; - -const int ccl_root = 0; - -typedef float algorithmFPType; /* Algorithm floating-point type */ - -NumericTablePtr userOffset; -NumericTablePtr itemOffset; - -// KeyValueDataCollectionPtr userOffsetsOnMaster; -// KeyValueDataCollectionPtr itemOffsetsOnMaster; - -CSRNumericTablePtr dataTable; -CSRNumericTablePtr transposedDataTable; - -KeyValueDataCollectionPtr userStep3LocalInput; -KeyValueDataCollectionPtr itemStep3LocalInput; - -training::DistributedPartialResultStep4Ptr itemsPartialResultLocal; -training::DistributedPartialResultStep4Ptr usersPartialResultLocal; -std::vector itemsPartialResultsMaster; -std::vector usersPartialResultsMaster; - -template -void gather(size_t rankId, ccl::communicator &comm, size_t nBlocks, const ByteBuffer& nodeResults, T* result) { - vector perNodeArchLengthMaster(nBlocks); - size_t perNodeArchLength = nodeResults.size(); - ByteBuffer serializedData; - - vector recv_counts(nBlocks); - for (size_t i = 0; i < nBlocks; i++) recv_counts[i] = sizeof(size_t); - - // MPI_Gather(&perNodeArchLength, sizeof(int), MPI_CHAR, perNodeArchLengthMaster, - // sizeof(int), MPI_CHAR, ccl_root, MPI_COMM_WORLD); - ccl::allgatherv(&perNodeArchLength, sizeof(size_t), perNodeArchLengthMaster.data(), recv_counts, - ccl::datatype::uint8, comm).wait(); - - // should resize for all ranks for ccl_allgatherv - size_t memoryBuf = 0; - for (size_t i = 0; i < nBlocks; i++) { - memoryBuf += perNodeArchLengthMaster[i]; - } - serializedData.resize(memoryBuf); - - std::vector displs(nBlocks); - if (rankId == ccl_root) { - size_t shift = 0; - for (size_t i = 0; i < nBlocks; i++) { - displs[i] = shift; - shift += perNodeArchLengthMaster[i]; - } - } - - /* Transfer partial results to step 2 on the root node */ - // MPI_Gatherv(&nodeResults[0], perNodeArchLength, MPI_CHAR, &serializedData[0], - // perNodeArchLengthMaster, displs, MPI_CHAR, ccl_root, - // MPI_COMM_WORLD); - ccl::allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0], - perNodeArchLengthMaster, ccl::datatype::uint8, comm).wait(); - - if (rankId == ccl_root) { - for (size_t i = 0; i < nBlocks; i++) { - /* Deserialize partial results from step 1 */ - result[i] = result[i]->cast(deserializeDAALObject(&serializedData[0] + displs[i], - perNodeArchLengthMaster[i])); - } - } -} - -// void gatherUsers(const ByteBuffer & nodeResults, int nBlocks) -// { -// size_t perNodeArchLengthMaster[nBlocks]; -// size_t perNodeArchLength = nodeResults.size(); -// ByteBuffer serializedData; -// size_t recv_counts[nBlocks]; -// for (int i = 0; i < nBlocks; i++) { -// recv_counts[i] = sizeof(size_t); -// } - -// ccl_request_t request; -// // MPI_Allgather(&perNodeArchLength, sizeof(int), MPI_CHAR, -// perNodeArchLengthMaster, sizeof(int), MPI_CHAR, MPI_COMM_WORLD); -// ccl_allgatherv(&perNodeArchLength, sizeof(size_t), perNodeArchLengthMaster, -// recv_counts, ccl_dtype_char, NULL, NULL, NULL, &request); ccl_wait(request); - -// size_t memoryBuf = 0; -// for (int i = 0; i < nBlocks; i++) -// { -// memoryBuf += perNodeArchLengthMaster[i]; -// } -// serializedData.resize(memoryBuf); - -// size_t shift = 0; -// std::vector displs(nBlocks); -// for (int i = 0; i < nBlocks; i++) -// { -// displs[i] = shift; -// shift += perNodeArchLengthMaster[i]; -// } - -// /* Transfer partial results to step 2 on the root node */ -// // MPI_Allgatherv(&nodeResults[0], perNodeArchLength, MPI_CHAR, &serializedData[0], -// perNodeArchLengthMaster, displs, MPI_CHAR, MPI_COMM_WORLD); -// ccl_allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0], -// perNodeArchLengthMaster, ccl_dtype_char, NULL, NULL, NULL, &request); -// ccl_wait(request); - -// usersPartialResultsMaster.resize(nBlocks); -// for (int i = 0; i < nBlocks; i++) -// { -// /* Deserialize partial results from step 4 */ -// usersPartialResultsMaster[i] = -// training::DistributedPartialResultStep4::cast(deserializeDAALObject(&serializedData[0] -// + displs[i], perNodeArchLengthMaster[i])); -// } -// } - -// void gatherItems(const ByteBuffer & nodeResults, size_t nBlocks) -// { -// size_t perNodeArchLengthMaster[nBlocks]; -// size_t perNodeArchLength = nodeResults.size(); -// ByteBuffer serializedData; -// size_t recv_counts[nBlocks]; -// for (size_t i = 0; i < nBlocks; i++) { -// recv_counts[i] = sizeof(size_t); -// } - -// ccl_request_t request; -// // MPI_Allgather(&perNodeArchLength, sizeof(int), MPI_CHAR, -// perNodeArchLengthMaster, sizeof(int), MPI_CHAR, MPI_COMM_WORLD); -// ccl_allgatherv(&perNodeArchLength, sizeof(size_t), perNodeArchLengthMaster, -// recv_counts, ccl_dtype_char, NULL, NULL, NULL, &request); ccl_wait(request); - -// size_t memoryBuf = 0; -// for (size_t i = 0; i < nBlocks; i++) -// { -// memoryBuf += perNodeArchLengthMaster[i]; -// } -// serializedData.resize(memoryBuf); - -// size_t shift = 0; -// std::vector displs(nBlocks); -// for (size_t i = 0; i < nBlocks; i++) -// { -// displs[i] = shift; -// shift += perNodeArchLengthMaster[i]; -// } - -// /* Transfer partial results to step 2 on the root node */ -// // MPI_Allgatherv(&nodeResults[0], perNodeArchLength, MPI_CHAR, &serializedData[0], -// perNodeArchLengthMaster, displs, MPI_CHAR, MPI_COMM_WORLD); -// ccl_allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0], -// perNodeArchLengthMaster, ccl_dtype_char, NULL, NULL, NULL, &request); -// ccl_wait(request); - -// itemsPartialResultsMaster.resize(nBlocks); -// for (size_t i = 0; i < nBlocks; i++) -// { -// /* Deserialize partial results from step 4 */ -// itemsPartialResultsMaster[i] = -// training::DistributedPartialResultStep4::cast(deserializeDAALObject(&serializedData[0] -// + displs[i], perNodeArchLengthMaster[i])); -// } -// } - -template -void all2all(ccl::communicator &comm, ByteBuffer* nodeResults, size_t nBlocks, KeyValueDataCollectionPtr result) { - size_t memoryBuf = 0; - size_t shift = 0; - vector perNodeArchLengths(nBlocks); - vector perNodeArchLengthsRecv(nBlocks); - std::vector sdispls(nBlocks); - ByteBuffer serializedSendData; - ByteBuffer serializedRecvData; - - for (size_t i = 0; i < nBlocks; i++) { - perNodeArchLengths[i] = nodeResults[i].size(); - memoryBuf += perNodeArchLengths[i]; - sdispls[i] = shift; - shift += perNodeArchLengths[i]; - } - serializedSendData.resize(memoryBuf); - - /* memcpy to avoid double compute */ - memoryBuf = 0; - for (size_t i = 0; i < nBlocks; i++) { - for (size_t j = 0; j < perNodeArchLengths[i]; j++) - serializedSendData[memoryBuf + j] = nodeResults[i][j]; - memoryBuf += perNodeArchLengths[i]; - } - - // MPI_Alltoall(perNodeArchLengths, sizeof(int), MPI_CHAR, perNodeArchLengthsRecv, - // sizeof(int), MPI_CHAR, MPI_COMM_WORLD); - ccl::alltoall(perNodeArchLengths.data(), perNodeArchLengthsRecv.data(), sizeof(size_t), ccl::datatype::uint8, comm).wait(); - - memoryBuf = 0; - shift = 0; - std::vector rdispls(nBlocks); - for (size_t i = 0; i < nBlocks; i++) { - memoryBuf += perNodeArchLengthsRecv[i]; - rdispls[i] = shift; - shift += perNodeArchLengthsRecv[i]; - } - - serializedRecvData.resize(memoryBuf); - - /* Transfer partial results to step 2 on the root node */ - // MPI_Alltoallv(&serializedSendData[0], perNodeArchLengths, sdispls, MPI_CHAR, - // &serializedRecvData[0], perNodeArchLengthsRecv, rdispls, MPI_CHAR, - // MPI_COMM_WORLD); - ccl::alltoallv(&serializedSendData[0], perNodeArchLengths, &serializedRecvData[0], - perNodeArchLengthsRecv, ccl::datatype::uint8, comm).wait(); - - for (size_t i = 0; i < nBlocks; i++) { - (*result)[i] = T::cast(deserializeDAALObject(&serializedRecvData[rdispls[i]], - perNodeArchLengthsRecv[i])); - } -} - -KeyValueDataCollectionPtr initializeStep1Local(size_t rankId, size_t partitionId, - size_t nBlocks, size_t nUsers, - size_t nFactors) { - int usersPartition[1] = {(int)nBlocks}; - - /* Create an algorithm object to initialize the implicit ALS model with the default - * method */ - training::init::Distributed - initAlgorithm; - initAlgorithm.parameter.fullNUsers = nUsers; - initAlgorithm.parameter.nFactors = nFactors; - initAlgorithm.parameter.seed += rankId; - initAlgorithm.parameter.partition.reset( - new HomogenNumericTable((int*)usersPartition, 1, 1)); - /* Pass a training data set and dependent values to the algorithm */ - initAlgorithm.input.set(training::init::data, dataTable); - - /* Initialize the implicit ALS model */ - initAlgorithm.compute(); - - training::init::PartialResultPtr partialResult = initAlgorithm.getPartialResult(); - itemStep3LocalInput = partialResult->get(training::init::outputOfInitForComputeStep3); - userOffset = partialResult->get(training::init::offsets, (size_t)rankId); - // if (rankId == ccl_root) - // { - // userOffsetsOnMaster = partialResult->get(training::init::offsets); - // } - PartialModelPtr partialModelLocal = partialResult->get(training::init::partialModel); - - itemsPartialResultLocal.reset(new training::DistributedPartialResultStep4()); - itemsPartialResultLocal->set(training::outputOfStep4ForStep1, partialModelLocal); - - return partialResult->get(training::init::outputOfStep1ForStep2); -} - -void initializeStep2Local(size_t rankId, size_t partitionId, - const KeyValueDataCollectionPtr& initStep2LocalInput) { - /* Create an algorithm object to perform the second step of the implicit ALS - * initialization algorithm */ - training::init::Distributed - initAlgorithm; - - initAlgorithm.input.set(training::init::inputOfStep2FromStep1, initStep2LocalInput); - - /* Compute partial results of the second step on local nodes */ - initAlgorithm.compute(); - - training::init::DistributedPartialResultStep2Ptr partialResult = - initAlgorithm.getPartialResult(); - transposedDataTable = - CSRNumericTable::cast(partialResult->get(training::init::transposedData)); - userStep3LocalInput = partialResult->get(training::init::outputOfInitForComputeStep3); - itemOffset = partialResult->get(training::init::offsets, (size_t)rankId); - // if (rankId == ccl_root) - // { - // itemOffsetsOnMaster = partialResult->get(training::init::offsets); - // } -} - -void initializeModel(size_t rankId, ccl::communicator &comm, size_t partitionId, size_t nBlocks, size_t nUsers, - size_t nFactors) { - std::cout << "ALS (native): initializeModel " << std::endl; - - auto t1 = std::chrono::high_resolution_clock::now(); - - KeyValueDataCollectionPtr initStep1LocalResult = - initializeStep1Local(rankId, partitionId, nBlocks, nUsers, nFactors); - - /* MPI_Alltoallv to populate initStep2LocalInput */ - ByteBuffer nodeCPs[nBlocks]; - for (size_t i = 0; i < nBlocks; i++) { - serializeDAALObject((*initStep1LocalResult)[i].get(), nodeCPs[i]); - } - KeyValueDataCollectionPtr initStep2LocalInput(new KeyValueDataCollection()); - all2all(comm, nodeCPs, nBlocks, initStep2LocalInput); - - initializeStep2Local(rankId, partitionId, initStep2LocalInput); - - auto t2 = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(t2 - t1).count(); - std::cout << "ALS (native): initializeModel took " << duration << " secs" << std::endl; -} - -training::DistributedPartialResultStep1Ptr computeStep1Local( - const training::DistributedPartialResultStep4Ptr& partialResultLocal, - size_t nFactors) { - /* Create algorithm objects to compute implicit ALS algorithm in the distributed - * processing mode on the local node using the default method */ - training::Distributed algorithm; - algorithm.parameter.nFactors = nFactors; - - /* Set input objects for the algorithm */ - algorithm.input.set(training::partialModel, - partialResultLocal->get(training::outputOfStep4ForStep1)); - - /* Compute partial estimates on local nodes */ - algorithm.compute(); - - /* Get the computed partial estimates */ - return algorithm.getPartialResult(); -} - -NumericTablePtr computeStep2Master( - const training::DistributedPartialResultStep1Ptr* step1LocalResultsOnMaster, - size_t nFactors, size_t nBlocks) { - /* Create algorithm objects to compute implicit ALS algorithm in the distributed - * processing mode on the master node using the default method */ - training::Distributed algorithm; - algorithm.parameter.nFactors = nFactors; - - /* Set input objects for the algorithm */ - for (size_t i = 0; i < nBlocks; i++) { - algorithm.input.add(training::inputOfStep2FromStep1, step1LocalResultsOnMaster[i]); - } - - /* Compute a partial estimate on the master node from the partial estimates on local - * nodes */ - algorithm.compute(); - - return algorithm.getPartialResult()->get(training::outputOfStep2ForStep4); -} - -KeyValueDataCollectionPtr computeStep3Local( - const NumericTablePtr& offset, - const training::DistributedPartialResultStep4Ptr& partialResultLocal, - const KeyValueDataCollectionPtr& step3LocalInput, size_t nFactors) { - training::Distributed algorithm; - algorithm.parameter.nFactors = nFactors; - - algorithm.input.set(training::partialModel, - partialResultLocal->get(training::outputOfStep4ForStep3)); - algorithm.input.set(training::inputOfStep3FromInit, step3LocalInput); - algorithm.input.set(training::offset, offset); - - algorithm.compute(); - - return algorithm.getPartialResult()->get(training::outputOfStep3ForStep4); -} - -training::DistributedPartialResultStep4Ptr computeStep4Local( - const CSRNumericTablePtr& dataTable, const NumericTablePtr& step2MasterResult, - const KeyValueDataCollectionPtr& step4LocalInput, size_t nFactors) { - training::Distributed algorithm; - algorithm.parameter.nFactors = nFactors; - - algorithm.input.set(training::partialModels, step4LocalInput); - algorithm.input.set(training::partialData, dataTable); - algorithm.input.set(training::inputOfStep4FromStep2, step2MasterResult); - - algorithm.compute(); - - return algorithm.getPartialResult(); -} - -void trainModel(size_t rankId, ccl::communicator &comm, size_t partitionId, size_t nBlocks, size_t nFactors, - size_t maxIterations) { - std::cout << "ALS (native): trainModel" << std::endl; - - auto tStart = std::chrono::high_resolution_clock::now(); - - training::DistributedPartialResultStep1Ptr step1LocalResultsOnMaster[nBlocks]; - training::DistributedPartialResultStep1Ptr step1LocalResult; - NumericTablePtr step2MasterResult; - KeyValueDataCollectionPtr step3LocalResult; - KeyValueDataCollectionPtr step4LocalInput(new KeyValueDataCollection()); - - ByteBuffer nodeCPs[nBlocks]; - ByteBuffer nodeResults; - ByteBuffer crossProductBuf; - int crossProductLen; - - for (size_t iteration = 0; iteration < maxIterations; iteration++) { - auto t1 = std::chrono::high_resolution_clock::now(); - - // - // Update partial users factors - // - step1LocalResult = computeStep1Local(itemsPartialResultLocal, nFactors); - - serializeDAALObject(step1LocalResult.get(), nodeResults); - - /* Gathering step1LocalResult on the master */ - gather(rankId, comm, nBlocks, nodeResults, step1LocalResultsOnMaster); - - if (rankId == ccl_root) { - step2MasterResult = - computeStep2Master(step1LocalResultsOnMaster, nFactors, nBlocks); - serializeDAALObject(step2MasterResult.get(), crossProductBuf); - crossProductLen = crossProductBuf.size(); - } - - // MPI_Bcast(&crossProductLen, sizeof(int), MPI_CHAR, ccl_root, MPI_COMM_WORLD); - ccl::broadcast(&crossProductLen, sizeof(int), ccl::datatype::uint8, ccl_root, comm).wait(); - - if (rankId != ccl_root) { - crossProductBuf.resize(crossProductLen); - } - // MPI_Bcast(&crossProductBuf[0], crossProductLen, MPI_CHAR, ccl_root, - // MPI_COMM_WORLD); - ccl::broadcast(&crossProductBuf[0], crossProductLen, ccl::datatype::uint8, ccl_root, comm).wait(); - - step2MasterResult = - NumericTable::cast(deserializeDAALObject(&crossProductBuf[0], crossProductLen)); - - step3LocalResult = computeStep3Local(itemOffset, itemsPartialResultLocal, - itemStep3LocalInput, nFactors); - - /* MPI_Alltoallv to populate step4LocalInput */ - for (size_t i = 0; i < nBlocks; i++) { - serializeDAALObject((*step3LocalResult)[i].get(), nodeCPs[i]); - } - all2all(comm, nodeCPs, nBlocks, step4LocalInput); - - usersPartialResultLocal = computeStep4Local(transposedDataTable, step2MasterResult, - step4LocalInput, nFactors); - - // - // Update partial items factors - // - step1LocalResult = computeStep1Local(usersPartialResultLocal, nFactors); - - serializeDAALObject(step1LocalResult.get(), nodeResults); - - /* Gathering step1LocalResult on the master */ - gather(rankId, comm, nBlocks, nodeResults, step1LocalResultsOnMaster); - - if (rankId == ccl_root) { - step2MasterResult = - computeStep2Master(step1LocalResultsOnMaster, nFactors, nBlocks); - serializeDAALObject(step2MasterResult.get(), crossProductBuf); - crossProductLen = crossProductBuf.size(); - } - - // MPI_Bcast(&crossProductLen, sizeof(int), MPI_CHAR, ccl_root, MPI_COMM_WORLD); - ccl::broadcast(&crossProductLen, sizeof(int), ccl::datatype::uint8, ccl_root, comm).wait(); - - if (rankId != ccl_root) { - crossProductBuf.resize(crossProductLen); - } - - // MPI_Bcast(&crossProductBuf[0], crossProductLen, MPI_CHAR, ccl_root, - // MPI_COMM_WORLD); - ccl::broadcast(&crossProductBuf[0], crossProductLen, ccl::datatype::uint8, ccl_root, comm).wait(); - - step2MasterResult = - NumericTable::cast(deserializeDAALObject(&crossProductBuf[0], crossProductLen)); - - step3LocalResult = computeStep3Local(userOffset, usersPartialResultLocal, - userStep3LocalInput, nFactors); - - /* MPI_Alltoallv to populate step4LocalInput */ - for (size_t i = 0; i < nBlocks; i++) { - serializeDAALObject((*step3LocalResult)[i].get(), nodeCPs[i]); - } - all2all(comm, nodeCPs, nBlocks, step4LocalInput); - - itemsPartialResultLocal = - computeStep4Local(dataTable, step2MasterResult, step4LocalInput, nFactors); - - auto t2 = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(t2 - t1).count(); - std::cout << "ALS (native): iteration " << iteration << " took " << duration - << " secs" << std::endl; - } - - auto tEnd = std::chrono::high_resolution_clock::now(); - auto durationTotal = - std::chrono::duration_cast(tEnd - tStart).count(); - std::cout << "ALS (native): trainModel took " << durationTotal << " secs" << std::endl; - - /*Gather all itemsPartialResultLocal to itemsPartialResultsMaster on the master and - * distributing the result over other ranks*/ - // serializeDAALObject(itemsPartialResultLocal.get(), nodeResults); - // gatherItems(nodeResults, nBlocks); - - // serializeDAALObject(usersPartialResultLocal.get(), nodeResults); - // gatherUsers(nodeResults, nBlocks); -} - -static size_t getOffsetFromOffsetTable(NumericTablePtr offsetTable) { - size_t ret; - BlockDescriptor block; - offsetTable->getBlockOfRows(0, 1, readOnly, block); - ret = (size_t)((block.getBlockPtr())[0]); - offsetTable->releaseBlockOfRows(block); - - return ret; -} - -/* - * Class: org_apache_spark_ml_recommendation_ALSDALImpl - * Method: cShuffleData - * Signature: - * (Ljava/nio/ByteBuffer;IILorg/apache/spark/ml/recommendation/ALSPartitionInfo;)Ljava/nio/ByteBuffer; - */ -JNIEXPORT jobject JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cShuffleData( - JNIEnv* env, jobject obj, jobject dataBuffer, jint nTotalKeys, jint nBlocks, - jobject infoObj) { - // cout << "cShuffleData: rank " << rankId << endl; - cout << "RATING_SIZE: " << RATING_SIZE << endl; - - ccl::communicator &comm = getComm(); - - jbyte* ratingsBuf = (jbyte*)env->GetDirectBufferAddress(dataBuffer); - - jlong ratingsNum = env->GetDirectBufferCapacity(dataBuffer) / RATING_SIZE; - - std::vector ratingPartitions(nBlocks); - - for (int i = 0; i < ratingsNum; i++) { - Rating* rating = (Rating*)(ratingsBuf + RATING_SIZE * i); - int partition = getPartiton(rating->user, nTotalKeys, nBlocks); - ratingPartitions[partition].push_back(*rating); - } - - // for (int i = 0; i < nBlocks; i++) { - // cout << "Partition " << i << endl; - // for (auto r : ratingPartitions[i]) { - // cout << r.user << " " << r.item << " " << r.rating << endl; - // } - // } - - size_t newRatingsNum = 0; - size_t newCsrRowNum = 0; - Rating* ratings = shuffle_all2all(comm, ratingPartitions, nBlocks, newRatingsNum, newCsrRowNum); - - // Get the class of the input object - jclass clazz = env->GetObjectClass(infoObj); - // Get Field references - jfieldID ratingsNumField = env->GetFieldID(clazz, "ratingsNum", "I"); - jfieldID csrRowNumField = env->GetFieldID(clazz, "csrRowNum", "I"); - - env->SetIntField(infoObj, ratingsNumField, newRatingsNum); - env->SetIntField(infoObj, csrRowNumField, newCsrRowNum); - - return env->NewDirectByteBuffer(ratings, newRatingsNum*RATING_SIZE); -} - -/* - * Class: org_apache_spark_ml_recommendation_ALSDALImpl - * Method: cDALImplictALS - * Signature: (JJIIDDIIILorg/apache/spark/ml/recommendation/ALSResult;)J - */ - -JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cDALImplictALS( - JNIEnv* env, jobject obj, jlong numTableAddr, jlong nUsers, jint nFactors, - jint maxIter, jdouble regParam, jdouble alpha, jint executor_num, jint executor_cores, - jint partitionId, jobject resultObj) { - - ccl::communicator &comm = getComm(); - size_t rankId = comm.rank(); - - dataTable = *((CSRNumericTablePtr*)numTableAddr); - // dataTable.reset(createFloatSparseTable("/home/xiaochang/github/oneDAL-upstream/samples/daal/cpp/mpi/data/distributed/implicit_als_csr_1.csv")); - - // printNumericTable(dataTable, "cDALImplictALS", 10); - cout << "ALS (native): Input info: " << endl; - cout << "- NumberOfRows: " << dataTable->getNumberOfRows() << endl; - cout << "- NumberOfColumns: " << dataTable->getNumberOfColumns() << endl; - cout << "- NumberOfRatings: " << dataTable->getDataSize() << endl; - cout << "- fullNUsers: " << nUsers << endl; - cout << "- nFactors: " << nFactors << endl; - - // Set number of threads for oneDAL to use for each rank - services::Environment::getInstance()->setNumberOfThreads(executor_cores); - int nThreadsNew = services::Environment::getInstance()->getNumberOfThreads(); - cout << "oneDAL (native): Number of threads used: " << nThreadsNew << endl; - - int nBlocks = executor_num; - initializeModel(rankId, comm, partitionId, nBlocks, nUsers, nFactors); - trainModel(rankId, comm, partitionId, executor_num, nFactors, maxIter); - - auto pUser = - usersPartialResultLocal->get(training::outputOfStep4ForStep1)->getFactors(); - // auto pUserIndices = - // usersPartialResultLocal->get(training::outputOfStep4ForStep1)->getIndices(); - auto pItem = - itemsPartialResultLocal->get(training::outputOfStep4ForStep1)->getFactors(); - // auto pItemIndices = - // itemsPartialResultsMaster[i]->get(training::outputOfStep4ForStep1)->getIndices(); - - std::cout << "\n=== Results for Rank " << rankId << "===\n" << std::endl; - // std::cout << "Partition ID: " << partitionId << std::endl; - printNumericTable(pUser, "User Factors (first 10 rows):", 10); - printNumericTable(pItem, "Item Factors (first 10 rows):", 10); - std::cout << "User Offset: " << getOffsetFromOffsetTable(userOffset) << std::endl; - std::cout << "Item Offset: " << getOffsetFromOffsetTable(itemOffset) << std::endl; - std::cout << std::endl; - - // printNumericTable(userOffset, "userOffset"); - // printNumericTable(itemOffset, "itemOffset"); - - // if (rankId == ccl_root) { - // for (int i = 0; i < nBlocks; i++) { - // printNumericTable(NumericTable::cast((*userOffsetsOnMaster)[i]), - // "userOffsetsOnMaster"); - // } - - // for (int i = 0; i < nBlocks; i++) { - // printNumericTable(NumericTable::cast((*itemOffsetsOnMaster)[i]), - // "itemOffsetsOnMaster"); - // } - // } - - // printf("native pUser %ld, pItem %ld", (jlong)&pUser, (jlong)&pItem); - - // Get the class of the input object - jclass clazz = env->GetObjectClass(resultObj); - - // Fill in rankId - jfieldID cRankIdField = env->GetFieldID(clazz, "rankId", "J"); - env->SetLongField(resultObj, cRankIdField, (jlong)rankId); - - // Fill in cUsersFactorsNumTab & cItemsFactorsNumTab - // Get Field references - jfieldID cUsersFactorsNumTabField = env->GetFieldID(clazz, "cUsersFactorsNumTab", "J"); - jfieldID cItemsFactorsNumTabField = env->GetFieldID(clazz, "cItemsFactorsNumTab", "J"); - // Set factors as result, should use heap memory - NumericTablePtr* retUser = new NumericTablePtr(pUser); - NumericTablePtr* retItem = new NumericTablePtr(pItem); - env->SetLongField(resultObj, cUsersFactorsNumTabField, (jlong)retUser); - env->SetLongField(resultObj, cItemsFactorsNumTabField, (jlong)retItem); - - // Fill in cUserOffset & cItemOffset - jfieldID cUserOffsetField = env->GetFieldID(clazz, "cUserOffset", "J"); - assert(cUserOffsetField != NULL); - env->SetLongField(resultObj, cUserOffsetField, - (jlong)getOffsetFromOffsetTable(userOffset)); - - jfieldID cItemOffsetField = env->GetFieldID(clazz, "cItemOffset", "J"); - assert(cItemOffsetField != NULL); - env->SetLongField(resultObj, cItemOffsetField, - (jlong)getOffsetFromOffsetTable(itemOffset)); - - return 0; -} diff --git a/mllib-dal/src/main/native/ALSShuffle.cpp b/mllib-dal/src/main/native/ALSShuffle.cpp deleted file mode 100644 index 73440d253..000000000 --- a/mllib-dal/src/main/native/ALSShuffle.cpp +++ /dev/null @@ -1,102 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "ALSShuffle.h" - -using namespace std; - -std::vector recvData; - -jlong getPartiton(jlong key, jlong totalKeys, long nBlocks) { - - jlong itemsInBlock = totalKeys / nBlocks; - - return min(key / itemsInBlock, nBlocks - 1); -} - -// Compares two Rating according to userId. -bool compareRatingByUser(Rating r1, Rating r2) -{ - if (r1.user < r2.user) - return true; - if (r1.user == r2.user && r1.item < r2.item) - return true; - return false; -} - -bool compareRatingUserEquality(Rating &r1, Rating &r2) { - return r1.user == r2.user; -} - -int distinct_count(std::vector &data) { - long curUser = -1; - long count = 0; - for (auto i : data) { - if (i.user > curUser) { - curUser = i.user; - count += 1; - } - } - return count; -} - -Rating * shuffle_all2all(ccl::communicator &comm, std::vector &partitions, size_t nBlocks, size_t &newRatingsNum, size_t &newCsrRowNum) { - size_t sendBufSize = 0; - size_t recvBufSize = 0; - vector perNodeSendLens(nBlocks); - vector perNodeRecvLens(nBlocks); - - ByteBuffer sendData; - - // Calculate send buffer size - for (size_t i = 0; i < nBlocks; i++) { - perNodeSendLens[i] = partitions[i].size() * RATING_SIZE; - // cout << "rank " << rankId << " Send partition " << i << " size " << perNodeSendLens[i] << endl; - sendBufSize += perNodeSendLens[i]; - } - cout << "sendData size " << sendBufSize << endl; - sendData.resize(sendBufSize); - - // Fill in send buffer - size_t offset = 0; - for (size_t i = 0; i < nBlocks; i++) - { - memcpy(sendData.data()+offset, partitions[i].data(), perNodeSendLens[i]); - offset += perNodeSendLens[i]; - } - - // Send lens first - ccl::alltoall(perNodeSendLens.data(), perNodeRecvLens.data(), sizeof(size_t), ccl::datatype::uint8, comm).wait(); - - // Calculate recv buffer size - for (size_t i = 0; i < nBlocks; i++) { - // cout << "rank " << rankId << " Recv partition " << i << " size " << perNodeRecvLens[i] << endl; - recvBufSize += perNodeRecvLens[i]; - } - - int ratingsNum = recvBufSize / RATING_SIZE; - recvData.resize(ratingsNum); - - // Send data - ccl::alltoallv(sendData.data(), perNodeSendLens, recvData.data(), perNodeRecvLens, ccl::datatype::uint8, comm).wait(); - - sort(recvData.begin(), recvData.end(), compareRatingByUser); - - // for (auto r : recvData) { - // cout << r.user << " " << r.item << " " << r.rating << endl; - // } - - newRatingsNum = recvData.size(); - // RatingPartition::iterator iter = std::unique(recvData.begin(), recvData.end(), compareRatingUserEquality); - // newCsrRowNum = std::distance(recvData.begin(), iter); - newCsrRowNum = distinct_count(recvData); - - cout << "newRatingsNum: " << newRatingsNum << " newCsrRowNum: " << newCsrRowNum << endl; - - return recvData.data(); -} - diff --git a/mllib-dal/src/main/native/ALSShuffle.h b/mllib-dal/src/main/native/ALSShuffle.h deleted file mode 100644 index dbe864978..000000000 --- a/mllib-dal/src/main/native/ALSShuffle.h +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - -#include - -struct Rating { - jlong user; - jlong item; - jfloat rating; -} __attribute__((packed)); - -const int RATING_SIZE = sizeof(Rating); - -typedef std::vector ByteBuffer; -typedef std::vector RatingPartition; - -jlong getPartiton(jlong key, jlong totalKeys, long nBlocks); -Rating * shuffle_all2all(ccl::communicator &comm, std::vector &partitions, size_t nBlocks, size_t &ratingsNum, size_t &csrRowNum); diff --git a/mllib-dal/src/main/native/KMeansDALImpl.cpp b/mllib-dal/src/main/native/KMeansDALImpl.cpp index d9c7a2f29..8ee123754 100644 --- a/mllib-dal/src/main/native/KMeansDALImpl.cpp +++ b/mllib-dal/src/main/native/KMeansDALImpl.cpp @@ -14,14 +14,13 @@ * limitations under the License. *******************************************************************************/ -#include +#include #include -#include -#include #include "service.h" #include "org_apache_spark_ml_clustering_KMeansDALImpl.h" -#include "OneCCL.h" +#include +#include using namespace std; using namespace daal; @@ -31,8 +30,7 @@ const int ccl_root = 0; typedef double algorithmFPType; /* Algorithm floating-point type */ -static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, - const NumericTablePtr & pData, const NumericTablePtr & initialCentroids, +static NumericTablePtr kmeans_compute(int rankId, const NumericTablePtr & pData, const NumericTablePtr & initialCentroids, size_t nClusters, size_t nBlocks, algorithmFPType &ret_cost) { const bool isRoot = (rankId == ccl_root); @@ -45,13 +43,17 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, CentroidsArchLength = inputArch.getSizeOfArchive(); } + ccl_request_t request; + /* Get partial results from the root node */ - ccl::broadcast(&CentroidsArchLength, sizeof(size_t), ccl::datatype::uint8, ccl_root, comm).wait(); + ccl_bcast(&CentroidsArchLength, sizeof(size_t), ccl_dtype_char, ccl_root, NULL, NULL, NULL, &request); + ccl_wait(request); ByteBuffer nodeCentroids(CentroidsArchLength); if (isRoot) inputArch.copyArchiveToArray(&nodeCentroids[0], CentroidsArchLength); - ccl::broadcast(&nodeCentroids[0], CentroidsArchLength, ccl::datatype::uint8, ccl_root, comm).wait(); + ccl_bcast(&nodeCentroids[0], CentroidsArchLength, ccl_dtype_char, ccl_root, NULL, NULL, NULL, &request); + ccl_wait(request); /* Deserialize centroids data */ OutputDataArchive outArch(nodeCentroids.size() ? &nodeCentroids[0] : NULL, CentroidsArchLength); @@ -77,7 +79,7 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, ByteBuffer serializedData; /* Serialized data is of equal size on each node if each node called compute() equal number of times */ - vector recvCounts(nBlocks); + size_t* recvCounts = new size_t[nBlocks]; for (size_t i = 0; i < nBlocks; i++) { recvCounts[i] = perNodeArchLength; @@ -88,7 +90,10 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, dataArch.copyArchiveToArray(&nodeResults[0], perNodeArchLength); /* Transfer partial results to step 2 on the root node */ - ccl::allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0], recvCounts, ccl::datatype::uint8, comm).wait(); + ccl_allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0], recvCounts, ccl_dtype_char, NULL, NULL, NULL, &request); + ccl_wait(request); + + delete [] recvCounts; if (isRoot) { @@ -163,8 +168,8 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_clustering_KMeansDALImpl_cKMean jint executor_num, jint executor_cores, jobject resultObj) { - ccl::communicator &comm = getComm(); - size_t rankId = comm.rank(); + size_t rankId; + ccl_get_comm_rank(NULL, &rankId); NumericTablePtr pData = *((NumericTablePtr *)pNumTabData); NumericTablePtr centroids = *((NumericTablePtr *)pNumTabCenters); @@ -184,14 +189,16 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_clustering_KMeansDALImpl_cKMean for (it = 0; it < iteration_num && !converged; it++) { auto t1 = std::chrono::high_resolution_clock::now(); - newCentroids = kmeans_compute(rankId, comm, pData, centroids, cluster_num, executor_num, totalCost); + newCentroids = kmeans_compute(rankId, pData, centroids, cluster_num, executor_num, totalCost); if (rankId == ccl_root) { converged = areAllCentersConverged(centroids, newCentroids, tolerance); } // Sync converged status - ccl::broadcast(&converged, 1, ccl::datatype::uint8, ccl_root, comm).wait(); + ccl_request_t request; + ccl_bcast(&converged, 1, ccl_dtype_char, ccl_root, NULL, NULL, NULL, &request); + ccl_wait(request); centroids = newCentroids; @@ -221,4 +228,4 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_clustering_KMeansDALImpl_cKMean return (jlong)ret; } else return (jlong)0; -} \ No newline at end of file +} diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile index 23222e646..dfb5076ff 100644 --- a/mllib-dal/src/main/native/Makefile +++ b/mllib-dal/src/main/native/Makefile @@ -31,17 +31,17 @@ INCS := -I $(JAVA_HOME)/include \ # Use static link if possible, TBB is only available as dynamic libs -LIBS := -L${CCL_ROOT}/lib -lccl \ +LIBS := -L${CCL_ROOT}/lib -l:libccl.a \ -L$(DAALROOT)/lib/intel64 -l:libdaal_core.a -l:libdaal_thread.a \ - -L$(TBBROOT)/lib -ltbb -ltbbmalloc + -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc # TODO: Add signal chaining support, should fix linking, package so and loading # -L$(JAVA_HOME)/jre/lib/amd64 -ljsig CPP_SRCS += \ -./OneCCL.cpp ./OneDAL.cpp ./KMeansDALImpl.cpp ./PCADALImpl.cpp ./ALSDALImpl.cpp ./ALSShuffle.cpp ./service.cpp ./error_handling.cpp +./OneCCL.cpp ./OneDAL.cpp ./KMeansDALImpl.cpp ./PCADALImpl.cpp ./service.cpp ./error_handling.cpp OBJS += \ -./OneCCL.o ./OneDAL.o ./KMeansDALImpl.o ./PCADALImpl.o ./ALSDALImpl.o ./ALSShuffle.o ./service.o ./error_handling.o +./OneCCL.o ./OneDAL.o ./KMeansDALImpl.o ./PCADALImpl.o ./service.o ./error_handling.o # Output Binary OUTPUT = ../../../target/libMLlibDAL.so diff --git a/mllib-dal/src/main/native/OneCCL.cpp b/mllib-dal/src/main/native/OneCCL.cpp index c733c7b33..b23be9737 100644 --- a/mllib-dal/src/main/native/OneCCL.cpp +++ b/mllib-dal/src/main/native/OneCCL.cpp @@ -1,63 +1,26 @@ #include -#include - -#include -#include -#include -#include - -#include -#include -#include - -#include - +#include #include "org_apache_spark_ml_util_OneCCL__.h" -// todo: fill initial comm_size and rank_id -size_t comm_size; -size_t rank_id; - -std::vector g_comms; - -ccl::communicator &getComm() { - return g_comms[0]; -} - JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1init - (JNIEnv *env, jobject obj, jint size, jint rank, jstring ip_port, jobject param) { + (JNIEnv *env, jobject obj, jobject param) { - std::cerr << "OneCCL (native): init" << std::endl; - - auto t1 = std::chrono::high_resolution_clock::now(); - - ccl::init(); - - const char *str = env->GetStringUTFChars(ip_port, 0); - ccl::string ccl_ip_port(str); + std::cout << "oneCCL (native): init" << std::endl; - auto kvs_attr = ccl::create_kvs_attr(); - kvs_attr.set(ccl_ip_port); - - ccl::shared_ptr_class kvs; - kvs = ccl::create_main_kvs(kvs_attr); - - g_comms.push_back(ccl::create_communicator(size, rank, kvs)); - - auto t2 = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( t2 - t1 ).count(); - std::cerr << "OneCCL (native): init took " << duration << " secs" << std::endl; - - rank_id = getComm().rank(); - comm_size = getComm().size(); + ccl_init(); jclass cls = env->GetObjectClass(param); jfieldID fid_comm_size = env->GetFieldID(cls, "commSize", "J"); - jfieldID fid_rank_id = env->GetFieldID(cls, "rankId", "J"); + jfieldID fid_rank_id = env->GetFieldID(cls, "rankId", "J"); + + size_t comm_size; + size_t rank_id; + + ccl_get_comm_size(NULL, &comm_size); + ccl_get_comm_rank(NULL, &rank_id); env->SetLongField(param, fid_comm_size, comm_size); env->SetLongField(param, fid_rank_id, rank_id); - env->ReleaseStringUTFChars(ip_port, str); return 1; } @@ -70,10 +33,9 @@ JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1init JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1cleanup (JNIEnv *env, jobject obj) { - g_comms.pop_back(); - - std::cerr << "OneCCL (native): cleanup" << std::endl; + std::cout << "oneCCL (native): cleanup" << std::endl; + ccl_finalize(); } /* @@ -82,9 +44,12 @@ JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1cleanup * Signature: ()Z */ JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_isRoot - (JNIEnv *env, jobject obj) { + (JNIEnv *env, jobject obj) { - return getComm().rank() == 0; + size_t rank_id; + ccl_get_comm_rank(NULL, &rank_id); + + return (rank_id == 0); } /* @@ -94,7 +59,12 @@ JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_isRoot */ JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_rankID (JNIEnv *env, jobject obj) { - return getComm().rank(); + + size_t rank_id; + ccl_get_comm_rank(NULL, &rank_id); + + return rank_id; + } /* @@ -115,115 +85,3 @@ JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_setEnv return err; } - -static const int CCL_IP_LEN = 128; -std::list local_host_ips; - -static int fill_local_host_ip() { - struct ifaddrs *ifaddr, *ifa; - int family = AF_UNSPEC; - char local_ip[CCL_IP_LEN]; - if (getifaddrs(&ifaddr) < 0) { - // LOG_ERROR("fill_local_host_ip: can not get host IP"); - return -1; - } - - const char iface_name[] = "lo"; - local_host_ips.clear(); - - for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { - if (ifa->ifa_addr == NULL) - continue; - if (strstr(ifa->ifa_name, iface_name) == NULL) { - family = ifa->ifa_addr->sa_family; - if (family == AF_INET) { - memset(local_ip, 0, CCL_IP_LEN); - int res = getnameinfo( - ifa->ifa_addr, - (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6), - local_ip, - CCL_IP_LEN, - NULL, - 0, - NI_NUMERICHOST); - if (res != 0) { - std::string s("fill_local_host_ip: getnameinfo error > "); - s.append(gai_strerror(res)); - // LOG_ERROR(s.c_str()); - return -1; - } - local_host_ips.push_back(local_ip); - } - } - } - if (local_host_ips.empty()) { - // LOG_ERROR("fill_local_host_ip: can't find interface to get host IP"); - return -1; - } - // memset(local_host_ip, 0, CCL_IP_LEN); - // strncpy(local_host_ip, local_host_ips.front().c_str(), CCL_IP_LEN); - - // for (auto &ip : local_host_ips) - // cout << ip << endl; - - freeifaddrs(ifaddr); - return 0; -} - -static bool is_valid_ip(char ip[]) { - if (fill_local_host_ip() == -1) { - std::cerr << "fill_local_host_ip error" << std::endl; - }; - for (std::list::iterator it = local_host_ips.begin(); it != local_host_ips.end(); ++it) { - if (*it == ip) { - return true; - } - } - - return false; -} - -/* - * Class: org_apache_spark_ml_util_OneCCL__ - * Method: getAvailPort - * Signature: (Ljava/lang/String;)I - */ -JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1getAvailPort - (JNIEnv *env, jobject obj, jstring localIP) { - - // start from beginning of dynamic port - const int port_start_base = 3000; - - char* local_host_ip = (char *) env->GetStringUTFChars(localIP, NULL); - - // check if the input ip is one of host's ips - if (!is_valid_ip(local_host_ip)) - return -1; - - struct sockaddr_in main_server_address; - int server_listen_sock; - in_port_t port = port_start_base; - - if ((server_listen_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - perror("OneCCL (native) getAvailPort error!"); - return -1; - } - - main_server_address.sin_family = AF_INET; - main_server_address.sin_addr.s_addr = inet_addr(local_host_ip); - main_server_address.sin_port = htons(port); - - // search for available port - while (bind(server_listen_sock, - (const struct sockaddr *)&main_server_address, - sizeof(main_server_address)) < 0) { - port++; - main_server_address.sin_port = htons(port); - } - - close(server_listen_sock); - - env->ReleaseStringUTFChars(localIP, local_host_ip); - - return port; -} diff --git a/mllib-dal/src/main/native/OneCCL.h b/mllib-dal/src/main/native/OneCCL.h deleted file mode 100644 index b579c4697..000000000 --- a/mllib-dal/src/main/native/OneCCL.h +++ /dev/null @@ -1,5 +0,0 @@ -#pragma once - -#include - -ccl::communicator &getComm(); diff --git a/mllib-dal/src/main/native/OneDAL.cpp b/mllib-dal/src/main/native/OneDAL.cpp index 792225c3e..2210ffd1c 100644 --- a/mllib-dal/src/main/native/OneDAL.cpp +++ b/mllib-dal/src/main/native/OneDAL.cpp @@ -19,8 +19,6 @@ #include #include "org_apache_spark_ml_util_OneDAL__.h" -#include "service.h" - using namespace daal; using namespace daal::data_management; @@ -125,51 +123,3 @@ JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cCheckPlat // Only guarantee compatibility and performance on Intel platforms, use oneDAL lib function return daal_check_is_intel_cpu(); } - -/* - * Class: org_apache_spark_ml_util_OneDAL__ - * Method: cNewCSRNumericTable - * Signature: ([F[J[JJJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cNewCSRNumericTable - (JNIEnv *env, jobject, jfloatArray data, jlongArray colIndices, jlongArray rowOffsets, jlong nFeatures, jlong nVectors) { - - long numData = env->GetArrayLength(data); - // long numColIndices = numData; - // long numRowOffsets = env->GetArrayLength(rowOffsets); - - size_t * resultRowOffsets = NULL; - size_t * resultColIndices = NULL; - float * resultData = NULL; - CSRNumericTable * numericTable = new CSRNumericTable(resultData, resultColIndices, resultRowOffsets, nFeatures, nVectors); - numericTable->allocateDataMemory(numData); - numericTable->getArrays(&resultData, &resultColIndices, &resultRowOffsets); - - size_t * pRowOffsets = (size_t *)env->GetLongArrayElements(rowOffsets, 0); - size_t * pColIndices = (size_t *)env->GetLongArrayElements(colIndices, 0); - float * pData = env->GetFloatArrayElements(data, 0); - - // std::memcpy(resultRowOffsets, pRowOffsets, numRowOffsets*sizeof(jlong)); - // std::memcpy(resultColIndices, pColIndices, numColIndices*sizeof(jlong)); - // std::memcpy(resultData, pData, numData*sizeof(float)); - - for (size_t i = 0; i < (size_t)numData; ++i) - { - resultData[i] = pData[i]; - resultColIndices[i] = pColIndices[i]; - } - for (size_t i = 0; i < (size_t)nVectors + 1; ++i) - { - resultRowOffsets[i] = pRowOffsets[i]; - } - - env->ReleaseLongArrayElements(rowOffsets, (jlong *)pRowOffsets, 0); - env->ReleaseLongArrayElements(colIndices, (jlong *)pColIndices, 0); - env->ReleaseFloatArrayElements(data, pData, 0); - - CSRNumericTablePtr *ret = new CSRNumericTablePtr(numericTable); - - //printNumericTable(*ret, "cNewCSRNumericTable", 10); - - return (jlong)ret; -} diff --git a/mllib-dal/src/main/native/PCADALImpl.cpp b/mllib-dal/src/main/native/PCADALImpl.cpp index 33e2bc95d..3b06fc0dc 100644 --- a/mllib-dal/src/main/native/PCADALImpl.cpp +++ b/mllib-dal/src/main/native/PCADALImpl.cpp @@ -1,3 +1,4 @@ +#include #include #include "service.h" @@ -6,7 +7,6 @@ #include #include "org_apache_spark_ml_feature_PCADALImpl.h" -#include "OneCCL.h" using namespace std; using namespace daal; @@ -24,9 +24,8 @@ typedef double algorithmFPType; /* Algorithm floating-point type */ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL( JNIEnv *env, jobject obj, jlong pNumTabData, jint k, jint executor_num, jint executor_cores, jobject resultObj) { - - ccl::communicator &comm = getComm(); - size_t rankId = comm.rank(); + size_t rankId; + ccl_get_comm_rank(NULL, &rankId); const size_t nBlocks = executor_num; const int comm_size = executor_num; @@ -41,8 +40,6 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL int nThreadsNew = services::Environment::getInstance()->getNumberOfThreads(); cout << "oneDAL (native): Number of threads used: " << nThreadsNew << endl; - auto t1 = std::chrono::high_resolution_clock::now(); - pca::Distributed localAlgorithm; /* Set the input data set to the algorithm */ @@ -51,12 +48,6 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL /* Compute PCA decomposition */ localAlgorithm.compute(); - auto t2 = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( t2 - t1 ).count(); - std::cout << "PCA (native): local step took " << duration << " secs" << std::endl; - - t1 = std::chrono::high_resolution_clock::now(); - /* Serialize partial results required by step 2 */ services::SharedPtr serializedData; InputDataArchive dataArch; @@ -68,31 +59,31 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL byte* nodeResults = new byte[perNodeArchLength]; dataArch.copyArchiveToArray(nodeResults, perNodeArchLength); - t2 = std::chrono::high_resolution_clock::now(); - - duration = std::chrono::duration_cast( t2 - t1 ).count(); - std::cout << "PCA (native): serializing partial results took " << duration << " secs" << std::endl; + ccl_request_t request; - vector recv_counts(comm_size * perNodeArchLength); + size_t* recv_counts = new size_t[comm_size * perNodeArchLength]; for (int i = 0; i < comm_size; i++) recv_counts[i] = perNodeArchLength; cout << "PCA (native): ccl_allgatherv receiving " << perNodeArchLength * nBlocks << " bytes" << endl; - t1 = std::chrono::high_resolution_clock::now(); + auto t1 = std::chrono::high_resolution_clock::now(); /* Transfer partial results to step 2 on the root node */ // MPI_Gather(nodeResults, perNodeArchLength, MPI_CHAR, serializedData.get(), // perNodeArchLength, MPI_CHAR, ccl_root, MPI_COMM_WORLD); - ccl::allgatherv(nodeResults, perNodeArchLength, serializedData.get(), recv_counts, - ccl::datatype::uint8, comm).wait(); + ccl_allgatherv(nodeResults, perNodeArchLength, serializedData.get(), recv_counts, + ccl_dtype_char, NULL, NULL, NULL, &request); + ccl_wait(request); - t2 = std::chrono::high_resolution_clock::now(); + auto t2 = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast( t2 - t1 ).count(); + auto duration = std::chrono::duration_cast( t2 - t1 ).count(); std::cout << "PCA (native): ccl_allgatherv took " << duration << " secs" << std::endl; + delete[] nodeResults; + if (rankId == ccl_root) { - auto t1 = std::chrono::high_resolution_clock::now(); + auto t1 = std::chrono::high_resolution_clock::now(); /* Create an algorithm for principal component analysis using the svdDense method * on the master node */ diff --git a/mllib-dal/src/main/native/build-jni.sh b/mllib-dal/src/main/native/build-jni.sh index dacd8382b..bee614dcd 100755 --- a/mllib-dal/src/main/native/build-jni.sh +++ b/mllib-dal/src/main/native/build-jni.sh @@ -18,5 +18,4 @@ javah -d $WORK_DIR/javah -classpath "$WORK_DIR/../../../target/classes:$DAAL_JAR org.apache.spark.ml.util.OneCCL$ \ org.apache.spark.ml.util.OneDAL$ \ org.apache.spark.ml.clustering.KMeansDALImpl \ - org.apache.spark.ml.feature.PCADALImpl \ - org.apache.spark.ml.recommendation.ALSDALImpl + org.apache.spark.ml.feature.PCADALImpl diff --git a/mllib-dal/src/main/native/javah/org_apache_spark_ml_recommendation_ALSDALImpl.h b/mllib-dal/src/main/native/javah/org_apache_spark_ml_recommendation_ALSDALImpl.h deleted file mode 100644 index 73024d05b..000000000 --- a/mllib-dal/src/main/native/javah/org_apache_spark_ml_recommendation_ALSDALImpl.h +++ /dev/null @@ -1,29 +0,0 @@ -/* DO NOT EDIT THIS FILE - it is machine generated */ -#include -/* Header for class org_apache_spark_ml_recommendation_ALSDALImpl */ - -#ifndef _Included_org_apache_spark_ml_recommendation_ALSDALImpl -#define _Included_org_apache_spark_ml_recommendation_ALSDALImpl -#ifdef __cplusplus -extern "C" { -#endif -/* - * Class: org_apache_spark_ml_recommendation_ALSDALImpl - * Method: cDALImplictALS - * Signature: (JJIIDDIIILorg/apache/spark/ml/recommendation/ALSResult;)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cDALImplictALS - (JNIEnv *, jobject, jlong, jlong, jint, jint, jdouble, jdouble, jint, jint, jint, jobject); - -/* - * Class: org_apache_spark_ml_recommendation_ALSDALImpl - * Method: cShuffleData - * Signature: (Ljava/nio/ByteBuffer;IILorg/apache/spark/ml/recommendation/ALSPartitionInfo;)Ljava/nio/ByteBuffer; - */ -JNIEXPORT jobject JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cShuffleData - (JNIEnv *, jobject, jobject, jint, jint, jobject); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneCCL__.h b/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneCCL__.h index 580c34bf9..60825ae3f 100644 --- a/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneCCL__.h +++ b/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneCCL__.h @@ -10,10 +10,10 @@ extern "C" { /* * Class: org_apache_spark_ml_util_OneCCL__ * Method: c_init - * Signature: (IILjava/lang/String;Lorg/apache/spark/ml/util/CCLParam;)I + * Signature: (Lorg/apache/spark/ml/util/CCLParam;)I */ JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1init - (JNIEnv *, jobject, jint, jint, jstring, jobject); + (JNIEnv *, jobject, jobject); /* * Class: org_apache_spark_ml_util_OneCCL__ @@ -47,14 +47,6 @@ JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_rankID JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_setEnv (JNIEnv *, jobject, jstring, jstring, jboolean); -/* - * Class: org_apache_spark_ml_util_OneCCL__ - * Method: c_getAvailPort - * Signature: (Ljava/lang/String;)I - */ -JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1getAvailPort - (JNIEnv *, jobject, jstring); - #ifdef __cplusplus } #endif diff --git a/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneDAL__.h b/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneDAL__.h index d88462d66..5f67d9428 100644 --- a/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneDAL__.h +++ b/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneDAL__.h @@ -47,14 +47,6 @@ JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cFreeDataMemor JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cCheckPlatformCompatibility (JNIEnv *, jobject); -/* - * Class: org_apache_spark_ml_util_OneDAL__ - * Method: cNewCSRNumericTable - * Signature: ([F[J[JJJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cNewCSRNumericTable - (JNIEnv *, jobject, jfloatArray, jlongArray, jlongArray, jlong, jlong); - #ifdef __cplusplus } #endif diff --git a/mllib-dal/src/main/native/service.cpp b/mllib-dal/src/main/native/service.cpp index 623767406..9316b3b62 100644 --- a/mllib-dal/src/main/native/service.cpp +++ b/mllib-dal/src/main/native/service.cpp @@ -125,10 +125,6 @@ CSRNumericTable * createSparseTable(const std::string & datasetFileName) return numericTable; } -CSRNumericTable * createFloatSparseTable(const std::string & datasetFileName) { - return createSparseTable(datasetFileName); -} - void printAprioriItemsets(NumericTablePtr largeItemsetsTable, NumericTablePtr largeItemsetsSupportTable, size_t nItemsetToPrint = 20) { size_t largeItemsetCount = largeItemsetsSupportTable->getNumberOfRows(); diff --git a/mllib-dal/src/main/native/service.h b/mllib-dal/src/main/native/service.h index b6a2cc5c5..37b702aea 100644 --- a/mllib-dal/src/main/native/service.h +++ b/mllib-dal/src/main/native/service.h @@ -43,8 +43,5 @@ typedef std::vector ByteBuffer; void printNumericTable(const NumericTablePtr & dataTable, const char * message = "", size_t nPrintedRows = 0, size_t nPrintedCols = 0, size_t interval = 10); -size_t serializeDAALObject(SerializationIface * pData, ByteBuffer & buffer); -SerializationIfacePtr deserializeDAALObject(daal::byte * buff, size_t length); -CSRNumericTable * createFloatSparseTable(const std::string & datasetFileName); #endif diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala index e9e7ec36d..5f29fe441 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala @@ -40,6 +40,8 @@ class KMeansDALImpl ( instr.foreach(_.logInfo(s"Processing partitions with $executorNum executors")) + val executorIPAddress = Utils.sparkFirstExecutorIP(data.sparkContext) + // repartition to executorNum if not enough partitions val dataForConversion = if (data.getNumPartitions < executorNum) { data.repartition(executorNum).setName("Repartitioned for conversion").cache() @@ -47,13 +49,6 @@ class KMeansDALImpl ( data } - val executorIPAddress = Utils.sparkFirstExecutorIP(dataForConversion.sparkContext) - val kvsIP = dataForConversion.sparkContext.conf.get("spark.oap.mllib.oneccl.kvs.ip", executorIPAddress) - val kvsPortDetected = Utils.checkExecutorAvailPort(dataForConversion, kvsIP) - val kvsPort = dataForConversion.sparkContext.conf.getInt("spark.oap.mllib.oneccl.kvs.port", kvsPortDetected) - - val kvsIPPort = kvsIP+"_"+kvsPort - val partitionDims = Utils.getPartitionDims(dataForConversion) // filter the empty partitions @@ -69,14 +64,14 @@ class KMeansDALImpl ( val it = entry._3 val numCols = partitionDims(index)._2 - logDebug(s"KMeansDALImpl: Partition index: $index, numCols: $numCols, numRows: $numRows") + println(s"KMeansDALImpl: Partition index: $index, numCols: $numCols, numRows: $numRows") // Build DALMatrix, this will load libJavaAPI, libtbb, libtbbmalloc val context = new DaalContext() val matrix = new DALMatrix(context, classOf[java.lang.Double], numCols.toLong, numRows.toLong, NumericTable.AllocationFlag.DoAllocate) - logDebug("KMeansDALImpl: Loading native libraries" ) + println("KMeansDALImpl: Loading native libraries" ) // oneDAL libs should be loaded by now, extract libMLlibDAL.so to temp file and load LibLoader.loadLibraries() @@ -116,9 +111,10 @@ class KMeansDALImpl ( }.cache() - val results = coalescedTables.mapPartitionsWithIndex { (rank, table) => + val results = coalescedTables.mapPartitions { table => val tableArr = table.next() - OneCCL.init(executorNum, rank, kvsIPPort) + + OneCCL.init(executorNum, executorIPAddress, OneCCL.KVS_PORT) val initCentroids = OneDAL.makeNumericTable(centers) val result = new KMeansResult() diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala index e1bba3d37..1760aa171 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala @@ -18,20 +18,19 @@ package org.apache.spark.ml.feature import java.util.Arrays + import com.intel.daal.data_management.data.{HomogenNumericTable, NumericTable} -import org.apache.spark.internal.Logging import org.apache.spark.ml.linalg._ import org.apache.spark.ml.util.{OneCCL, OneDAL, Utils} import org.apache.spark.mllib.feature.{PCAModel => MLlibPCAModel} import org.apache.spark.mllib.linalg.{DenseMatrix => OldDenseMatrix, Vectors => OldVectors} import org.apache.spark.rdd.RDD -import org.apache.spark.mllib.feature.{StandardScaler => MLlibStandardScaler} +import org.apache.spark.mllib.feature.{ StandardScaler => MLlibStandardScaler } class PCADALImpl ( val k: Int, val executorNum: Int, - val executorCores: Int) - extends Serializable with Logging { + val executorCores: Int) extends Serializable { // Normalize data before apply fitWithDAL private def normalizeData(input: RDD[Vector]) : RDD[Vector] = { @@ -41,23 +40,17 @@ class PCADALImpl ( res.map(_.asML) } - def fitWithDAL(data: RDD[Vector]) : MLlibPCAModel = { + def fitWithDAL(input: RDD[Vector]) : MLlibPCAModel = { - val normalizedData = normalizeData(data) + val normalizedData = normalizeData(input) val coalescedTables = OneDAL.rddVectorToNumericTables(normalizedData, executorNum) - val executorIPAddress = Utils.sparkFirstExecutorIP(coalescedTables.sparkContext) - val kvsIP = coalescedTables.sparkContext.conf.get("spark.oap.mllib.oneccl.kvs.ip", executorIPAddress) - - val kvsPortDetected = Utils.checkExecutorAvailPort(coalescedTables, kvsIP) - val kvsPort = coalescedTables.sparkContext.conf.getInt("spark.oap.mllib.oneccl.kvs.port", kvsPortDetected) - - val kvsIPPort = kvsIP+"_"+kvsPort + val executorIPAddress = Utils.sparkFirstExecutorIP(input.sparkContext) - val results = coalescedTables.mapPartitionsWithIndex { (rank, table) => + val results = coalescedTables.mapPartitions { table => val tableArr = table.next() - OneCCL.init(executorNum, rank, kvsIPPort) + OneCCL.init(executorNum, executorIPAddress, OneCCL.KVS_PORT) val result = new PCAResult() cPCATrainDAL( diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala deleted file mode 100644 index 9196873fb..000000000 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ /dev/null @@ -1,1885 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.recommendation - -import java.{util => ju} -import java.io.IOException -import java.util.Locale - -import scala.collection.mutable -import scala.reflect.ClassTag -import scala.util.{Sorting, Try} -import scala.util.hashing.byteswap64 - -import com.github.fommil.netlib.BLAS.{getInstance => blas} -import org.apache.hadoop.fs.Path -import org.json4s.DefaultFormats -import org.json4s.JsonDSL._ - -import org.apache.spark.{Dependency, Partitioner, ShuffleDependency, SparkContext, SparkException} -import org.apache.spark.annotation.Since -import org.apache.spark.internal.Logging -import org.apache.spark.ml.{Estimator, Model} -import org.apache.spark.ml.linalg.BLAS -import org.apache.spark.ml.param._ -import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util.{Utils => DALImplUtils, _} -import org.apache.spark.ml.util.Instrumentation.instrumented -import org.apache.spark.mllib.linalg.CholeskyDecomposition -import org.apache.spark.mllib.optimization.NNLS -import org.apache.spark.rdd.{DeterministicLevel, RDD} -import org.apache.spark.sql.{DataFrame, Dataset} -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types._ -import org.apache.spark.storage.StorageLevel -import org.apache.spark.util.{BoundedPriorityQueue, Utils} -import org.apache.spark.util.collection.{OpenHashMap, OpenHashSet, SortDataFormat, Sorter} -import org.apache.spark.util.random.XORShiftRandom - -/** - * Common params for ALS and ALSModel. - */ -private[recommendation] trait ALSModelParams extends Params with HasPredictionCol - with HasBlockSize { - /** - * Param for the column name for user ids. Ids must be integers. Other - * numeric types are supported for this column, but will be cast to integers as long as they - * fall within the integer value range. - * Default: "user" - * @group param - */ - val userCol = new Param[String](this, "userCol", "column name for user ids. Ids must be within " + - "the integer value range.") - - /** @group getParam */ - def getUserCol: String = $(userCol) - - /** - * Param for the column name for item ids. Ids must be integers. Other - * numeric types are supported for this column, but will be cast to integers as long as they - * fall within the integer value range. - * Default: "item" - * @group param - */ - val itemCol = new Param[String](this, "itemCol", "column name for item ids. Ids must be within " + - "the integer value range.") - - /** @group getParam */ - def getItemCol: String = $(itemCol) - - /** - * Attempts to safely cast a user/item id to an Int. Throws an exception if the value is - * out of integer range or contains a fractional part. - */ - protected[recommendation] val checkedCast = udf { (n: Any) => - n match { - case v: Int => v // Avoid unnecessary casting - case v: Number => - val intV = v.intValue - // Checks if number within Int range and has no fractional part. - if (v.doubleValue == intV) { - intV - } else { - throw new IllegalArgumentException(s"ALS only supports values in Integer range " + - s"and without fractional part for columns ${$(userCol)} and ${$(itemCol)}. " + - s"Value $n was either out of Integer range or contained a fractional part that " + - s"could not be converted.") - } - case _ => throw new IllegalArgumentException(s"ALS only supports values in Integer range " + - s"for columns ${$(userCol)} and ${$(itemCol)}. Value $n was not numeric.") - } - } - - /** - * Param for strategy for dealing with unknown or new users/items at prediction time. - * This may be useful in cross-validation or production scenarios, for handling user/item ids - * the model has not seen in the training data. - * Supported values: - * - "nan": predicted value for unknown ids will be NaN. - * - "drop": rows in the input DataFrame containing unknown ids will be dropped from - * the output DataFrame containing predictions. - * Default: "nan". - * @group expertParam - */ - val coldStartStrategy = new Param[String](this, "coldStartStrategy", - "strategy for dealing with unknown or new users/items at prediction time. This may be " + - "useful in cross-validation or production scenarios, for handling user/item ids the model " + - "has not seen in the training data. Supported values: " + - s"${ALSModel.supportedColdStartStrategies.mkString(",")}.", - (s: String) => - ALSModel.supportedColdStartStrategies.contains(s.toLowerCase(Locale.ROOT))) - - /** @group expertGetParam */ - def getColdStartStrategy: String = $(coldStartStrategy).toLowerCase(Locale.ROOT) - - setDefault(blockSize -> 4096) -} - -/** - * Common params for ALS. - */ -private[recommendation] trait ALSParams extends ALSModelParams with HasMaxIter with HasRegParam - with HasCheckpointInterval with HasSeed { - - /** - * Param for rank of the matrix factorization (positive). - * Default: 10 - * @group param - */ - val rank = new IntParam(this, "rank", "rank of the factorization", ParamValidators.gtEq(1)) - - /** @group getParam */ - def getRank: Int = $(rank) - - /** - * Param for number of user blocks (positive). - * Default: 10 - * @group param - */ - val numUserBlocks = new IntParam(this, "numUserBlocks", "number of user blocks", - ParamValidators.gtEq(1)) - - /** @group getParam */ - def getNumUserBlocks: Int = $(numUserBlocks) - - /** - * Param for number of item blocks (positive). - * Default: 10 - * @group param - */ - val numItemBlocks = new IntParam(this, "numItemBlocks", "number of item blocks", - ParamValidators.gtEq(1)) - - /** @group getParam */ - def getNumItemBlocks: Int = $(numItemBlocks) - - /** - * Param to decide whether to use implicit preference. - * Default: false - * @group param - */ - val implicitPrefs = new BooleanParam(this, "implicitPrefs", "whether to use implicit preference") - - /** @group getParam */ - def getImplicitPrefs: Boolean = $(implicitPrefs) - - /** - * Param for the alpha parameter in the implicit preference formulation (nonnegative). - * Default: 1.0 - * @group param - */ - val alpha = new DoubleParam(this, "alpha", "alpha for implicit preference", - ParamValidators.gtEq(0)) - - /** @group getParam */ - def getAlpha: Double = $(alpha) - - /** - * Param for the column name for ratings. - * Default: "rating" - * @group param - */ - val ratingCol = new Param[String](this, "ratingCol", "column name for ratings") - - /** @group getParam */ - def getRatingCol: String = $(ratingCol) - - /** - * Param for whether to apply nonnegativity constraints. - * Default: false - * @group param - */ - val nonnegative = new BooleanParam( - this, "nonnegative", "whether to use nonnegative constraint for least squares") - - /** @group getParam */ - def getNonnegative: Boolean = $(nonnegative) - - /** - * Param for StorageLevel for intermediate datasets. Pass in a string representation of - * `StorageLevel`. Cannot be "NONE". - * Default: "MEMORY_AND_DISK". - * - * @group expertParam - */ - val intermediateStorageLevel = new Param[String](this, "intermediateStorageLevel", - "StorageLevel for intermediate datasets. Cannot be 'NONE'.", - (s: String) => Try(StorageLevel.fromString(s)).isSuccess && s != "NONE") - - /** @group expertGetParam */ - def getIntermediateStorageLevel: String = $(intermediateStorageLevel) - - /** - * Param for StorageLevel for ALS model factors. Pass in a string representation of - * `StorageLevel`. - * Default: "MEMORY_AND_DISK". - * - * @group expertParam - */ - val finalStorageLevel = new Param[String](this, "finalStorageLevel", - "StorageLevel for ALS model factors.", - (s: String) => Try(StorageLevel.fromString(s)).isSuccess) - - /** @group expertGetParam */ - def getFinalStorageLevel: String = $(finalStorageLevel) - - setDefault(rank -> 10, maxIter -> 10, regParam -> 0.1, numUserBlocks -> 10, numItemBlocks -> 10, - implicitPrefs -> false, alpha -> 1.0, userCol -> "user", itemCol -> "item", - ratingCol -> "rating", nonnegative -> false, checkpointInterval -> 10, - intermediateStorageLevel -> "MEMORY_AND_DISK", finalStorageLevel -> "MEMORY_AND_DISK", - coldStartStrategy -> "nan") - - /** - * Validates and transforms the input schema. - * - * @param schema input schema - * @return output schema - */ - protected def validateAndTransformSchema(schema: StructType): StructType = { - // user and item will be cast to Int - SchemaUtils.checkNumericType(schema, $(userCol)) - SchemaUtils.checkNumericType(schema, $(itemCol)) - // rating will be cast to Float - SchemaUtils.checkNumericType(schema, $(ratingCol)) - SchemaUtils.appendColumn(schema, $(predictionCol), FloatType) - } -} - -/** - * Model fitted by ALS. - * - * @param rank rank of the matrix factorization model - * @param userFactors a DataFrame that stores user factors in two columns: `id` and `features` - * @param itemFactors a DataFrame that stores item factors in two columns: `id` and `features` - */ -@Since("1.3.0") -class ALSModel private[ml] ( - @Since("1.4.0") override val uid: String, - @Since("1.4.0") val rank: Int, - @transient val userFactors: DataFrame, - @transient val itemFactors: DataFrame) - extends Model[ALSModel] with ALSModelParams with MLWritable { - - /** @group setParam */ - @Since("1.4.0") - def setUserCol(value: String): this.type = set(userCol, value) - - /** @group setParam */ - @Since("1.4.0") - def setItemCol(value: String): this.type = set(itemCol, value) - - /** @group setParam */ - @Since("1.3.0") - def setPredictionCol(value: String): this.type = set(predictionCol, value) - - /** @group expertSetParam */ - @Since("2.2.0") - def setColdStartStrategy(value: String): this.type = set(coldStartStrategy, value) - - /** - * Set block size for stacking input data in matrices. - * Default is 4096. - * - * @group expertSetParam - */ - @Since("3.0.0") - def setBlockSize(value: Int): this.type = set(blockSize, value) - - private val predict = udf { (featuresA: Seq[Float], featuresB: Seq[Float]) => - if (featuresA != null && featuresB != null) { - var dotProduct = 0.0f - var i = 0 - while (i < rank) { - dotProduct += featuresA(i) * featuresB(i) - i += 1 - } - dotProduct - } else { - Float.NaN - } - } - - @Since("2.0.0") - override def transform(dataset: Dataset[_]): DataFrame = { - transformSchema(dataset.schema) - // create a new column named map(predictionCol) by running the predict UDF. - val predictions = dataset - .join(userFactors, - checkedCast(dataset($(userCol))) === userFactors("id"), "left") - .join(itemFactors, - checkedCast(dataset($(itemCol))) === itemFactors("id"), "left") - .select(dataset("*"), - predict(userFactors("features"), itemFactors("features")).as($(predictionCol))) - getColdStartStrategy match { - case ALSModel.Drop => - predictions.na.drop("all", Seq($(predictionCol))) - case ALSModel.NaN => - predictions - } - } - - @Since("1.3.0") - override def transformSchema(schema: StructType): StructType = { - // user and item will be cast to Int - SchemaUtils.checkNumericType(schema, $(userCol)) - SchemaUtils.checkNumericType(schema, $(itemCol)) - SchemaUtils.appendColumn(schema, $(predictionCol), FloatType) - } - - @Since("1.5.0") - override def copy(extra: ParamMap): ALSModel = { - val copied = new ALSModel(uid, rank, userFactors, itemFactors) - copyValues(copied, extra).setParent(parent) - } - - @Since("1.6.0") - override def write: MLWriter = new ALSModel.ALSModelWriter(this) - - @Since("3.0.0") - override def toString: String = { - s"ALSModel: uid=$uid, rank=$rank" - } - - /** - * Returns top `numItems` items recommended for each user, for all users. - * @param numItems max number of recommendations for each user - * @return a DataFrame of (userCol: Int, recommendations), where recommendations are - * stored as an array of (itemCol: Int, rating: Float) Rows. - */ - @Since("2.2.0") - def recommendForAllUsers(numItems: Int): DataFrame = { - recommendForAll(userFactors, itemFactors, $(userCol), $(itemCol), numItems, $(blockSize)) - } - - /** - * Returns top `numItems` items recommended for each user id in the input data set. Note that if - * there are duplicate ids in the input dataset, only one set of recommendations per unique id - * will be returned. - * @param dataset a Dataset containing a column of user ids. The column name must match `userCol`. - * @param numItems max number of recommendations for each user. - * @return a DataFrame of (userCol: Int, recommendations), where recommendations are - * stored as an array of (itemCol: Int, rating: Float) Rows. - */ - @Since("2.3.0") - def recommendForUserSubset(dataset: Dataset[_], numItems: Int): DataFrame = { - val srcFactorSubset = getSourceFactorSubset(dataset, userFactors, $(userCol)) - recommendForAll(srcFactorSubset, itemFactors, $(userCol), $(itemCol), numItems, $(blockSize)) - } - - /** - * Returns top `numUsers` users recommended for each item, for all items. - * @param numUsers max number of recommendations for each item - * @return a DataFrame of (itemCol: Int, recommendations), where recommendations are - * stored as an array of (userCol: Int, rating: Float) Rows. - */ - @Since("2.2.0") - def recommendForAllItems(numUsers: Int): DataFrame = { - recommendForAll(itemFactors, userFactors, $(itemCol), $(userCol), numUsers, $(blockSize)) - } - - /** - * Returns top `numUsers` users recommended for each item id in the input data set. Note that if - * there are duplicate ids in the input dataset, only one set of recommendations per unique id - * will be returned. - * @param dataset a Dataset containing a column of item ids. The column name must match `itemCol`. - * @param numUsers max number of recommendations for each item. - * @return a DataFrame of (itemCol: Int, recommendations), where recommendations are - * stored as an array of (userCol: Int, rating: Float) Rows. - */ - @Since("2.3.0") - def recommendForItemSubset(dataset: Dataset[_], numUsers: Int): DataFrame = { - val srcFactorSubset = getSourceFactorSubset(dataset, itemFactors, $(itemCol)) - recommendForAll(srcFactorSubset, userFactors, $(itemCol), $(userCol), numUsers, $(blockSize)) - } - - /** - * Returns a subset of a factor DataFrame limited to only those unique ids contained - * in the input dataset. - * @param dataset input Dataset containing id column to user to filter factors. - * @param factors factor DataFrame to filter. - * @param column column name containing the ids in the input dataset. - * @return DataFrame containing factors only for those ids present in both the input dataset and - * the factor DataFrame. - */ - private def getSourceFactorSubset( - dataset: Dataset[_], - factors: DataFrame, - column: String): DataFrame = { - factors - .join(dataset.select(column), factors("id") === dataset(column), joinType = "left_semi") - .select(factors("id"), factors("features")) - } - - /** - * Makes recommendations for all users (or items). - * - * Note: the previous approach used for computing top-k recommendations - * used a cross-join followed by predicting a score for each row of the joined dataset. - * However, this results in exploding the size of intermediate data. While Spark SQL makes it - * relatively efficient, the approach implemented here is significantly more efficient. - * - * This approach groups factors into blocks and computes the top-k elements per block, - * using dot product and an efficient [[BoundedPriorityQueue]] (instead of gemm). - * It then computes the global top-k by aggregating the per block top-k elements with - * a [[TopByKeyAggregator]]. This significantly reduces the size of intermediate and shuffle data. - * This is the DataFrame equivalent to the approach used in - * [[org.apache.spark.mllib.recommendation.MatrixFactorizationModel]]. - * - * @param srcFactors src factors for which to generate recommendations - * @param dstFactors dst factors used to make recommendations - * @param srcOutputColumn name of the column for the source ID in the output DataFrame - * @param dstOutputColumn name of the column for the destination ID in the output DataFrame - * @param num max number of recommendations for each record - * @return a DataFrame of (srcOutputColumn: Int, recommendations), where recommendations are - * stored as an array of (dstOutputColumn: Int, rating: Float) Rows. - */ - private def recommendForAll( - srcFactors: DataFrame, - dstFactors: DataFrame, - srcOutputColumn: String, - dstOutputColumn: String, - num: Int, - blockSize: Int): DataFrame = { - import srcFactors.sparkSession.implicits._ - - val srcFactorsBlocked = blockify(srcFactors.as[(Int, Array[Float])], blockSize) - val dstFactorsBlocked = blockify(dstFactors.as[(Int, Array[Float])], blockSize) - val ratings = srcFactorsBlocked.crossJoin(dstFactorsBlocked) - .as[(Seq[(Int, Array[Float])], Seq[(Int, Array[Float])])] - .flatMap { case (srcIter, dstIter) => - val m = srcIter.size - val n = math.min(dstIter.size, num) - val output = new Array[(Int, Int, Float)](m * n) - var i = 0 - val pq = new BoundedPriorityQueue[(Int, Float)](num)(Ordering.by(_._2)) - srcIter.foreach { case (srcId, srcFactor) => - dstIter.foreach { case (dstId, dstFactor) => - // We use F2jBLAS which is faster than a call to native BLAS for vector dot product - val score = BLAS.f2jBLAS.sdot(rank, srcFactor, 1, dstFactor, 1) - pq += dstId -> score - } - pq.foreach { case (dstId, score) => - output(i) = (srcId, dstId, score) - i += 1 - } - pq.clear() - } - output.toSeq - } - // We'll force the IDs to be Int. Unfortunately this converts IDs to Int in the output. - val topKAggregator = new TopByKeyAggregator[Int, Int, Float](num, Ordering.by(_._2)) - val recs = ratings.as[(Int, Int, Float)].groupByKey(_._1).agg(topKAggregator.toColumn) - .toDF("id", "recommendations") - - val arrayType = ArrayType( - new StructType() - .add(dstOutputColumn, IntegerType) - .add("rating", FloatType) - ) - recs.select($"id".as(srcOutputColumn), $"recommendations".cast(arrayType)) - } - - /** - * Blockifies factors to improve the efficiency of cross join - */ - private def blockify( - factors: Dataset[(Int, Array[Float])], - blockSize: Int): Dataset[Seq[(Int, Array[Float])]] = { - import factors.sparkSession.implicits._ - factors.mapPartitions(_.grouped(blockSize)) - } - -} - -@Since("1.6.0") -object ALSModel extends MLReadable[ALSModel] { - - private val NaN = "nan" - private val Drop = "drop" - private[recommendation] final val supportedColdStartStrategies = Array(NaN, Drop) - - @Since("1.6.0") - override def read: MLReader[ALSModel] = new ALSModelReader - - @Since("1.6.0") - override def load(path: String): ALSModel = super.load(path) - - private[ALSModel] class ALSModelWriter(instance: ALSModel) extends MLWriter { - - override protected def saveImpl(path: String): Unit = { - val extraMetadata = "rank" -> instance.rank - DefaultParamsWriter.saveMetadata(instance, path, sc, Some(extraMetadata)) - val userPath = new Path(path, "userFactors").toString - instance.userFactors.write.format("parquet").save(userPath) - val itemPath = new Path(path, "itemFactors").toString - instance.itemFactors.write.format("parquet").save(itemPath) - } - } - - private class ALSModelReader extends MLReader[ALSModel] { - - /** Checked against metadata when loading model */ - private val className = classOf[ALSModel].getName - - override def load(path: String): ALSModel = { - val metadata = DefaultParamsReader.loadMetadata(path, sc, className) - implicit val format = DefaultFormats - val rank = (metadata.metadata \ "rank").extract[Int] - val userPath = new Path(path, "userFactors").toString - val userFactors = sparkSession.read.format("parquet").load(userPath) - val itemPath = new Path(path, "itemFactors").toString - val itemFactors = sparkSession.read.format("parquet").load(itemPath) - - val model = new ALSModel(metadata.uid, rank, userFactors, itemFactors) - - metadata.getAndSetParams(model) - model - } - } -} - -/** - * Alternating Least Squares (ALS) matrix factorization. - * - * ALS attempts to estimate the ratings matrix `R` as the product of two lower-rank matrices, - * `X` and `Y`, i.e. `X * Yt = R`. Typically these approximations are called 'factor' matrices. - * The general approach is iterative. During each iteration, one of the factor matrices is held - * constant, while the other is solved for using least squares. The newly-solved factor matrix is - * then held constant while solving for the other factor matrix. - * - * This is a blocked implementation of the ALS factorization algorithm that groups the two sets - * of factors (referred to as "users" and "products") into blocks and reduces communication by only - * sending one copy of each user vector to each product block on each iteration, and only for the - * product blocks that need that user's feature vector. This is achieved by pre-computing some - * information about the ratings matrix to determine the "out-links" of each user (which blocks of - * products it will contribute to) and "in-link" information for each product (which of the feature - * vectors it receives from each user block it will depend on). This allows us to send only an - * array of feature vectors between each user block and product block, and have the product block - * find the users' ratings and update the products based on these messages. - * - * For implicit preference data, the algorithm used is based on - * "Collaborative Filtering for Implicit Feedback Datasets", available at - * https://doi.org/10.1109/ICDM.2008.22, adapted for the blocked approach used here. - * - * Essentially instead of finding the low-rank approximations to the rating matrix `R`, - * this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if - * r is greater than 0 and 0 if r is less than or equal to 0. The ratings then act as 'confidence' - * values related to strength of indicated user - * preferences rather than explicit ratings given to items. - * - * Note: the input rating dataset to the ALS implementation should be deterministic. - * Nondeterministic data can cause failure during fitting ALS model. - * For example, an order-sensitive operation like sampling after a repartition makes dataset - * output nondeterministic, like `dataset.repartition(2).sample(false, 0.5, 1618)`. - * Checkpointing sampled dataset or adding a sort before sampling can help make the dataset - * deterministic. - */ -@Since("1.3.0") -class ALS(@Since("1.4.0") override val uid: String) extends Estimator[ALSModel] with ALSParams - with DefaultParamsWritable { - - import org.apache.spark.ml.recommendation.ALS.Rating - - @Since("1.4.0") - def this() = this(Identifiable.randomUID("als")) - - /** @group setParam */ - @Since("1.3.0") - def setRank(value: Int): this.type = set(rank, value) - - /** @group setParam */ - @Since("1.3.0") - def setNumUserBlocks(value: Int): this.type = set(numUserBlocks, value) - - /** @group setParam */ - @Since("1.3.0") - def setNumItemBlocks(value: Int): this.type = set(numItemBlocks, value) - - /** @group setParam */ - @Since("1.3.0") - def setImplicitPrefs(value: Boolean): this.type = set(implicitPrefs, value) - - /** @group setParam */ - @Since("1.3.0") - def setAlpha(value: Double): this.type = set(alpha, value) - - /** @group setParam */ - @Since("1.3.0") - def setUserCol(value: String): this.type = set(userCol, value) - - /** @group setParam */ - @Since("1.3.0") - def setItemCol(value: String): this.type = set(itemCol, value) - - /** @group setParam */ - @Since("1.3.0") - def setRatingCol(value: String): this.type = set(ratingCol, value) - - /** @group setParam */ - @Since("1.3.0") - def setPredictionCol(value: String): this.type = set(predictionCol, value) - - /** @group setParam */ - @Since("1.3.0") - def setMaxIter(value: Int): this.type = set(maxIter, value) - - /** @group setParam */ - @Since("1.3.0") - def setRegParam(value: Double): this.type = set(regParam, value) - - /** @group setParam */ - @Since("1.3.0") - def setNonnegative(value: Boolean): this.type = set(nonnegative, value) - - /** @group setParam */ - @Since("1.4.0") - def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) - - /** @group setParam */ - @Since("1.3.0") - def setSeed(value: Long): this.type = set(seed, value) - - /** @group expertSetParam */ - @Since("2.0.0") - def setIntermediateStorageLevel(value: String): this.type = set(intermediateStorageLevel, value) - - /** @group expertSetParam */ - @Since("2.0.0") - def setFinalStorageLevel(value: String): this.type = set(finalStorageLevel, value) - - /** @group expertSetParam */ - @Since("2.2.0") - def setColdStartStrategy(value: String): this.type = set(coldStartStrategy, value) - - /** - * Set block size for stacking input data in matrices. - * Default is 4096. - * - * @group expertSetParam - */ - @Since("3.0.0") - def setBlockSize(value: Int): this.type = set(blockSize, value) - - /** - * Sets both numUserBlocks and numItemBlocks to the specific value. - * - * @group setParam - */ - @Since("1.3.0") - def setNumBlocks(value: Int): this.type = { - setNumUserBlocks(value) - setNumItemBlocks(value) - this - } - - @Since("2.0.0") - override def fit(dataset: Dataset[_]): ALSModel = instrumented { instr => - transformSchema(dataset.schema) - import dataset.sparkSession.implicits._ - - val r = if ($(ratingCol) != "") col($(ratingCol)).cast(FloatType) else lit(1.0f) - val ratings = dataset - .select(checkedCast(col($(userCol))), checkedCast(col($(itemCol))), r) - .rdd - .map { row => - Rating(row.getInt(0), row.getInt(1), row.getFloat(2)) - } - - instr.logPipelineStage(this) - instr.logDataset(dataset) - instr.logParams(this, rank, numUserBlocks, numItemBlocks, implicitPrefs, alpha, userCol, - itemCol, ratingCol, predictionCol, maxIter, regParam, nonnegative, checkpointInterval, - seed, intermediateStorageLevel, finalStorageLevel, blockSize) - - val (userFactors, itemFactors) = ALS.train(ratings, rank = $(rank), - numUserBlocks = $(numUserBlocks), numItemBlocks = $(numItemBlocks), - maxIter = $(maxIter), regParam = $(regParam), implicitPrefs = $(implicitPrefs), - alpha = $(alpha), nonnegative = $(nonnegative), - intermediateRDDStorageLevel = StorageLevel.fromString($(intermediateStorageLevel)), - finalRDDStorageLevel = StorageLevel.fromString($(finalStorageLevel)), - checkpointInterval = $(checkpointInterval), seed = $(seed)) - val userDF = userFactors.toDF("id", "features") - val itemDF = itemFactors.toDF("id", "features") - val model = new ALSModel(uid, $(rank), userDF, itemDF).setBlockSize($(blockSize)) - .setParent(this) - copyValues(model) - } - - @Since("1.3.0") - override def transformSchema(schema: StructType): StructType = { - validateAndTransformSchema(schema) - } - - @Since("1.5.0") - override def copy(extra: ParamMap): ALS = defaultCopy(extra) -} - - -/** - * An implementation of ALS that supports generic ID types, specialized for Int and Long. This is - * exposed as a developer API for users who do need other ID types. But it is not recommended - * because it increases the shuffle size and memory requirement during training. For simplicity, - * users and items must have the same type. The number of distinct users/items should be smaller - * than 2 billion. - */ -object ALS extends DefaultParamsReadable[ALS] with Logging { - - /** - * Rating class for better code readability. - */ - case class Rating[@specialized(Int, Long) ID](user: ID, item: ID, rating: Float) - - @Since("1.6.0") - override def load(path: String): ALS = super.load(path) - - /** Trait for least squares solvers applied to the normal equation. */ - private[recommendation] trait LeastSquaresNESolver extends Serializable { - /** Solves a least squares problem with regularization (possibly with other constraints). */ - def solve(ne: NormalEquation, lambda: Double): Array[Float] - } - - /** Cholesky solver for least square problems. */ - private[recommendation] class CholeskySolver extends LeastSquaresNESolver { - - /** - * Solves a least squares problem with L2 regularization: - * - * min norm(A x - b)^2^ + lambda * norm(x)^2^ - * - * @param ne a [[NormalEquation]] instance that contains AtA, Atb, and n (number of instances) - * @param lambda regularization constant - * @return the solution x - */ - override def solve(ne: NormalEquation, lambda: Double): Array[Float] = { - val k = ne.k - // Add scaled lambda to the diagonals of AtA. - var i = 0 - var j = 2 - while (i < ne.triK) { - ne.ata(i) += lambda - i += j - j += 1 - } - CholeskyDecomposition.solve(ne.ata, ne.atb) - val x = new Array[Float](k) - i = 0 - while (i < k) { - x(i) = ne.atb(i).toFloat - i += 1 - } - ne.reset() - x - } - } - - /** NNLS solver. */ - private[recommendation] class NNLSSolver extends LeastSquaresNESolver { - private var rank: Int = -1 - private var workspace: NNLS.Workspace = _ - private var ata: Array[Double] = _ - private var initialized: Boolean = false - - private def initialize(rank: Int): Unit = { - if (!initialized) { - this.rank = rank - workspace = NNLS.createWorkspace(rank) - ata = new Array[Double](rank * rank) - initialized = true - } else { - require(this.rank == rank) - } - } - - /** - * Solves a nonnegative least squares problem with L2 regularization: - * - * min_x_ norm(A x - b)^2^ + lambda * n * norm(x)^2^ - * subject to x >= 0 - */ - override def solve(ne: NormalEquation, lambda: Double): Array[Float] = { - val rank = ne.k - initialize(rank) - fillAtA(ne.ata, lambda) - val x = NNLS.solve(ata, ne.atb, workspace) - ne.reset() - x.map(x => x.toFloat) - } - - /** - * Given a triangular matrix in the order of fillXtX above, compute the full symmetric square - * matrix that it represents, storing it into destMatrix. - */ - private def fillAtA(triAtA: Array[Double], lambda: Double): Unit = { - var i = 0 - var pos = 0 - var a = 0.0 - while (i < rank) { - var j = 0 - while (j <= i) { - a = triAtA(pos) - ata(i * rank + j) = a - ata(j * rank + i) = a - pos += 1 - j += 1 - } - ata(i * rank + i) += lambda - i += 1 - } - } - } - - /** - * Representing a normal equation to solve the following weighted least squares problem: - * - * minimize \sum,,i,, c,,i,, (a,,i,,^T^ x - d,,i,,)^2^ + lambda * x^T^ x. - * - * Its normal equation is given by - * - * \sum,,i,, c,,i,, (a,,i,, a,,i,,^T^ x - d,,i,, a,,i,,) + lambda * x = 0. - * - * Distributing and letting b,,i,, = c,,i,, * d,,i,, - * - * \sum,,i,, c,,i,, a,,i,, a,,i,,^T^ x - b,,i,, a,,i,, + lambda * x = 0. - */ - private[recommendation] class NormalEquation(val k: Int) extends Serializable { - - /** Number of entries in the upper triangular part of a k-by-k matrix. */ - val triK = k * (k + 1) / 2 - /** A^T^ * A */ - val ata = new Array[Double](triK) - /** A^T^ * b */ - val atb = new Array[Double](k) - - private val da = new Array[Double](k) - private val upper = "U" - - private def copyToDouble(a: Array[Float]): Unit = { - var i = 0 - while (i < k) { - da(i) = a(i) - i += 1 - } - } - - /** Adds an observation. */ - def add(a: Array[Float], b: Double, c: Double = 1.0): NormalEquation = { - require(c >= 0.0) - require(a.length == k) - copyToDouble(a) - blas.dspr(upper, k, c, da, 1, ata) - if (b != 0.0) { - blas.daxpy(k, b, da, 1, atb, 1) - } - this - } - - /** Merges another normal equation object. */ - def merge(other: NormalEquation): NormalEquation = { - require(other.k == k) - blas.daxpy(ata.length, 1.0, other.ata, 1, ata, 1) - blas.daxpy(atb.length, 1.0, other.atb, 1, atb, 1) - this - } - - /** Resets everything to zero, which should be called after each solve. */ - def reset(): Unit = { - ju.Arrays.fill(ata, 0.0) - ju.Arrays.fill(atb, 0.0) - } - } - - def train[ID: ClassTag]( // scalastyle:ignore - ratings: RDD[Rating[ID]], - rank: Int = 10, - numUserBlocks: Int = 10, - numItemBlocks: Int = 10, - maxIter: Int = 10, - regParam: Double = 0.1, - implicitPrefs: Boolean = false, - alpha: Double = 1.0, - nonnegative: Boolean = false, - intermediateRDDStorageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK, - finalRDDStorageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK, - checkpointInterval: Int = 10, - seed: Long = 0L)( - implicit ord: Ordering[ID]): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = { - - val isPlatformSupported = DALImplUtils.checkClusterPlatformCompatibility(ratings.sparkContext) - - val (userIdAndFactors, itemIdAndFactors) = - if (implicitPrefs && isPlatformSupported) { - new ALSDALImpl(ratings, rank, maxIter, regParam, alpha, seed).run() - } else { - trainMLlib(ratings, rank, numUserBlocks, numItemBlocks, maxIter, regParam, implicitPrefs, - alpha, nonnegative, intermediateRDDStorageLevel, finalRDDStorageLevel, - checkpointInterval, seed) - } - - (userIdAndFactors, itemIdAndFactors) - } - - private def trainDAL[ID: ClassTag]( - ratings: RDD[Rating[ID]], - rank: Int, - maxIter: Int, - regParam: Double, - alpha: Double, - seed: Long): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = { - null - } - - /** - * Implementation of the ALS algorithm. - * - * This implementation of the ALS factorization algorithm partitions the two sets of factors among - * Spark workers so as to reduce network communication by only sending one copy of each factor - * vector to each Spark worker on each iteration, and only if needed. This is achieved by - * precomputing some information about the ratings matrix to determine which users require which - * item factors and vice versa. See the Scaladoc for `InBlock` for a detailed explanation of how - * the precomputation is done. - * - * In addition, since each iteration of calculating the factor matrices depends on the known - * ratings, which are spread across Spark partitions, a naive implementation would incur - * significant network communication overhead between Spark workers, as the ratings RDD would be - * repeatedly shuffled during each iteration. This implementation reduces that overhead by - * performing the shuffling operation up front, precomputing each partition's ratings dependencies - * and duplicating those values to the appropriate workers before starting iterations to solve for - * the factor matrices. See the Scaladoc for `OutBlock` for a detailed explanation of how the - * precomputation is done. - * - * Note that the term "rating block" is a bit of a misnomer, as the ratings are not partitioned by - * contiguous blocks from the ratings matrix but by a hash function on the rating's location in - * the matrix. If it helps you to visualize the partitions, it is easier to think of the term - * "block" as referring to a subset of an RDD containing the ratings rather than a contiguous - * submatrix of the ratings matrix. - */ - private def trainMLlib[ID: ClassTag]( // scalastyle:ignore - ratings: RDD[Rating[ID]], - rank: Int = 10, - numUserBlocks: Int = 10, - numItemBlocks: Int = 10, - maxIter: Int = 10, - regParam: Double = 0.1, - implicitPrefs: Boolean = false, - alpha: Double = 1.0, - nonnegative: Boolean = false, - intermediateRDDStorageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK, - finalRDDStorageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK, - checkpointInterval: Int = 10, - seed: Long = 0L)( - implicit ord: Ordering[ID]): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = { - - require(!ratings.isEmpty(), s"No ratings available from $ratings") - require(intermediateRDDStorageLevel != StorageLevel.NONE, - "ALS is not designed to run without persisting intermediate RDDs.") - - val sc = ratings.sparkContext - - // Precompute the rating dependencies of each partition - val userPart = new ALSPartitioner(numUserBlocks) - val itemPart = new ALSPartitioner(numItemBlocks) - val blockRatings = partitionRatings(ratings, userPart, itemPart) - .persist(intermediateRDDStorageLevel) - val (userInBlocks, userOutBlocks) = - makeBlocks("user", blockRatings, userPart, itemPart, intermediateRDDStorageLevel) - userOutBlocks.count() // materialize blockRatings and user blocks - val swappedBlockRatings = blockRatings.map { - case ((userBlockId, itemBlockId), RatingBlock(userIds, itemIds, localRatings)) => - ((itemBlockId, userBlockId), RatingBlock(itemIds, userIds, localRatings)) - } - val (itemInBlocks, itemOutBlocks) = - makeBlocks("item", swappedBlockRatings, itemPart, userPart, intermediateRDDStorageLevel) - itemOutBlocks.count() // materialize item blocks - - // Encoders for storing each user/item's partition ID and index within its partition using a - // single integer; used as an optimization - val userLocalIndexEncoder = new LocalIndexEncoder(userPart.numPartitions) - val itemLocalIndexEncoder = new LocalIndexEncoder(itemPart.numPartitions) - - // These are the user and item factor matrices that, once trained, are multiplied together to - // estimate the rating matrix. The two matrices are stored in RDDs, partitioned by column such - // that each factor column resides on the same Spark worker as its corresponding user or item. - val seedGen = new XORShiftRandom(seed) - var userFactors = initialize(userInBlocks, rank, seedGen.nextLong()) - var itemFactors = initialize(itemInBlocks, rank, seedGen.nextLong()) - - val solver = if (nonnegative) new NNLSSolver else new CholeskySolver - - var previousCheckpointFile: Option[String] = None - val shouldCheckpoint: Int => Boolean = (iter) => - sc.checkpointDir.isDefined && checkpointInterval != -1 && (iter % checkpointInterval == 0) - val deletePreviousCheckpointFile: () => Unit = () => - previousCheckpointFile.foreach { file => - try { - val checkpointFile = new Path(file) - checkpointFile.getFileSystem(sc.hadoopConfiguration).delete(checkpointFile, true) - } catch { - case e: IOException => - logWarning(s"Cannot delete checkpoint file $file:", e) - } - } - - if (implicitPrefs) { - for (iter <- 1 to maxIter) { - userFactors.setName(s"userFactors-$iter").persist(intermediateRDDStorageLevel) - val previousItemFactors = itemFactors - itemFactors = computeFactors(userFactors, userOutBlocks, itemInBlocks, rank, regParam, - userLocalIndexEncoder, implicitPrefs, alpha, solver) - previousItemFactors.unpersist() - itemFactors.setName(s"itemFactors-$iter").persist(intermediateRDDStorageLevel) - // TODO: Generalize PeriodicGraphCheckpointer and use it here. - val deps = itemFactors.dependencies - if (shouldCheckpoint(iter)) { - itemFactors.checkpoint() // itemFactors gets materialized in computeFactors - } - val previousUserFactors = userFactors - userFactors = computeFactors(itemFactors, itemOutBlocks, userInBlocks, rank, regParam, - itemLocalIndexEncoder, implicitPrefs, alpha, solver) - if (shouldCheckpoint(iter)) { - ALS.cleanShuffleDependencies(sc, deps) - deletePreviousCheckpointFile() - previousCheckpointFile = itemFactors.getCheckpointFile - } - previousUserFactors.unpersist() - } - } else { - var previousCachedItemFactors: Option[RDD[(Int, FactorBlock)]] = None - for (iter <- 0 until maxIter) { - itemFactors = computeFactors(userFactors, userOutBlocks, itemInBlocks, rank, regParam, - userLocalIndexEncoder, solver = solver) - if (shouldCheckpoint(iter)) { - itemFactors.setName(s"itemFactors-$iter").persist(intermediateRDDStorageLevel) - val deps = itemFactors.dependencies - itemFactors.checkpoint() - itemFactors.count() // checkpoint item factors and cut lineage - ALS.cleanShuffleDependencies(sc, deps) - deletePreviousCheckpointFile() - - previousCachedItemFactors.foreach(_.unpersist()) - previousCheckpointFile = itemFactors.getCheckpointFile - previousCachedItemFactors = Option(itemFactors) - } - userFactors = computeFactors(itemFactors, itemOutBlocks, userInBlocks, rank, regParam, - itemLocalIndexEncoder, solver = solver) - } - } - val userIdAndFactors = userInBlocks - .mapValues(_.srcIds) - .join(userFactors) - .mapPartitions({ items => - items.flatMap { case (_, (ids, factors)) => - ids.view.zip(factors) - } - // Preserve the partitioning because IDs are consistent with the partitioners in userInBlocks - // and userFactors. - }, preservesPartitioning = true) - .setName("userFactors") - .persist(finalRDDStorageLevel) - val itemIdAndFactors = itemInBlocks - .mapValues(_.srcIds) - .join(itemFactors) - .mapPartitions({ items => - items.flatMap { case (_, (ids, factors)) => - ids.view.zip(factors) - } - }, preservesPartitioning = true) - .setName("itemFactors") - .persist(finalRDDStorageLevel) - if (finalRDDStorageLevel != StorageLevel.NONE) { - userIdAndFactors.count() - userInBlocks.unpersist() - userOutBlocks.unpersist() - itemOutBlocks.unpersist() - blockRatings.unpersist() - itemIdAndFactors.count() - itemFactors.unpersist() - itemInBlocks.unpersist() - } - (userIdAndFactors, itemIdAndFactors) - } - - /** - * Factor block that stores factors (Array[Float]) in an Array. - */ - private type FactorBlock = Array[Array[Float]] - - /** - * A mapping of the columns of the items factor matrix that are needed when calculating each row - * of the users factor matrix, and vice versa. - * - * Specifically, when calculating a user factor vector, since only those columns of the items - * factor matrix that correspond to the items that that user has rated are needed, we can avoid - * having to repeatedly copy the entire items factor matrix to each worker later in the algorithm - * by precomputing these dependencies for all users, storing them in an RDD of `OutBlock`s. The - * items' dependencies on the columns of the users factor matrix is computed similarly. - * - * =Example= - * - * Using the example provided in the `InBlock` Scaladoc, `userOutBlocks` would look like the - * following: - * - * {{{ - * userOutBlocks.collect() == Seq( - * 0 -> Array(Array(0, 1), Array(0, 1)), - * 1 -> Array(Array(0), Array(0)) - * ) - * }}} - * - * Each value in this map-like sequence is of type `Array[Array[Int]]`. The values in the - * inner array are the ranks of the sorted user IDs in that partition; so in the example above, - * `Array(0, 1)` in partition 0 refers to user IDs 0 and 6, since when all unique user IDs in - * partition 0 are sorted, 0 is the first ID and 6 is the second. The position of each inner - * array in its enclosing outer array denotes the partition number to which item IDs map; in the - * example, the first `Array(0, 1)` is in position 0 of its outer array, denoting item IDs that - * map to partition 0. - * - * In summary, the data structure encodes the following information: - * - * * There are ratings with user IDs 0 and 6 (encoded in `Array(0, 1)`, where 0 and 1 are the - * indices of the user IDs 0 and 6 on partition 0) whose item IDs map to partitions 0 and 1 - * (represented by the fact that `Array(0, 1)` appears in both the 0th and 1st positions). - * - * * There are ratings with user ID 3 (encoded in `Array(0)`, where 0 is the index of the user - * ID 3 on partition 1) whose item IDs map to partitions 0 and 1 (represented by the fact that - * `Array(0)` appears in both the 0th and 1st positions). - */ - private type OutBlock = Array[Array[Int]] - - /** - * In-link block for computing user and item factor matrices. - * - * The ALS algorithm partitions the columns of the users factor matrix evenly among Spark workers. - * Since each column of the factor matrix is calculated using the known ratings of the correspond- - * ing user, and since the ratings don't change across iterations, the ALS algorithm preshuffles - * the ratings to the appropriate partitions, storing them in `InBlock` objects. - * - * The ratings shuffled by item ID are computed similarly and also stored in `InBlock` objects. - * Note that this means every rating is stored twice, once as shuffled by user ID and once by item - * ID. This is a necessary tradeoff, since in general a rating will not be on the same worker - * when partitioned by user as by item. - * - * =Example= - * - * Say we have a small collection of eight items to offer the seven users in our application. We - * have some known ratings given by the users, as seen in the matrix below: - * - * {{{ - * Items - * 0 1 2 3 4 5 6 7 - * +---+---+---+---+---+---+---+---+ - * 0 | |0.1| | |0.4| | |0.7| - * +---+---+---+---+---+---+---+---+ - * 1 | | | | | | | | | - * +---+---+---+---+---+---+---+---+ - * U 2 | | | | | | | | | - * s +---+---+---+---+---+---+---+---+ - * e 3 | |3.1| | |3.4| | |3.7| - * r +---+---+---+---+---+---+---+---+ - * s 4 | | | | | | | | | - * +---+---+---+---+---+---+---+---+ - * 5 | | | | | | | | | - * +---+---+---+---+---+---+---+---+ - * 6 | |6.1| | |6.4| | |6.7| - * +---+---+---+---+---+---+---+---+ - * }}} - * - * The ratings are represented as an RDD, passed to the `partitionRatings` method as the `ratings` - * parameter: - * - * {{{ - * ratings.collect() == Seq( - * Rating(0, 1, 0.1f), - * Rating(0, 4, 0.4f), - * Rating(0, 7, 0.7f), - * Rating(3, 1, 3.1f), - * Rating(3, 4, 3.4f), - * Rating(3, 7, 3.7f), - * Rating(6, 1, 6.1f), - * Rating(6, 4, 6.4f), - * Rating(6, 7, 6.7f) - * ) - * }}} - * - * Say that we are using two partitions to calculate each factor matrix: - * - * {{{ - * val userPart = new ALSPartitioner(2) - * val itemPart = new ALSPartitioner(2) - * val blockRatings = partitionRatings(ratings, userPart, itemPart) - * }}} - * - * Ratings are mapped to partitions using the user/item IDs modulo the number of partitions. With - * two partitions, ratings with even-valued user IDs are shuffled to partition 0 while those with - * odd-valued user IDs are shuffled to partition 1: - * - * {{{ - * userInBlocks.collect() == Seq( - * 0 -> Seq( - * // Internally, the class stores the ratings in a more optimized format than - * // a sequence of `Rating`s, but for clarity we show it as such here. - * Rating(0, 1, 0.1f), - * Rating(0, 4, 0.4f), - * Rating(0, 7, 0.7f), - * Rating(6, 1, 6.1f), - * Rating(6, 4, 6.4f), - * Rating(6, 7, 6.7f) - * ), - * 1 -> Seq( - * Rating(3, 1, 3.1f), - * Rating(3, 4, 3.4f), - * Rating(3, 7, 3.7f) - * ) - * ) - * }}} - * - * Similarly, ratings with even-valued item IDs are shuffled to partition 0 while those with - * odd-valued item IDs are shuffled to partition 1: - * - * {{{ - * itemInBlocks.collect() == Seq( - * 0 -> Seq( - * Rating(0, 4, 0.4f), - * Rating(3, 4, 3.4f), - * Rating(6, 4, 6.4f) - * ), - * 1 -> Seq( - * Rating(0, 1, 0.1f), - * Rating(0, 7, 0.7f), - * Rating(3, 1, 3.1f), - * Rating(3, 7, 3.7f), - * Rating(6, 1, 6.1f), - * Rating(6, 7, 6.7f) - * ) - * ) - * }}} - * - * @param srcIds src ids (ordered) - * @param dstPtrs dst pointers. Elements in range [dstPtrs(i), dstPtrs(i+1)) of dst indices and - * ratings are associated with srcIds(i). - * @param dstEncodedIndices encoded dst indices - * @param ratings ratings - * @see [[LocalIndexEncoder]] - */ - private[recommendation] case class InBlock[@specialized(Int, Long) ID: ClassTag]( - srcIds: Array[ID], - dstPtrs: Array[Int], - dstEncodedIndices: Array[Int], - ratings: Array[Float]) { - /** Size of the block. */ - def size: Int = ratings.length - require(dstEncodedIndices.length == size) - require(dstPtrs.length == srcIds.length + 1) - } - - /** - * Initializes factors randomly given the in-link blocks. - * - * @param inBlocks in-link blocks - * @param rank rank - * @return initialized factor blocks - */ - private def initialize[ID]( - inBlocks: RDD[(Int, InBlock[ID])], - rank: Int, - seed: Long): RDD[(Int, FactorBlock)] = { - // Choose a unit vector uniformly at random from the unit sphere, but from the - // "first quadrant" where all elements are nonnegative. This can be done by choosing - // elements distributed as Normal(0,1) and taking the absolute value, and then normalizing. - // This appears to create factorizations that have a slightly better reconstruction - // (<1%) compared picking elements uniformly at random in [0,1]. - inBlocks.mapPartitions({ iter => - iter.map { - case (srcBlockId, inBlock) => - val random = new XORShiftRandom(byteswap64(seed ^ srcBlockId)) - val factors = Array.fill(inBlock.srcIds.length) { - val factor = Array.fill(rank)(random.nextGaussian().toFloat) - val nrm = blas.snrm2(rank, factor, 1) - blas.sscal(rank, 1.0f / nrm, factor, 1) - factor - } - (srcBlockId, factors) - } - }, preservesPartitioning = true) - } - - /** - * A rating block that contains src IDs, dst IDs, and ratings, stored in primitive arrays. - */ - private[recommendation] case class RatingBlock[@specialized(Int, Long) ID: ClassTag]( - srcIds: Array[ID], - dstIds: Array[ID], - ratings: Array[Float]) { - /** Size of the block. */ - def size: Int = srcIds.length - require(dstIds.length == srcIds.length) - require(ratings.length == srcIds.length) - } - - /** - * Builder for [[RatingBlock]]. `mutable.ArrayBuilder` is used to avoid boxing/unboxing. - */ - private[recommendation] class RatingBlockBuilder[@specialized(Int, Long) ID: ClassTag] - extends Serializable { - - private val srcIds = mutable.ArrayBuilder.make[ID] - private val dstIds = mutable.ArrayBuilder.make[ID] - private val ratings = mutable.ArrayBuilder.make[Float] - var size = 0 - - /** Adds a rating. */ - def add(r: Rating[ID]): this.type = { - size += 1 - srcIds += r.user - dstIds += r.item - ratings += r.rating - this - } - - /** Merges another [[RatingBlockBuilder]]. */ - def merge(other: RatingBlock[ID]): this.type = { - size += other.srcIds.length - srcIds ++= other.srcIds - dstIds ++= other.dstIds - ratings ++= other.ratings - this - } - - /** Builds a [[RatingBlock]]. */ - def build(): RatingBlock[ID] = { - RatingBlock[ID](srcIds.result(), dstIds.result(), ratings.result()) - } - } - - /** - * Groups an RDD of [[Rating]]s by the user partition and item partition to which each `Rating` - * maps according to the given partitioners. The returned pair RDD holds the ratings, encoded in - * a memory-efficient format but otherwise unchanged, keyed by the (user partition ID, item - * partition ID) pair. - * - * Performance note: This is an expensive operation that performs an RDD shuffle. - * - * Implementation note: This implementation produces the same result as the following but - * generates fewer intermediate objects: - * - * {{{ - * ratings.map { r => - * ((srcPart.getPartition(r.user), dstPart.getPartition(r.item)), r) - * }.aggregateByKey(new RatingBlockBuilder)( - * seqOp = (b, r) => b.add(r), - * combOp = (b0, b1) => b0.merge(b1.build())) - * .mapValues(_.build()) - * }}} - * - * @param ratings raw ratings - * @param srcPart partitioner for src IDs - * @param dstPart partitioner for dst IDs - * @return an RDD of rating blocks in the form of ((srcBlockId, dstBlockId), ratingBlock) - */ - private def partitionRatings[ID: ClassTag]( - ratings: RDD[Rating[ID]], - srcPart: Partitioner, - dstPart: Partitioner): RDD[((Int, Int), RatingBlock[ID])] = { - val numPartitions = srcPart.numPartitions * dstPart.numPartitions - ratings.mapPartitions { iter => - val builders = Array.fill(numPartitions)(new RatingBlockBuilder[ID]) - iter.flatMap { r => - val srcBlockId = srcPart.getPartition(r.user) - val dstBlockId = dstPart.getPartition(r.item) - val idx = srcBlockId + srcPart.numPartitions * dstBlockId - val builder = builders(idx) - builder.add(r) - if (builder.size >= 2048) { // 2048 * (3 * 4) = 24k - builders(idx) = new RatingBlockBuilder - Iterator.single(((srcBlockId, dstBlockId), builder.build())) - } else { - Iterator.empty - } - } ++ { - builders.view.zipWithIndex.filter(_._1.size > 0).map { case (block, idx) => - val srcBlockId = idx % srcPart.numPartitions - val dstBlockId = idx / srcPart.numPartitions - ((srcBlockId, dstBlockId), block.build()) - } - } - }.groupByKey().mapValues { blocks => - val builder = new RatingBlockBuilder[ID] - blocks.foreach(builder.merge) - builder.build() - }.setName("ratingBlocks") - } - - /** - * Builder for uncompressed in-blocks of (srcId, dstEncodedIndex, rating) tuples. - * - * @param encoder encoder for dst indices - */ - private[recommendation] class UncompressedInBlockBuilder[@specialized(Int, Long) ID: ClassTag]( - encoder: LocalIndexEncoder)( - implicit ord: Ordering[ID]) { - - private val srcIds = mutable.ArrayBuilder.make[ID] - private val dstEncodedIndices = mutable.ArrayBuilder.make[Int] - private val ratings = mutable.ArrayBuilder.make[Float] - - /** - * Adds a dst block of (srcId, dstLocalIndex, rating) tuples. - * - * @param dstBlockId dst block ID - * @param srcIds original src IDs - * @param dstLocalIndices dst local indices - * @param ratings ratings - */ - def add( - dstBlockId: Int, - srcIds: Array[ID], - dstLocalIndices: Array[Int], - ratings: Array[Float]): this.type = { - val sz = srcIds.length - require(dstLocalIndices.length == sz) - require(ratings.length == sz) - this.srcIds ++= srcIds - this.ratings ++= ratings - var j = 0 - while (j < sz) { - this.dstEncodedIndices += encoder.encode(dstBlockId, dstLocalIndices(j)) - j += 1 - } - this - } - - /** Builds a [[UncompressedInBlock]]. */ - def build(): UncompressedInBlock[ID] = { - new UncompressedInBlock(srcIds.result(), dstEncodedIndices.result(), ratings.result()) - } - } - - /** - * A block of (srcId, dstEncodedIndex, rating) tuples stored in primitive arrays. - */ - private[recommendation] class UncompressedInBlock[@specialized(Int, Long) ID: ClassTag]( - val srcIds: Array[ID], - val dstEncodedIndices: Array[Int], - val ratings: Array[Float])( - implicit ord: Ordering[ID]) { - - /** Size the of block. */ - def length: Int = srcIds.length - - /** - * Compresses the block into an `InBlock`. The algorithm is the same as converting a sparse - * matrix from coordinate list (COO) format into compressed sparse column (CSC) format. - * Sorting is done using Spark's built-in Timsort to avoid generating too many objects. - */ - def compress(): InBlock[ID] = { - val sz = length - assert(sz > 0, "Empty in-link block should not exist.") - sort() - val uniqueSrcIdsBuilder = mutable.ArrayBuilder.make[ID] - val dstCountsBuilder = mutable.ArrayBuilder.make[Int] - var preSrcId = srcIds(0) - uniqueSrcIdsBuilder += preSrcId - var curCount = 1 - var i = 1 - while (i < sz) { - val srcId = srcIds(i) - if (srcId != preSrcId) { - uniqueSrcIdsBuilder += srcId - dstCountsBuilder += curCount - preSrcId = srcId - curCount = 0 - } - curCount += 1 - i += 1 - } - dstCountsBuilder += curCount - val uniqueSrcIds = uniqueSrcIdsBuilder.result() - val numUniqueSrdIds = uniqueSrcIds.length - val dstCounts = dstCountsBuilder.result() - val dstPtrs = new Array[Int](numUniqueSrdIds + 1) - var sum = 0 - i = 0 - while (i < numUniqueSrdIds) { - sum += dstCounts(i) - i += 1 - dstPtrs(i) = sum - } - InBlock(uniqueSrcIds, dstPtrs, dstEncodedIndices, ratings) - } - - private def sort(): Unit = { - val sz = length - // Since there might be interleaved log messages, we insert a unique id for easy pairing. - val sortId = Utils.random.nextInt() - logDebug(s"Start sorting an uncompressed in-block of size $sz. (sortId = $sortId)") - val start = System.nanoTime() - val sorter = new Sorter(new UncompressedInBlockSort[ID]) - sorter.sort(this, 0, length, Ordering[KeyWrapper[ID]]) - val duration = (System.nanoTime() - start) / 1e9 - logDebug(s"Sorting took $duration seconds. (sortId = $sortId)") - } - } - - /** - * A wrapper that holds a primitive key. - * - * @see [[UncompressedInBlockSort]] - */ - private class KeyWrapper[@specialized(Int, Long) ID: ClassTag]( - implicit ord: Ordering[ID]) extends Ordered[KeyWrapper[ID]] { - - var key: ID = _ - - override def compare(that: KeyWrapper[ID]): Int = { - ord.compare(key, that.key) - } - - def setKey(key: ID): this.type = { - this.key = key - this - } - } - - /** - * [[SortDataFormat]] of [[UncompressedInBlock]] used by [[Sorter]]. - */ - private class UncompressedInBlockSort[@specialized(Int, Long) ID: ClassTag]( - implicit ord: Ordering[ID]) - extends SortDataFormat[KeyWrapper[ID], UncompressedInBlock[ID]] { - - override def newKey(): KeyWrapper[ID] = new KeyWrapper() - - override def getKey( - data: UncompressedInBlock[ID], - pos: Int, - reuse: KeyWrapper[ID]): KeyWrapper[ID] = { - if (reuse == null) { - new KeyWrapper().setKey(data.srcIds(pos)) - } else { - reuse.setKey(data.srcIds(pos)) - } - } - - override def getKey( - data: UncompressedInBlock[ID], - pos: Int): KeyWrapper[ID] = { - getKey(data, pos, null) - } - - private def swapElements[@specialized(Int, Float) T]( - data: Array[T], - pos0: Int, - pos1: Int): Unit = { - val tmp = data(pos0) - data(pos0) = data(pos1) - data(pos1) = tmp - } - - override def swap(data: UncompressedInBlock[ID], pos0: Int, pos1: Int): Unit = { - swapElements(data.srcIds, pos0, pos1) - swapElements(data.dstEncodedIndices, pos0, pos1) - swapElements(data.ratings, pos0, pos1) - } - - override def copyRange( - src: UncompressedInBlock[ID], - srcPos: Int, - dst: UncompressedInBlock[ID], - dstPos: Int, - length: Int): Unit = { - System.arraycopy(src.srcIds, srcPos, dst.srcIds, dstPos, length) - System.arraycopy(src.dstEncodedIndices, srcPos, dst.dstEncodedIndices, dstPos, length) - System.arraycopy(src.ratings, srcPos, dst.ratings, dstPos, length) - } - - override def allocate(length: Int): UncompressedInBlock[ID] = { - new UncompressedInBlock( - new Array[ID](length), new Array[Int](length), new Array[Float](length)) - } - - override def copyElement( - src: UncompressedInBlock[ID], - srcPos: Int, - dst: UncompressedInBlock[ID], - dstPos: Int): Unit = { - dst.srcIds(dstPos) = src.srcIds(srcPos) - dst.dstEncodedIndices(dstPos) = src.dstEncodedIndices(srcPos) - dst.ratings(dstPos) = src.ratings(srcPos) - } - } - - /** - * Creates in-blocks and out-blocks from rating blocks. - * - * @param prefix prefix for in/out-block names - * @param ratingBlocks rating blocks - * @param srcPart partitioner for src IDs - * @param dstPart partitioner for dst IDs - * @return (in-blocks, out-blocks) - */ - private def makeBlocks[ID: ClassTag]( - prefix: String, - ratingBlocks: RDD[((Int, Int), RatingBlock[ID])], - srcPart: Partitioner, - dstPart: Partitioner, - storageLevel: StorageLevel)( - implicit srcOrd: Ordering[ID]): (RDD[(Int, InBlock[ID])], RDD[(Int, OutBlock)]) = { - val inBlocks = ratingBlocks.map { - case ((srcBlockId, dstBlockId), RatingBlock(srcIds, dstIds, ratings)) => - // The implementation is a faster version of - // val dstIdToLocalIndex = dstIds.toSet.toSeq.sorted.zipWithIndex.toMap - val start = System.nanoTime() - val dstIdSet = new OpenHashSet[ID](1 << 20) - dstIds.foreach(dstIdSet.add) - val sortedDstIds = new Array[ID](dstIdSet.size) - var i = 0 - var pos = dstIdSet.nextPos(0) - while (pos != -1) { - sortedDstIds(i) = dstIdSet.getValue(pos) - pos = dstIdSet.nextPos(pos + 1) - i += 1 - } - assert(i == dstIdSet.size) - Sorting.quickSort(sortedDstIds) - val dstIdToLocalIndex = new OpenHashMap[ID, Int](sortedDstIds.length) - i = 0 - while (i < sortedDstIds.length) { - dstIdToLocalIndex.update(sortedDstIds(i), i) - i += 1 - } - logDebug( - "Converting to local indices took " + (System.nanoTime() - start) / 1e9 + " seconds.") - val dstLocalIndices = dstIds.map(dstIdToLocalIndex.apply) - (srcBlockId, (dstBlockId, srcIds, dstLocalIndices, ratings)) - }.groupByKey(new ALSPartitioner(srcPart.numPartitions)) - .mapValues { iter => - val builder = - new UncompressedInBlockBuilder[ID](new LocalIndexEncoder(dstPart.numPartitions)) - iter.foreach { case (dstBlockId, srcIds, dstLocalIndices, ratings) => - builder.add(dstBlockId, srcIds, dstLocalIndices, ratings) - } - builder.build().compress() - }.setName(prefix + "InBlocks") - .persist(storageLevel) - val outBlocks = inBlocks.mapValues { case InBlock(srcIds, dstPtrs, dstEncodedIndices, _) => - val encoder = new LocalIndexEncoder(dstPart.numPartitions) - val activeIds = Array.fill(dstPart.numPartitions)(mutable.ArrayBuilder.make[Int]) - var i = 0 - val seen = new Array[Boolean](dstPart.numPartitions) - while (i < srcIds.length) { - var j = dstPtrs(i) - ju.Arrays.fill(seen, false) - while (j < dstPtrs(i + 1)) { - val dstBlockId = encoder.blockId(dstEncodedIndices(j)) - if (!seen(dstBlockId)) { - activeIds(dstBlockId) += i // add the local index in this out-block - seen(dstBlockId) = true - } - j += 1 - } - i += 1 - } - activeIds.map { x => - x.result() - } - }.setName(prefix + "OutBlocks") - .persist(storageLevel) - (inBlocks, outBlocks) - } - - /** - * Compute dst factors by constructing and solving least square problems. - * - * @param srcFactorBlocks src factors - * @param srcOutBlocks src out-blocks - * @param dstInBlocks dst in-blocks - * @param rank rank - * @param regParam regularization constant - * @param srcEncoder encoder for src local indices - * @param implicitPrefs whether to use implicit preference - * @param alpha the alpha constant in the implicit preference formulation - * @param solver solver for least squares problems - * @return dst factors - */ - private def computeFactors[ID]( - srcFactorBlocks: RDD[(Int, FactorBlock)], - srcOutBlocks: RDD[(Int, OutBlock)], - dstInBlocks: RDD[(Int, InBlock[ID])], - rank: Int, - regParam: Double, - srcEncoder: LocalIndexEncoder, - implicitPrefs: Boolean = false, - alpha: Double = 1.0, - solver: LeastSquaresNESolver): RDD[(Int, FactorBlock)] = { - val numSrcBlocks = srcFactorBlocks.partitions.length - val YtY = if (implicitPrefs) Some(computeYtY(srcFactorBlocks, rank)) else None - val srcOut = srcOutBlocks.join(srcFactorBlocks).flatMap { - case (srcBlockId, (srcOutBlock, srcFactors)) => - srcOutBlock.view.zipWithIndex.map { case (activeIndices, dstBlockId) => - (dstBlockId, (srcBlockId, activeIndices.map(idx => srcFactors(idx)))) - } - } - val merged = srcOut.groupByKey(new ALSPartitioner(dstInBlocks.partitions.length)) - - // SPARK-28927: Nondeterministic RDDs causes inconsistent in/out blocks in case of rerun. - // It can cause runtime error when matching in/out user/item blocks. - val isBlockRDDNondeterministic = - dstInBlocks.outputDeterministicLevel == DeterministicLevel.INDETERMINATE || - srcOutBlocks.outputDeterministicLevel == DeterministicLevel.INDETERMINATE - - dstInBlocks.join(merged).mapValues { - case (InBlock(dstIds, srcPtrs, srcEncodedIndices, ratings), srcFactors) => - val sortedSrcFactors = new Array[FactorBlock](numSrcBlocks) - srcFactors.foreach { case (srcBlockId, factors) => - sortedSrcFactors(srcBlockId) = factors - } - val dstFactors = new Array[Array[Float]](dstIds.length) - var j = 0 - val ls = new NormalEquation(rank) - while (j < dstIds.length) { - ls.reset() - if (implicitPrefs) { - ls.merge(YtY.get) - } - var i = srcPtrs(j) - var numExplicits = 0 - while (i < srcPtrs(j + 1)) { - val encoded = srcEncodedIndices(i) - val blockId = srcEncoder.blockId(encoded) - val localIndex = srcEncoder.localIndex(encoded) - var srcFactor: Array[Float] = null - try { - srcFactor = sortedSrcFactors(blockId)(localIndex) - } catch { - case a: ArrayIndexOutOfBoundsException if isBlockRDDNondeterministic => - val errMsg = "A failure detected when matching In/Out blocks of users/items. " + - "Because at least one In/Out block RDD is found to be nondeterministic now, " + - "the issue is probably caused by nondeterministic input data. You can try to " + - "checkpoint training data to make it deterministic. If you do `repartition` + " + - "`sample` or `randomSplit`, you can also try to sort it before `sample` or " + - "`randomSplit` to make it deterministic." - throw new SparkException(errMsg, a) - } - val rating = ratings(i) - if (implicitPrefs) { - // Extension to the original paper to handle rating < 0. confidence is a function - // of |rating| instead so that it is never negative. c1 is confidence - 1. - val c1 = alpha * math.abs(rating) - // For rating <= 0, the corresponding preference is 0. So the second argument of add - // is only there for rating > 0. - if (rating > 0.0) { - numExplicits += 1 - } - ls.add(srcFactor, if (rating > 0.0) 1.0 + c1 else 0.0, c1) - } else { - ls.add(srcFactor, rating) - numExplicits += 1 - } - i += 1 - } - // Weight lambda by the number of explicit ratings based on the ALS-WR paper. - dstFactors(j) = solver.solve(ls, numExplicits * regParam) - j += 1 - } - dstFactors - } - } - - /** - * Computes the Gramian matrix of user or item factors, which is only used in implicit preference. - * Caching of the input factors is handled in [[ALS#train]]. - */ - private def computeYtY(factorBlocks: RDD[(Int, FactorBlock)], rank: Int): NormalEquation = { - factorBlocks.values.aggregate(new NormalEquation(rank))( - seqOp = (ne, factors) => { - factors.foreach(ne.add(_, 0.0)) - ne - }, - combOp = (ne1, ne2) => ne1.merge(ne2)) - } - - /** - * Encoder for storing (blockId, localIndex) into a single integer. - * - * We use the leading bits (including the sign bit) to store the block id and the rest to store - * the local index. This is based on the assumption that users/items are approximately evenly - * partitioned. With this assumption, we should be able to encode two billion distinct values. - * - * @param numBlocks number of blocks - */ - private[recommendation] class LocalIndexEncoder(numBlocks: Int) extends Serializable { - - require(numBlocks > 0, s"numBlocks must be positive but found $numBlocks.") - - private[this] final val numLocalIndexBits = - math.min(java.lang.Integer.numberOfLeadingZeros(numBlocks - 1), 31) - private[this] final val localIndexMask = (1 << numLocalIndexBits) - 1 - - /** Encodes a (blockId, localIndex) into a single integer. */ - def encode(blockId: Int, localIndex: Int): Int = { - require(blockId < numBlocks) - require((localIndex & ~localIndexMask) == 0) - (blockId << numLocalIndexBits) | localIndex - } - - /** Gets the block id from an encoded index. */ - @inline - def blockId(encoded: Int): Int = { - encoded >>> numLocalIndexBits - } - - /** Gets the local index from an encoded index. */ - @inline - def localIndex(encoded: Int): Int = { - encoded & localIndexMask - } - } - - /** - * Partitioner used by ALS. We require that getPartition is a projection. That is, for any key k, - * we have getPartition(getPartition(k)) = getPartition(k). Since the default HashPartitioner - * satisfies this requirement, we simply use a type alias here. - */ - private[recommendation] type ALSPartitioner = org.apache.spark.HashPartitioner - - /** - * Private function to clean up all of the shuffles files from the dependencies and their parents. - */ - private[spark] def cleanShuffleDependencies[T]( - sc: SparkContext, - deps: Seq[Dependency[_]], - blocking: Boolean = false): Unit = { - // If there is no reference tracking we skip clean up. - sc.cleaner.foreach { cleaner => - /** - * Clean the shuffles & all of its parents. - */ - def cleanEagerly(dep: Dependency[_]): Unit = { - if (dep.isInstanceOf[ShuffleDependency[_, _, _]]) { - val shuffleId = dep.asInstanceOf[ShuffleDependency[_, _, _]].shuffleId - cleaner.doCleanupShuffle(shuffleId, blocking) - } - val rdd = dep.rdd - val rddDeps = rdd.dependencies - if (rdd.getStorageLevel == StorageLevel.NONE && rddDeps != null) { - rddDeps.foreach(cleanEagerly) - } - } - deps.foreach(cleanEagerly) - } - } -} diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALSDALImpl.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALSDALImpl.scala deleted file mode 100644 index bcb95ca1f..000000000 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALSDALImpl.scala +++ /dev/null @@ -1,387 +0,0 @@ -package org.apache.spark.ml.recommendation - -import com.intel.daal.data_management.data.CSRNumericTable.Indexing -import org.apache.spark.rdd.{ExecutorInProcessCoalescePartitioner, RDD} - -import scala.reflect.ClassTag -import com.intel.daal.data_management.data.{CSRNumericTable, HomogenNumericTable, RowMergedNumericTable, Matrix => DALMatrix} -import com.intel.daal.services.DaalContext -import org.apache.spark.Partitioner -import org.apache.spark.internal.Logging -import org.apache.spark.ml.recommendation.ALS.Rating -import org.apache.spark.ml.util._ - -import java.nio.{ByteBuffer, ByteOrder} -import scala.collection.mutable.ArrayBuffer -//import java.nio.DoubleBuffer -import java.nio.FloatBuffer - -class ALSDataPartitioner(blocks: Int, itemsInBlock: Long) - extends Partitioner { - def numPartitions: Int = blocks - def getPartition(key: Any): Int = { - val k = key.asInstanceOf[Long] - // itemsInBlock = numItems / partitions - // remaining records will belog to the last partition - // 21 => 5, 5, 5, 6 - // 46 => 11, 11, 11, 13 - math.min((k / itemsInBlock).toInt, blocks-1) - } -} - -class ALSDALImpl[@specialized(Int, Long) ID: ClassTag]( - data: RDD[Rating[ID]], - nFactors: Int, - maxIter: Int, - regParam: Double, - alpha: Double, - seed: Long, -) extends Serializable with Logging { - - // Rating struct size is size of Long+Long+Float - val RATING_SIZE = 8 + 8 + 4 - - // Return Map partitionId -> (ratingsNum, csrRowNum, rowOffset) - private def getRatingsPartitionInfo(data: RDD[Rating[ID]]): Map[Int, (Int, Int, Int)] = { - val collectd = data.mapPartitionsWithIndex { case (index: Int, it: Iterator[Rating[ID]]) => - var ratingsNum = 0 - var s = Set[ID]() - it.foreach { v => - s += v.user - ratingsNum += 1 - } - Iterator((index, (ratingsNum, s.count(_ => true)))) - }.collect - - var ret = Map[Int, (Int, Int, Int)]() - var rowOffset = 0 - collectd.foreach { v => - val partitionId = v._1 - val ratingsNum = v._2._1 - val csrRowNum = v._2._2 - ret += ( partitionId -> (ratingsNum, csrRowNum, rowOffset)) - rowOffset = rowOffset + csrRowNum - } - - ret - } - - private def ratingsToCSRNumericTables(ratings: RDD[Rating[ID]], - nVectors: Long, nFeatures: Long, nBlocks: Long): RDD[CSRNumericTable] = { - -// val rowSortedRatings = ratings.sortBy(_.user.toString.toLong) - -// val itemsInBlock = (nFeatures + nBlocks - 1) / nBlocks - val itemsInBlock = nFeatures / nBlocks -// val rowSortedGrouped = rowSortedRatings.groupBy(value => value.user.toString.toLong / itemsInBlock).flatMap(_._2) - val rowSortedGrouped = ratings - // Transpose the dataset - .map { p => - Rating(p.item, p.user, p.rating) - } - .groupBy(value => value.user.toString.toLong) - .partitionBy(new ALSDataPartitioner(nBlocks.toInt, itemsInBlock)) - .flatMap(_._2).mapPartitions { p => - p.toArray.sortBy(_.user.toString.toLong).toIterator - } - - println("rowSortedGrouped partition number: ", rowSortedGrouped.getNumPartitions) - - // rowSortedGrouped.mapPartitionsWithIndex { case (partitionId, partition) => -// println("partitionId", partitionId) -// partition.foreach { p => -// println(p.user, p.item, p.rating) } -// Iterator(partitionId) -// }.collect() - - val ratingsPartitionInfo = getRatingsPartitionInfo(rowSortedGrouped) - println("ratingsPartitionInfo:", ratingsPartitionInfo) - - rowSortedGrouped.mapPartitionsWithIndex { case (partitionId, partition) => - val ratingsNum = ratingsPartitionInfo(partitionId)._1 - val csrRowNum = ratingsPartitionInfo(partitionId)._2 - val values = Array.fill(ratingsNum) { 0.0f } - val columnIndices = Array.fill(ratingsNum) { 0L } - val rowOffsets = ArrayBuffer[Long](1L) - - - var index = 0 - var curRow = 0L - // Each partition converted to one CSRNumericTable - partition.foreach { p => - // Modify row index for each partition (start from 0) - val row = p.user.toString.toLong - ratingsPartitionInfo(partitionId)._3 - val column = p.item.toString.toLong - val rating = p.rating - - values(index) = rating - // one-based index - columnIndices(index) = column + 1 - - if (row > curRow) { - curRow = row - // one-based index - rowOffsets += index + 1 - } - - index = index + 1 - } - // one-based row index - rowOffsets += index+1 - - println("PartitionId:", partitionId) - println("csrRowNum", csrRowNum) -// println("rowOffsets", rowOffsets.mkString(",")) -// println("columnIndices", columnIndices.mkString(",")) -// println("values", values.mkString(",")) - - val contextLocal = new DaalContext() - - println("ALSDALImpl: Loading native libraries ..." ) - LibLoader.loadLibraries() - - val cTable = OneDAL.cNewCSRNumericTable(values, columnIndices, rowOffsets.toArray, nVectors, csrRowNum) - val table = new CSRNumericTable(contextLocal, cTable) -// table.pack() - - println("Input dimensions:", table.getNumberOfRows, table.getNumberOfColumns) - - // There is a bug https://github.com/oneapi-src/oneDAL/pull/1288, - // printNumericTable can't print correct result for CSRNumericTable, use C++ printNumericTable - // Service.printNumericTable("Input: ", table) - - Iterator(table) - }.cache() - } - -// def factorsToRDD(cUsersFactorsNumTab: Long, cItemsFactorsNumTab: Long) -// :(RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = { -// val usersFactorsNumTab = OneDAL.makeNumericTable(cUsersFactorsNumTab) -// val itemsFactorsNumTab = OneDAL.makeNumericTable(cItemsFactorsNumTab) -// -// Service.printNumericTable("usersFactorsNumTab", usersFactorsNumTab) -// Service.printNumericTable("itemsFactorsNumTab", itemsFactorsNumTab) -// -// null -// } - - def ratingsToByteBuffer(ratings: Array[Rating[ID]]): ByteBuffer = { -// println("ratings len", ratings.length) - - val buffer= ByteBuffer.allocateDirect(ratings.length*(8+8+4)) - // Use little endian - buffer.order(ByteOrder.LITTLE_ENDIAN) - ratings.foreach { rating => - buffer.putLong(rating.user.toString.toLong) - buffer.putLong(rating.item.toString.toLong) - buffer.putFloat(rating.rating) - } - buffer - } - - def run(): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = { - val executorNum = Utils.sparkExecutorNum(data.sparkContext) - val executorCores = Utils.sparkExecutorCores() - - val nFeatures = data.max()(new Ordering[Rating[ID]]() { - override def compare(x: Rating[ID], y: Rating[ID]): Int = - Ordering[Long].compare(x.item.toString.toLong, y.item.toString.toLong) - }).item.toString.toLong + 1 - - val nVectors = data.max()(new Ordering[Rating[ID]]() { - override def compare(x: Rating[ID], y: Rating[ID]): Int = - Ordering[Long].compare(x.user.toString.toLong, y.user.toString.toLong) - }).user.toString.toLong + 1 - -// val largestItems = data.sortBy(_.item.toString.toLong, ascending = false).take(1) -// val nFeatures = largestItems(0).item.toString.toLong + 1 - -// val largestUsers = data.sortBy(_.user.toString.toLong, ascending = false).take(1) -// val nVectors = largestUsers(0).user.toString.toLong + 1 - - val nBlocks = executorNum - -// val nRatings = data.count() - - logInfo(s"ALSDAL fit using $executorNum Executors for $nVectors vectors and $nFeatures features") - - val numericTables = data.repartition(executorNum).setName("Repartitioned for conversion").cache() - - val executorIPAddress = Utils.sparkFirstExecutorIP(numericTables.sparkContext) - val kvsIP = numericTables.sparkContext.conf.get("spark.oap.mllib.oneccl.kvs.ip", executorIPAddress) - - val kvsPortDetected = Utils.checkExecutorAvailPort(numericTables, kvsIP) - val kvsPort = numericTables.sparkContext.conf.getInt("spark.oap.mllib.oneccl.kvs.port", kvsPortDetected) - - val kvsIPPort = kvsIP+"_"+kvsPort - - val results = numericTables - // Transpose the dataset - .map { p => - Rating(p.item, p.user, p.rating) } - .mapPartitionsWithIndex { (rank, iter) => - val context = new DaalContext() - println("ALSDALImpl: Loading libMLlibDAL.so" ) - LibLoader.loadLibraries() - - OneCCL.init(executorNum, rank, kvsIPPort) - val rankId = OneCCL.rankID() - - println("rankId", rankId, "nUsers", nVectors, "nItems", nFeatures) - - val buffer = ratingsToByteBuffer(iter.toArray) - val bufferInfo = new ALSPartitionInfo - val shuffledBuffer = cShuffleData(buffer, nFeatures.toInt, nBlocks, bufferInfo) - - val table = bufferToCSRNumericTable(shuffledBuffer, bufferInfo, nVectors.toInt, nFeatures.toInt, nBlocks, rankId) - - val result = new ALSResult() - cDALImplictALS( - table.getCNumericTable, nUsers = nVectors, - nFactors, maxIter, regParam, alpha, - executorNum, - executorCores, - rankId, - result - ) - Iterator(result) - }.cache() - -// results.foreach { p => -//// val usersFactorsNumTab = OneDAL.makeNumericTable(p.cUsersFactorsNumTab) -//// println("foreach", p.cUsersFactorsNumTab, p.cItemsFactorsNumTab) -// println("result", p.rankId, p.cUserOffset, p.cItemOffset); -// } - -// val usersFactorsRDD = results.mapPartitionsWithIndex { (index: Int, partiton: Iterator[ALSResult]) => -// partiton.foreach { p => -// val usersFactorsNumTab = OneDAL.makeNumericTable(p.cUsersFactorsNumTab) -// Service.printNumericTable("usersFactorsNumTab", usersFactorsNumTab) -// } -// Iterator() -// }.collect() - - val usersFactorsRDD = results.mapPartitionsWithIndex { (index: Int, partiton: Iterator[ALSResult]) => - val ret = partiton.flatMap { p => - val userOffset = p.cUserOffset.toInt - val usersFactorsNumTab = OneDAL.makeNumericTable(p.cUsersFactorsNumTab) - val nRows = usersFactorsNumTab.getNumberOfRows.toInt - val nCols = usersFactorsNumTab.getNumberOfColumns.toInt - var buffer = FloatBuffer.allocate(nCols * nRows) - // should use returned buffer - buffer = usersFactorsNumTab.getBlockOfRows(0, nRows, buffer) - (0 until nRows).map { index => - val array = Array.fill(nCols){0.0f} - buffer.get(array, 0, nCols) - ((index+userOffset).asInstanceOf[ID], array) - }.toIterator - } - ret - }.setName("userFactors").cache() - - val itemsFactorsRDD = results.mapPartitionsWithIndex { (index: Int, partiton: Iterator[ALSResult]) => - val ret = partiton.flatMap { p => - val itemOffset = p.cItemOffset.toInt - val itemsFactorsNumTab = OneDAL.makeNumericTable(p.cItemsFactorsNumTab) - val nRows = itemsFactorsNumTab.getNumberOfRows.toInt - val nCols = itemsFactorsNumTab.getNumberOfColumns.toInt - var buffer = FloatBuffer.allocate(nCols * nRows) - // should use returned buffer - buffer = itemsFactorsNumTab.getBlockOfRows(0, nRows, buffer) - (0 until nRows).map { index => - val array = Array.fill(nCols){0.0f} - buffer.get(array, 0, nCols) - ((index+itemOffset).asInstanceOf[ID], array) - }.toIterator - } - ret - }.setName("itemFactors").cache() - - usersFactorsRDD.count() - itemsFactorsRDD.count() - -// usersFactorsRDD.foreach { case (id, array) => -// println("usersFactorsRDD", id, array.mkString(", ")) -// } -// -// itemsFactorsRDD.foreach { case (id, array) => -// println("itemsFactorsRDD", id, array.mkString(", ")) -// } - - (usersFactorsRDD, itemsFactorsRDD) - } - - private def getPartitionOffset(partitionId: Int, nRatings: Int, nBlocks: Int): Int = { - require(partitionId >=0 && partitionId < nBlocks) - val itemsInBlock = nRatings / nBlocks - return partitionId * itemsInBlock - } - - private def bufferToCSRNumericTable(buffer: ByteBuffer, info: ALSPartitionInfo, - nVectors: Int, nFeatures: Int, nBlocks: Int, rankId: Int): CSRNumericTable = { - // Use little endian - buffer.order(ByteOrder.LITTLE_ENDIAN) - - val ratingsNum = info.ratingsNum - val csrRowNum = info.csrRowNum - val values = Array.fill(ratingsNum) { 0.0f } - val columnIndices = Array.fill(ratingsNum) { 0L } - val rowOffsets = ArrayBuffer[Long](1L) - - var index = 0 - var curRow = 0L - // Each partition converted to one CSRNumericTable - for (i <- 0 until ratingsNum) { - // Modify row index for each partition (start from 0) - val row = buffer.getLong(i*RATING_SIZE) - getPartitionOffset(rankId, nFeatures, nBlocks) - val column = buffer.getLong(i*RATING_SIZE+8) - val rating = buffer.getFloat(i*RATING_SIZE+16) - - values(index) = rating - // one-based index - columnIndices(index) = column + 1 - - if (row > curRow) { - curRow = row - // one-based index - rowOffsets += index + 1 - } - - index = index + 1 - } - // one-based row index - rowOffsets += index+1 - -// println("rankId:", rankId) -// println("csrRowNum", csrRowNum) - -// println(rowOffsets.mkString(" ")) -// println(columnIndices.mkString(" ")) -// println(values.mkString(" ")) - - val contextLocal = new DaalContext() - val cTable = OneDAL.cNewCSRNumericTable(values, columnIndices, rowOffsets.toArray, nVectors, csrRowNum) - val table = new CSRNumericTable(contextLocal, cTable) - - println("Input dimensions:", table.getNumberOfRows, table.getNumberOfColumns) -// Service.printNumericTable("Input NumericTable", table) - - table - } - - // Single entry to call Implict ALS DAL backend - @native private def cDALImplictALS(data: Long, - nUsers: Long, - nFactors: Int, - maxIter: Int, - regParam: Double, - alpha: Double, - executor_num: Int, - executor_cores: Int, - rankId: Int, - result: ALSResult): Long - @native private def cShuffleData(data: ByteBuffer, - nTotalKeys: Int, - nBlocks: Int, - info: ALSPartitionInfo): ByteBuffer -} diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala index 7581a1003..4c38a9bdc 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala @@ -17,32 +17,60 @@ package org.apache.spark.ml.util -import org.apache.spark.internal.Logging +import org.apache.spark.SparkConf -object OneCCL extends Logging { +object OneCCL { var cclParam = new CCLParam() + var kvsIPPort = sys.env.getOrElse("CCL_KVS_IP_PORT", "") + var worldSize = sys.env.getOrElse("CCL_WORLD_SIZE", "1").toInt + + var KVS_PORT = 51234 + + private def checkEnv() { + val altTransport = sys.env.getOrElse("CCL_ATL_TRANSPORT", "") + val pmType = sys.env.getOrElse("CCL_PM_TYPE", "") + val ipExchange = sys.env.getOrElse("CCL_KVS_IP_EXCHANGE", "") + + assert(altTransport == "ofi") + assert(pmType == "resizable") + assert(ipExchange == "env") + assert(kvsIPPort != "") + + } + // Run on Executor - def setExecutorEnv(): Unit = { + def setExecutorEnv(executor_num: Int, ip: String, port: Int): Unit = { + // Work around ccl by passings in a spark.executorEnv.CCL_KVS_IP_PORT. + val ccl_kvs_ip_port = sys.env.getOrElse("CCL_KVS_IP_PORT", s"${ip}_${port}") + + println(s"oneCCL: Initializing with CCL_KVS_IP_PORT: $ccl_kvs_ip_port") + + setEnv("CCL_PM_TYPE", "resizable") setEnv("CCL_ATL_TRANSPORT","ofi") + setEnv("CCL_ATL_TRANSPORT_PATH", LibLoader.getTempSubDir()) + setEnv("CCL_KVS_IP_EXCHANGE","env") + setEnv("CCL_KVS_IP_PORT", ccl_kvs_ip_port) + setEnv("CCL_WORLD_SIZE", s"${executor_num}") // Uncomment this if you whant to debug oneCCL // setEnv("CCL_LOG_LEVEL", "2") } - def init(executor_num: Int, rank: Int, ip_port: String) = { - - setExecutorEnv() + def init(executor_num: Int, ip: String, port: Int) = { - logInfo(s"Initializing with IP_PORT: ${ip_port}") + setExecutorEnv(executor_num, ip, port) // cclParam is output from native code - c_init(executor_num, rank, ip_port, cclParam) + c_init(cclParam) // executor number should equal to oneCCL world size assert(executor_num == cclParam.commSize, "executor number should equal to oneCCL world size") - logInfo(s"Initialized with executorNum: $executor_num, commSize, ${cclParam.commSize}, rankId: ${cclParam.rankId}") + println(s"oneCCL: Initialized with executorNum: $executor_num, commSize, ${cclParam.commSize}, rankId: ${cclParam.rankId}") + + KVS_PORT = KVS_PORT + 1 + } // Run on Executor @@ -50,16 +78,11 @@ object OneCCL extends Logging { c_cleanup() } - def getAvailPort(localIP: String): Int = synchronized { - c_getAvailPort(localIP) - } - - @native private def c_init(size: Int, rank: Int, ip_port: String, param: CCLParam) : Int + @native private def c_init(param: CCLParam) : Int @native private def c_cleanup() : Unit @native def isRoot() : Boolean @native def rankID() : Int @native def setEnv(key: String, value: String, overwrite: Boolean = true): Int - @native def c_getAvailPort(localIP: String): Int -} \ No newline at end of file +} diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala index 9b6c0f6c7..2f6c83775 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala @@ -149,7 +149,4 @@ object OneDAL { @native def cFreeDataMemory(numTableAddr: Long) @native def cCheckPlatformCompatibility() : Boolean - - @native def cNewCSRNumericTable(data: Array[Float], colIndices: Array[Long], rowOffsets: Array[Long], nFeatures: Long, - nVectors: Long) : Long } diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala index aa8eb8979..40a1c6823 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala @@ -71,22 +71,8 @@ object Utils { ip } - def checkExecutorAvailPort(data: RDD[_], localIP: String) : Int = { - val sc = data.sparkContext - val result = data.mapPartitions { p => - LibLoader.loadLibraries() - val port = OneCCL.getAvailPort(localIP) - if (port != -1) - Iterator(port) - else - Iterator() - }.collect() - - return result(0) - } - def checkClusterPlatformCompatibility(sc: SparkContext) : Boolean = { - LibLoader.loadLibraries() + LibLoader.loadLibMLlibDAL() // check driver platform compatibility if (!OneDAL.cCheckPlatformCompatibility()) @@ -96,7 +82,7 @@ object Utils { val executor_num = Utils.sparkExecutorNum(sc) val data = sc.parallelize(1 to executor_num, executor_num) val result = data.map { p => - LibLoader.loadLibraries() + LibLoader.loadLibMLlibDAL() OneDAL.cCheckPlatformCompatibility() }.collect() diff --git a/mllib-dal/src/test/scala/org/apache/spark/ml/recommendation/IntelALSSuite.scala b/mllib-dal/src/test/scala/org/apache/spark/ml/recommendation/IntelALSSuite.scala deleted file mode 100644 index 339644888..000000000 --- a/mllib-dal/src/test/scala/org/apache/spark/ml/recommendation/IntelALSSuite.scala +++ /dev/null @@ -1,1239 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.recommendation - -import java.io.File -import java.util.Random - -import scala.collection.JavaConverters._ -import scala.collection.mutable -import scala.collection.mutable.{ArrayBuffer, WrappedArray} - -import com.github.fommil.netlib.BLAS.{getInstance => blas} -import org.apache.commons.io.FileUtils -import org.apache.commons.io.filefilter.TrueFileFilter -import org.scalatest.BeforeAndAfterEach - -import org.apache.spark._ -import org.apache.spark.internal.Logging -import org.apache.spark.ml.linalg.Vectors -import org.apache.spark.ml.recommendation.ALS._ -import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils} -import org.apache.spark.ml.util.TestingUtils._ -import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.rdd.RDD -import org.apache.spark.scheduler.{SparkListener, SparkListenerStageCompleted} -import org.apache.spark.sql.{DataFrame, Encoder, Row, SparkSession} -import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.sql.functions.{col, lit} -import org.apache.spark.sql.streaming.StreamingQueryException -import org.apache.spark.sql.types._ -import org.apache.spark.storage.StorageLevel -import org.apache.spark.util.Utils - -class IntelALSSuite extends MLTest with DefaultReadWriteTest with Logging { - - override def beforeAll(): Unit = { - super.beforeAll() - sc.setCheckpointDir(tempDir.getAbsolutePath) - } - - override def afterAll(): Unit = { - super.afterAll() - } - - test("LocalIndexEncoder") { - val random = new Random - for (numBlocks <- Seq(1, 2, 5, 10, 20, 50, 100)) { - val encoder = new LocalIndexEncoder(numBlocks) - val maxLocalIndex = Int.MaxValue / numBlocks - val tests = Seq.fill(5)((random.nextInt(numBlocks), random.nextInt(maxLocalIndex))) ++ - Seq((0, 0), (numBlocks - 1, maxLocalIndex)) - tests.foreach { case (blockId, localIndex) => - val err = s"Failed with numBlocks=$numBlocks, blockId=$blockId, and localIndex=$localIndex." - val encoded = encoder.encode(blockId, localIndex) - assert(encoder.blockId(encoded) === blockId, err) - assert(encoder.localIndex(encoded) === localIndex, err) - } - } - } - - test("normal equation construction") { - val k = 2 - val ne0 = new NormalEquation(k) - .add(Array(1.0f, 2.0f), 3.0) - .add(Array(4.0f, 5.0f), 12.0, 2.0) // weighted - assert(ne0.k === k) - assert(ne0.triK === k * (k + 1) / 2) - // NumPy code that computes the expected values: - // A = np.matrix("1 2; 4 5") - // b = np.matrix("3; 6") - // C = np.matrix(np.diag([1, 2])) - // ata = A.transpose() * C * A - // atb = A.transpose() * C * b - assert(Vectors.dense(ne0.ata) ~== Vectors.dense(33.0, 42.0, 54.0) relTol 1e-8) - assert(Vectors.dense(ne0.atb) ~== Vectors.dense(51.0, 66.0) relTol 1e-8) - - val ne1 = new NormalEquation(2) - .add(Array(7.0f, 8.0f), 9.0) - ne0.merge(ne1) - // NumPy code that computes the expected values: - // A = np.matrix("1 2; 4 5; 7 8") - // b = np.matrix("3; 6; 9") - // C = np.matrix(np.diag([1, 2, 1])) - // ata = A.transpose() * C * A - // atb = A.transpose() * C * b - assert(Vectors.dense(ne0.ata) ~== Vectors.dense(82.0, 98.0, 118.0) relTol 1e-8) - assert(Vectors.dense(ne0.atb) ~== Vectors.dense(114.0, 138.0) relTol 1e-8) - - intercept[IllegalArgumentException] { - ne0.add(Array(1.0f), 2.0) - } - intercept[IllegalArgumentException] { - ne0.add(Array(1.0f, 2.0f, 3.0f), 4.0) - } - intercept[IllegalArgumentException] { - ne0.add(Array(1.0f, 2.0f), 0.0, -1.0) - } - intercept[IllegalArgumentException] { - val ne2 = new NormalEquation(3) - ne0.merge(ne2) - } - - ne0.reset() - assert(ne0.ata.forall(_ == 0.0)) - assert(ne0.atb.forall(_ == 0.0)) - } - - test("CholeskySolver") { - val k = 2 - val ne0 = new NormalEquation(k) - .add(Array(1.0f, 2.0f), 4.0) - .add(Array(1.0f, 3.0f), 9.0) - .add(Array(1.0f, 4.0f), 16.0) - val ne1 = new NormalEquation(k) - .merge(ne0) - - val chol = new CholeskySolver - val x0 = chol.solve(ne0, 0.0).map(_.toDouble) - // NumPy code that computes the expected solution: - // A = np.matrix("1 2; 1 3; 1 4") - // b = b = np.matrix("3; 6") - // x0 = np.linalg.lstsq(A, b)[0] - assert(Vectors.dense(x0) ~== Vectors.dense(-8.333333, 6.0) relTol 1e-6) - - assert(ne0.ata.forall(_ == 0.0)) - assert(ne0.atb.forall(_ == 0.0)) - - val x1 = chol.solve(ne1, 1.5).map(_.toDouble) - // NumPy code that computes the expected solution, where lambda is scaled by n: - // x0 = np.linalg.solve(A.transpose() * A + 1.5 * np.eye(2), A.transpose() * b) - assert(Vectors.dense(x1) ~== Vectors.dense(-0.1155556, 3.28) relTol 1e-6) - } - - test("RatingBlockBuilder") { - val emptyBuilder = new RatingBlockBuilder[Int]() - assert(emptyBuilder.size === 0) - val emptyBlock = emptyBuilder.build() - assert(emptyBlock.srcIds.isEmpty) - assert(emptyBlock.dstIds.isEmpty) - assert(emptyBlock.ratings.isEmpty) - - val builder0 = new RatingBlockBuilder() - .add(Rating(0, 1, 2.0f)) - .add(Rating(3, 4, 5.0f)) - assert(builder0.size === 2) - val builder1 = new RatingBlockBuilder() - .add(Rating(6, 7, 8.0f)) - .merge(builder0.build()) - assert(builder1.size === 3) - val block = builder1.build() - val ratings = Seq.tabulate(block.size) { i => - (block.srcIds(i), block.dstIds(i), block.ratings(i)) - }.toSet - assert(ratings === Set((0, 1, 2.0f), (3, 4, 5.0f), (6, 7, 8.0f))) - } - - test("UncompressedInBlock") { - val encoder = new LocalIndexEncoder(10) - val uncompressed = new UncompressedInBlockBuilder[Int](encoder) - .add(0, Array(1, 0, 2), Array(0, 1, 4), Array(1.0f, 2.0f, 3.0f)) - .add(1, Array(3, 0), Array(2, 5), Array(4.0f, 5.0f)) - .build() - assert(uncompressed.length === 5) - val records = Seq.tabulate(uncompressed.length) { i => - val dstEncodedIndex = uncompressed.dstEncodedIndices(i) - val dstBlockId = encoder.blockId(dstEncodedIndex) - val dstLocalIndex = encoder.localIndex(dstEncodedIndex) - (uncompressed.srcIds(i), dstBlockId, dstLocalIndex, uncompressed.ratings(i)) - }.toSet - val expected = - Set((1, 0, 0, 1.0f), (0, 0, 1, 2.0f), (2, 0, 4, 3.0f), (3, 1, 2, 4.0f), (0, 1, 5, 5.0f)) - assert(records === expected) - - val compressed = uncompressed.compress() - assert(compressed.size === 5) - assert(compressed.srcIds.toSeq === Seq(0, 1, 2, 3)) - assert(compressed.dstPtrs.toSeq === Seq(0, 2, 3, 4, 5)) - var decompressed = ArrayBuffer.empty[(Int, Int, Int, Float)] - var i = 0 - while (i < compressed.srcIds.length) { - var j = compressed.dstPtrs(i) - while (j < compressed.dstPtrs(i + 1)) { - val dstEncodedIndex = compressed.dstEncodedIndices(j) - val dstBlockId = encoder.blockId(dstEncodedIndex) - val dstLocalIndex = encoder.localIndex(dstEncodedIndex) - decompressed += ((compressed.srcIds(i), dstBlockId, dstLocalIndex, compressed.ratings(j))) - j += 1 - } - i += 1 - } - assert(decompressed.toSet === expected) - } - - test("CheckedCast") { - val checkedCast = new ALS().checkedCast - val df = spark.range(1) - - withClue("Valid Integer Ids") { - df.select(checkedCast(lit(123))).collect() - } - - withClue("Valid Long Ids") { - df.select(checkedCast(lit(1231L))).collect() - } - - withClue("Valid Decimal Ids") { - df.select(checkedCast(lit(123).cast(DecimalType(15, 2)))).collect() - } - - withClue("Valid Double Ids") { - df.select(checkedCast(lit(123.0))).collect() - } - - val msg = "either out of Integer range or contained a fractional part" - withClue("Invalid Long: out of range") { - val e: SparkException = intercept[SparkException] { - df.select(checkedCast(lit(1231000000000L))).collect() - } - assert(e.getMessage.contains(msg)) - } - - withClue("Invalid Decimal: out of range") { - val e: SparkException = intercept[SparkException] { - df.select(checkedCast(lit(1231000000000.0).cast(DecimalType(15, 2)))).collect() - } - assert(e.getMessage.contains(msg)) - } - - withClue("Invalid Decimal: fractional part") { - val e: SparkException = intercept[SparkException] { - df.select(checkedCast(lit(123.1).cast(DecimalType(15, 2)))).collect() - } - assert(e.getMessage.contains(msg)) - } - - withClue("Invalid Double: out of range") { - val e: SparkException = intercept[SparkException] { - df.select(checkedCast(lit(1231000000000.0))).collect() - } - assert(e.getMessage.contains(msg)) - } - - withClue("Invalid Double: fractional part") { - val e: SparkException = intercept[SparkException] { - df.select(checkedCast(lit(123.1))).collect() - } - assert(e.getMessage.contains(msg)) - } - - withClue("Invalid Type") { - val e: SparkException = intercept[SparkException] { - df.select(checkedCast(lit("123.1"))).collect() - } - assert(e.getMessage.contains("was not numeric")) - } - } - - /** - * Generates an explicit feedback dataset for testing ALS. - * @param numUsers number of users - * @param numItems number of items - * @param rank rank - * @param noiseStd the standard deviation of additive Gaussian noise on training data - * @param seed random seed - * @return (training, test) - */ - def genExplicitTestData( - numUsers: Int, - numItems: Int, - rank: Int, - noiseStd: Double = 0.0, - seed: Long = 11L): (RDD[Rating[Int]], RDD[Rating[Int]]) = { - val trainingFraction = 0.6 - val testFraction = 0.3 - val totalFraction = trainingFraction + testFraction - val random = new Random(seed) - val userFactors = genFactors(numUsers, rank, random) - val itemFactors = genFactors(numItems, rank, random) - val training = ArrayBuffer.empty[Rating[Int]] - val test = ArrayBuffer.empty[Rating[Int]] - for ((userId, userFactor) <- userFactors; (itemId, itemFactor) <- itemFactors) { - val x = random.nextDouble() - if (x < totalFraction) { - val rating = blas.sdot(rank, userFactor, 1, itemFactor, 1) - if (x < trainingFraction) { - val noise = noiseStd * random.nextGaussian() - training += Rating(userId, itemId, rating + noise.toFloat) - } else { - test += Rating(userId, itemId, rating) - } - } - } - logInfo(s"Generated an explicit feedback dataset with ${training.size} ratings for training " + - s"and ${test.size} for test.") - (sc.parallelize(training, 2), sc.parallelize(test, 2)) - } - - /** - * Generates an implicit feedback dataset for testing ALS. - * @param numUsers number of users - * @param numItems number of items - * @param rank rank - * @param noiseStd the standard deviation of additive Gaussian noise on training data - * @param seed random seed - * @return (training, test) - */ - def genImplicitTestData( - numUsers: Int, - numItems: Int, - rank: Int, - noiseStd: Double = 0.0, - seed: Long = 11L): (RDD[Rating[Int]], RDD[Rating[Int]]) = { - ALSSuite.genImplicitTestData(sc, numUsers, numItems, rank, noiseStd, seed) - } - - /** - * Generates random user/item factors, with i.i.d. values drawn from U(a, b). - * @param size number of users/items - * @param rank number of features - * @param random random number generator - * @param a min value of the support (default: -1) - * @param b max value of the support (default: 1) - * @return a sequence of (ID, factors) pairs - */ - private def genFactors( - size: Int, - rank: Int, - random: Random, - a: Float = -1.0f, - b: Float = 1.0f): Seq[(Int, Array[Float])] = { - IntelALSSuite.genFactors(size, rank, random, a, b) - } - - /** - * Train ALS using the given training set and parameters - * @param training training dataset - * @param rank rank of the matrix factorization - * @param maxIter max number of iterations - * @param regParam regularization constant - * @param implicitPrefs whether to use implicit preference - * @param numUserBlocks number of user blocks - * @param numItemBlocks number of item blocks - * @return a trained ALSModel - */ - def trainALS( - training: RDD[Rating[Int]], - rank: Int, - maxIter: Int, - regParam: Double, - implicitPrefs: Boolean = false, - numUserBlocks: Int = 2, - numItemBlocks: Int = 3): ALSModel = { - val spark = this.spark - import spark.implicits._ - val als = new ALS() - .setRank(rank) - .setRegParam(regParam) - .setImplicitPrefs(implicitPrefs) - .setNumUserBlocks(numUserBlocks) - .setNumItemBlocks(numItemBlocks) - .setSeed(0) - als.fit(training.toDF()) - } - - /** - * Test ALS using the given training/test splits and parameters. - * @param training training dataset - * @param test test dataset - * @param rank rank of the matrix factorization - * @param maxIter max number of iterations - * @param regParam regularization constant - * @param implicitPrefs whether to use implicit preference - * @param numUserBlocks number of user blocks - * @param numItemBlocks number of item blocks - * @param targetRMSE target test RMSE - */ - def testALS( - training: RDD[Rating[Int]], - test: RDD[Rating[Int]], - rank: Int, - maxIter: Int, - regParam: Double, - implicitPrefs: Boolean = false, - numUserBlocks: Int = 2, - numItemBlocks: Int = 3, - targetRMSE: Double = 0.05): Unit = { - val spark = this.spark - import spark.implicits._ - val als = new ALS() - .setRank(rank) - .setRegParam(regParam) - .setImplicitPrefs(implicitPrefs) - .setNumUserBlocks(numUserBlocks) - .setNumItemBlocks(numItemBlocks) - .setSeed(0) - val alpha = als.getAlpha - val model = als.fit(training.toDF()) - testTransformerByGlobalCheckFunc[Rating[Int]](test.toDF(), model, "rating", "prediction") { - case rows: Seq[Row] => - val predictions = rows.map(row => (row.getFloat(0).toDouble, row.getFloat(1).toDouble)) - - val rmse = - if (implicitPrefs) { - // TODO: Use a better (rank-based?) evaluation metric for implicit feedback. - // We limit the ratings and the predictions to interval [0, 1] and compute the - // weighted RMSE with the confidence scores as weights. - val (totalWeight, weightedSumSq) = predictions.map { case (rating, prediction) => - val confidence = 1.0 + alpha * math.abs(rating) - val rating01 = math.max(math.min(rating, 1.0), 0.0) - val prediction01 = math.max(math.min(prediction, 1.0), 0.0) - val err = prediction01 - rating01 - (confidence, confidence * err * err) - }.reduce[(Double, Double)] { case ((c0, e0), (c1, e1)) => - (c0 + c1, e0 + e1) - } - math.sqrt(weightedSumSq / totalWeight) - } else { - val errorSquares = predictions.map { case (rating, prediction) => - val err = rating - prediction - err * err - } - val mse = errorSquares.sum / errorSquares.length - math.sqrt(mse) - } - logInfo(s"Test RMSE is $rmse.") - assert(rmse < targetRMSE) - } - - MLTestingUtils.checkCopyAndUids(als, model) - } - - test("exact rank-1 matrix") { - val (training, test) = genExplicitTestData(numUsers = 20, numItems = 40, rank = 1) - testALS(training, test, maxIter = 1, rank = 1, regParam = 1e-5, targetRMSE = 0.001) - testALS(training, test, maxIter = 1, rank = 2, regParam = 1e-5, targetRMSE = 0.001) - } - - test("approximate rank-1 matrix") { - val (training, test) = - genExplicitTestData(numUsers = 20, numItems = 40, rank = 1, noiseStd = 0.01) - testALS(training, test, maxIter = 2, rank = 1, regParam = 0.01, targetRMSE = 0.02) - testALS(training, test, maxIter = 2, rank = 2, regParam = 0.01, targetRMSE = 0.02) - } - - test("approximate rank-2 matrix") { - val (training, test) = - genExplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01) - testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, targetRMSE = 0.03) - testALS(training, test, maxIter = 4, rank = 3, regParam = 0.01, targetRMSE = 0.03) - } - - test("different block settings") { - val (training, test) = - genExplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01) - for ((numUserBlocks, numItemBlocks) <- Seq((1, 1), (1, 2), (2, 1), (2, 2))) { - testALS(training, test, maxIter = 4, rank = 3, regParam = 0.01, targetRMSE = 0.03, - numUserBlocks = numUserBlocks, numItemBlocks = numItemBlocks) - } - } - - test("more blocks than ratings") { - val (training, test) = - genExplicitTestData(numUsers = 4, numItems = 4, rank = 1) - testALS(training, test, maxIter = 2, rank = 1, regParam = 1e-4, targetRMSE = 0.002, - numItemBlocks = 5, numUserBlocks = 5) - } - - test("implicit feedback") { - val (training, test) = - genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01) - testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, implicitPrefs = true, - targetRMSE = 0.3) - } - - test("implicit feedback regression") { - val trainingWithNeg = sc.parallelize(Seq(Rating(0, 0, 1), Rating(1, 1, 1), Rating(0, 1, -3))) - val trainingWithZero = sc.parallelize(Seq(Rating(0, 0, 1), Rating(1, 1, 1), Rating(0, 1, 0))) - val modelWithNeg = - trainALS(trainingWithNeg, rank = 1, maxIter = 5, regParam = 0.01, implicitPrefs = true) - val modelWithZero = - trainALS(trainingWithZero, rank = 1, maxIter = 5, regParam = 0.01, implicitPrefs = true) - val userFactorsNeg = modelWithNeg.userFactors - val itemFactorsNeg = modelWithNeg.itemFactors - val userFactorsZero = modelWithZero.userFactors - val itemFactorsZero = modelWithZero.itemFactors - assert(userFactorsNeg.intersect(userFactorsZero).count() == 0) - assert(itemFactorsNeg.intersect(itemFactorsZero).count() == 0) - } - test("using generic ID types") { - val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01) - - val longRatings = ratings.map(r => Rating(r.user.toLong, r.item.toLong, r.rating)) - val (longUserFactors, _) = ALS.train(longRatings, rank = 2, maxIter = 4, seed = 0) - assert(longUserFactors.first()._1.getClass === classOf[Long]) - - val strRatings = ratings.map(r => Rating(r.user.toString, r.item.toString, r.rating)) - val (strUserFactors, _) = ALS.train(strRatings, rank = 2, maxIter = 4, seed = 0) - assert(strUserFactors.first()._1.getClass === classOf[String]) - } - - test("nonnegative constraint") { - val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01) - val (userFactors, itemFactors) = - ALS.train(ratings, rank = 2, maxIter = 4, nonnegative = true, seed = 0) - def isNonnegative(factors: RDD[(Int, Array[Float])]): Boolean = { - factors.values.map { _.forall(_ >= 0.0) }.reduce(_ && _) - } - assert(isNonnegative(userFactors)) - assert(isNonnegative(itemFactors)) - // TODO: Validate the solution. - } - - test("als partitioner is a projection") { - for (p <- Seq(1, 10, 100, 1000)) { - val part = new ALSPartitioner(p) - var k = 0 - while (k < p) { - assert(k === part.getPartition(k)) - assert(k === part.getPartition(k.toLong)) - k += 1 - } - } - } - - test("partitioner in returned factors") { - val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01) - val (userFactors, itemFactors) = ALS.train( - ratings, rank = 2, maxIter = 4, numUserBlocks = 3, numItemBlocks = 4, seed = 0) - for ((tpe, factors) <- Seq(("User", userFactors), ("Item", itemFactors))) { - assert(userFactors.partitioner.isDefined, s"$tpe factors should have partitioner.") - val part = userFactors.partitioner.get - userFactors.mapPartitionsWithIndex { (idx, items) => - items.foreach { case (id, _) => - if (part.getPartition(id) != idx) { - throw new SparkException(s"$tpe with ID $id should not be in partition $idx.") - } - } - Iterator.empty - }.count() - } - } - - test("als with large number of iterations") { - val (ratings, _) = genExplicitTestData(numUsers = 4, numItems = 4, rank = 1) - ALS.train(ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2, seed = 0) - ALS.train(ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2, - implicitPrefs = true, seed = 0) - } - - test("read/write") { - val spark = this.spark - import ALSSuite._ - import spark.implicits._ - val (ratings, _) = genExplicitTestData(numUsers = 4, numItems = 4, rank = 1) - - def getFactors(df: DataFrame): Set[(Int, Array[Float])] = { - df.select("id", "features").collect().map { case r => - (r.getInt(0), r.getAs[Array[Float]](1)) - }.toSet - } - - def checkModelData(model: ALSModel, model2: ALSModel): Unit = { - assert(model.rank === model2.rank) - assert(getFactors(model.userFactors) === getFactors(model2.userFactors)) - assert(getFactors(model.itemFactors) === getFactors(model2.itemFactors)) - } - - val als = new ALS() - testEstimatorAndModelReadWrite(als, ratings.toDF(), allEstimatorParamSettings, - allModelParamSettings, checkModelData) - } - - private def checkNumericTypesALS( - estimator: ALS, - spark: SparkSession, - column: String, - baseType: NumericType) - (check: (ALSModel, ALSModel) => Unit) - (check2: (ALSModel, ALSModel, DataFrame, Encoder[_]) => Unit): Unit = { - val dfs = genRatingsDFWithNumericCols(spark, column) - val maybeDf = dfs.find { case (numericTypeWithEncoder, _) => - numericTypeWithEncoder.numericType == baseType - } - assert(maybeDf.isDefined) - val df = maybeDf.get._2 - - val expected = estimator.fit(df) - val actuals = dfs.map(t => (t, estimator.fit(t._2))) - actuals.foreach { case (_, actual) => check(expected, actual) } - actuals.foreach { case (t, actual) => check2(expected, actual, t._2, t._1.encoder) } - - val baseDF = dfs.find(_._1.numericType == baseType).get._2 - val others = baseDF.columns.toSeq.diff(Seq(column)).map(col) - val cols = Seq(col(column).cast(StringType)) ++ others - val strDF = baseDF.select(cols: _*) - val thrown = intercept[IllegalArgumentException] { - estimator.fit(strDF) - } - assert(thrown.getMessage.contains( - s"$column must be of type numeric but was actually of type string")) - } - - private class NumericTypeWithEncoder[A](val numericType: NumericType) - (implicit val encoder: Encoder[(A, Int, Double)]) - - private def genRatingsDFWithNumericCols( - spark: SparkSession, - column: String) = { - - import testImplicits._ - - val df = spark.createDataFrame(Seq( - (0, 10, 1.0), - (1, 20, 2.0), - (2, 30, 3.0), - (3, 40, 4.0), - (4, 50, 5.0) - )).toDF("user", "item", "rating") - - val others = df.columns.toSeq.diff(Seq(column)).map(col) - val types = - Seq(new NumericTypeWithEncoder[Short](ShortType), - new NumericTypeWithEncoder[Long](LongType), - new NumericTypeWithEncoder[Int](IntegerType), - new NumericTypeWithEncoder[Float](FloatType), - new NumericTypeWithEncoder[Byte](ByteType), - new NumericTypeWithEncoder[Double](DoubleType), - new NumericTypeWithEncoder[Decimal](DecimalType(10, 0))(ExpressionEncoder()) - ) - types.map { t => - val cols = Seq(col(column).cast(t.numericType)) ++ others - t -> df.select(cols: _*) - } - } - - test("input type validation") { - val spark = this.spark - import spark.implicits._ - - // check that ALS can handle all numeric types for rating column - // and user/item columns (when the user/item ids are within Int range) - val als = new ALS().setMaxIter(1).setRank(1) - Seq(("user", IntegerType), ("item", IntegerType), ("rating", FloatType)).foreach { - case (colName, sqlType) => - checkNumericTypesALS(als, spark, colName, sqlType) { - (ex, act) => - ex.userFactors.first().getSeq[Float](1) === act.userFactors.first().getSeq[Float](1) - } { (ex, act, df, enc) => - // With AQE on/off, the order of result may be different. Here sortby the result. - val expected = ex.transform(df).selectExpr("prediction") - .sort("prediction").first().getFloat(0) - testTransformerByGlobalCheckFunc(df, act, "prediction") { - case rows: Seq[Row] => - expected ~== rows.sortBy(_.getFloat(0)).head.getFloat(0) absTol 1e-6 - }(enc) - } - } - // check user/item ids falling outside of Int range - val big = Int.MaxValue.toLong + 1 - val small = Int.MinValue.toDouble - 1 - val df = Seq( - (0, 0L, 0d, 1, 1L, 1d, 3.0), - (0, big, small, 0, big, small, 2.0), - (1, 1L, 1d, 0, 0L, 0d, 5.0) - ).toDF("user", "user_big", "user_small", "item", "item_big", "item_small", "rating") - val msg = "either out of Integer range or contained a fractional part" - withClue("fit should fail when ids exceed integer range. ") { - assert(intercept[SparkException] { - als.fit(df.select(df("user_big").as("user"), df("item"), df("rating"))) - }.getCause.getMessage.contains(msg)) - assert(intercept[SparkException] { - als.fit(df.select(df("user_small").as("user"), df("item"), df("rating"))) - }.getCause.getMessage.contains(msg)) - assert(intercept[SparkException] { - als.fit(df.select(df("item_big").as("item"), df("user"), df("rating"))) - }.getCause.getMessage.contains(msg)) - assert(intercept[SparkException] { - als.fit(df.select(df("item_small").as("item"), df("user"), df("rating"))) - }.getCause.getMessage.contains(msg)) - } - withClue("transform should fail when ids exceed integer range. ") { - val model = als.fit(df) - def testTransformIdExceedsIntRange[A : Encoder](dataFrame: DataFrame): Unit = { - val e1 = intercept[SparkException] { - model.transform(dataFrame).collect() - } - TestUtils.assertExceptionMsg(e1, msg) - val e2 = intercept[StreamingQueryException] { - testTransformer[A](dataFrame, model, "prediction") { _ => } - } - TestUtils.assertExceptionMsg(e2, msg) - } - testTransformIdExceedsIntRange[(Long, Int)](df.select(df("user_big").as("user"), - df("item"))) - testTransformIdExceedsIntRange[(Double, Int)](df.select(df("user_small").as("user"), - df("item"))) - testTransformIdExceedsIntRange[(Long, Int)](df.select(df("item_big").as("item"), - df("user"))) - testTransformIdExceedsIntRange[(Double, Int)](df.select(df("item_small").as("item"), - df("user"))) - } - } - - test("SPARK-18268: ALS with empty RDD should fail with better message") { - val ratings = sc.parallelize(Array.empty[Rating[Int]]) - intercept[IllegalArgumentException] { - ALS.train(ratings) - } - } - - test("ALS cold start user/item prediction strategy") { - val spark = this.spark - import org.apache.spark.sql.functions._ - import spark.implicits._ - - val (ratings, _) = genExplicitTestData(numUsers = 4, numItems = 4, rank = 1) - val data = ratings.toDF - val knownUser = data.select(max("user")).as[Int].first() - val unknownUser = knownUser + 10 - val knownItem = data.select(max("item")).as[Int].first() - val unknownItem = knownItem + 20 - val test = Seq( - (unknownUser, unknownItem, true), - (knownUser, unknownItem, true), - (unknownUser, knownItem, true), - (knownUser, knownItem, false) - ).toDF("user", "item", "expectedIsNaN") - - val als = new ALS().setMaxIter(1).setRank(1) - // default is 'nan' - val defaultModel = als.fit(data) - testTransformer[(Int, Int, Boolean)](test, defaultModel, "expectedIsNaN", "prediction") { - case Row(expectedIsNaN: Boolean, prediction: Float) => - assert(prediction.isNaN === expectedIsNaN) - } - - // check 'drop' strategy should filter out rows with unknown users/items - val defaultPrediction = defaultModel.transform(test).select("prediction") - .as[Float].filter(!_.isNaN).first() - testTransformerByGlobalCheckFunc[(Int, Int, Boolean)](test, - defaultModel.setColdStartStrategy("drop"), "prediction") { - case rows: Seq[Row] => - val dropPredictions = rows.map(_.getFloat(0)) - assert(dropPredictions.length == 1) - assert(!dropPredictions.head.isNaN) - assert(dropPredictions.head ~== defaultPrediction relTol 1e-14) - } - } - - test("case insensitive cold start param value") { - val spark = this.spark - import spark.implicits._ - val (ratings, _) = genExplicitTestData(numUsers = 2, numItems = 2, rank = 1) - val data = ratings.toDF - val model = new ALS().fit(data) - Seq("nan", "NaN", "Nan", "drop", "DROP", "Drop").foreach { s => - testTransformer[Rating[Int]](data, model.setColdStartStrategy(s), "prediction") { _ => } - } - } - - private def getALSModel = { - val spark = this.spark - import spark.implicits._ - - val userFactors = Seq( - (0, Array(6.0f, 4.0f)), - (1, Array(3.0f, 4.0f)), - (2, Array(3.0f, 6.0f)) - ).toDF("id", "features") - val itemFactors = Seq( - (3, Array(5.0f, 6.0f)), - (4, Array(6.0f, 2.0f)), - (5, Array(3.0f, 6.0f)), - (6, Array(4.0f, 1.0f)) - ).toDF("id", "features") - val als = new ALS().setRank(2) - new ALSModel(als.uid, als.getRank, userFactors, itemFactors) - .setUserCol("user") - .setItemCol("item") - } - - test("recommendForAllUsers with k <, = and > num_items") { - val model = getALSModel - val numUsers = model.userFactors.count - val numItems = model.itemFactors.count - val expected = Map( - 0 -> Seq((3, 54f), (4, 44f), (5, 42f), (6, 28f)), - 1 -> Seq((3, 39f), (5, 33f), (4, 26f), (6, 16f)), - 2 -> Seq((3, 51f), (5, 45f), (4, 30f), (6, 18f)) - ) - - Seq(2, 4, 6).foreach { k => - val n = math.min(k, numItems).toInt - val expectedUpToN = expected.mapValues(_.slice(0, n)) - val topItems = model.recommendForAllUsers(k) - assert(topItems.count() == numUsers) - assert(topItems.columns.contains("user")) - checkRecommendations(topItems, expectedUpToN, "item") - } - } - - test("recommendForAllItems with k <, = and > num_users") { - val model = getALSModel - val numUsers = model.userFactors.count - val numItems = model.itemFactors.count - val expected = Map( - 3 -> Seq((0, 54f), (2, 51f), (1, 39f)), - 4 -> Seq((0, 44f), (2, 30f), (1, 26f)), - 5 -> Seq((2, 45f), (0, 42f), (1, 33f)), - 6 -> Seq((0, 28f), (2, 18f), (1, 16f)) - ) - - Seq(2, 3, 4).foreach { k => - val n = math.min(k, numUsers).toInt - val expectedUpToN = expected.mapValues(_.slice(0, n)) - val topUsers = getALSModel.recommendForAllItems(k) - assert(topUsers.count() == numItems) - assert(topUsers.columns.contains("item")) - checkRecommendations(topUsers, expectedUpToN, "user") - } - } - - test("recommendForUserSubset with k <, = and > num_items") { - val spark = this.spark - import spark.implicits._ - val model = getALSModel - val numItems = model.itemFactors.count - val expected = Map( - 0 -> Seq((3, 54f), (4, 44f), (5, 42f), (6, 28f)), - 2 -> Seq((3, 51f), (5, 45f), (4, 30f), (6, 18f)) - ) - val userSubset = expected.keys.toSeq.toDF("user") - val numUsersSubset = userSubset.count - - Seq(2, 4, 6).foreach { k => - val n = math.min(k, numItems).toInt - val expectedUpToN = expected.mapValues(_.slice(0, n)) - val topItems = model.recommendForUserSubset(userSubset, k) - assert(topItems.count() == numUsersSubset) - assert(topItems.columns.contains("user")) - checkRecommendations(topItems, expectedUpToN, "item") - } - } - - test("recommendForItemSubset with k <, = and > num_users") { - val spark = this.spark - import spark.implicits._ - val model = getALSModel - val numUsers = model.userFactors.count - val expected = Map( - 3 -> Seq((0, 54f), (2, 51f), (1, 39f)), - 6 -> Seq((0, 28f), (2, 18f), (1, 16f)) - ) - val itemSubset = expected.keys.toSeq.toDF("item") - val numItemsSubset = itemSubset.count - - Seq(2, 3, 4).foreach { k => - val n = math.min(k, numUsers).toInt - val expectedUpToN = expected.mapValues(_.slice(0, n)) - val topUsers = model.recommendForItemSubset(itemSubset, k) - assert(topUsers.count() == numItemsSubset) - assert(topUsers.columns.contains("item")) - checkRecommendations(topUsers, expectedUpToN, "user") - } - } - - test("subset recommendations eliminate duplicate ids, returns same results as unique ids") { - val spark = this.spark - import spark.implicits._ - val model = getALSModel - val k = 2 - - val users = Seq(0, 1).toDF("user") - val dupUsers = Seq(0, 1, 0, 1).toDF("user") - val singleUserRecs = model.recommendForUserSubset(users, k) - val dupUserRecs = model.recommendForUserSubset(dupUsers, k) - .as[(Int, Seq[(Int, Float)])].collect().toMap - assert(singleUserRecs.count == dupUserRecs.size) - checkRecommendations(singleUserRecs, dupUserRecs, "item") - - val items = Seq(3, 4, 5).toDF("item") - val dupItems = Seq(3, 4, 5, 4, 5).toDF("item") - val singleItemRecs = model.recommendForItemSubset(items, k) - val dupItemRecs = model.recommendForItemSubset(dupItems, k) - .as[(Int, Seq[(Int, Float)])].collect().toMap - assert(singleItemRecs.count == dupItemRecs.size) - checkRecommendations(singleItemRecs, dupItemRecs, "user") - } - - test("subset recommendations on full input dataset equivalent to recommendForAll") { - val spark = this.spark - import spark.implicits._ - val model = getALSModel - val k = 2 - - val userSubset = model.userFactors.withColumnRenamed("id", "user").drop("features") - val userSubsetRecs = model.recommendForUserSubset(userSubset, k) - val allUserRecs = model.recommendForAllUsers(k).as[(Int, Seq[(Int, Float)])].collect().toMap - checkRecommendations(userSubsetRecs, allUserRecs, "item") - - val itemSubset = model.itemFactors.withColumnRenamed("id", "item").drop("features") - val itemSubsetRecs = model.recommendForItemSubset(itemSubset, k) - val allItemRecs = model.recommendForAllItems(k).as[(Int, Seq[(Int, Float)])].collect().toMap - checkRecommendations(itemSubsetRecs, allItemRecs, "user") - } - - test("ALS should not introduce unnecessary shuffle") { - def getShuffledDependencies(rdd: RDD[_]): Seq[ShuffleDependency[_, _, _]] = { - rdd.dependencies.flatMap { - case s: ShuffleDependency[_, _, _] => - Seq(s) ++ getShuffledDependencies(s.rdd) - case o => - Seq.empty ++ getShuffledDependencies(o.rdd) - } - } - - val spark = this.spark - import spark.implicits._ - val (ratings, _) = genExplicitTestData(numUsers = 2, numItems = 2, rank = 1) - val data = ratings.toDF - val model = new ALS() - .setMaxIter(2) - .setImplicitPrefs(true) - .setCheckpointInterval(-1) - .fit(data) - - val userFactors = model.userFactors - val itemFactors = model.itemFactors - val shuffledUserFactors = getShuffledDependencies(userFactors.rdd).filter { dep => - dep.rdd.name != null && dep.rdd.name.contains("userFactors") - } - val shuffledItemFactors = getShuffledDependencies(itemFactors.rdd).filter { dep => - dep.rdd.name != null && dep.rdd.name.contains("itemFactors") - } - assert(shuffledUserFactors.size == 0) - assert(shuffledItemFactors.size == 0) - } - - private def checkRecommendations( - topK: DataFrame, - expected: Map[Int, Seq[(Int, Float)]], - dstColName: String): Unit = { - val spark = this.spark - import spark.implicits._ - - assert(topK.columns.contains("recommendations")) - topK.as[(Int, Seq[(Int, Float)])].collect().foreach { case (id: Int, recs: Seq[(Int, Float)]) => - assert(recs === expected(id)) - } - topK.collect().foreach { row => - val recs = row.getAs[WrappedArray[Row]]("recommendations") - assert(recs(0).fieldIndex(dstColName) == 0) - assert(recs(0).fieldIndex("rating") == 1) - } - } -} - -class ALSCleanerSuite extends SparkFunSuite with BeforeAndAfterEach { - override def beforeEach(): Unit = { - super.beforeEach() - // Once `Utils.getOrCreateLocalRootDirs` is called, it is cached in `Utils.localRootDirs`. - // Unless this is manually cleared before and after a test, it returns the same directory - // set before even if 'spark.local.dir' is configured afterwards. - Utils.clearLocalRootDirs() - } - - override def afterEach(): Unit = { - Utils.clearLocalRootDirs() - super.afterEach() - } - - test("ALS shuffle cleanup standalone") { - val conf = new SparkConf() - val localDir = Utils.createTempDir() - val checkpointDir = Utils.createTempDir() - def getAllFiles: Set[File] = - FileUtils.listFiles(localDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet - try { - conf.set("spark.local.dir", localDir.getAbsolutePath) - val sc = new SparkContext("local[2]", "test", conf) - try { - sc.setCheckpointDir(checkpointDir.getAbsolutePath) - // Test checkpoint and clean parents - val input = sc.parallelize(1 to 1000) - val keyed = input.map(x => (x % 20, 1)) - val shuffled = keyed.reduceByKey(_ + _) - val keysOnly = shuffled.keys - val deps = keysOnly.dependencies - keysOnly.count() - ALS.cleanShuffleDependencies(sc, deps, true) - val resultingFiles = getAllFiles - assert(resultingFiles === Set()) - // Ensure running count again works fine even if we kill the shuffle files. - keysOnly.count() - } finally { - sc.stop() - } - } finally { - Utils.deleteRecursively(localDir) - Utils.deleteRecursively(checkpointDir) - } - } - - test("ALS shuffle cleanup in algorithm") { - val conf = new SparkConf() - val localDir = Utils.createTempDir() - val checkpointDir = Utils.createTempDir() - def getAllFiles: Set[File] = - FileUtils.listFiles(localDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet - try { - conf.set("spark.local.dir", localDir.getAbsolutePath) - val sc = new SparkContext("local[2]", "ALSCleanerSuite", conf) - try { - sc.setCheckpointDir(checkpointDir.getAbsolutePath) - // Generate test data - val (training, _) = ALSSuite.genImplicitTestData(sc, 20, 5, 1, 0.2, 0) - // Implicitly test the cleaning of parents during ALS training - val spark = SparkSession.builder - .sparkContext(sc) - .getOrCreate() - import spark.implicits._ - val als = new ALS() - .setRank(1) - .setRegParam(1e-5) - .setSeed(0) - .setCheckpointInterval(1) - .setMaxIter(7) - val model = als.fit(training.toDF()) - val resultingFiles = getAllFiles - // We expect the last shuffles files, block ratings, user factors, and item factors to be - // around but no more. - val pattern = "shuffle_(\\d+)_.+\\.data".r - val rddIds = resultingFiles.flatMap { f => - pattern.findAllIn(f.getName()).matchData.map { _.group(1) } } - assert(rddIds.size === 4) - } finally { - sc.stop() - } - } finally { - Utils.deleteRecursively(localDir) - Utils.deleteRecursively(checkpointDir) - } - } -} - -class ALSStorageSuite - extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest with Logging { - - test("invalid storage params") { - intercept[IllegalArgumentException] { - new ALS().setIntermediateStorageLevel("foo") - } - intercept[IllegalArgumentException] { - new ALS().setIntermediateStorageLevel("NONE") - } - intercept[IllegalArgumentException] { - new ALS().setFinalStorageLevel("foo") - } - } - - test("default and non-default storage params set correct RDD StorageLevels") { - val spark = this.spark - import spark.implicits._ - val data = Seq( - (0, 0, 1.0), - (0, 1, 2.0), - (1, 2, 3.0), - (1, 0, 2.0) - ).toDF("user", "item", "rating") - val als = new ALS().setMaxIter(1).setRank(1) - // add listener to check intermediate RDD default storage levels - val defaultListener = new IntermediateRDDStorageListener - sc.addSparkListener(defaultListener) - val model = als.fit(data) - // check final factor RDD default storage levels - val defaultFactorRDDs = sc.getPersistentRDDs.collect { - case (id, rdd) if rdd.name == "userFactors" || rdd.name == "itemFactors" => - rdd.name -> ((id, rdd.getStorageLevel)) - }.toMap - defaultFactorRDDs.foreach { case (_, (id, level)) => - assert(level == StorageLevel.MEMORY_AND_DISK) - } - defaultListener.storageLevels.foreach(level => assert(level == StorageLevel.MEMORY_AND_DISK)) - - // add listener to check intermediate RDD non-default storage levels - val nonDefaultListener = new IntermediateRDDStorageListener - sc.addSparkListener(nonDefaultListener) - val nonDefaultModel = als - .setFinalStorageLevel("MEMORY_ONLY") - .setIntermediateStorageLevel("DISK_ONLY") - .fit(data) - // check final factor RDD non-default storage levels - val levels = sc.getPersistentRDDs.collect { - case (id, rdd) if rdd.name == "userFactors" && rdd.id != defaultFactorRDDs("userFactors")._1 - || rdd.name == "itemFactors" && rdd.id != defaultFactorRDDs("itemFactors")._1 => - rdd.getStorageLevel - } - levels.foreach(level => assert(level == StorageLevel.MEMORY_ONLY)) - nonDefaultListener.storageLevels.foreach(level => assert(level == StorageLevel.DISK_ONLY)) - } -} - -private class IntermediateRDDStorageListener extends SparkListener { - - val storageLevels: mutable.ArrayBuffer[StorageLevel] = mutable.ArrayBuffer() - - override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = { - val stageLevels = stageCompleted.stageInfo.rddInfos.collect { - case info if info.name.contains("Blocks") || info.name.contains("Factors-") => - info.storageLevel - } - storageLevels ++= stageLevels - } - -} - -object IntelALSSuite extends Logging { - - /** - * Mapping from all Params to valid settings which differ from the defaults. - * This is useful for tests which need to exercise all Params, such as save/load. - * This excludes input columns to simplify some tests. - */ - val allModelParamSettings: Map[String, Any] = Map( - "predictionCol" -> "myPredictionCol" - ) - - /** - * Mapping from all Params to valid settings which differ from the defaults. - * This is useful for tests which need to exercise all Params, such as save/load. - * This excludes input columns to simplify some tests. - */ - val allEstimatorParamSettings: Map[String, Any] = allModelParamSettings ++ Map( - "maxIter" -> 1, - "rank" -> 1, - "regParam" -> 0.01, - "numUserBlocks" -> 2, - "numItemBlocks" -> 2, - "implicitPrefs" -> true, - "alpha" -> 0.9, - "nonnegative" -> true, - "checkpointInterval" -> 20, - "intermediateStorageLevel" -> "MEMORY_ONLY", - "finalStorageLevel" -> "MEMORY_AND_DISK_SER" - ) - - // Helper functions to generate test data we share between ALS test suites - - /** - * Generates random user/item factors, with i.i.d. values drawn from U(a, b). - * @param size number of users/items - * @param rank number of features - * @param random random number generator - * @param a min value of the support (default: -1) - * @param b max value of the support (default: 1) - * @return a sequence of (ID, factors) pairs - */ - private def genFactors( - size: Int, - rank: Int, - random: Random, - a: Float = -1.0f, - b: Float = 1.0f): Seq[(Int, Array[Float])] = { - require(size > 0 && size < Int.MaxValue / 3) - require(b > a) - val ids = mutable.Set.empty[Int] - while (ids.size < size) { - ids += random.nextInt() - } - val width = b - a - ids.toSeq.sorted.map(id => (id, Array.fill(rank)(a + random.nextFloat() * width))) - } - - /** - * Generates an implicit feedback dataset for testing ALS. - * - * @param sc SparkContext - * @param numUsers number of users - * @param numItems number of items - * @param rank rank - * @param noiseStd the standard deviation of additive Gaussian noise on training data - * @param seed random seed - * @return (training, test) - */ - def genImplicitTestData( - sc: SparkContext, - numUsers: Int, - numItems: Int, - rank: Int, - noiseStd: Double = 0.0, - seed: Long = 11L): (RDD[Rating[Int]], RDD[Rating[Int]]) = { - // The assumption of the implicit feedback model is that unobserved ratings are more likely to - // be negatives. - val positiveFraction = 0.8 - val negativeFraction = 1.0 - positiveFraction - val trainingFraction = 0.6 - val testFraction = 0.3 - val totalFraction = trainingFraction + testFraction - val random = new Random(seed) - val userFactors = genFactors(numUsers, rank, random) - val itemFactors = genFactors(numItems, rank, random) - val training = ArrayBuffer.empty[Rating[Int]] - val test = ArrayBuffer.empty[Rating[Int]] - for ((userId, userFactor) <- userFactors; (itemId, itemFactor) <- itemFactors) { - val rating = blas.sdot(rank, userFactor, 1, itemFactor, 1) - val threshold = if (rating > 0) positiveFraction else negativeFraction - val observed = random.nextDouble() < threshold - if (observed) { - val x = random.nextDouble() - if (x < totalFraction) { - if (x < trainingFraction) { - val noise = noiseStd * random.nextGaussian() - training += Rating(userId, itemId, rating + noise.toFloat) - } else { - test += Rating(userId, itemId, rating) - } - } - } - } - logInfo(s"Generated an implicit feedback dataset with ${training.size} ratings for training " + - s"and ${test.size} for test.") - (sc.parallelize(training, 2), sc.parallelize(test, 2)) - } -} diff --git a/mllib-dal/test-cluster.sh b/mllib-dal/test-cluster.sh deleted file mode 100755 index 4f5a6132a..000000000 --- a/mllib-dal/test-cluster.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env bash - -cd ../dev/test-cluster/workloads - -./run-kmeans-pyspark.sh diff --git a/mllib-dal/test.sh b/mllib-dal/test.sh index 0157c22a4..f7e73ca1f 100755 --- a/mllib-dal/test.sh +++ b/mllib-dal/test.sh @@ -35,9 +35,8 @@ export LD_PRELOAD=$JAVA_HOME/jre/lib/amd64/libjsig.so # -Dtest=none to turn off the Java tests # Test all -# mvn -Dtest=none -Dmaven.test.skip=false test +mvn -Dtest=none -Dmaven.test.skip=false test # Individual test -mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.clustering.IntelKMeansSuite test -mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.feature.IntelPCASuite test -# mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.recommendation.IntelALSSuite test +# mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.clustering.IntelKMeansSuite test +# mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.feature.IntelPCASuite test