diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml
index 2c6973321..73700a96d 100644
--- a/.github/workflows/oap-mllib-ci.yml
+++ b/.github/workflows/oap-mllib-ci.yml
@@ -38,5 +38,4 @@ jobs:
           source /opt/intel/oneapi/dal/latest/env/vars.sh
           source /opt/intel/oneapi/tbb/latest/env/vars.sh
           source /tmp/oneCCL/build/_install/env/setvars.sh
-          # temp disable and will enable for new release of oneCCL
-          #./build.sh
+          ./test.sh
diff --git a/.gitignore b/.gitignore
index 1d621bdd4..21a5d40c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,6 @@
 *.o
 *.log
 .vscode
-*.iml
 target/
 .idea/
 .idea_modules/
diff --git a/README.md b/README.md
index 477c74b1f..d217becee 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ You can find the all the OAP MLlib documents on the [project web page](https://o
 
 ### Java/Scala Users Preferred
 
-Use a pre-built OAP MLlib JAR to get started. You can firstly download OAP package from [OAP-JARs-Tarball](https://github.com/Intel-bigdata/OAP/releases/download/v1.0.0-spark-3.0.0/oap-1.0.0-bin-spark-3.0.0.tar.gz) and extract this Tarball to get `oap-mllib-x.x.x-with-spark-x.x.x.jar` under `oap-1.0.0-bin-spark-3.0.0/jars`.
+Use a pre-built OAP MLlib JAR to get started. You can firstly download OAP package from [OAP-JARs-Tarball](https://github.com/oap-mllib/releases/download/v1.1.0-spark-3.0.0/oap-1.1.0-bin-spark-3.0.0.tar.gz) and extract this Tarball to get `oap-mllib-x.x.x-with-spark-x.x.x.jar` under `oap-1.1.0-bin-spark-3.0.0/jars`.
 
 Then you can refer to the following [Running](#running) section to try out.
 
@@ -65,14 +65,6 @@ To use K-means example for sanity check, you need to upload a data file to your
     $ ./run.sh
 ```
 
-### Benchmark with HiBench
-Use [Hibench](https://github.com/Intel-bigdata/HiBench) to generate dataset with various profiles, and change related variables in `run-XXX.sh` script when applicable.  Then run the following commands:
-```
-    $ cd oap-mllib/examples/kmeans-hibench
-    $ ./build.sh
-    $ ./run-hibench-oap-mllib.sh
-```
-
 ### PySpark Support
 
 As PySpark-based applications call their Scala couterparts, they shall be supported out-of-box. An example can be found in the [Examples](#examples) section.
@@ -95,7 +87,7 @@ Intel® oneAPI Toolkits and its components can be downloaded and install from [h
 
 More details about oneAPI can be found [here](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html).
 
-You can also refer to [this script and comments in it](https://github.com/Intel-bigdata/OAP/blob/branch-1.0-spark-3.x/oap-mllib/dev/install-build-deps-centos.sh) to install correct oneAPI version and manually setup the environments.
+You can also refer to [this script and comments in it](https://github.com/oap-project/oap-mllib/blob/branch-1.1-spark-3.x/dev/install-build-deps-centos.sh) to install correct oneAPI version and manually setup the environments.
 
 Scala and Java dependency descriptions are already included in Maven POM file. 
 
@@ -138,7 +130,7 @@ CCL_ROOT    | Path to oneCCL home directory
 We suggest you to source `setvars.sh` script into current shell to setup building environments as following:
 
 ```
-	$ source /opt/intel/inteloneapi/setvars.sh
+	$ source /opt/intel/oneapi/setvars.sh
 	$ source /your/oneCCL_source_code/build/_install/env/setvars.sh
 ```
 
@@ -160,8 +152,11 @@ Example         |  Description
 ----------------|---------------------------
 kmeans          |  K-means example for Scala
 kmeans-pyspark  |  K-means example for PySpark
-kmeans-hibench  |  Use HiBench-generated input dataset to benchmark K-means performance
+pca             |  PCA example for Scala
+pca-pyspark     |  PCA example for PySpark
 
 ## List of Accelerated Algorithms
 
 * K-Means (CPU, Experimental)
+* PCA (CPU, Experimental)
+
diff --git a/dev/install-build-deps-centos.sh b/dev/install-build-deps-centos.sh
index 8a347fdef..7b27736ae 100755
--- a/dev/install-build-deps-centos.sh
+++ b/dev/install-build-deps-centos.sh
@@ -23,7 +23,7 @@ cd /tmp
 rm -rf oneCCL
 git clone https://github.com/oneapi-src/oneCCL
 cd oneCCL
-git checkout 2021.1
+git checkout beta08
 mkdir -p build && cd build
 cmake ..
 make -j 2 install
diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh
index d43e35b89..07019b834 100755
--- a/dev/install-build-deps-ubuntu.sh
+++ b/dev/install-build-deps-ubuntu.sh
@@ -17,7 +17,7 @@ echo "Building oneCCL ..."
 cd /tmp
 git clone https://github.com/oneapi-src/oneCCL
 cd oneCCL
-git checkout 2021.1
+git checkout beta08
 mkdir build && cd build
 cmake ..
 make -j 2 install
diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh
deleted file mode 100755
index d093fa17a..000000000
--- a/dev/test-cluster/config-ssh.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env bash
-
-ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa
-cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
-echo "    StrictHostKeyChecking no                     " | sudo tee -a /etc/ssh/ssh_config
-sudo service ssh restart
diff --git a/dev/test-cluster/core-site.xml b/dev/test-cluster/core-site.xml
deleted file mode 100644
index 7016e477e..000000000
--- a/dev/test-cluster/core-site.xml
+++ /dev/null
@@ -1,24 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
-<!--
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License. See accompanying LICENSE file.
--->
-
-<!-- Put site-specific property overrides in this file. -->
-
-<configuration>
-	<property>
-        <name>fs.default.name</name>
-    	<value>hdfs://localhost:8020</value>
-    </property>
-</configuration>
diff --git a/dev/test-cluster/envs.sh b/dev/test-cluster/envs.sh
deleted file mode 100644
index 71e8506e6..000000000
--- a/dev/test-cluster/envs.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-# Set user Spark and Hadoop home directory
-export HADOOP_HOME=~/opt/hadoop-2.7.7
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7
-
-export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH
-export PYSPARK_PYTHON=python3
-
-# Set user HDFS Root
-export HDFS_ROOT=hdfs://localhost:8020
-export OAP_MLLIB_DATA_ROOT=OAPMLlib/Data
-# Set user Intel MLlib Root directory
-export OAP_MLLIB_ROOT=${GITHUB_WORKSPACE}
-
-# Target jar built
-OAP_MLLIB_JAR_NAME=oap-mllib-1.1.0.jar
-OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
-
-# Use absolute path
-SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
-# Use relative path
-SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
diff --git a/dev/test-cluster/hadoop-env.sh b/dev/test-cluster/hadoop-env.sh
deleted file mode 100755
index bee6c1f69..000000000
--- a/dev/test-cluster/hadoop-env.sh
+++ /dev/null
@@ -1,99 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Set Hadoop-specific environment variables here.
-
-# The only required environment variable is JAVA_HOME.  All others are
-# optional.  When running a distributed configuration it is best to
-# set JAVA_HOME in this file, so that it is correctly defined on
-# remote nodes.
-
-# The java implementation to use.
-# export JAVA_HOME=${JAVA_HOME}
-export JAVA_HOME=/usr/local/lib/jvm/openjdk8
-
-# The jsvc implementation to use. Jsvc is required to run secure datanodes
-# that bind to privileged ports to provide authentication of data transfer
-# protocol.  Jsvc is not required if SASL is configured for authentication of
-# data transfer protocol using non-privileged ports.
-#export JSVC_HOME=${JSVC_HOME}
-
-export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"}
-
-# Extra Java CLASSPATH elements.  Automatically insert capacity-scheduler.
-for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do
-  if [ "$HADOOP_CLASSPATH" ]; then
-    export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f
-  else
-    export HADOOP_CLASSPATH=$f
-  fi
-done
-
-# The maximum amount of heap to use, in MB. Default is 1000.
-#export HADOOP_HEAPSIZE=
-#export HADOOP_NAMENODE_INIT_HEAPSIZE=""
-
-# Extra Java runtime options.  Empty by default.
-export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true"
-
-# Command specific options appended to HADOOP_OPTS when specified
-export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS"
-export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS"
-
-export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS"
-
-export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS"
-export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS"
-
-# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
-export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS"
-#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS"
-
-# On secure datanodes, user to run the datanode as after dropping privileges.
-# This **MUST** be uncommented to enable secure HDFS if using privileged ports
-# to provide authentication of data transfer protocol.  This **MUST NOT** be
-# defined if SASL is configured for authentication of data transfer protocol
-# using non-privileged ports.
-export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}
-
-# Where log files are stored.  $HADOOP_HOME/logs by default.
-#export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER
-
-# Where log files are stored in the secure data environment.
-export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
-
-###
-# HDFS Mover specific parameters
-###
-# Specify the JVM options to be used when starting the HDFS Mover.
-# These options will be appended to the options specified as HADOOP_OPTS
-# and therefore may override any similar flags set in HADOOP_OPTS
-#
-# export HADOOP_MOVER_OPTS=""
-
-###
-# Advanced Users Only!
-###
-
-# The directory where pid files are stored. /tmp by default.
-# NOTE: this should be set to a directory that can only be written to by 
-#       the user that will run the hadoop daemons.  Otherwise there is the
-#       potential for a symlink attack.
-export HADOOP_PID_DIR=${HADOOP_PID_DIR}
-export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}
-
-# A string representing this instance of hadoop. $USER by default.
-export HADOOP_IDENT_STRING=$USER
diff --git a/dev/test-cluster/hdfs-site.xml b/dev/test-cluster/hdfs-site.xml
deleted file mode 100644
index 40fcbb5d6..000000000
--- a/dev/test-cluster/hdfs-site.xml
+++ /dev/null
@@ -1,32 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
-<!--
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License. See accompanying LICENSE file.
--->
-
-<!-- Put site-specific property overrides in this file. -->
-
-<configuration>
-	<property>
-		<name>dfs.replication</name>
-		<value>1</value>
-	</property>
-	<property>
-		<name>dfs.namenode.name.dir</name>
-		<value>/tmp/run/hdfs/namenode</value>
-	</property>
-	<property>
-		<name>dfs.datanode.data.dir</name>
-		<value>/tmp/run/hdfs/datanode</value>
-	</property>
-</configuration>
diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh
deleted file mode 100755
index eea058f80..000000000
--- a/dev/test-cluster/setup-cluster.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env bash
-
-WORK_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-cd $WORK_DIR
-
-echo JAVA_HOME is $JAVA_HOME
-
-mkdir ~/opt
-cd ~/opt
-wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
-tar -xzf spark-3.0.0-bin-hadoop2.7.tgz
-wget https://archive.apache.org/dist/hadoop/core/hadoop-2.7.7/hadoop-2.7.7.tar.gz
-tar -xzf hadoop-2.7.7.tar.gz
-
-cd $WORK_DIR
-
-cp ./core-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/
-cp ./hdfs-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/
-cp ./yarn-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/
-cp ./hadoop-env.sh ~/opt/hadoop-2.7.7/etc/hadoop/
-cp ./spark-defaults.conf ~/opt/spark-3.0.0-bin-hadoop2.7/conf
-
-# create directories
-mkdir -p /tmp/run/hdfs/namenode
-mkdir -p /tmp/run/hdfs/datanode
-
-# hdfs format
-~/opt/hadoop-2.7.7/bin/hdfs namenode -format
-
-export HADOOP_HOME=~/opt/hadoop-2.7.7
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7
-
-export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH
-
-# start hdfs and yarn
-$HADOOP_HOME/sbin/start-dfs.sh
-$HADOOP_HOME/sbin/start-yarn.sh
-
-hadoop fs -ls /
-yarn node -list
diff --git a/dev/test-cluster/setup-python3-env.sh b/dev/test-cluster/setup-python3-env.sh
deleted file mode 100755
index 29208dc5e..000000000
--- a/dev/test-cluster/setup-python3-env.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env bash
-
-sudo apt-get update
-sudo apt-get install python3-pip python3-setuptools python3-wheel
-
-pip3 install --user numpy
-
-echo python is in $(which python) 
-python --version
-
-echo python3 is in $(which python3) 
-python3 --version
diff --git a/dev/test-cluster/spark-defaults.conf b/dev/test-cluster/spark-defaults.conf
deleted file mode 100644
index 1c25bb2ec..000000000
--- a/dev/test-cluster/spark-defaults.conf
+++ /dev/null
@@ -1,34 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Default system properties included when running spark-submit.
-# This is useful for setting default environmental settings.
-
-# Example:
-# spark.master                     spark://master:7077
-# spark.eventLog.enabled           true
-# spark.eventLog.dir               hdfs://namenode:8021/directory
-# spark.serializer                 org.apache.spark.serializer.KryoSerializer
-# spark.driver.memory              5g
-# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
-
-spark.master                     yarn	
-spark.serializer                 org.apache.spark.serializer.KryoSerializer
-spark.driver.memory              3g
-spark.executor.num               2
-spark.executor.cores             1
-spark.executor.memory            4g
diff --git a/dev/test-cluster/workloads/kmeans-pyspark.py b/dev/test-cluster/workloads/kmeans-pyspark.py
deleted file mode 100644
index cf93e6034..000000000
--- a/dev/test-cluster/workloads/kmeans-pyspark.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-An example demonstrating k-means clustering.
-Run with:
-  bin/spark-submit examples/src/main/python/ml/kmeans_example.py
-
-This example requires NumPy (http://www.numpy.org/).
-"""
-from __future__ import print_function
-import sys
-
-# $example on$
-from pyspark.ml.clustering import KMeans
-from pyspark.ml.evaluation import ClusteringEvaluator
-# $example off$
-
-from pyspark.sql import SparkSession
-
-if __name__ == "__main__":
-    spark = SparkSession\
-        .builder\
-        .appName("KMeansExample")\
-        .getOrCreate()
-
-    if (len(sys.argv) != 2) :
-        println("Require data file path as input parameter")
-        sys.exit(1)
-
-    # $example on$
-    # Loads data.
-    dataset = spark.read.format("libsvm").load(sys.argv[1])
-
-    # Trains a k-means model.
-    kmeans = KMeans().setK(2).setSeed(1)
-    model = kmeans.fit(dataset)
-
-    # Make predictions
-    predictions = model.transform(dataset)
-
-    # Evaluate clustering by computing Silhouette score
-    evaluator = ClusteringEvaluator()
-
-    silhouette = evaluator.evaluate(predictions)
-    print("Silhouette with squared euclidean distance = " + str(silhouette))
-
-    # Shows the result.
-    centers = model.clusterCenters()
-    print("Cluster Centers: ")
-    for center in centers:
-        print(center)
-    # $example off$
-
-    spark.stop()
-
diff --git a/dev/test-cluster/workloads/run-kmeans-pyspark.sh b/dev/test-cluster/workloads/run-kmeans-pyspark.sh
deleted file mode 100755
index e07f3f7b6..000000000
--- a/dev/test-cluster/workloads/run-kmeans-pyspark.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-
-source ../envs.sh
-
-# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt), the data file should be copied to HDFS
-$HADOOP_HOME/bin/hadoop fs -mkdir -p $OAP_MLLIB_DATA_ROOT
-$HADOOP_HOME/bin/hadoop fs -copyFromLocal $SPARK_HOME/data/mllib/sample_kmeans_data.txt $OAP_MLLIB_DATA_ROOT
-
-# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly
-SPARK_MASTER=yarn
-SPARK_DRIVER_MEMORY=1G
-SPARK_NUM_EXECUTORS=2
-SPARK_EXECUTOR_CORES=1
-SPARK_EXECUTOR_MEMORY=1G
-
-SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
-
-# ======================================================= #
-
-# Check env
-if [[ -z $SPARK_HOME ]]; then
-    echo SPARK_HOME not defined!
-    exit 1
-fi
-
-if [[ -z $HADOOP_HOME ]]; then
-    echo HADOOP_HOME not defined!
-    exit 1
-fi
-
-APP_PY="$OAP_MLLIB_ROOT/dev/test-cluster/workloads/kmeans-pyspark.py"
-DATA_FILE=$OAP_MLLIB_DATA_ROOT/sample_kmeans_data.txt
-
-$SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
-    --num-executors $SPARK_NUM_EXECUTORS \
-    --driver-memory $SPARK_DRIVER_MEMORY \
-    --executor-cores $SPARK_EXECUTOR_CORES \
-    --executor-memory $SPARK_EXECUTOR_MEMORY \
-    --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
-    --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
-    --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
-    --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
-    --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \    
-    --conf "spark.shuffle.reduceLocality.enabled=false" \
-    --conf "spark.network.timeout=1200s" \
-    --conf "spark.task.maxFailures=1" \
-    --jars $OAP_MLLIB_JAR \
-    $APP_PY $DATA_FILE
diff --git a/dev/test-cluster/yarn-site.xml b/dev/test-cluster/yarn-site.xml
deleted file mode 100644
index ff74d23a7..000000000
--- a/dev/test-cluster/yarn-site.xml
+++ /dev/null
@@ -1,67 +0,0 @@
-<?xml version="1.0"?>
-<!--
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License. See accompanying LICENSE file.
--->
-<configuration>
-	<!-- Site specific YARN configuration properties -->
-	<property>
-		<name>yarn.nodemanager.aux-services</name>
-		<value>mapreduce_shuffle</value>
-	</property>
-	<property>
-		<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
-		<value>org.apache.hadoop.mapred.ShuffleHandler</value>
-	</property>
-	<property>
-		<name>yarn.resourcemanager.hostname</name>
-		<value>localhost</value>
-	</property>
-	<property>
-		<name>yarn.resourcemanager.address</name>
-		<value>localhost:8032</value>
-	</property>
-
-	<property>
-		<name>yarn.nodemanager.resource.memory-mb</name>
-		<value>7168</value>
-	</property>
-	<property>
-		<name>yarn.nodemanager.resource.cpu-vcores</name>
-		<value>2</value>
-	</property>
-	<property>
-		<name>yarn.nodemanager.vmem-check-enabled</name>
-		<value>false</value>
-	</property>
-	<property>
-		<name>yarn.nodemanager.vmem-pmem-ratio</name>
-		<value>2</value>
-	</property>
-	<property>
-		<name>yarn.scheduler.minimum-allocation-mb</name>
-		<value>1024</value>
-	</property>
-	<property>
-		<name>yarn.scheduler.maximum-allocation-mb</name>
-		<value>7168</value>
-	</property>
-	<property>
-		<name>yarn.scheduler.minimum-allocation-vcores</name>
-		<value>1</value>
-	</property>
-	<property>
-		<name>yarn.scheduler.maximum-allocation-vcores</name>
-		<value>2</value>
-	</property>
-
-</configuration>
diff --git a/examples/als-hibench/build.sh b/examples/als-hibench/build.sh
deleted file mode 100755
index 8cbc692be..000000000
--- a/examples/als-hibench/build.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/usr/bin/env bash
-
-mvn clean package
\ No newline at end of file
diff --git a/examples/als-hibench/pom.xml b/examples/als-hibench/pom.xml
deleted file mode 100644
index 68e02c256..000000000
--- a/examples/als-hibench/pom.xml
+++ /dev/null
@@ -1,100 +0,0 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0" 
-  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <groupId>com.intel.oap</groupId>
-  <artifactId>oap-mllib-examples</artifactId>
-  <version>0.9.0-with-spark-3.0.0</version>
-  <packaging>jar</packaging>
-
-  <name>ALSHiBenchExample</name>
-  <url>https://github.com/Intel-bigdata/OAP</url>
-
-  <properties>
-    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    <scala.version>2.12.10</scala.version>
-    <scala.binary.version>2.12</scala.binary.version>
-    <spark.version>3.0.0</spark.version>
-  </properties>
-
-  <dependencies>
-
-    <dependency>
-      <groupId>org.scala-lang</groupId>
-      <artifactId>scala-library</artifactId>
-      <version>2.12.10</version>
-    </dependency>
-
-     <dependency>
-      <groupId>com.github.scopt</groupId>
-      <artifactId>scopt_2.12</artifactId>
-      <version>3.7.0</version>      
-    </dependency>
-
-    <!--<dependency>-->
-      <!--<groupId>com.github.fommil.netlib</groupId>-->
-      <!--<artifactId>all</artifactId>-->
-      <!--<version>1.1.2</version>      -->
-      <!--<type>pom</type>-->
-    <!--</dependency>-->
-
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-sql_2.12</artifactId>
-      <version>${spark.version}</version>
-      <scope>provided</scope>
-    </dependency>
-
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-mllib_2.12</artifactId>
-      <version>${spark.version}</version>
-      <scope>provided</scope>
-    </dependency>
-
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.scala-tools</groupId>
-        <artifactId>maven-scala-plugin</artifactId>
-        <version>2.15.2</version>
-        <executions>
-          <execution>
-            <goals>
-              <goal>compile</goal>
-              <goal>testCompile</goal>
-            </goals>
-          </execution>
-        </executions>
-        <configuration>
-          <scalaVersion>${scala.version}</scalaVersion>
-          <args>
-            <arg>-target:jvm-1.8</arg>
-          </args>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-        <version>3.0.0</version>
-        <configuration>
-          <appendAssemblyId>false</appendAssemblyId>
-          <descriptorRefs>
-            <descriptorRef>jar-with-dependencies</descriptorRef>
-          </descriptorRefs>
-        </configuration>
-        <executions>
-          <execution>
-            <id>assembly</id>
-            <phase>package</phase>
-            <goals>
-              <goal>single</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-    </plugins>
-  </build>
-
-</project>
diff --git a/examples/als-hibench/run-hibench-oap-mllib.sh b/examples/als-hibench/run-hibench-oap-mllib.sh
deleted file mode 100755
index 050b80558..000000000
--- a/examples/als-hibench/run-hibench-oap-mllib.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/usr/bin/env bash
-
-export HDFS_ROOT=hdfs://sr591:8020
-export OAP_MLLIB_ROOT=/home/xiaochang/Works/OAP-xwu99-als/oap-mllib
-
-SPARK_MASTER=yarn
-SPARK_DRIVER_MEMORY=16G
-SPARK_NUM_EXECUTORS=6
-SPARK_EXECUTOR_CORES=28
-SPARK_EXECUTOR_MEMORY_OVERHEAD=25G
-SPARK_EXECUTOR_MEMORY=100G
-
-SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
-#SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES)
-
-# ======================================================= #
-
-# for log suffix
-SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" )
-
-# Check envs
-if [[ -z $SPARK_HOME ]]; then
-    echo SPARK_HOME not defined!
-    exit 1
-fi
-
-if [[ -z $HADOOP_HOME ]]; then
-    echo HADOOP_HOME not defined!
-    exit 1
-fi
-
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-
-# Target jar built
-OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar
-OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
-
-# Use absolute path
-SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
-# Use relative path
-SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
-
-APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar
-APP_CLASS=com.intel.hibench.sparkbench.ml.ALSExample
-
-HDFS_INPUT=hdfs://sr591:8020/HiBench/ALS/Input
-RANK=10
-NUM_ITERATIONS=1
-LAMBDA=0.1
-IMPLICIT=true
-
-/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
-    --num-executors $SPARK_NUM_EXECUTORS \
-    --driver-memory $SPARK_DRIVER_MEMORY \
-    --executor-cores $SPARK_EXECUTOR_CORES \
-    --executor-memory $SPARK_EXECUTOR_MEMORY \
-    --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
-    --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
-    --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
-    --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
-    --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
-    --conf "spark.shuffle.reduceLocality.enabled=false" \
-    --conf "spark.executor.memoryOverhead=$SPARK_EXECUTOR_MEMORY_OVERHEAD" \
-    --conf "spark.network.timeout=1200s" \
-    --conf "spark.task.maxFailures=1" \
-    --jars $OAP_MLLIB_JAR \
-    --class $APP_CLASS \
-    $APP_JAR \
-    --rank $RANK --numIterations $NUM_ITERATIONS --implicitPrefs $IMPLICIT --lambda $LAMBDA \
-    --numProductBlocks $SPARK_DEFAULT_PARALLELISM --numUserBlocks $SPARK_DEFAULT_PARALLELISM \
-    $HDFS_INPUT \
-    2>&1 | tee ALS-$SUFFIX-$(date +%m%d_%H_%M_%S).log
-
diff --git a/examples/als-hibench/run-hibench-vanilla.sh b/examples/als-hibench/run-hibench-vanilla.sh
deleted file mode 100755
index 6cb6b3ae7..000000000
--- a/examples/als-hibench/run-hibench-vanilla.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/usr/bin/env bash
-
-export HDFS_ROOT=hdfs://sr591:8020
-
-SPARK_MASTER=yarn
-SPARK_DRIVER_MEMORY=16G
-SPARK_NUM_EXECUTORS=6
-SPARK_EXECUTOR_CORES=28
-SPARK_EXECUTOR_MEMORY_OVERHEAD=25G
-SPARK_EXECUTOR_MEMORY=100G
-
-SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
-
-# ======================================================= #
-
-# for log suffix
-SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" )
-
-# Check envs
-if [[ -z $SPARK_HOME ]]; then
-    echo SPARK_HOME not defined!
-    exit 1
-fi
-
-if [[ -z $HADOOP_HOME ]]; then
-    echo HADOOP_HOME not defined!
-    exit 1
-fi
-
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-
-APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar
-APP_CLASS=com.intel.hibench.sparkbench.ml.ALSExample
-
-HDFS_INPUT=hdfs://sr591:8020/HiBench/ALS/Input
-RANK=10
-NUM_ITERATIONS=1
-LAMBDA=0.1
-IMPLICIT=true
-
-/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
-    --num-executors $SPARK_NUM_EXECUTORS \
-    --driver-memory $SPARK_DRIVER_MEMORY \
-    --executor-cores $SPARK_EXECUTOR_CORES \
-    --executor-memory $SPARK_EXECUTOR_MEMORY \
-    --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
-    --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
-    --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
-    --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
-    --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
-    --conf "spark.shuffle.reduceLocality.enabled=false" \
-    --conf "spark.executor.memoryOverhead=$SPARK_EXECUTOR_MEMORY_OVERHEAD" \
-    --conf "spark.network.timeout=1200s" \
-    --conf "spark.task.maxFailures=1" \
-    --class $APP_CLASS \
-    $APP_JAR \
-    --rank $RANK --numIterations $NUM_ITERATIONS --implicitPrefs $IMPLICIT --lambda $LAMBDA \
-    --numProductBlocks $SPARK_DEFAULT_PARALLELISM --numUserBlocks $SPARK_DEFAULT_PARALLELISM \
-    $HDFS_INPUT \
-    2>&1 | tee ALS-$SUFFIX-$(date +%m%d_%H_%M_%S).log
-
diff --git a/examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala b/examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala
deleted file mode 100644
index 5a29bcc80..000000000
--- a/examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.intel.hibench.sparkbench.ml
-
-import org.apache.spark.ml.evaluation.RegressionEvaluator
-import org.apache.spark.ml.recommendation.ALS
-import org.apache.spark.ml.recommendation.ALS.Rating
-import org.apache.spark.sql.SparkSession
-import scopt.OptionParser
-
-object ALSExample {
-
-  case class Params(
-    dataPath: String = null,
-    numIterations: Int = 10,
-    lambda: Double = 0.1,
-    rank: Int = 10,
-    numUserBlocks: Int = 10,
-    numItemBlocks: Int = 10,
-    implicitPrefs: Boolean = false)
-
-  def main(args: Array[String]) {
-    val defaultParams = Params()
-
-    val parser = new OptionParser[Params]("ALS") {
-      head("ALS: an example app for ALS on User-Item data.")
-      opt[Int]("rank")
-        .text(s"rank, default: ${defaultParams.rank}")
-        .action((x, c) => c.copy(rank = x))
-      opt[Int]("numIterations")
-        .text(s"number of iterations, default: ${defaultParams.numIterations}")
-        .action((x, c) => c.copy(numIterations = x))
-      opt[Double]("lambda")
-        .text(s"regularization parameter, default: ${defaultParams.lambda}")
-        .action((x, c) => c.copy(lambda = x))
-      opt[Int]("numUserBlocks")
-        .text(s"number of user blocks, default: ${defaultParams.numUserBlocks}")
-        .action((x, c) => c.copy(numUserBlocks = x))
-      opt[Int]("numProductBlocks")
-        .text(s"number of product blocks, default: ${defaultParams.numItemBlocks}")
-        .action((x, c) => c.copy(numItemBlocks = x))
-      opt[Boolean]("implicitPrefs")
-        .text("implicit preference, default: ${defaultParams.implicitPrefs}")
-        .action((x, c) => c.copy(implicitPrefs = x))
-      arg[String]("<dataPath>")
-        .required()
-        .text("Input paths to a User-Product dataset of ratings")
-        .action((x, c) => c.copy(dataPath = x))
-    }
-    parser.parse(args, defaultParams) match {
-      case Some(params) => run(params)
-      case _ => sys.exit(1)
-    }
-  }
-
-  def run(params: Params): Unit = {
-    val spark = SparkSession
-      .builder
-      .appName(s"ALS with $params")
-      .getOrCreate()
-    val sc = spark.sparkContext
-
-    import spark.implicits._
-
-    val ratings = sc.objectFile[Rating[Int]](params.dataPath).toDF()
-
-    val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2), 1L)
-
-    // Build the recommendation model using ALS on the training data
-    val als = new ALS()
-      .setRank(params.rank)
-      .setMaxIter(params.numIterations)
-      .setRegParam(params.lambda)
-      .setImplicitPrefs(params.implicitPrefs)
-      .setNumUserBlocks(params.numUserBlocks)
-      .setNumItemBlocks(params.numItemBlocks)
-      .setUserCol("user")
-      .setItemCol("item")
-      .setRatingCol("rating")
-    val model = als.fit(training)
-
-    // Evaluate the model by computing the RMSE on the test data
-    // Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
-    model.setColdStartStrategy("drop")
-    val predictions = model.transform(test)
-
-    val evaluator = new RegressionEvaluator()
-      .setMetricName("rmse")
-      .setLabelCol("rating")
-      .setPredictionCol("prediction")
-    val rmse = evaluator.evaluate(predictions)
-    println(s"Root-mean-square error = $rmse")
-
-    spark.stop()
-  }
-}
diff --git a/examples/als-pyspark/als-pyspark.py b/examples/als-pyspark/als-pyspark.py
deleted file mode 100644
index 8847ca2b9..000000000
--- a/examples/als-pyspark/als-pyspark.py
+++ /dev/null
@@ -1,67 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-import sys
-if sys.version >= '3':
-    long = int
-
-from pyspark.sql import SparkSession
-
-# $example on$
-from pyspark.ml.evaluation import RegressionEvaluator
-from pyspark.ml.recommendation import ALS
-from pyspark.sql import Row
-# $example off$
-
-if __name__ == "__main__":
-    spark = SparkSession\
-        .builder\
-        .appName("ALSExample")\
-        .getOrCreate()
-
-    if (len(sys.argv) != 2) :
-        print("Require data file path as input parameter")
-        sys.exit(1)
-
-    # $example on$
-    lines = spark.read.text(sys.argv[1]).rdd
-    parts = lines.map(lambda row: row.value.split("::"))
-    ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
-                                         rating=float(p[2])))
-    ratings = spark.createDataFrame(ratingsRDD)   
-    # (training, test) = ratings.randomSplit([0.8, 0.2])
-
-    # Build the recommendation model using ALS on the training data
-    # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
-    als = ALS(rank=10, maxIter=5, regParam=0.01, implicitPrefs=True, alpha=40.0,
-              userCol="userId", itemCol="movieId", ratingCol="rating",
-              coldStartStrategy="drop")
-    print("\nALS training with implicitPrefs={}, rank={}, maxIter={}, regParam={}, alpha={}, seed={}\n".format(
-        als.getImplicitPrefs(), als.getRank(), als.getMaxIter(), als.getRegParam(), als.getAlpha(), als.getSeed()
-    ))          
-    model = als.fit(ratings)    
-
-     # Evaluate the model by computing the RMSE on the test data
-    # predictions = model.transform(test)
-    # evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
-    #                                 predictionCol="prediction")
-    # rmse = evaluator.evaluate(predictions)
-    # print("Root-mean-square error = " + str(rmse))
-    
-    spark.stop()
diff --git a/examples/als-pyspark/run.sh b/examples/als-pyspark/run.sh
deleted file mode 100755
index b3ba1b6d2..000000000
--- a/examples/als-pyspark/run.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env bash
-
-# == User to customize the following environments ======= #
-
-# Set user Spark and Hadoop home directory
-#export SPARK_HOME=/path/to/your/spark/home
-#export HADOOP_HOME=/path/to/your/hadoop/home
-# Set user HDFS Root
-export HDFS_ROOT=hdfs://sr549:8020
-# Set user Intel MLlib Root directory
-export OAP_MLLIB_ROOT=/home/xiaochang/Works/OAP-xwu99-als/oap-mllib
-# Set IP and Port for oneCCL KVS, you can select any one of the worker nodes and set CCL_KVS_IP_PORT to its IP and Port
-# IP can be got with `hostname -I`, if multiple IPs are returned, the first IP should be used. Port can be any available port.
-# For example, if one of the worker IP is 192.168.0.1 and an available port is 51234. 
-# CCL_KVS_IP_PORT can be set in the format of 192.168.0.1_51234
-# Incorrectly setting this value will result in hanging when oneCCL initialize
-export CCL_KVS_IP_PORT=10.0.2.149_51234
-
-# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt), the data file should be copied to HDFS
-DATA_FILE=data/onedal_als_csr_ratings.txt
-
-# == User to customize Spark executor cores and memory == #
-
-# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly
-SPARK_MASTER=yarn
-SPARK_DRIVER_MEMORY=1G
-SPARK_NUM_EXECUTORS=2
-SPARK_EXECUTOR_CORES=1
-SPARK_EXECUTOR_MEMORY=1G
-
-SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
-
-# ======================================================= #
-
-# Check env
-if [[ -z $SPARK_HOME ]]; then
-    echo SPARK_HOME not defined!
-    exit 1
-fi
-
-if [[ -z $HADOOP_HOME ]]; then
-    echo HADOOP_HOME not defined!
-    exit 1
-fi
-
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-
-# Target jar built
-OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar
-OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
-
-# Use absolute path
-SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
-# Use relative path
-SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
-
-APP_PY=als-pyspark.py
-
-/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
-    --num-executors $SPARK_NUM_EXECUTORS \
-    --driver-memory $SPARK_DRIVER_MEMORY \
-    --executor-cores $SPARK_EXECUTOR_CORES \
-    --executor-memory $SPARK_EXECUTOR_MEMORY \
-    --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
-    --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
-    --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
-    --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
-    --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
-    --conf "spark.executorEnv.CCL_KVS_IP_PORT=$CCL_KVS_IP_PORT" \
-    --conf "spark.shuffle.reduceLocality.enabled=false" \
-    --conf "spark.network.timeout=1200s" \
-    --conf "spark.task.maxFailures=1" \
-    --jars $OAP_MLLIB_JAR \
-    $APP_PY $DATA_FILE \
-    2>&1 | tee ALS-$(date +%m%d_%H_%M_%S).log
diff --git a/examples/kmeans-hibench/build.sh b/examples/kmeans-hibench/build.sh
deleted file mode 100755
index da373645b..000000000
--- a/examples/kmeans-hibench/build.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/usr/bin/env bash
-
-mvn clean package
diff --git a/examples/kmeans-hibench/pom.xml b/examples/kmeans-hibench/pom.xml
deleted file mode 100644
index 3f5b56e29..000000000
--- a/examples/kmeans-hibench/pom.xml
+++ /dev/null
@@ -1,99 +0,0 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0" 
-  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <groupId>com.intel.oap</groupId>
-  <artifactId>oap-mllib-examples</artifactId>
-  <version>1.1.0-with-spark-3.0.0</version>
-  <packaging>jar</packaging>
-
-  <name>KMeansHiBenchExample</name>
-  <url>https://github.com/oap-project/oap-mllib.git</url>
-
-  <properties>
-    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    <scala.version>2.12.10</scala.version>
-    <scala.binary.version>2.12</scala.binary.version>
-    <spark.version>3.0.0</spark.version>
-  </properties>
-
-  <dependencies>
-
-    <dependency>
-      <groupId>org.scala-lang</groupId>
-      <artifactId>scala-library</artifactId>
-      <version>2.12.10</version>
-    </dependency>
-
-    <dependency>
-      <groupId>com.github.scopt</groupId>
-      <artifactId>scopt_2.12</artifactId>
-      <version>3.7.0</version>      
-    </dependency>
-
-    <dependency>
-      <groupId>org.apache.mahout</groupId>
-      <artifactId>mahout-hdfs</artifactId>
-      <version>14.1</version>
-    </dependency>
-
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-sql_2.12</artifactId>
-      <version>${spark.version}</version>
-      <scope>provided</scope>
-    </dependency>
-
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-mllib_2.12</artifactId>
-      <version>${spark.version}</version>
-      <scope>provided</scope>
-    </dependency>
-
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.scala-tools</groupId>
-        <artifactId>maven-scala-plugin</artifactId>
-        <version>2.15.2</version>
-        <executions>
-          <execution>
-            <goals>
-              <goal>compile</goal>
-              <goal>testCompile</goal>
-            </goals>
-          </execution>
-        </executions>
-        <configuration>
-          <scalaVersion>${scala.version}</scalaVersion>
-          <args>
-            <arg>-target:jvm-1.8</arg>
-          </args>
-        </configuration>
-      </plugin>
-      <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-        <version>3.0.0</version>
-        <configuration>
-          <appendAssemblyId>false</appendAssemblyId>
-          <descriptorRefs>
-            <descriptorRef>jar-with-dependencies</descriptorRef>
-          </descriptorRefs>
-        </configuration>
-        <executions>
-          <execution>
-            <id>assembly</id>
-            <phase>package</phase>
-            <goals>
-              <goal>single</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-    </plugins>
-  </build>
-
-</project>
diff --git a/examples/kmeans-hibench/run-hibench-oap-mllib.sh b/examples/kmeans-hibench/run-hibench-oap-mllib.sh
deleted file mode 100755
index caa42584f..000000000
--- a/examples/kmeans-hibench/run-hibench-oap-mllib.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env bash
-
-# == User to customize the following environments ======= #
-
-# Set user Spark and Hadoop home directory
-export SPARK_HOME=/path/to/your/spark/home
-export HADOOP_HOME=/path/to/your/hadoop/home
-# Set user HDFS Root
-export HDFS_ROOT=hdfs://your_hostname:8020
-# Set user Intel MLlib Root directory
-export OAP_MLLIB_ROOT=/path/to/your/OAP/oap-mllib
-# Set IP and Port for oneCCL KVS, you can select any one of the worker nodes and set CCL_KVS_IP_PORT to its IP and Port
-# IP can be got with `hostname -I`, if multiple IPs are returned, the first IP should be used. Port can be any available port.
-# For example, if one of the worker IP is 192.168.0.1 and an available port is 51234. 
-# CCL_KVS_IP_PORT can be set in the format of 192.168.0.1_51234
-# Incorrectly setting this value will result in hanging when oneCCL initialize
-export CCL_KVS_IP_PORT=192.168.0.1_51234
-
-# == User to customize Spark executor cores and memory == #
-
-# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly
-SPARK_MASTER=yarn
-SPARK_DRIVER_MEMORY=8G
-SPARK_NUM_EXECUTORS=6
-SPARK_EXECUTOR_CORES=15
-SPARK_EXECUTOR_MEMORY_OVERHEAD=25G
-SPARK_EXECUTOR_MEMORY=50G
-
-SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
-
-# ======================================================= #
-
-# for log suffix
-SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" )
-
-# Check envs
-if [[ -z $SPARK_HOME ]]; then
-    echo SPARK_HOME not defined!
-    exit 1
-fi
-
-if [[ -z $HADOOP_HOME ]]; then
-    echo HADOOP_HOME not defined!
-    exit 1
-fi
-
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-
-# Target jar built
-OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar
-OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
-
-# Use absolute path
-SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
-# Use relative path
-SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
-
-APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar
-APP_CLASS=com.intel.hibench.sparkbench.ml.DenseKMeansDS
-
-K=200
-INIT_MODE=Random
-MAX_ITERATION=20
-INPUT_HDFS=$HDFS_ROOT/HiBench/Kmeans/Input/samples
-
-/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
-    --num-executors $SPARK_NUM_EXECUTORS \
-    --driver-memory $SPARK_DRIVER_MEMORY \
-    --executor-cores $SPARK_EXECUTOR_CORES \
-    --executor-memory $SPARK_EXECUTOR_MEMORY \
-    --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
-    --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
-    --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
-    --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
-    --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
-    --conf "spark.executorEnv.CCL_KVS_IP_PORT=$CCL_KVS_IP_PORT" \
-    --conf "spark.shuffle.reduceLocality.enabled=false" \
-    --conf "spark.executor.memoryOverhead=$SPARK_EXECUTOR_MEMORY_OVERHEAD" \
-    --conf "spark.memory.fraction=0.8" \
-    --conf "spark.network.timeout=1200s" \
-    --conf "spark.task.maxFailures=1" \
-    --jars $OAP_MLLIB_JAR \
-    --class $APP_CLASS \
-    $APP_JAR \
-    -k $K --initMode $INIT_MODE --numIterations $MAX_ITERATION $INPUT_HDFS \
-    2>&1 | tee KMeansHiBench-$SUFFIX-$(date +%m%d_%H_%M_%S).log
diff --git a/examples/kmeans-hibench/run-hibench-vanilla.sh b/examples/kmeans-hibench/run-hibench-vanilla.sh
deleted file mode 100755
index 475c25aff..000000000
--- a/examples/kmeans-hibench/run-hibench-vanilla.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/usr/bin/env bash
-
-# == User to customize the following environments ======= #
-
-# Set user Spark and Hadoop home directory
-export SPARK_HOME=/path/to/your/spark/home
-export HADOOP_HOME=/path/to/your/hadoop/home
-# Set user HDFS Root
-export HDFS_ROOT=hdfs://your_hostname:8020
-
-# == User to customize Spark executor cores and memory == #
-
-SPARK_MASTER=yarn
-SPARK_DRIVER_MEMORY=8G
-SPARK_NUM_EXECUTORS=6
-SPARK_EXECUTOR_CORES=15
-SPARK_EXECUTOR_MEMORY=75G
-
-SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
-
-# ======================================================= #
-
-# for log suffix
-SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" )
-
-# Check envs
-if [[ -z $SPARK_HOME ]]; then
-    echo SPARK_HOME not defined!
-    exit 1
-fi
-
-if [[ -z $HADOOP_HOME ]]; then
-    echo HADOOP_HOME not defined!
-    exit 1
-fi
-
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-
-APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar
-APP_CLASS=com.intel.hibench.sparkbench.ml.DenseKMeansDS
-
-K=200
-INIT_MODE=Random
-MAX_ITERATION=20
-INPUT_HDFS=$HDFS_ROOT/HiBench/Kmeans/Input/samples
-
-/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
-    --num-executors $SPARK_NUM_EXECUTORS \
-    --driver-memory $SPARK_DRIVER_MEMORY \
-    --executor-cores $SPARK_EXECUTOR_CORES \
-    --executor-memory $SPARK_EXECUTOR_MEMORY \
-    --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
-    --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
-    --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
-    --class $APP_CLASS \
-    $APP_JAR \
-    -k $K --initMode $INIT_MODE --numIterations $MAX_ITERATION $INPUT_HDFS \
-    2>&1 | tee KMeansHiBench-$SUFFIX-$(date +%m%d_%H_%M_%S).log
diff --git a/examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala b/examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala
deleted file mode 100644
index 3a949bb1c..000000000
--- a/examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.intel.hibench.sparkbench.ml
-
-import breeze.linalg.DenseVector
-import org.apache.hadoop.io.LongWritable
-import org.apache.mahout.math.VectorWritable
-import org.apache.spark.ml.clustering.KMeans
-import org.apache.spark.ml.evaluation.ClusteringEvaluator
-import org.apache.spark.ml.linalg.{Vector, Vectors}
-import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.sql._
-import scopt.OptionParser
-import org.apache.spark.sql.SparkSession
-
-object DenseKMeansDS {
-
-  object InitializationMode extends Enumeration {
-    type InitializationMode = Value
-    val Random, Parallel = Value
-  }
-
-  import com.intel.hibench.sparkbench.ml.DenseKMeansDS.InitializationMode._
-
-  case class Params(input: String = null,
-                    k: Int = -1,
-                    numIterations: Int = 10,
-                    initializationMode: InitializationMode = Random)
-
-  def main(args: Array[String]) {
-    val defaultParams = Params()
-
-    val parser = new OptionParser[Params]("DenseKMeans") {
-      head("DenseKMeans: an example k-means app for dense data.")
-      opt[Int]('k', "k")
-        .required()
-        .text(s"number of clusters, required")
-        .action((x, c) => c.copy(k = x))
-      opt[Int]("numIterations")
-        .text(s"number of iterations, default; ${defaultParams.numIterations}")
-        .action((x, c) => c.copy(numIterations = x))
-      opt[String]("initMode")
-        .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
-          s"default: ${defaultParams.initializationMode}")
-        .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
-      arg[String]("<input>")
-        .text("input paths to examples")
-        .required()
-        .action((x, c) => c.copy(input = x))
-    }
-
-    parser.parse(args, defaultParams).map { params =>
-      run(params)
-    }.getOrElse {
-      sys.exit(1)
-    }
-  }
-
-  def run(params: Params) {
-    val spark = SparkSession
-      .builder
-      .appName(s"DenseKMeansDS with $params")
-      .getOrCreate()
-    import spark.implicits._
-
-    val sc = spark.sparkContext
-
-    val data = sc.sequenceFile[LongWritable, VectorWritable](params.input)
-
-    // Should use Tuple1 to warp around for calling toDF
-    val dataset = data.map { case (k, v) =>
-      var vector: Array[Double] = new Array[Double](v.get().size)
-      for (i <- 0 until v.get().size) vector(i) = v.get().get(i)
-      Tuple1(Vectors.dense(vector))
-    }.toDF("features")    
-
-    val initMode = params.initializationMode match {
-      case Random => "random"
-      case Parallel => "k-means||"
-    }
-
-    val model = new KMeans()
-      .setInitMode(initMode)
-      .setK(params.k)
-      .setMaxIter(params.numIterations)
-      .setSeed(1L)
-      .fit(dataset)    
-
-    spark.stop()
-  }
-}
-
diff --git a/mllib-dal/pom.xml b/mllib-dal/pom.xml
index 4e51f9157..01e002830 100644
--- a/mllib-dal/pom.xml
+++ b/mllib-dal/pom.xml
@@ -218,12 +218,10 @@
                         <resource>
                             <directory>${env.CCL_ROOT}/lib</directory>
                             <includes>
-                                <!--<include>libpmi.so.1</include>-->
-                                <!--<include>libresizable_pmi.so.1</include>-->
-                                <include>libmpi.so.12.0.0</include>
+                                <include>libpmi.so.1</include>
+                                <include>libresizable_pmi.so.1</include>
                                 <include>libfabric.so.1</include>
-				<include>libccl.so</include>
-                                <!--<include>libccl_atl_ofi.so.1</include>-->
+                                <include>libccl_atl_ofi.so.1</include>
                             </includes>
                         </resource>
                         <resource>
@@ -273,13 +271,9 @@
                                     <destinationFile>${project.build.testOutputDirectory}/lib/libtbbmalloc.so.2</destinationFile>
                                 </fileSet>
                                 <fileSet>
-                                    <sourceFile>${project.build.testOutputDirectory}/lib/libmpi.so.12.0.0</sourceFile>
-                                    <destinationFile>${project.build.testOutputDirectory}/lib/libmpi.so.12</destinationFile>
+                                    <sourceFile>${project.build.testOutputDirectory}/lib/libccl_atl_ofi.so.1</sourceFile>
+                                    <destinationFile>${project.build.testOutputDirectory}/lib/libccl_atl_ofi.so</destinationFile>
                                 </fileSet>
-                                <!--<fileSet>-->
-                                    <!--<sourceFile>${project.build.testOutputDirectory}/lib/libccl_atl_ofi.so.1</sourceFile>-->
-                                    <!--<destinationFile>${project.build.testOutputDirectory}/lib/libccl_atl_ofi.so</destinationFile>-->
-                                <!--</fileSet>-->
                             </fileSets>
                         </configuration>
                     </execution>
diff --git a/mllib-dal/src/assembly/assembly.xml b/mllib-dal/src/assembly/assembly.xml
index 498b90e02..137f19b81 100644
--- a/mllib-dal/src/assembly/assembly.xml
+++ b/mllib-dal/src/assembly/assembly.xml
@@ -58,21 +58,26 @@
         </file>
         <!-- Include oneCCL libraries into JAR  -->
         <file>
-            <source>${env.CCL_ROOT}/lib/libfabric.so.1</source>
+            <source>${env.CCL_ROOT}/lib/libpmi.so.1</source>
             <outputDirectory>lib</outputDirectory>
         </file>
         <file>
-            <source>${env.CCL_ROOT}/lib/libmpi.so.12.0.0</source>
+            <source>${env.CCL_ROOT}/lib/libresizable_pmi.so.1</source>
             <outputDirectory>lib</outputDirectory>
-            <destName>libmpi.so.12</destName>
         </file>
         <file>
-            <source>${env.CCL_ROOT}/lib/libccl.so</source>
+            <source>${env.CCL_ROOT}/lib//libfabric.so.1</source>
             <outputDirectory>lib</outputDirectory>
         </file>
         <file>
             <source>${env.CCL_ROOT}/lib/prov/libsockets-fi.so</source>
             <outputDirectory>lib</outputDirectory>
         </file>
+        <file>
+            <!-- Should rename to XXX.so for ATL to load -->
+            <source>${env.CCL_ROOT}/lib/libccl_atl_ofi.so.1</source>
+            <outputDirectory>lib</outputDirectory>
+            <destName>libccl_atl_ofi.so</destName>
+        </file>
     </files>
 </assembly>
\ No newline at end of file
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSPartitionInfo.java b/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSPartitionInfo.java
deleted file mode 100644
index 5f2561772..000000000
--- a/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSPartitionInfo.java
+++ /dev/null
@@ -1,6 +0,0 @@
-package org.apache.spark.ml.recommendation;
-
-public class ALSPartitionInfo {
-    public int ratingsNum;
-    public int csrRowNum;
-}
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSResult.java b/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSResult.java
deleted file mode 100644
index 67173a0c5..000000000
--- a/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSResult.java
+++ /dev/null
@@ -1,9 +0,0 @@
-package org.apache.spark.ml.recommendation;
-
-public class ALSResult {
-    public long rankId = -1;
-    public long cUsersFactorsNumTab;
-    public long cItemsFactorsNumTab;
-    public long cUserOffset;
-    public long cItemOffset;
-}
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java b/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java
index d8ea09a23..5b51451ae 100644
--- a/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java
+++ b/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java
@@ -21,8 +21,7 @@
 import java.io.*;
 import java.util.UUID;
 import java.util.logging.Level;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import java.util.logging.Logger;
 
 import com.intel.daal.utils.LibUtils;
 
@@ -31,7 +30,8 @@ public final class LibLoader {
     // Make sure loading libraries from different temp directory for each process
     private final static String subDir = "MLlibDAL_" + UUID.randomUUID();
 
-    private static final Logger log = LoggerFactory.getLogger("LibLoader");
+    private static final Logger logger = Logger.getLogger(LibLoader.class.getName());
+    private static final Level logLevel = Level.INFO;
 
     /**
      * Get temp dir for exacting lib files
@@ -54,23 +54,24 @@ public static synchronized void loadLibraries() throws IOException {
     /**
      * Load oneCCL libs in dependency order
      */
-    private static synchronized void loadLibCCL() throws IOException {
+    public static synchronized void loadLibCCL() throws IOException {
+        loadFromJar(subDir, "libpmi.so.1");
+        loadFromJar(subDir, "libresizable_pmi.so.1");
         loadFromJar(subDir, "libfabric.so.1");
-        loadFromJar(subDir, "libmpi.so.12");
-        loadFromJar(subDir, "libccl.so");
         loadFromJar(subDir, "libsockets-fi.so");
+        loadFromJar(subDir, "libccl_atl_ofi.so");
     }
 
     /**
      * Load MLlibDAL lib, it depends TBB libs that are loaded by oneDAL,
      * so this function should be called after oneDAL loadLibrary
      */
-    private static synchronized void loadLibMLlibDAL() throws IOException {
+    public static synchronized void loadLibMLlibDAL() throws IOException {
         // oneDAL Java API doesn't load correct libtbb version for oneAPI Beta 10
         // Rename in pom.xml and assembly.xml to workaround.
         // See https://github.com/oneapi-src/oneDAL/issues/1254 -->
         LibUtils.loadLibrary();
-
+        
         loadFromJar(subDir, "libMLlibDAL.so");
     }
 
@@ -81,12 +82,12 @@ private static synchronized void loadLibMLlibDAL() throws IOException {
      * @param name library name
      */
     private static void loadFromJar(String path, String name) throws IOException {
-        log.debug("Loading " + name + " ...");
+        logger.log(logLevel, "Loading " + name + " ...");
 
         File fileOut = createTempFile(path, name);
         // File exists already
         if (fileOut == null) {
-            log.debug("DONE: Loading library as resource.");
+            logger.log(logLevel, "DONE: Loading library as resource.");
             return;
         }
 
@@ -96,7 +97,7 @@ private static void loadFromJar(String path, String name) throws IOException {
         }
 
         try (OutputStream streamOut = new FileOutputStream(fileOut)) {
-            log.debug("Writing resource to temp file.");
+            logger.log(logLevel, "Writing resource to temp file.");
 
             byte[] buffer = new byte[32768];
             while (true) {
@@ -115,7 +116,7 @@ private static void loadFromJar(String path, String name) throws IOException {
         }
 
         System.load(fileOut.toString());
-        log.debug("DONE: Loading library as resource.");
+        logger.log(logLevel, "DONE: Loading library as resource.");
     }
 
     /**
diff --git a/mllib-dal/src/main/native/ALSDALImpl.cpp b/mllib-dal/src/main/native/ALSDALImpl.cpp
deleted file mode 100644
index 53212dc1d..000000000
--- a/mllib-dal/src/main/native/ALSDALImpl.cpp
+++ /dev/null
@@ -1,668 +0,0 @@
-#include <assert.h>
-#include <daal.h>
-#include <chrono>
-#include <iostream>
-
-#include "OneCCL.h"
-#include "ALSShuffle.h"
-#include "org_apache_spark_ml_recommendation_ALSDALImpl.h"
-#include "service.h"
-
-using namespace std;
-using namespace daal;
-using namespace daal::algorithms;
-using namespace daal::algorithms::implicit_als;
-
-const int ccl_root = 0;
-
-typedef float algorithmFPType; /* Algorithm floating-point type */
-
-NumericTablePtr userOffset;
-NumericTablePtr itemOffset;
-
-// KeyValueDataCollectionPtr userOffsetsOnMaster;
-// KeyValueDataCollectionPtr itemOffsetsOnMaster;
-
-CSRNumericTablePtr dataTable;
-CSRNumericTablePtr transposedDataTable;
-
-KeyValueDataCollectionPtr userStep3LocalInput;
-KeyValueDataCollectionPtr itemStep3LocalInput;
-
-training::DistributedPartialResultStep4Ptr itemsPartialResultLocal;
-training::DistributedPartialResultStep4Ptr usersPartialResultLocal;
-std::vector<training::DistributedPartialResultStep4Ptr> itemsPartialResultsMaster;
-std::vector<training::DistributedPartialResultStep4Ptr> usersPartialResultsMaster;
-
-template <typename T>
-void gather(size_t rankId, ccl::communicator &comm, size_t nBlocks, const ByteBuffer& nodeResults, T* result) {
-  vector<size_t> perNodeArchLengthMaster(nBlocks);
-  size_t perNodeArchLength = nodeResults.size();
-  ByteBuffer serializedData;  
-
-  vector<size_t> recv_counts(nBlocks);
-  for (size_t i = 0; i < nBlocks; i++) recv_counts[i] = sizeof(size_t);
-
-  // MPI_Gather(&perNodeArchLength, sizeof(int), MPI_CHAR, perNodeArchLengthMaster,
-  // sizeof(int), MPI_CHAR, ccl_root, MPI_COMM_WORLD);
-  ccl::allgatherv(&perNodeArchLength, sizeof(size_t), perNodeArchLengthMaster.data(), recv_counts,
-                 ccl::datatype::uint8, comm).wait();    
-
-  // should resize for all ranks for ccl_allgatherv
-  size_t memoryBuf = 0;
-  for (size_t i = 0; i < nBlocks; i++) {
-    memoryBuf += perNodeArchLengthMaster[i];
-  }
-  serializedData.resize(memoryBuf);
-
-  std::vector<int> displs(nBlocks);
-  if (rankId == ccl_root) {
-    size_t shift = 0;
-    for (size_t i = 0; i < nBlocks; i++) {
-      displs[i] = shift;
-      shift += perNodeArchLengthMaster[i];
-    }
-  }
-
-  /* Transfer partial results to step 2 on the root node */
-  // MPI_Gatherv(&nodeResults[0], perNodeArchLength, MPI_CHAR, &serializedData[0],
-  // perNodeArchLengthMaster, displs, MPI_CHAR, ccl_root,
-  //             MPI_COMM_WORLD);
-  ccl::allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0],
-                 perNodeArchLengthMaster, ccl::datatype::uint8, comm).wait();  
-
-  if (rankId == ccl_root) {
-    for (size_t i = 0; i < nBlocks; i++) {
-      /* Deserialize partial results from step 1 */
-      result[i] = result[i]->cast(deserializeDAALObject(&serializedData[0] + displs[i],
-                                                        perNodeArchLengthMaster[i]));
-    }
-  }
-}
-
-// void gatherUsers(const ByteBuffer & nodeResults, int nBlocks)
-// {
-//     size_t perNodeArchLengthMaster[nBlocks];
-//     size_t perNodeArchLength = nodeResults.size();
-//     ByteBuffer serializedData;
-//     size_t recv_counts[nBlocks];
-//     for (int i = 0; i < nBlocks; i++) {
-//         recv_counts[i] = sizeof(size_t);
-//     }
-
-//     ccl_request_t request;
-//     // MPI_Allgather(&perNodeArchLength, sizeof(int), MPI_CHAR,
-//     perNodeArchLengthMaster, sizeof(int), MPI_CHAR, MPI_COMM_WORLD);
-//     ccl_allgatherv(&perNodeArchLength, sizeof(size_t), perNodeArchLengthMaster,
-//     recv_counts, ccl_dtype_char, NULL, NULL, NULL, &request); ccl_wait(request);
-
-//     size_t memoryBuf = 0;
-//     for (int i = 0; i < nBlocks; i++)
-//     {
-//         memoryBuf += perNodeArchLengthMaster[i];
-//     }
-//     serializedData.resize(memoryBuf);
-
-//     size_t shift = 0;
-//     std::vector<int> displs(nBlocks);
-//     for (int i = 0; i < nBlocks; i++)
-//     {
-//         displs[i] = shift;
-//         shift += perNodeArchLengthMaster[i];
-//     }
-
-//     /* Transfer partial results to step 2 on the root node */
-//     // MPI_Allgatherv(&nodeResults[0], perNodeArchLength, MPI_CHAR, &serializedData[0],
-//     perNodeArchLengthMaster, displs, MPI_CHAR, MPI_COMM_WORLD);
-//     ccl_allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0],
-//     perNodeArchLengthMaster, ccl_dtype_char, NULL, NULL, NULL, &request);
-//     ccl_wait(request);
-
-//     usersPartialResultsMaster.resize(nBlocks);
-//     for (int i = 0; i < nBlocks; i++)
-//     {
-//         /* Deserialize partial results from step 4 */
-//         usersPartialResultsMaster[i] =
-//             training::DistributedPartialResultStep4::cast(deserializeDAALObject(&serializedData[0]
-//             + displs[i], perNodeArchLengthMaster[i]));
-//     }
-// }
-
-// void gatherItems(const ByteBuffer & nodeResults, size_t nBlocks)
-// {
-//     size_t perNodeArchLengthMaster[nBlocks];
-//     size_t perNodeArchLength = nodeResults.size();
-//     ByteBuffer serializedData;
-//     size_t recv_counts[nBlocks];
-//     for (size_t i = 0; i < nBlocks; i++) {
-//         recv_counts[i] = sizeof(size_t);
-//     }
-
-//     ccl_request_t request;
-//     // MPI_Allgather(&perNodeArchLength, sizeof(int), MPI_CHAR,
-//     perNodeArchLengthMaster, sizeof(int), MPI_CHAR, MPI_COMM_WORLD);
-//     ccl_allgatherv(&perNodeArchLength, sizeof(size_t), perNodeArchLengthMaster,
-//     recv_counts, ccl_dtype_char, NULL, NULL, NULL, &request); ccl_wait(request);
-
-//     size_t memoryBuf = 0;
-//     for (size_t i = 0; i < nBlocks; i++)
-//     {
-//         memoryBuf += perNodeArchLengthMaster[i];
-//     }
-//     serializedData.resize(memoryBuf);
-
-//     size_t shift = 0;
-//     std::vector<int> displs(nBlocks);
-//     for (size_t i = 0; i < nBlocks; i++)
-//     {
-//         displs[i] = shift;
-//         shift += perNodeArchLengthMaster[i];
-//     }
-
-//     /* Transfer partial results to step 2 on the root node */
-//     // MPI_Allgatherv(&nodeResults[0], perNodeArchLength, MPI_CHAR, &serializedData[0],
-//     perNodeArchLengthMaster, displs, MPI_CHAR, MPI_COMM_WORLD);
-//     ccl_allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0],
-//     perNodeArchLengthMaster, ccl_dtype_char, NULL, NULL, NULL, &request);
-//     ccl_wait(request);
-
-//     itemsPartialResultsMaster.resize(nBlocks);
-//     for (size_t i = 0; i < nBlocks; i++)
-//     {
-//         /* Deserialize partial results from step 4 */
-//         itemsPartialResultsMaster[i] =
-//             training::DistributedPartialResultStep4::cast(deserializeDAALObject(&serializedData[0]
-//             + displs[i], perNodeArchLengthMaster[i]));
-//     }
-// }
-
-template <typename T>
-void all2all(ccl::communicator &comm, ByteBuffer* nodeResults, size_t nBlocks, KeyValueDataCollectionPtr result) {
-  size_t memoryBuf = 0;
-  size_t shift = 0;
-  vector<size_t> perNodeArchLengths(nBlocks);
-  vector<size_t> perNodeArchLengthsRecv(nBlocks);
-  std::vector<size_t> sdispls(nBlocks);
-  ByteBuffer serializedSendData;
-  ByteBuffer serializedRecvData;
-
-  for (size_t i = 0; i < nBlocks; i++) {
-    perNodeArchLengths[i] = nodeResults[i].size();
-    memoryBuf += perNodeArchLengths[i];
-    sdispls[i] = shift;
-    shift += perNodeArchLengths[i];
-  }
-  serializedSendData.resize(memoryBuf);
-
-  /* memcpy to avoid double compute */
-  memoryBuf = 0;
-  for (size_t i = 0; i < nBlocks; i++) {
-    for (size_t j = 0; j < perNodeArchLengths[i]; j++)
-      serializedSendData[memoryBuf + j] = nodeResults[i][j];
-    memoryBuf += perNodeArchLengths[i];
-  }
-
-  // MPI_Alltoall(perNodeArchLengths, sizeof(int), MPI_CHAR, perNodeArchLengthsRecv,
-  // sizeof(int), MPI_CHAR, MPI_COMM_WORLD);
-  ccl::alltoall(perNodeArchLengths.data(), perNodeArchLengthsRecv.data(), sizeof(size_t), ccl::datatype::uint8, comm).wait();
-
-  memoryBuf = 0;
-  shift = 0;
-  std::vector<size_t> rdispls(nBlocks);
-  for (size_t i = 0; i < nBlocks; i++) {
-    memoryBuf += perNodeArchLengthsRecv[i];
-    rdispls[i] = shift;
-    shift += perNodeArchLengthsRecv[i];
-  }
-
-  serializedRecvData.resize(memoryBuf);
-
-  /* Transfer partial results to step 2 on the root node */
-  // MPI_Alltoallv(&serializedSendData[0], perNodeArchLengths, sdispls, MPI_CHAR,
-  // &serializedRecvData[0], perNodeArchLengthsRecv, rdispls, MPI_CHAR,
-  //               MPI_COMM_WORLD);
-  ccl::alltoallv(&serializedSendData[0], perNodeArchLengths, &serializedRecvData[0],
-                perNodeArchLengthsRecv, ccl::datatype::uint8, comm).wait();
-
-  for (size_t i = 0; i < nBlocks; i++) {
-    (*result)[i] = T::cast(deserializeDAALObject(&serializedRecvData[rdispls[i]],
-                                                 perNodeArchLengthsRecv[i]));
-  }
-}
-
-KeyValueDataCollectionPtr initializeStep1Local(size_t rankId, size_t partitionId,
-                                               size_t nBlocks, size_t nUsers,
-                                               size_t nFactors) {
-  int usersPartition[1] = {(int)nBlocks};
-
-  /* Create an algorithm object to initialize the implicit ALS model with the default
-   * method */
-  training::init::Distributed<step1Local, algorithmFPType, training::init::fastCSR>
-      initAlgorithm;
-  initAlgorithm.parameter.fullNUsers = nUsers;
-  initAlgorithm.parameter.nFactors = nFactors;
-  initAlgorithm.parameter.seed += rankId;
-  initAlgorithm.parameter.partition.reset(
-      new HomogenNumericTable<int>((int*)usersPartition, 1, 1));
-  /* Pass a training data set and dependent values to the algorithm */
-  initAlgorithm.input.set(training::init::data, dataTable);
-
-  /* Initialize the implicit ALS model */
-  initAlgorithm.compute();
-
-  training::init::PartialResultPtr partialResult = initAlgorithm.getPartialResult();
-  itemStep3LocalInput = partialResult->get(training::init::outputOfInitForComputeStep3);
-  userOffset = partialResult->get(training::init::offsets, (size_t)rankId);
-  // if (rankId == ccl_root)
-  // {
-  //     userOffsetsOnMaster = partialResult->get(training::init::offsets);
-  // }
-  PartialModelPtr partialModelLocal = partialResult->get(training::init::partialModel);
-
-  itemsPartialResultLocal.reset(new training::DistributedPartialResultStep4());
-  itemsPartialResultLocal->set(training::outputOfStep4ForStep1, partialModelLocal);
-
-  return partialResult->get(training::init::outputOfStep1ForStep2);
-}
-
-void initializeStep2Local(size_t rankId, size_t partitionId,
-                          const KeyValueDataCollectionPtr& initStep2LocalInput) {
-  /* Create an algorithm object to perform the second step of the implicit ALS
-   * initialization algorithm */
-  training::init::Distributed<step2Local, algorithmFPType, training::init::fastCSR>
-      initAlgorithm;
-
-  initAlgorithm.input.set(training::init::inputOfStep2FromStep1, initStep2LocalInput);
-
-  /* Compute partial results of the second step on local nodes */
-  initAlgorithm.compute();
-
-  training::init::DistributedPartialResultStep2Ptr partialResult =
-      initAlgorithm.getPartialResult();
-  transposedDataTable =
-      CSRNumericTable::cast(partialResult->get(training::init::transposedData));
-  userStep3LocalInput = partialResult->get(training::init::outputOfInitForComputeStep3);
-  itemOffset = partialResult->get(training::init::offsets, (size_t)rankId);
-  // if (rankId == ccl_root)
-  // {
-  //     itemOffsetsOnMaster = partialResult->get(training::init::offsets);
-  // }
-}
-
-void initializeModel(size_t rankId, ccl::communicator &comm, size_t partitionId, size_t nBlocks, size_t nUsers,
-                     size_t nFactors) {
-  std::cout << "ALS (native): initializeModel " << std::endl;
-
-  auto t1 = std::chrono::high_resolution_clock::now();
-
-  KeyValueDataCollectionPtr initStep1LocalResult =
-      initializeStep1Local(rankId, partitionId, nBlocks, nUsers, nFactors);
-
-  /* MPI_Alltoallv to populate initStep2LocalInput */
-  ByteBuffer nodeCPs[nBlocks];
-  for (size_t i = 0; i < nBlocks; i++) {
-    serializeDAALObject((*initStep1LocalResult)[i].get(), nodeCPs[i]);
-  }
-  KeyValueDataCollectionPtr initStep2LocalInput(new KeyValueDataCollection());
-  all2all<NumericTable>(comm, nodeCPs, nBlocks, initStep2LocalInput);
-
-  initializeStep2Local(rankId, partitionId, initStep2LocalInput);
-
-  auto t2 = std::chrono::high_resolution_clock::now();
-  auto duration = std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
-  std::cout << "ALS (native): initializeModel took " << duration << " secs" << std::endl;
-}
-
-training::DistributedPartialResultStep1Ptr computeStep1Local(
-    const training::DistributedPartialResultStep4Ptr& partialResultLocal,
-    size_t nFactors) {
-  /* Create algorithm objects to compute implicit ALS algorithm in the distributed
-   * processing mode on the local node using the default method */
-  training::Distributed<step1Local> algorithm;
-  algorithm.parameter.nFactors = nFactors;
-
-  /* Set input objects for the algorithm */
-  algorithm.input.set(training::partialModel,
-                      partialResultLocal->get(training::outputOfStep4ForStep1));
-
-  /* Compute partial estimates on local nodes */
-  algorithm.compute();
-
-  /* Get the computed partial estimates */
-  return algorithm.getPartialResult();
-}
-
-NumericTablePtr computeStep2Master(
-    const training::DistributedPartialResultStep1Ptr* step1LocalResultsOnMaster,
-    size_t nFactors, size_t nBlocks) {
-  /* Create algorithm objects to compute implicit ALS algorithm in the distributed
-   * processing mode on the master node using the default method */
-  training::Distributed<step2Master> algorithm;
-  algorithm.parameter.nFactors = nFactors;
-
-  /* Set input objects for the algorithm */
-  for (size_t i = 0; i < nBlocks; i++) {
-    algorithm.input.add(training::inputOfStep2FromStep1, step1LocalResultsOnMaster[i]);
-  }
-
-  /* Compute a partial estimate on the master node from the partial estimates on local
-   * nodes */
-  algorithm.compute();
-
-  return algorithm.getPartialResult()->get(training::outputOfStep2ForStep4);
-}
-
-KeyValueDataCollectionPtr computeStep3Local(
-    const NumericTablePtr& offset,
-    const training::DistributedPartialResultStep4Ptr& partialResultLocal,
-    const KeyValueDataCollectionPtr& step3LocalInput, size_t nFactors) {
-  training::Distributed<step3Local> algorithm;
-  algorithm.parameter.nFactors = nFactors;
-
-  algorithm.input.set(training::partialModel,
-                      partialResultLocal->get(training::outputOfStep4ForStep3));
-  algorithm.input.set(training::inputOfStep3FromInit, step3LocalInput);
-  algorithm.input.set(training::offset, offset);
-
-  algorithm.compute();
-
-  return algorithm.getPartialResult()->get(training::outputOfStep3ForStep4);
-}
-
-training::DistributedPartialResultStep4Ptr computeStep4Local(
-    const CSRNumericTablePtr& dataTable, const NumericTablePtr& step2MasterResult,
-    const KeyValueDataCollectionPtr& step4LocalInput, size_t nFactors) {
-  training::Distributed<step4Local> algorithm;
-  algorithm.parameter.nFactors = nFactors;
-
-  algorithm.input.set(training::partialModels, step4LocalInput);
-  algorithm.input.set(training::partialData, dataTable);
-  algorithm.input.set(training::inputOfStep4FromStep2, step2MasterResult);
-
-  algorithm.compute();
-
-  return algorithm.getPartialResult();
-}
-
-void trainModel(size_t rankId, ccl::communicator &comm, size_t partitionId, size_t nBlocks, size_t nFactors,
-                size_t maxIterations) {
-  std::cout << "ALS (native): trainModel" << std::endl;
-
-  auto tStart = std::chrono::high_resolution_clock::now();
-
-  training::DistributedPartialResultStep1Ptr step1LocalResultsOnMaster[nBlocks];
-  training::DistributedPartialResultStep1Ptr step1LocalResult;
-  NumericTablePtr step2MasterResult;
-  KeyValueDataCollectionPtr step3LocalResult;
-  KeyValueDataCollectionPtr step4LocalInput(new KeyValueDataCollection());
-
-  ByteBuffer nodeCPs[nBlocks];
-  ByteBuffer nodeResults;
-  ByteBuffer crossProductBuf;
-  int crossProductLen;
-
-  for (size_t iteration = 0; iteration < maxIterations; iteration++) {
-    auto t1 = std::chrono::high_resolution_clock::now();
-
-    //
-    // Update partial users factors
-    //
-    step1LocalResult = computeStep1Local(itemsPartialResultLocal, nFactors);
-
-    serializeDAALObject(step1LocalResult.get(), nodeResults);
-
-    /* Gathering step1LocalResult on the master */
-    gather(rankId, comm, nBlocks, nodeResults, step1LocalResultsOnMaster);
-
-    if (rankId == ccl_root) {
-      step2MasterResult =
-          computeStep2Master(step1LocalResultsOnMaster, nFactors, nBlocks);
-      serializeDAALObject(step2MasterResult.get(), crossProductBuf);
-      crossProductLen = crossProductBuf.size();
-    }    
-
-    // MPI_Bcast(&crossProductLen, sizeof(int), MPI_CHAR, ccl_root, MPI_COMM_WORLD);
-    ccl::broadcast(&crossProductLen, sizeof(int), ccl::datatype::uint8, ccl_root, comm).wait();
-
-    if (rankId != ccl_root) {
-      crossProductBuf.resize(crossProductLen);
-    }
-    // MPI_Bcast(&crossProductBuf[0], crossProductLen, MPI_CHAR, ccl_root,
-    // MPI_COMM_WORLD);
-    ccl::broadcast(&crossProductBuf[0], crossProductLen, ccl::datatype::uint8, ccl_root, comm).wait();    
-
-    step2MasterResult =
-        NumericTable::cast(deserializeDAALObject(&crossProductBuf[0], crossProductLen));
-
-    step3LocalResult = computeStep3Local(itemOffset, itemsPartialResultLocal,
-                                         itemStep3LocalInput, nFactors);
-
-    /* MPI_Alltoallv to populate step4LocalInput */
-    for (size_t i = 0; i < nBlocks; i++) {
-      serializeDAALObject((*step3LocalResult)[i].get(), nodeCPs[i]);
-    }
-    all2all<PartialModel>(comm, nodeCPs, nBlocks, step4LocalInput);
-
-    usersPartialResultLocal = computeStep4Local(transposedDataTable, step2MasterResult,
-                                                step4LocalInput, nFactors);
-
-    //
-    // Update partial items factors
-    //
-    step1LocalResult = computeStep1Local(usersPartialResultLocal, nFactors);
-
-    serializeDAALObject(step1LocalResult.get(), nodeResults);
-
-    /* Gathering step1LocalResult on the master */
-    gather(rankId, comm, nBlocks, nodeResults, step1LocalResultsOnMaster);
-
-    if (rankId == ccl_root) {
-      step2MasterResult =
-          computeStep2Master(step1LocalResultsOnMaster, nFactors, nBlocks);
-      serializeDAALObject(step2MasterResult.get(), crossProductBuf);
-      crossProductLen = crossProductBuf.size();
-    }
-
-    // MPI_Bcast(&crossProductLen, sizeof(int), MPI_CHAR, ccl_root, MPI_COMM_WORLD);
-    ccl::broadcast(&crossProductLen, sizeof(int), ccl::datatype::uint8, ccl_root, comm).wait();
-
-    if (rankId != ccl_root) {
-      crossProductBuf.resize(crossProductLen);
-    }
-
-    // MPI_Bcast(&crossProductBuf[0], crossProductLen, MPI_CHAR, ccl_root,
-    // MPI_COMM_WORLD);
-    ccl::broadcast(&crossProductBuf[0], crossProductLen, ccl::datatype::uint8, ccl_root, comm).wait();    
-
-    step2MasterResult =
-        NumericTable::cast(deserializeDAALObject(&crossProductBuf[0], crossProductLen));
-
-    step3LocalResult = computeStep3Local(userOffset, usersPartialResultLocal,
-                                         userStep3LocalInput, nFactors);
-
-    /* MPI_Alltoallv to populate step4LocalInput */
-    for (size_t i = 0; i < nBlocks; i++) {
-      serializeDAALObject((*step3LocalResult)[i].get(), nodeCPs[i]);
-    }
-    all2all<PartialModel>(comm, nodeCPs, nBlocks, step4LocalInput);
-
-    itemsPartialResultLocal =
-        computeStep4Local(dataTable, step2MasterResult, step4LocalInput, nFactors);
-
-    auto t2 = std::chrono::high_resolution_clock::now();
-    auto duration = std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count();
-    std::cout << "ALS (native): iteration " << iteration << " took " << duration
-              << " secs" << std::endl;
-  }
-
-  auto tEnd = std::chrono::high_resolution_clock::now();
-  auto durationTotal =
-      std::chrono::duration_cast<std::chrono::seconds>(tEnd - tStart).count();
-  std::cout << "ALS (native): trainModel took " << durationTotal << " secs" << std::endl;
-
-  /*Gather all itemsPartialResultLocal to itemsPartialResultsMaster on the master and
-   * distributing the result over other ranks*/
-  // serializeDAALObject(itemsPartialResultLocal.get(), nodeResults);
-  // gatherItems(nodeResults, nBlocks);
-
-  // serializeDAALObject(usersPartialResultLocal.get(), nodeResults);
-  // gatherUsers(nodeResults, nBlocks);
-}
-
-static size_t getOffsetFromOffsetTable(NumericTablePtr offsetTable) {
-  size_t ret;
-  BlockDescriptor<int> block;
-  offsetTable->getBlockOfRows(0, 1, readOnly, block);
-  ret = (size_t)((block.getBlockPtr())[0]);
-  offsetTable->releaseBlockOfRows(block);
-
-  return ret;
-}
-
-/*
- * Class:     org_apache_spark_ml_recommendation_ALSDALImpl
- * Method:    cShuffleData
- * Signature:
- * (Ljava/nio/ByteBuffer;IILorg/apache/spark/ml/recommendation/ALSPartitionInfo;)Ljava/nio/ByteBuffer;
- */
-JNIEXPORT jobject JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cShuffleData(
-    JNIEnv* env, jobject obj, jobject dataBuffer, jint nTotalKeys, jint nBlocks,
-    jobject infoObj) {
-  //   cout << "cShuffleData: rank " << rankId << endl;
-  cout << "RATING_SIZE: " << RATING_SIZE << endl;
-
-  ccl::communicator &comm = getComm();
-
-  jbyte* ratingsBuf = (jbyte*)env->GetDirectBufferAddress(dataBuffer);
-
-  jlong ratingsNum = env->GetDirectBufferCapacity(dataBuffer) / RATING_SIZE;
-
-  std::vector<RatingPartition> ratingPartitions(nBlocks);
-
-  for (int i = 0; i < ratingsNum; i++) {
-    Rating* rating = (Rating*)(ratingsBuf + RATING_SIZE * i);
-    int partition = getPartiton(rating->user, nTotalKeys, nBlocks);
-    ratingPartitions[partition].push_back(*rating);
-  }
-
-  // for (int i = 0; i < nBlocks; i++) {
-  //   cout << "Partition " << i << endl;
-  //   for (auto r : ratingPartitions[i]) {
-  //     cout << r.user << " " << r.item << " " << r.rating << endl;
-  //   }
-  // }
-
-  size_t newRatingsNum = 0;
-  size_t newCsrRowNum = 0;
-  Rating* ratings = shuffle_all2all(comm, ratingPartitions, nBlocks, newRatingsNum, newCsrRowNum);
-
-  // Get the class of the input object
-  jclass clazz = env->GetObjectClass(infoObj);
-  // Get Field references
-  jfieldID ratingsNumField = env->GetFieldID(clazz, "ratingsNum", "I");
-  jfieldID csrRowNumField = env->GetFieldID(clazz, "csrRowNum", "I");    
-  
-  env->SetIntField(infoObj, ratingsNumField, newRatingsNum);
-  env->SetIntField(infoObj, csrRowNumField, newCsrRowNum);
-  
-  return env->NewDirectByteBuffer(ratings, newRatingsNum*RATING_SIZE);
-}
-
-/*
- * Class:     org_apache_spark_ml_recommendation_ALSDALImpl
- * Method:    cDALImplictALS
- * Signature: (JJIIDDIIILorg/apache/spark/ml/recommendation/ALSResult;)J
- */
-
-JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cDALImplictALS(
-    JNIEnv* env, jobject obj, jlong numTableAddr, jlong nUsers, jint nFactors,
-    jint maxIter, jdouble regParam, jdouble alpha, jint executor_num, jint executor_cores,
-    jint partitionId, jobject resultObj) {
-
-  ccl::communicator &comm = getComm();
-  size_t rankId = comm.rank();
-
-  dataTable = *((CSRNumericTablePtr*)numTableAddr);
-  // dataTable.reset(createFloatSparseTable("/home/xiaochang/github/oneDAL-upstream/samples/daal/cpp/mpi/data/distributed/implicit_als_csr_1.csv"));
-
-  // printNumericTable(dataTable, "cDALImplictALS", 10);
-  cout << "ALS (native): Input info: " << endl;
-  cout << "- NumberOfRows: " << dataTable->getNumberOfRows() << endl;
-  cout << "- NumberOfColumns: " << dataTable->getNumberOfColumns() << endl;
-  cout << "- NumberOfRatings: " << dataTable->getDataSize() << endl;
-  cout << "- fullNUsers: " << nUsers << endl;
-  cout << "- nFactors: " << nFactors << endl;
-
-  // Set number of threads for oneDAL to use for each rank
-  services::Environment::getInstance()->setNumberOfThreads(executor_cores);
-  int nThreadsNew = services::Environment::getInstance()->getNumberOfThreads();
-  cout << "oneDAL (native): Number of threads used: " << nThreadsNew << endl;
-
-  int nBlocks = executor_num;
-  initializeModel(rankId, comm, partitionId, nBlocks, nUsers, nFactors);
-  trainModel(rankId, comm, partitionId, executor_num, nFactors, maxIter);
-
-  auto pUser =
-      usersPartialResultLocal->get(training::outputOfStep4ForStep1)->getFactors();
-  // auto pUserIndices =
-  // usersPartialResultLocal->get(training::outputOfStep4ForStep1)->getIndices();
-  auto pItem =
-      itemsPartialResultLocal->get(training::outputOfStep4ForStep1)->getFactors();
-  // auto pItemIndices =
-  // itemsPartialResultsMaster[i]->get(training::outputOfStep4ForStep1)->getIndices();
-
-  std::cout << "\n=== Results for Rank " << rankId << "===\n" << std::endl;
-  // std::cout << "Partition ID: " << partitionId << std::endl;
-  printNumericTable(pUser, "User Factors (first 10 rows):", 10);
-  printNumericTable(pItem, "Item Factors (first 10 rows):", 10);
-  std::cout << "User Offset: " << getOffsetFromOffsetTable(userOffset) << std::endl;
-  std::cout << "Item Offset: " << getOffsetFromOffsetTable(itemOffset) << std::endl;
-  std::cout << std::endl;
-
-  // printNumericTable(userOffset, "userOffset");
-  // printNumericTable(itemOffset, "itemOffset");
-
-  // if (rankId == ccl_root) {
-  //     for (int i = 0; i < nBlocks; i++) {
-  //         printNumericTable(NumericTable::cast((*userOffsetsOnMaster)[i]),
-  //         "userOffsetsOnMaster");
-  //     }
-
-  //     for (int i = 0; i < nBlocks; i++) {
-  //         printNumericTable(NumericTable::cast((*itemOffsetsOnMaster)[i]),
-  //         "itemOffsetsOnMaster");
-  //     }
-  // }
-
-  //    printf("native pUser %ld, pItem %ld", (jlong)&pUser, (jlong)&pItem);
-
-  // Get the class of the input object
-  jclass clazz = env->GetObjectClass(resultObj);
-
-  // Fill in rankId
-  jfieldID cRankIdField = env->GetFieldID(clazz, "rankId", "J");
-  env->SetLongField(resultObj, cRankIdField, (jlong)rankId);
-
-  // Fill in cUsersFactorsNumTab & cItemsFactorsNumTab
-  // Get Field references
-  jfieldID cUsersFactorsNumTabField = env->GetFieldID(clazz, "cUsersFactorsNumTab", "J");
-  jfieldID cItemsFactorsNumTabField = env->GetFieldID(clazz, "cItemsFactorsNumTab", "J");
-  // Set factors as result, should use heap memory
-  NumericTablePtr* retUser = new NumericTablePtr(pUser);
-  NumericTablePtr* retItem = new NumericTablePtr(pItem);
-  env->SetLongField(resultObj, cUsersFactorsNumTabField, (jlong)retUser);
-  env->SetLongField(resultObj, cItemsFactorsNumTabField, (jlong)retItem);
-
-  // Fill in cUserOffset & cItemOffset
-  jfieldID cUserOffsetField = env->GetFieldID(clazz, "cUserOffset", "J");
-  assert(cUserOffsetField != NULL);
-  env->SetLongField(resultObj, cUserOffsetField,
-                    (jlong)getOffsetFromOffsetTable(userOffset));
-
-  jfieldID cItemOffsetField = env->GetFieldID(clazz, "cItemOffset", "J");
-  assert(cItemOffsetField != NULL);
-  env->SetLongField(resultObj, cItemOffsetField,
-                    (jlong)getOffsetFromOffsetTable(itemOffset));
-
-  return 0;
-}
diff --git a/mllib-dal/src/main/native/ALSShuffle.cpp b/mllib-dal/src/main/native/ALSShuffle.cpp
deleted file mode 100644
index 73440d253..000000000
--- a/mllib-dal/src/main/native/ALSShuffle.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-#include <cstring>
-#include <iostream>
-#include <vector>
-#include <algorithm>
-#include <set>
-#include <oneapi/ccl.hpp>
-
-#include "ALSShuffle.h"
-
-using namespace std;
-
-std::vector<Rating> recvData;
-
-jlong getPartiton(jlong key, jlong totalKeys, long nBlocks) {
-
-  jlong itemsInBlock = totalKeys / nBlocks;
-
-  return min(key / itemsInBlock, nBlocks - 1);
-}
-
-// Compares two Rating according to userId. 
-bool compareRatingByUser(Rating r1, Rating r2)
-{ 
-  if (r1.user < r2.user)
-    return true;
-  if (r1.user == r2.user && r1.item < r2.item)  
-    return true;
-  return false;
-}
-
-bool compareRatingUserEquality(Rating &r1, Rating &r2) {
-    return r1.user == r2.user;
-}
-
-int distinct_count(std::vector<Rating> &data) {
-  long curUser = -1;
-  long count = 0;
-  for (auto i : data) {
-    if (i.user > curUser) {
-      curUser = i.user;
-      count += 1;
-    }    
-  }
-  return count;
-}
-
-Rating * shuffle_all2all(ccl::communicator &comm, std::vector<RatingPartition> &partitions, size_t nBlocks, size_t &newRatingsNum, size_t &newCsrRowNum) {
-  size_t sendBufSize = 0;
-  size_t recvBufSize = 0;
-  vector<size_t> perNodeSendLens(nBlocks);
-  vector<size_t> perNodeRecvLens(nBlocks);
-
-  ByteBuffer sendData;  
-
-  // Calculate send buffer size
-  for (size_t i = 0; i < nBlocks; i++) {      
-      perNodeSendLens[i] = partitions[i].size() * RATING_SIZE;
-      // cout << "rank " << rankId << " Send partition " << i << " size " << perNodeSendLens[i] << endl;
-      sendBufSize += perNodeSendLens[i];
-  }
-  cout << "sendData size " << sendBufSize << endl;
-  sendData.resize(sendBufSize);
-
-  // Fill in send buffer
-  size_t offset = 0;
-  for (size_t i = 0; i < nBlocks; i++)
-  {
-    memcpy(sendData.data()+offset, partitions[i].data(), perNodeSendLens[i]);
-    offset += perNodeSendLens[i];
-  }
-
-  // Send lens first  
-  ccl::alltoall(perNodeSendLens.data(), perNodeRecvLens.data(), sizeof(size_t), ccl::datatype::uint8, comm).wait();
-
-  // Calculate recv buffer size
-  for (size_t i = 0; i < nBlocks; i++) {
-      // cout << "rank " << rankId << " Recv partition " << i << " size " << perNodeRecvLens[i] << endl;
-      recvBufSize += perNodeRecvLens[i];
-  }  
-
-  int ratingsNum = recvBufSize / RATING_SIZE;
-  recvData.resize(ratingsNum);
-
-  // Send data
-  ccl::alltoallv(sendData.data(), perNodeSendLens, recvData.data(), perNodeRecvLens, ccl::datatype::uint8, comm).wait();  
-
-  sort(recvData.begin(), recvData.end(), compareRatingByUser);
-
-  // for (auto r : recvData) {
-  //   cout << r.user << " " << r.item << " " << r.rating << endl;
-  // }
-
-  newRatingsNum = recvData.size();
-  // RatingPartition::iterator iter = std::unique(recvData.begin(), recvData.end(), compareRatingUserEquality);
-  // newCsrRowNum = std::distance(recvData.begin(), iter);
-  newCsrRowNum = distinct_count(recvData);
-
-  cout << "newRatingsNum: " << newRatingsNum << " newCsrRowNum: " << newCsrRowNum << endl;
-
-  return recvData.data();
-}
-
diff --git a/mllib-dal/src/main/native/ALSShuffle.h b/mllib-dal/src/main/native/ALSShuffle.h
deleted file mode 100644
index dbe864978..000000000
--- a/mllib-dal/src/main/native/ALSShuffle.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#pragma once
-
-#include <jni.h>
-
-struct Rating {
-  jlong user;
-  jlong item;
-  jfloat rating;
-} __attribute__((packed));
-
-const int RATING_SIZE = sizeof(Rating);
-
-typedef std::vector<unsigned char> ByteBuffer;
-typedef std::vector<Rating> RatingPartition;
-
-jlong getPartiton(jlong key, jlong totalKeys, long nBlocks);
-Rating * shuffle_all2all(ccl::communicator &comm, std::vector<RatingPartition> &partitions, size_t nBlocks, size_t &ratingsNum, size_t &csrRowNum);
diff --git a/mllib-dal/src/main/native/KMeansDALImpl.cpp b/mllib-dal/src/main/native/KMeansDALImpl.cpp
index d9c7a2f29..8ee123754 100644
--- a/mllib-dal/src/main/native/KMeansDALImpl.cpp
+++ b/mllib-dal/src/main/native/KMeansDALImpl.cpp
@@ -14,14 +14,13 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include <oneapi/ccl.hpp>
+#include <ccl.h>
 #include <daal.h>
-#include <iostream>
-#include <chrono>
 
 #include "service.h"
 #include "org_apache_spark_ml_clustering_KMeansDALImpl.h"
-#include "OneCCL.h"
+#include <iostream>
+#include <chrono>
 
 using namespace std;
 using namespace daal;
@@ -31,8 +30,7 @@ const int ccl_root = 0;
 
 typedef double algorithmFPType; /* Algorithm floating-point type */
 
-static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, 
-                                      const NumericTablePtr & pData, const NumericTablePtr & initialCentroids,
+static NumericTablePtr kmeans_compute(int rankId, const NumericTablePtr & pData, const NumericTablePtr & initialCentroids,
     size_t nClusters, size_t nBlocks, algorithmFPType &ret_cost)
 {
     const bool isRoot          = (rankId == ccl_root);
@@ -45,13 +43,17 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
         CentroidsArchLength = inputArch.getSizeOfArchive();
     }
 
+    ccl_request_t request;
+
     /* Get partial results from the root node */
-    ccl::broadcast(&CentroidsArchLength, sizeof(size_t), ccl::datatype::uint8, ccl_root, comm).wait();
+    ccl_bcast(&CentroidsArchLength, sizeof(size_t), ccl_dtype_char, ccl_root, NULL, NULL, NULL, &request);
+    ccl_wait(request);
 
     ByteBuffer nodeCentroids(CentroidsArchLength);
     if (isRoot) inputArch.copyArchiveToArray(&nodeCentroids[0], CentroidsArchLength);
 
-    ccl::broadcast(&nodeCentroids[0], CentroidsArchLength, ccl::datatype::uint8, ccl_root, comm).wait();
+    ccl_bcast(&nodeCentroids[0], CentroidsArchLength, ccl_dtype_char, ccl_root, NULL, NULL, NULL, &request);
+    ccl_wait(request);
 
     /* Deserialize centroids data */
     OutputDataArchive outArch(nodeCentroids.size() ? &nodeCentroids[0] : NULL, CentroidsArchLength);
@@ -77,7 +79,7 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
     ByteBuffer serializedData;
 
     /* Serialized data is of equal size on each node if each node called compute() equal number of times */
-    vector<size_t> recvCounts(nBlocks);
+    size_t* recvCounts = new size_t[nBlocks];
     for (size_t i = 0; i < nBlocks; i++)
     {
         recvCounts[i] = perNodeArchLength;
@@ -88,7 +90,10 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
     dataArch.copyArchiveToArray(&nodeResults[0], perNodeArchLength);
 
     /* Transfer partial results to step 2 on the root node */
-    ccl::allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0], recvCounts, ccl::datatype::uint8, comm).wait();
+    ccl_allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0], recvCounts, ccl_dtype_char, NULL, NULL, NULL, &request);
+    ccl_wait(request);
+
+    delete [] recvCounts;
 
     if (isRoot)
     {
@@ -163,8 +168,8 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_clustering_KMeansDALImpl_cKMean
   jint executor_num, jint executor_cores,
   jobject resultObj) {
 
-  ccl::communicator &comm = getComm();
-  size_t rankId = comm.rank();
+  size_t rankId;
+  ccl_get_comm_rank(NULL, &rankId);
 
   NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
   NumericTablePtr centroids = *((NumericTablePtr *)pNumTabCenters);
@@ -184,14 +189,16 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_clustering_KMeansDALImpl_cKMean
   for (it = 0; it < iteration_num && !converged; it++) {
     auto t1 = std::chrono::high_resolution_clock::now();
 
-    newCentroids = kmeans_compute(rankId, comm, pData, centroids, cluster_num, executor_num, totalCost);
+    newCentroids = kmeans_compute(rankId, pData, centroids, cluster_num, executor_num, totalCost);
 
     if (rankId == ccl_root) {
         converged = areAllCentersConverged(centroids, newCentroids, tolerance);
     }
 
     // Sync converged status
-    ccl::broadcast(&converged, 1, ccl::datatype::uint8, ccl_root, comm).wait();
+    ccl_request_t request;
+    ccl_bcast(&converged, 1, ccl_dtype_char, ccl_root, NULL, NULL, NULL, &request);
+    ccl_wait(request);
 
     centroids = newCentroids;
 
@@ -221,4 +228,4 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_clustering_KMeansDALImpl_cKMean
     return (jlong)ret;
   } else
     return (jlong)0;
-}
\ No newline at end of file
+}
diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile
index 23222e646..dfb5076ff 100644
--- a/mllib-dal/src/main/native/Makefile
+++ b/mllib-dal/src/main/native/Makefile
@@ -31,17 +31,17 @@ INCS := -I $(JAVA_HOME)/include \
 
 # Use static link if possible, TBB is only available as dynamic libs
 
-LIBS := -L${CCL_ROOT}/lib -lccl \
+LIBS := -L${CCL_ROOT}/lib -l:libccl.a \
         -L$(DAALROOT)/lib/intel64 -l:libdaal_core.a -l:libdaal_thread.a \
-        -L$(TBBROOT)/lib -ltbb -ltbbmalloc
+        -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc
 #        TODO: Add signal chaining support, should fix linking, package so and loading
 #        -L$(JAVA_HOME)/jre/lib/amd64 -ljsig
 
 CPP_SRCS += \
-./OneCCL.cpp ./OneDAL.cpp ./KMeansDALImpl.cpp ./PCADALImpl.cpp ./ALSDALImpl.cpp ./ALSShuffle.cpp ./service.cpp ./error_handling.cpp
+./OneCCL.cpp ./OneDAL.cpp ./KMeansDALImpl.cpp ./PCADALImpl.cpp ./service.cpp ./error_handling.cpp
 
 OBJS += \
-./OneCCL.o ./OneDAL.o ./KMeansDALImpl.o ./PCADALImpl.o ./ALSDALImpl.o ./ALSShuffle.o ./service.o ./error_handling.o
+./OneCCL.o ./OneDAL.o ./KMeansDALImpl.o ./PCADALImpl.o ./service.o ./error_handling.o
 
 # Output Binary
 OUTPUT = ../../../target/libMLlibDAL.so
diff --git a/mllib-dal/src/main/native/OneCCL.cpp b/mllib-dal/src/main/native/OneCCL.cpp
index c733c7b33..b23be9737 100644
--- a/mllib-dal/src/main/native/OneCCL.cpp
+++ b/mllib-dal/src/main/native/OneCCL.cpp
@@ -1,63 +1,26 @@
 #include <iostream>
-#include <chrono>
-
-#include <arpa/inet.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <unistd.h>
-
-#include <list>
-#include <ifaddrs.h>
-#include <netdb.h>
-
-#include <oneapi/ccl.hpp>
-
+#include <ccl.h>
 #include "org_apache_spark_ml_util_OneCCL__.h"
 
-// todo: fill initial comm_size and rank_id
-size_t comm_size;
-size_t rank_id;
-
-std::vector<ccl::communicator> g_comms;
-
-ccl::communicator &getComm() {
-    return g_comms[0];
-}
-
 JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1init
-  (JNIEnv *env, jobject obj, jint size, jint rank, jstring ip_port, jobject param) {
+  (JNIEnv *env, jobject obj, jobject param) {
   
-  std::cerr << "OneCCL (native): init" << std::endl;
-
-  auto t1 = std::chrono::high_resolution_clock::now();
-
-  ccl::init();
-
-  const char *str = env->GetStringUTFChars(ip_port, 0);
-  ccl::string ccl_ip_port(str);
+  std::cout << "oneCCL (native): init" << std::endl;
 
-  auto kvs_attr = ccl::create_kvs_attr();
-  kvs_attr.set<ccl::kvs_attr_id::ip_port>(ccl_ip_port);
-
-  ccl::shared_ptr_class<ccl::kvs> kvs;
-  kvs = ccl::create_main_kvs(kvs_attr);
-
-  g_comms.push_back(ccl::create_communicator(size, rank, kvs));
-
-  auto t2 = std::chrono::high_resolution_clock::now();
-  auto duration = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
-  std::cerr << "OneCCL (native): init took " << duration << " secs" << std::endl;
-
-  rank_id = getComm().rank();
-  comm_size = getComm().size();
+  ccl_init();
 
   jclass cls = env->GetObjectClass(param);
   jfieldID fid_comm_size = env->GetFieldID(cls, "commSize", "J");
-  jfieldID fid_rank_id = env->GetFieldID(cls, "rankId", "J");  
+  jfieldID fid_rank_id = env->GetFieldID(cls, "rankId", "J");
+
+  size_t comm_size;
+  size_t rank_id;
+
+  ccl_get_comm_size(NULL, &comm_size);
+  ccl_get_comm_rank(NULL, &rank_id);
 
   env->SetLongField(param, fid_comm_size, comm_size);
   env->SetLongField(param, fid_rank_id, rank_id);    
-  env->ReleaseStringUTFChars(ip_port, str);
 
   return 1;
 }
@@ -70,10 +33,9 @@ JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1init
 JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1cleanup
   (JNIEnv *env, jobject obj) {
 
-  g_comms.pop_back();
-
-  std::cerr << "OneCCL (native): cleanup" << std::endl;
+  std::cout << "oneCCL (native): cleanup" << std::endl;
 
+  ccl_finalize();
 }
 
 /*
@@ -82,9 +44,12 @@ JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1cleanup
  * Signature: ()Z
  */
 JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_isRoot
-  (JNIEnv *env, jobject obj) {    
+  (JNIEnv *env, jobject obj) {
 
-    return getComm().rank() == 0;
+    size_t rank_id;
+    ccl_get_comm_rank(NULL, &rank_id);
+
+    return (rank_id == 0);
 }
 
 /*
@@ -94,7 +59,12 @@ JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_isRoot
  */
 JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_rankID
   (JNIEnv *env, jobject obj) {
-    return getComm().rank();
+
+    size_t rank_id;
+    ccl_get_comm_rank(NULL, &rank_id);
+
+    return rank_id;
+
 }
 
 /*
@@ -115,115 +85,3 @@ JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_setEnv
 
     return err;
 }
-
-static const int CCL_IP_LEN = 128;
-std::list<std::string> local_host_ips;
-
-static int fill_local_host_ip() {
-    struct ifaddrs *ifaddr, *ifa;
-    int family = AF_UNSPEC;
-    char local_ip[CCL_IP_LEN];
-    if (getifaddrs(&ifaddr) < 0) {
-        // LOG_ERROR("fill_local_host_ip: can not get host IP");
-        return -1;
-    }
-
-    const char iface_name[] = "lo";
-    local_host_ips.clear();
-
-    for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) {
-        if (ifa->ifa_addr == NULL)
-            continue;
-        if (strstr(ifa->ifa_name, iface_name) == NULL) {
-            family = ifa->ifa_addr->sa_family;
-            if (family == AF_INET) {
-                memset(local_ip, 0, CCL_IP_LEN);
-                int res = getnameinfo(
-                    ifa->ifa_addr,
-                    (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6),
-                    local_ip,
-                    CCL_IP_LEN,
-                    NULL,
-                    0,
-                    NI_NUMERICHOST);
-                if (res != 0) {
-                    std::string s("fill_local_host_ip: getnameinfo error > ");
-                    s.append(gai_strerror(res));
-                    // LOG_ERROR(s.c_str());
-                    return -1;
-                }
-                local_host_ips.push_back(local_ip);
-            }
-        }
-    }
-    if (local_host_ips.empty()) {
-        // LOG_ERROR("fill_local_host_ip: can't find interface to get host IP");
-        return -1;
-    }
-    // memset(local_host_ip, 0, CCL_IP_LEN);
-    // strncpy(local_host_ip, local_host_ips.front().c_str(), CCL_IP_LEN);
-
-    // for (auto &ip : local_host_ips)
-    //   cout << ip << endl;
-
-    freeifaddrs(ifaddr);
-    return 0;
-}
-
-static bool is_valid_ip(char ip[]) {
-  if (fill_local_host_ip() == -1) {
-    std::cerr << "fill_local_host_ip error" << std::endl;
-  };
-  for (std::list<std::string>::iterator it = local_host_ips.begin(); it != local_host_ips.end(); ++it) {
-    if (*it == ip) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-/*
- * Class:     org_apache_spark_ml_util_OneCCL__
- * Method:    getAvailPort
- * Signature: (Ljava/lang/String;)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1getAvailPort
-  (JNIEnv *env, jobject obj, jstring localIP) {
-
-  // start from beginning of dynamic port
-  const int port_start_base = 3000;
-
-  char* local_host_ip = (char *) env->GetStringUTFChars(localIP, NULL);
-
-  // check if the input ip is one of host's ips
-  if (!is_valid_ip(local_host_ip))
-    return -1;
-
-  struct sockaddr_in main_server_address;
-  int server_listen_sock;
-  in_port_t port = port_start_base;
-
-  if ((server_listen_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
-    perror("OneCCL (native) getAvailPort error!");
-    return -1;
-  }
-
-  main_server_address.sin_family = AF_INET;
-  main_server_address.sin_addr.s_addr = inet_addr(local_host_ip);
-  main_server_address.sin_port = htons(port);
-
-  // search for available port
-  while (bind(server_listen_sock,
-         (const struct sockaddr *)&main_server_address,
-         sizeof(main_server_address)) < 0) {
-    port++;
-    main_server_address.sin_port = htons(port);
-  }
-
-  close(server_listen_sock);  
-
-  env->ReleaseStringUTFChars(localIP, local_host_ip);
-
-  return port;
-}
diff --git a/mllib-dal/src/main/native/OneCCL.h b/mllib-dal/src/main/native/OneCCL.h
deleted file mode 100644
index b579c4697..000000000
--- a/mllib-dal/src/main/native/OneCCL.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#pragma once
-
-#include <oneapi/ccl.hpp>
-
-ccl::communicator &getComm();
diff --git a/mllib-dal/src/main/native/OneDAL.cpp b/mllib-dal/src/main/native/OneDAL.cpp
index 792225c3e..2210ffd1c 100644
--- a/mllib-dal/src/main/native/OneDAL.cpp
+++ b/mllib-dal/src/main/native/OneDAL.cpp
@@ -19,8 +19,6 @@
 #include <cstring>
 #include "org_apache_spark_ml_util_OneDAL__.h"
 
-#include "service.h"
-
 using namespace daal;
 using namespace daal::data_management;
 
@@ -125,51 +123,3 @@ JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cCheckPlat
     // Only guarantee compatibility and performance on Intel platforms, use oneDAL lib function
     return daal_check_is_intel_cpu();
 }
-
-/*
- * Class:     org_apache_spark_ml_util_OneDAL__
- * Method:    cNewCSRNumericTable
- * Signature: ([F[J[JJJ)J
- */
-JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cNewCSRNumericTable
-  (JNIEnv *env, jobject, jfloatArray data, jlongArray colIndices, jlongArray rowOffsets, jlong nFeatures, jlong nVectors) {
-
-    long numData = env->GetArrayLength(data);
-    // long numColIndices = numData;
-    // long numRowOffsets = env->GetArrayLength(rowOffsets);
-
-    size_t * resultRowOffsets      = NULL;
-    size_t * resultColIndices      = NULL;
-    float  * resultData         = NULL;            
-    CSRNumericTable * numericTable = new CSRNumericTable(resultData, resultColIndices, resultRowOffsets, nFeatures, nVectors);    
-    numericTable->allocateDataMemory(numData);
-    numericTable->getArrays<float>(&resultData, &resultColIndices, &resultRowOffsets);
-
-    size_t * pRowOffsets = (size_t *)env->GetLongArrayElements(rowOffsets, 0);
-    size_t * pColIndices = (size_t *)env->GetLongArrayElements(colIndices, 0);
-    float * pData       = env->GetFloatArrayElements(data, 0);
-
-    // std::memcpy(resultRowOffsets, pRowOffsets, numRowOffsets*sizeof(jlong));
-    // std::memcpy(resultColIndices, pColIndices, numColIndices*sizeof(jlong));
-    // std::memcpy(resultData, pData, numData*sizeof(float));
-
-    for (size_t i = 0; i < (size_t)numData; ++i)
-    {
-        resultData[i]       = pData[i];
-        resultColIndices[i] = pColIndices[i];
-    }
-    for (size_t i = 0; i < (size_t)nVectors + 1; ++i)
-    {
-        resultRowOffsets[i] = pRowOffsets[i];
-    }
-
-    env->ReleaseLongArrayElements(rowOffsets, (jlong *)pRowOffsets, 0);
-    env->ReleaseLongArrayElements(colIndices, (jlong *)pColIndices, 0);
-    env->ReleaseFloatArrayElements(data, pData, 0);
-
-    CSRNumericTablePtr *ret = new CSRNumericTablePtr(numericTable);
-
-    //printNumericTable(*ret, "cNewCSRNumericTable", 10);
-
-    return (jlong)ret;
-}
diff --git a/mllib-dal/src/main/native/PCADALImpl.cpp b/mllib-dal/src/main/native/PCADALImpl.cpp
index 33e2bc95d..3b06fc0dc 100644
--- a/mllib-dal/src/main/native/PCADALImpl.cpp
+++ b/mllib-dal/src/main/native/PCADALImpl.cpp
@@ -1,3 +1,4 @@
+#include <ccl.h>
 #include <daal.h>
 
 #include "service.h"
@@ -6,7 +7,6 @@
 #include <iostream>
 
 #include "org_apache_spark_ml_feature_PCADALImpl.h"
-#include "OneCCL.h"
 
 using namespace std;
 using namespace daal;
@@ -24,9 +24,8 @@ typedef double algorithmFPType; /* Algorithm floating-point type */
 JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL(
     JNIEnv *env, jobject obj, jlong pNumTabData, jint k, jint executor_num, jint executor_cores,
     jobject resultObj) {
-
-  ccl::communicator &comm = getComm();
-  size_t rankId = comm.rank();
+  size_t rankId;
+  ccl_get_comm_rank(NULL, &rankId);
 
   const size_t nBlocks = executor_num;
   const int comm_size = executor_num;
@@ -41,8 +40,6 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL
   int nThreadsNew = services::Environment::getInstance()->getNumberOfThreads();
   cout << "oneDAL (native): Number of threads used: " << nThreadsNew << endl;
 
-  auto t1 = std::chrono::high_resolution_clock::now();
-
   pca::Distributed<step1Local, algorithmFPType, pca::svdDense> localAlgorithm;
 
   /* Set the input data set to the algorithm */
@@ -51,12 +48,6 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL
   /* Compute PCA decomposition */
   localAlgorithm.compute();
 
-  auto t2 = std::chrono::high_resolution_clock::now();
-  auto duration = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
-  std::cout << "PCA (native): local step took " << duration << " secs" << std::endl;
-
-  t1 = std::chrono::high_resolution_clock::now();
-
   /* Serialize partial results required by step 2 */
   services::SharedPtr<byte> serializedData;
   InputDataArchive dataArch;
@@ -68,31 +59,31 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL
   byte* nodeResults = new byte[perNodeArchLength];
   dataArch.copyArchiveToArray(nodeResults, perNodeArchLength);
 
-  t2 = std::chrono::high_resolution_clock::now();
-
-  duration = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
-  std::cout << "PCA (native): serializing partial results took " << duration << " secs" << std::endl;
+  ccl_request_t request;  
 
-  vector<size_t> recv_counts(comm_size * perNodeArchLength);
+  size_t* recv_counts = new size_t[comm_size * perNodeArchLength];
   for (int i = 0; i < comm_size; i++) recv_counts[i] = perNodeArchLength;
 
   cout << "PCA (native): ccl_allgatherv receiving " << perNodeArchLength * nBlocks << " bytes" << endl;
 
-  t1 = std::chrono::high_resolution_clock::now();
+  auto t1 = std::chrono::high_resolution_clock::now();
 
   /* Transfer partial results to step 2 on the root node */
   // MPI_Gather(nodeResults, perNodeArchLength, MPI_CHAR, serializedData.get(),
   // perNodeArchLength, MPI_CHAR, ccl_root, MPI_COMM_WORLD);
-  ccl::allgatherv(nodeResults, perNodeArchLength, serializedData.get(), recv_counts,
-                  ccl::datatype::uint8, comm).wait();
+  ccl_allgatherv(nodeResults, perNodeArchLength, serializedData.get(), recv_counts,
+                 ccl_dtype_char, NULL, NULL, NULL, &request);
+  ccl_wait(request);
 
-  t2 = std::chrono::high_resolution_clock::now();
+  auto t2 = std::chrono::high_resolution_clock::now();
 
-  duration = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
+  auto duration = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
   std::cout << "PCA (native): ccl_allgatherv took " << duration << " secs" << std::endl;
 
+  delete[] nodeResults;
+
   if (rankId == ccl_root) {
-    auto t1 = std::chrono::high_resolution_clock::now();
+    auto t1 = std::chrono::high_resolution_clock::now();        
 
     /* Create an algorithm for principal component analysis using the svdDense method
      * on the master node */
diff --git a/mllib-dal/src/main/native/build-jni.sh b/mllib-dal/src/main/native/build-jni.sh
index dacd8382b..bee614dcd 100755
--- a/mllib-dal/src/main/native/build-jni.sh
+++ b/mllib-dal/src/main/native/build-jni.sh
@@ -18,5 +18,4 @@ javah -d $WORK_DIR/javah -classpath "$WORK_DIR/../../../target/classes:$DAAL_JAR
     org.apache.spark.ml.util.OneCCL$ \
     org.apache.spark.ml.util.OneDAL$ \
     org.apache.spark.ml.clustering.KMeansDALImpl \
-    org.apache.spark.ml.feature.PCADALImpl \
-    org.apache.spark.ml.recommendation.ALSDALImpl
+    org.apache.spark.ml.feature.PCADALImpl
diff --git a/mllib-dal/src/main/native/javah/org_apache_spark_ml_recommendation_ALSDALImpl.h b/mllib-dal/src/main/native/javah/org_apache_spark_ml_recommendation_ALSDALImpl.h
deleted file mode 100644
index 73024d05b..000000000
--- a/mllib-dal/src/main/native/javah/org_apache_spark_ml_recommendation_ALSDALImpl.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* DO NOT EDIT THIS FILE - it is machine generated */
-#include <jni.h>
-/* Header for class org_apache_spark_ml_recommendation_ALSDALImpl */
-
-#ifndef _Included_org_apache_spark_ml_recommendation_ALSDALImpl
-#define _Included_org_apache_spark_ml_recommendation_ALSDALImpl
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     org_apache_spark_ml_recommendation_ALSDALImpl
- * Method:    cDALImplictALS
- * Signature: (JJIIDDIIILorg/apache/spark/ml/recommendation/ALSResult;)J
- */
-JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cDALImplictALS
-  (JNIEnv *, jobject, jlong, jlong, jint, jint, jdouble, jdouble, jint, jint, jint, jobject);
-
-/*
- * Class:     org_apache_spark_ml_recommendation_ALSDALImpl
- * Method:    cShuffleData
- * Signature: (Ljava/nio/ByteBuffer;IILorg/apache/spark/ml/recommendation/ALSPartitionInfo;)Ljava/nio/ByteBuffer;
- */
-JNIEXPORT jobject JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cShuffleData
-  (JNIEnv *, jobject, jobject, jint, jint, jobject);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneCCL__.h b/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneCCL__.h
index 580c34bf9..60825ae3f 100644
--- a/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneCCL__.h
+++ b/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneCCL__.h
@@ -10,10 +10,10 @@ extern "C" {
 /*
  * Class:     org_apache_spark_ml_util_OneCCL__
  * Method:    c_init
- * Signature: (IILjava/lang/String;Lorg/apache/spark/ml/util/CCLParam;)I
+ * Signature: (Lorg/apache/spark/ml/util/CCLParam;)I
  */
 JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1init
-  (JNIEnv *, jobject, jint, jint, jstring, jobject);
+  (JNIEnv *, jobject, jobject);
 
 /*
  * Class:     org_apache_spark_ml_util_OneCCL__
@@ -47,14 +47,6 @@ JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_rankID
 JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_setEnv
   (JNIEnv *, jobject, jstring, jstring, jboolean);
 
-/*
- * Class:     org_apache_spark_ml_util_OneCCL__
- * Method:    c_getAvailPort
- * Signature: (Ljava/lang/String;)I
- */
-JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1getAvailPort
-  (JNIEnv *, jobject, jstring);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneDAL__.h b/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneDAL__.h
index d88462d66..5f67d9428 100644
--- a/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneDAL__.h
+++ b/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneDAL__.h
@@ -47,14 +47,6 @@ JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cFreeDataMemor
 JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cCheckPlatformCompatibility
   (JNIEnv *, jobject);
 
-/*
- * Class:     org_apache_spark_ml_util_OneDAL__
- * Method:    cNewCSRNumericTable
- * Signature: ([F[J[JJJ)J
- */
-JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cNewCSRNumericTable
-  (JNIEnv *, jobject, jfloatArray, jlongArray, jlongArray, jlong, jlong);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/mllib-dal/src/main/native/service.cpp b/mllib-dal/src/main/native/service.cpp
index 623767406..9316b3b62 100644
--- a/mllib-dal/src/main/native/service.cpp
+++ b/mllib-dal/src/main/native/service.cpp
@@ -125,10 +125,6 @@ CSRNumericTable * createSparseTable(const std::string & datasetFileName)
     return numericTable;
 }
 
-CSRNumericTable * createFloatSparseTable(const std::string & datasetFileName) {
-    return createSparseTable<float>(datasetFileName);
-}
-
 void printAprioriItemsets(NumericTablePtr largeItemsetsTable, NumericTablePtr largeItemsetsSupportTable, size_t nItemsetToPrint = 20)
 {
     size_t largeItemsetCount     = largeItemsetsSupportTable->getNumberOfRows();
diff --git a/mllib-dal/src/main/native/service.h b/mllib-dal/src/main/native/service.h
index b6a2cc5c5..37b702aea 100644
--- a/mllib-dal/src/main/native/service.h
+++ b/mllib-dal/src/main/native/service.h
@@ -43,8 +43,5 @@ typedef std::vector<daal::byte> ByteBuffer;
 
 void printNumericTable(const NumericTablePtr & dataTable, const char * message = "", size_t nPrintedRows = 0, size_t nPrintedCols = 0,
                        size_t interval = 10);
-size_t serializeDAALObject(SerializationIface * pData, ByteBuffer & buffer);
-SerializationIfacePtr deserializeDAALObject(daal::byte * buff, size_t length);
-CSRNumericTable * createFloatSparseTable(const std::string & datasetFileName);
 
 #endif
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala
index e9e7ec36d..5f29fe441 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala
@@ -40,6 +40,8 @@ class KMeansDALImpl (
 
     instr.foreach(_.logInfo(s"Processing partitions with $executorNum executors"))
 
+    val executorIPAddress = Utils.sparkFirstExecutorIP(data.sparkContext)
+
     // repartition to executorNum if not enough partitions
     val dataForConversion = if (data.getNumPartitions < executorNum) {
       data.repartition(executorNum).setName("Repartitioned for conversion").cache()
@@ -47,13 +49,6 @@ class KMeansDALImpl (
       data
     }
 
-    val executorIPAddress = Utils.sparkFirstExecutorIP(dataForConversion.sparkContext)
-    val kvsIP = dataForConversion.sparkContext.conf.get("spark.oap.mllib.oneccl.kvs.ip", executorIPAddress)
-    val kvsPortDetected = Utils.checkExecutorAvailPort(dataForConversion, kvsIP)
-    val kvsPort = dataForConversion.sparkContext.conf.getInt("spark.oap.mllib.oneccl.kvs.port", kvsPortDetected)
-
-    val kvsIPPort = kvsIP+"_"+kvsPort
-
     val partitionDims = Utils.getPartitionDims(dataForConversion)
 
     // filter the empty partitions
@@ -69,14 +64,14 @@ class KMeansDALImpl (
       val it = entry._3
       val numCols = partitionDims(index)._2
 	  
-      logDebug(s"KMeansDALImpl: Partition index: $index, numCols: $numCols, numRows: $numRows")
+      println(s"KMeansDALImpl: Partition index: $index, numCols: $numCols, numRows: $numRows")
 
       // Build DALMatrix, this will load libJavaAPI, libtbb, libtbbmalloc
       val context = new DaalContext()
       val matrix = new DALMatrix(context, classOf[java.lang.Double],
         numCols.toLong, numRows.toLong, NumericTable.AllocationFlag.DoAllocate)
 
-      logDebug("KMeansDALImpl: Loading native libraries" )
+      println("KMeansDALImpl: Loading native libraries" )
       // oneDAL libs should be loaded by now, extract libMLlibDAL.so to temp file and load
       LibLoader.loadLibraries()
 
@@ -116,9 +111,10 @@ class KMeansDALImpl (
     
     }.cache()
 
-    val results = coalescedTables.mapPartitionsWithIndex { (rank, table) =>
+    val results = coalescedTables.mapPartitions { table =>
       val tableArr = table.next()
-      OneCCL.init(executorNum, rank, kvsIPPort)
+
+      OneCCL.init(executorNum, executorIPAddress, OneCCL.KVS_PORT)
 
       val initCentroids = OneDAL.makeNumericTable(centers)
       val result = new KMeansResult()
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala
index e1bba3d37..1760aa171 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala
@@ -18,20 +18,19 @@
 package org.apache.spark.ml.feature
 
 import java.util.Arrays
+
 import com.intel.daal.data_management.data.{HomogenNumericTable, NumericTable}
-import org.apache.spark.internal.Logging
 import org.apache.spark.ml.linalg._
 import org.apache.spark.ml.util.{OneCCL, OneDAL, Utils}
 import org.apache.spark.mllib.feature.{PCAModel => MLlibPCAModel}
 import org.apache.spark.mllib.linalg.{DenseMatrix => OldDenseMatrix, Vectors => OldVectors}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.mllib.feature.{StandardScaler => MLlibStandardScaler}
+import org.apache.spark.mllib.feature.{ StandardScaler => MLlibStandardScaler }
 
 class PCADALImpl (
     val k: Int,
     val executorNum: Int,
-    val executorCores: Int)
-  extends Serializable with Logging {
+    val executorCores: Int) extends Serializable {
 
   // Normalize data before apply fitWithDAL
   private def normalizeData(input: RDD[Vector]) : RDD[Vector] = {
@@ -41,23 +40,17 @@ class PCADALImpl (
     res.map(_.asML)
   }
 
-  def fitWithDAL(data: RDD[Vector]) : MLlibPCAModel = {
+  def fitWithDAL(input: RDD[Vector]) : MLlibPCAModel = {
 
-    val normalizedData = normalizeData(data)
+    val normalizedData = normalizeData(input)
 
     val coalescedTables = OneDAL.rddVectorToNumericTables(normalizedData, executorNum)
 
-    val executorIPAddress = Utils.sparkFirstExecutorIP(coalescedTables.sparkContext)
-    val kvsIP = coalescedTables.sparkContext.conf.get("spark.oap.mllib.oneccl.kvs.ip", executorIPAddress)
-
-    val kvsPortDetected = Utils.checkExecutorAvailPort(coalescedTables, kvsIP)
-    val kvsPort = coalescedTables.sparkContext.conf.getInt("spark.oap.mllib.oneccl.kvs.port", kvsPortDetected)
-
-    val kvsIPPort = kvsIP+"_"+kvsPort
+    val executorIPAddress = Utils.sparkFirstExecutorIP(input.sparkContext)
 
-    val results = coalescedTables.mapPartitionsWithIndex { (rank, table) =>
+    val results = coalescedTables.mapPartitions { table =>
       val tableArr = table.next()
-      OneCCL.init(executorNum, rank, kvsIPPort)
+      OneCCL.init(executorNum, executorIPAddress, OneCCL.KVS_PORT)
 
       val result = new PCAResult()
       cPCATrainDAL(
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
deleted file mode 100644
index 9196873fb..000000000
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ /dev/null
@@ -1,1885 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.ml.recommendation
-
-import java.{util => ju}
-import java.io.IOException
-import java.util.Locale
-
-import scala.collection.mutable
-import scala.reflect.ClassTag
-import scala.util.{Sorting, Try}
-import scala.util.hashing.byteswap64
-
-import com.github.fommil.netlib.BLAS.{getInstance => blas}
-import org.apache.hadoop.fs.Path
-import org.json4s.DefaultFormats
-import org.json4s.JsonDSL._
-
-import org.apache.spark.{Dependency, Partitioner, ShuffleDependency, SparkContext, SparkException}
-import org.apache.spark.annotation.Since
-import org.apache.spark.internal.Logging
-import org.apache.spark.ml.{Estimator, Model}
-import org.apache.spark.ml.linalg.BLAS
-import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.{Utils => DALImplUtils, _}
-import org.apache.spark.ml.util.Instrumentation.instrumented
-import org.apache.spark.mllib.linalg.CholeskyDecomposition
-import org.apache.spark.mllib.optimization.NNLS
-import org.apache.spark.rdd.{DeterministicLevel, RDD}
-import org.apache.spark.sql.{DataFrame, Dataset}
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types._
-import org.apache.spark.storage.StorageLevel
-import org.apache.spark.util.{BoundedPriorityQueue, Utils}
-import org.apache.spark.util.collection.{OpenHashMap, OpenHashSet, SortDataFormat, Sorter}
-import org.apache.spark.util.random.XORShiftRandom
-
-/**
- * Common params for ALS and ALSModel.
- */
-private[recommendation] trait ALSModelParams extends Params with HasPredictionCol
-  with HasBlockSize {
-  /**
-   * Param for the column name for user ids. Ids must be integers. Other
-   * numeric types are supported for this column, but will be cast to integers as long as they
-   * fall within the integer value range.
-   * Default: "user"
-   * @group param
-   */
-  val userCol = new Param[String](this, "userCol", "column name for user ids. Ids must be within " +
-    "the integer value range.")
-
-  /** @group getParam */
-  def getUserCol: String = $(userCol)
-
-  /**
-   * Param for the column name for item ids. Ids must be integers. Other
-   * numeric types are supported for this column, but will be cast to integers as long as they
-   * fall within the integer value range.
-   * Default: "item"
-   * @group param
-   */
-  val itemCol = new Param[String](this, "itemCol", "column name for item ids. Ids must be within " +
-    "the integer value range.")
-
-  /** @group getParam */
-  def getItemCol: String = $(itemCol)
-
-  /**
-   * Attempts to safely cast a user/item id to an Int. Throws an exception if the value is
-   * out of integer range or contains a fractional part.
-   */
-  protected[recommendation] val checkedCast = udf { (n: Any) =>
-    n match {
-      case v: Int => v // Avoid unnecessary casting
-      case v: Number =>
-        val intV = v.intValue
-        // Checks if number within Int range and has no fractional part.
-        if (v.doubleValue == intV) {
-          intV
-        } else {
-          throw new IllegalArgumentException(s"ALS only supports values in Integer range " +
-            s"and without fractional part for columns ${$(userCol)} and ${$(itemCol)}. " +
-            s"Value $n was either out of Integer range or contained a fractional part that " +
-            s"could not be converted.")
-        }
-      case _ => throw new IllegalArgumentException(s"ALS only supports values in Integer range " +
-        s"for columns ${$(userCol)} and ${$(itemCol)}. Value $n was not numeric.")
-    }
-  }
-
-  /**
-   * Param for strategy for dealing with unknown or new users/items at prediction time.
-   * This may be useful in cross-validation or production scenarios, for handling user/item ids
-   * the model has not seen in the training data.
-   * Supported values:
-   * - "nan":  predicted value for unknown ids will be NaN.
-   * - "drop": rows in the input DataFrame containing unknown ids will be dropped from
-   *           the output DataFrame containing predictions.
-   * Default: "nan".
-   * @group expertParam
-   */
-  val coldStartStrategy = new Param[String](this, "coldStartStrategy",
-    "strategy for dealing with unknown or new users/items at prediction time. This may be " +
-    "useful in cross-validation or production scenarios, for handling user/item ids the model " +
-    "has not seen in the training data. Supported values: " +
-    s"${ALSModel.supportedColdStartStrategies.mkString(",")}.",
-    (s: String) =>
-      ALSModel.supportedColdStartStrategies.contains(s.toLowerCase(Locale.ROOT)))
-
-  /** @group expertGetParam */
-  def getColdStartStrategy: String = $(coldStartStrategy).toLowerCase(Locale.ROOT)
-
-  setDefault(blockSize -> 4096)
-}
-
-/**
- * Common params for ALS.
- */
-private[recommendation] trait ALSParams extends ALSModelParams with HasMaxIter with HasRegParam
-  with HasCheckpointInterval with HasSeed {
-
-  /**
-   * Param for rank of the matrix factorization (positive).
-   * Default: 10
-   * @group param
-   */
-  val rank = new IntParam(this, "rank", "rank of the factorization", ParamValidators.gtEq(1))
-
-  /** @group getParam */
-  def getRank: Int = $(rank)
-
-  /**
-   * Param for number of user blocks (positive).
-   * Default: 10
-   * @group param
-   */
-  val numUserBlocks = new IntParam(this, "numUserBlocks", "number of user blocks",
-    ParamValidators.gtEq(1))
-
-  /** @group getParam */
-  def getNumUserBlocks: Int = $(numUserBlocks)
-
-  /**
-   * Param for number of item blocks (positive).
-   * Default: 10
-   * @group param
-   */
-  val numItemBlocks = new IntParam(this, "numItemBlocks", "number of item blocks",
-      ParamValidators.gtEq(1))
-
-  /** @group getParam */
-  def getNumItemBlocks: Int = $(numItemBlocks)
-
-  /**
-   * Param to decide whether to use implicit preference.
-   * Default: false
-   * @group param
-   */
-  val implicitPrefs = new BooleanParam(this, "implicitPrefs", "whether to use implicit preference")
-
-  /** @group getParam */
-  def getImplicitPrefs: Boolean = $(implicitPrefs)
-
-  /**
-   * Param for the alpha parameter in the implicit preference formulation (nonnegative).
-   * Default: 1.0
-   * @group param
-   */
-  val alpha = new DoubleParam(this, "alpha", "alpha for implicit preference",
-    ParamValidators.gtEq(0))
-
-  /** @group getParam */
-  def getAlpha: Double = $(alpha)
-
-  /**
-   * Param for the column name for ratings.
-   * Default: "rating"
-   * @group param
-   */
-  val ratingCol = new Param[String](this, "ratingCol", "column name for ratings")
-
-  /** @group getParam */
-  def getRatingCol: String = $(ratingCol)
-
-  /**
-   * Param for whether to apply nonnegativity constraints.
-   * Default: false
-   * @group param
-   */
-  val nonnegative = new BooleanParam(
-    this, "nonnegative", "whether to use nonnegative constraint for least squares")
-
-  /** @group getParam */
-  def getNonnegative: Boolean = $(nonnegative)
-
-  /**
-   * Param for StorageLevel for intermediate datasets. Pass in a string representation of
-   * `StorageLevel`. Cannot be "NONE".
-   * Default: "MEMORY_AND_DISK".
-   *
-   * @group expertParam
-   */
-  val intermediateStorageLevel = new Param[String](this, "intermediateStorageLevel",
-    "StorageLevel for intermediate datasets. Cannot be 'NONE'.",
-    (s: String) => Try(StorageLevel.fromString(s)).isSuccess && s != "NONE")
-
-  /** @group expertGetParam */
-  def getIntermediateStorageLevel: String = $(intermediateStorageLevel)
-
-  /**
-   * Param for StorageLevel for ALS model factors. Pass in a string representation of
-   * `StorageLevel`.
-   * Default: "MEMORY_AND_DISK".
-   *
-   * @group expertParam
-   */
-  val finalStorageLevel = new Param[String](this, "finalStorageLevel",
-    "StorageLevel for ALS model factors.",
-    (s: String) => Try(StorageLevel.fromString(s)).isSuccess)
-
-  /** @group expertGetParam */
-  def getFinalStorageLevel: String = $(finalStorageLevel)
-
-  setDefault(rank -> 10, maxIter -> 10, regParam -> 0.1, numUserBlocks -> 10, numItemBlocks -> 10,
-    implicitPrefs -> false, alpha -> 1.0, userCol -> "user", itemCol -> "item",
-    ratingCol -> "rating", nonnegative -> false, checkpointInterval -> 10,
-    intermediateStorageLevel -> "MEMORY_AND_DISK", finalStorageLevel -> "MEMORY_AND_DISK",
-    coldStartStrategy -> "nan")
-
-  /**
-   * Validates and transforms the input schema.
-   *
-   * @param schema input schema
-   * @return output schema
-   */
-  protected def validateAndTransformSchema(schema: StructType): StructType = {
-    // user and item will be cast to Int
-    SchemaUtils.checkNumericType(schema, $(userCol))
-    SchemaUtils.checkNumericType(schema, $(itemCol))
-    // rating will be cast to Float
-    SchemaUtils.checkNumericType(schema, $(ratingCol))
-    SchemaUtils.appendColumn(schema, $(predictionCol), FloatType)
-  }
-}
-
-/**
- * Model fitted by ALS.
- *
- * @param rank rank of the matrix factorization model
- * @param userFactors a DataFrame that stores user factors in two columns: `id` and `features`
- * @param itemFactors a DataFrame that stores item factors in two columns: `id` and `features`
- */
-@Since("1.3.0")
-class ALSModel private[ml] (
-    @Since("1.4.0") override val uid: String,
-    @Since("1.4.0") val rank: Int,
-    @transient val userFactors: DataFrame,
-    @transient val itemFactors: DataFrame)
-  extends Model[ALSModel] with ALSModelParams with MLWritable {
-
-  /** @group setParam */
-  @Since("1.4.0")
-  def setUserCol(value: String): this.type = set(userCol, value)
-
-  /** @group setParam */
-  @Since("1.4.0")
-  def setItemCol(value: String): this.type = set(itemCol, value)
-
-  /** @group setParam */
-  @Since("1.3.0")
-  def setPredictionCol(value: String): this.type = set(predictionCol, value)
-
-  /** @group expertSetParam */
-  @Since("2.2.0")
-  def setColdStartStrategy(value: String): this.type = set(coldStartStrategy, value)
-
-  /**
-   * Set block size for stacking input data in matrices.
-   * Default is 4096.
-   *
-   * @group expertSetParam
-   */
-  @Since("3.0.0")
-  def setBlockSize(value: Int): this.type = set(blockSize, value)
-
-  private val predict = udf { (featuresA: Seq[Float], featuresB: Seq[Float]) =>
-    if (featuresA != null && featuresB != null) {
-      var dotProduct = 0.0f
-      var i = 0
-      while (i < rank) {
-        dotProduct += featuresA(i) * featuresB(i)
-        i += 1
-      }
-      dotProduct
-    } else {
-      Float.NaN
-    }
-  }
-
-  @Since("2.0.0")
-  override def transform(dataset: Dataset[_]): DataFrame = {
-    transformSchema(dataset.schema)
-    // create a new column named map(predictionCol) by running the predict UDF.
-    val predictions = dataset
-      .join(userFactors,
-        checkedCast(dataset($(userCol))) === userFactors("id"), "left")
-      .join(itemFactors,
-        checkedCast(dataset($(itemCol))) === itemFactors("id"), "left")
-      .select(dataset("*"),
-        predict(userFactors("features"), itemFactors("features")).as($(predictionCol)))
-    getColdStartStrategy match {
-      case ALSModel.Drop =>
-        predictions.na.drop("all", Seq($(predictionCol)))
-      case ALSModel.NaN =>
-        predictions
-    }
-  }
-
-  @Since("1.3.0")
-  override def transformSchema(schema: StructType): StructType = {
-    // user and item will be cast to Int
-    SchemaUtils.checkNumericType(schema, $(userCol))
-    SchemaUtils.checkNumericType(schema, $(itemCol))
-    SchemaUtils.appendColumn(schema, $(predictionCol), FloatType)
-  }
-
-  @Since("1.5.0")
-  override def copy(extra: ParamMap): ALSModel = {
-    val copied = new ALSModel(uid, rank, userFactors, itemFactors)
-    copyValues(copied, extra).setParent(parent)
-  }
-
-  @Since("1.6.0")
-  override def write: MLWriter = new ALSModel.ALSModelWriter(this)
-
-  @Since("3.0.0")
-  override def toString: String = {
-    s"ALSModel: uid=$uid, rank=$rank"
-  }
-
-  /**
-   * Returns top `numItems` items recommended for each user, for all users.
-   * @param numItems max number of recommendations for each user
-   * @return a DataFrame of (userCol: Int, recommendations), where recommendations are
-   *         stored as an array of (itemCol: Int, rating: Float) Rows.
-   */
-  @Since("2.2.0")
-  def recommendForAllUsers(numItems: Int): DataFrame = {
-    recommendForAll(userFactors, itemFactors, $(userCol), $(itemCol), numItems, $(blockSize))
-  }
-
-  /**
-   * Returns top `numItems` items recommended for each user id in the input data set. Note that if
-   * there are duplicate ids in the input dataset, only one set of recommendations per unique id
-   * will be returned.
-   * @param dataset a Dataset containing a column of user ids. The column name must match `userCol`.
-   * @param numItems max number of recommendations for each user.
-   * @return a DataFrame of (userCol: Int, recommendations), where recommendations are
-   *         stored as an array of (itemCol: Int, rating: Float) Rows.
-   */
-  @Since("2.3.0")
-  def recommendForUserSubset(dataset: Dataset[_], numItems: Int): DataFrame = {
-    val srcFactorSubset = getSourceFactorSubset(dataset, userFactors, $(userCol))
-    recommendForAll(srcFactorSubset, itemFactors, $(userCol), $(itemCol), numItems, $(blockSize))
-  }
-
-  /**
-   * Returns top `numUsers` users recommended for each item, for all items.
-   * @param numUsers max number of recommendations for each item
-   * @return a DataFrame of (itemCol: Int, recommendations), where recommendations are
-   *         stored as an array of (userCol: Int, rating: Float) Rows.
-   */
-  @Since("2.2.0")
-  def recommendForAllItems(numUsers: Int): DataFrame = {
-    recommendForAll(itemFactors, userFactors, $(itemCol), $(userCol), numUsers, $(blockSize))
-  }
-
-  /**
-   * Returns top `numUsers` users recommended for each item id in the input data set. Note that if
-   * there are duplicate ids in the input dataset, only one set of recommendations per unique id
-   * will be returned.
-   * @param dataset a Dataset containing a column of item ids. The column name must match `itemCol`.
-   * @param numUsers max number of recommendations for each item.
-   * @return a DataFrame of (itemCol: Int, recommendations), where recommendations are
-   *         stored as an array of (userCol: Int, rating: Float) Rows.
-   */
-  @Since("2.3.0")
-  def recommendForItemSubset(dataset: Dataset[_], numUsers: Int): DataFrame = {
-    val srcFactorSubset = getSourceFactorSubset(dataset, itemFactors, $(itemCol))
-    recommendForAll(srcFactorSubset, userFactors, $(itemCol), $(userCol), numUsers, $(blockSize))
-  }
-
-  /**
-   * Returns a subset of a factor DataFrame limited to only those unique ids contained
-   * in the input dataset.
-   * @param dataset input Dataset containing id column to user to filter factors.
-   * @param factors factor DataFrame to filter.
-   * @param column column name containing the ids in the input dataset.
-   * @return DataFrame containing factors only for those ids present in both the input dataset and
-   *         the factor DataFrame.
-   */
-  private def getSourceFactorSubset(
-      dataset: Dataset[_],
-      factors: DataFrame,
-      column: String): DataFrame = {
-    factors
-      .join(dataset.select(column), factors("id") === dataset(column), joinType = "left_semi")
-      .select(factors("id"), factors("features"))
-  }
-
-  /**
-   * Makes recommendations for all users (or items).
-   *
-   * Note: the previous approach used for computing top-k recommendations
-   * used a cross-join followed by predicting a score for each row of the joined dataset.
-   * However, this results in exploding the size of intermediate data. While Spark SQL makes it
-   * relatively efficient, the approach implemented here is significantly more efficient.
-   *
-   * This approach groups factors into blocks and computes the top-k elements per block,
-   * using dot product and an efficient [[BoundedPriorityQueue]] (instead of gemm).
-   * It then computes the global top-k by aggregating the per block top-k elements with
-   * a [[TopByKeyAggregator]]. This significantly reduces the size of intermediate and shuffle data.
-   * This is the DataFrame equivalent to the approach used in
-   * [[org.apache.spark.mllib.recommendation.MatrixFactorizationModel]].
-   *
-   * @param srcFactors src factors for which to generate recommendations
-   * @param dstFactors dst factors used to make recommendations
-   * @param srcOutputColumn name of the column for the source ID in the output DataFrame
-   * @param dstOutputColumn name of the column for the destination ID in the output DataFrame
-   * @param num max number of recommendations for each record
-   * @return a DataFrame of (srcOutputColumn: Int, recommendations), where recommendations are
-   *         stored as an array of (dstOutputColumn: Int, rating: Float) Rows.
-   */
-  private def recommendForAll(
-      srcFactors: DataFrame,
-      dstFactors: DataFrame,
-      srcOutputColumn: String,
-      dstOutputColumn: String,
-      num: Int,
-      blockSize: Int): DataFrame = {
-    import srcFactors.sparkSession.implicits._
-
-    val srcFactorsBlocked = blockify(srcFactors.as[(Int, Array[Float])], blockSize)
-    val dstFactorsBlocked = blockify(dstFactors.as[(Int, Array[Float])], blockSize)
-    val ratings = srcFactorsBlocked.crossJoin(dstFactorsBlocked)
-      .as[(Seq[(Int, Array[Float])], Seq[(Int, Array[Float])])]
-      .flatMap { case (srcIter, dstIter) =>
-        val m = srcIter.size
-        val n = math.min(dstIter.size, num)
-        val output = new Array[(Int, Int, Float)](m * n)
-        var i = 0
-        val pq = new BoundedPriorityQueue[(Int, Float)](num)(Ordering.by(_._2))
-        srcIter.foreach { case (srcId, srcFactor) =>
-          dstIter.foreach { case (dstId, dstFactor) =>
-            // We use F2jBLAS which is faster than a call to native BLAS for vector dot product
-            val score = BLAS.f2jBLAS.sdot(rank, srcFactor, 1, dstFactor, 1)
-            pq += dstId -> score
-          }
-          pq.foreach { case (dstId, score) =>
-            output(i) = (srcId, dstId, score)
-            i += 1
-          }
-          pq.clear()
-        }
-        output.toSeq
-      }
-    // We'll force the IDs to be Int. Unfortunately this converts IDs to Int in the output.
-    val topKAggregator = new TopByKeyAggregator[Int, Int, Float](num, Ordering.by(_._2))
-    val recs = ratings.as[(Int, Int, Float)].groupByKey(_._1).agg(topKAggregator.toColumn)
-      .toDF("id", "recommendations")
-
-    val arrayType = ArrayType(
-      new StructType()
-        .add(dstOutputColumn, IntegerType)
-        .add("rating", FloatType)
-    )
-    recs.select($"id".as(srcOutputColumn), $"recommendations".cast(arrayType))
-  }
-
-  /**
-   * Blockifies factors to improve the efficiency of cross join
-   */
-  private def blockify(
-      factors: Dataset[(Int, Array[Float])],
-      blockSize: Int): Dataset[Seq[(Int, Array[Float])]] = {
-    import factors.sparkSession.implicits._
-    factors.mapPartitions(_.grouped(blockSize))
-  }
-
-}
-
-@Since("1.6.0")
-object ALSModel extends MLReadable[ALSModel] {
-
-  private val NaN = "nan"
-  private val Drop = "drop"
-  private[recommendation] final val supportedColdStartStrategies = Array(NaN, Drop)
-
-  @Since("1.6.0")
-  override def read: MLReader[ALSModel] = new ALSModelReader
-
-  @Since("1.6.0")
-  override def load(path: String): ALSModel = super.load(path)
-
-  private[ALSModel] class ALSModelWriter(instance: ALSModel) extends MLWriter {
-
-    override protected def saveImpl(path: String): Unit = {
-      val extraMetadata = "rank" -> instance.rank
-      DefaultParamsWriter.saveMetadata(instance, path, sc, Some(extraMetadata))
-      val userPath = new Path(path, "userFactors").toString
-      instance.userFactors.write.format("parquet").save(userPath)
-      val itemPath = new Path(path, "itemFactors").toString
-      instance.itemFactors.write.format("parquet").save(itemPath)
-    }
-  }
-
-  private class ALSModelReader extends MLReader[ALSModel] {
-
-    /** Checked against metadata when loading model */
-    private val className = classOf[ALSModel].getName
-
-    override def load(path: String): ALSModel = {
-      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
-      implicit val format = DefaultFormats
-      val rank = (metadata.metadata \ "rank").extract[Int]
-      val userPath = new Path(path, "userFactors").toString
-      val userFactors = sparkSession.read.format("parquet").load(userPath)
-      val itemPath = new Path(path, "itemFactors").toString
-      val itemFactors = sparkSession.read.format("parquet").load(itemPath)
-
-      val model = new ALSModel(metadata.uid, rank, userFactors, itemFactors)
-
-      metadata.getAndSetParams(model)
-      model
-    }
-  }
-}
-
-/**
- * Alternating Least Squares (ALS) matrix factorization.
- *
- * ALS attempts to estimate the ratings matrix `R` as the product of two lower-rank matrices,
- * `X` and `Y`, i.e. `X * Yt = R`. Typically these approximations are called 'factor' matrices.
- * The general approach is iterative. During each iteration, one of the factor matrices is held
- * constant, while the other is solved for using least squares. The newly-solved factor matrix is
- * then held constant while solving for the other factor matrix.
- *
- * This is a blocked implementation of the ALS factorization algorithm that groups the two sets
- * of factors (referred to as "users" and "products") into blocks and reduces communication by only
- * sending one copy of each user vector to each product block on each iteration, and only for the
- * product blocks that need that user's feature vector. This is achieved by pre-computing some
- * information about the ratings matrix to determine the "out-links" of each user (which blocks of
- * products it will contribute to) and "in-link" information for each product (which of the feature
- * vectors it receives from each user block it will depend on). This allows us to send only an
- * array of feature vectors between each user block and product block, and have the product block
- * find the users' ratings and update the products based on these messages.
- *
- * For implicit preference data, the algorithm used is based on
- * "Collaborative Filtering for Implicit Feedback Datasets", available at
- * https://doi.org/10.1109/ICDM.2008.22, adapted for the blocked approach used here.
- *
- * Essentially instead of finding the low-rank approximations to the rating matrix `R`,
- * this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if
- * r is greater than 0 and 0 if r is less than or equal to 0. The ratings then act as 'confidence'
- * values related to strength of indicated user
- * preferences rather than explicit ratings given to items.
- *
- * Note: the input rating dataset to the ALS implementation should be deterministic.
- * Nondeterministic data can cause failure during fitting ALS model.
- * For example, an order-sensitive operation like sampling after a repartition makes dataset
- * output nondeterministic, like `dataset.repartition(2).sample(false, 0.5, 1618)`.
- * Checkpointing sampled dataset or adding a sort before sampling can help make the dataset
- * deterministic.
- */
-@Since("1.3.0")
-class ALS(@Since("1.4.0") override val uid: String) extends Estimator[ALSModel] with ALSParams
-  with DefaultParamsWritable {
-
-  import org.apache.spark.ml.recommendation.ALS.Rating
-
-  @Since("1.4.0")
-  def this() = this(Identifiable.randomUID("als"))
-
-  /** @group setParam */
-  @Since("1.3.0")
-  def setRank(value: Int): this.type = set(rank, value)
-
-  /** @group setParam */
-  @Since("1.3.0")
-  def setNumUserBlocks(value: Int): this.type = set(numUserBlocks, value)
-
-  /** @group setParam */
-  @Since("1.3.0")
-  def setNumItemBlocks(value: Int): this.type = set(numItemBlocks, value)
-
-  /** @group setParam */
-  @Since("1.3.0")
-  def setImplicitPrefs(value: Boolean): this.type = set(implicitPrefs, value)
-
-  /** @group setParam */
-  @Since("1.3.0")
-  def setAlpha(value: Double): this.type = set(alpha, value)
-
-  /** @group setParam */
-  @Since("1.3.0")
-  def setUserCol(value: String): this.type = set(userCol, value)
-
-  /** @group setParam */
-  @Since("1.3.0")
-  def setItemCol(value: String): this.type = set(itemCol, value)
-
-  /** @group setParam */
-  @Since("1.3.0")
-  def setRatingCol(value: String): this.type = set(ratingCol, value)
-
-  /** @group setParam */
-  @Since("1.3.0")
-  def setPredictionCol(value: String): this.type = set(predictionCol, value)
-
-  /** @group setParam */
-  @Since("1.3.0")
-  def setMaxIter(value: Int): this.type = set(maxIter, value)
-
-  /** @group setParam */
-  @Since("1.3.0")
-  def setRegParam(value: Double): this.type = set(regParam, value)
-
-  /** @group setParam */
-  @Since("1.3.0")
-  def setNonnegative(value: Boolean): this.type = set(nonnegative, value)
-
-  /** @group setParam */
-  @Since("1.4.0")
-  def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
-
-  /** @group setParam */
-  @Since("1.3.0")
-  def setSeed(value: Long): this.type = set(seed, value)
-
-  /** @group expertSetParam */
-  @Since("2.0.0")
-  def setIntermediateStorageLevel(value: String): this.type = set(intermediateStorageLevel, value)
-
-  /** @group expertSetParam */
-  @Since("2.0.0")
-  def setFinalStorageLevel(value: String): this.type = set(finalStorageLevel, value)
-
-  /** @group expertSetParam */
-  @Since("2.2.0")
-  def setColdStartStrategy(value: String): this.type = set(coldStartStrategy, value)
-
-  /**
-   * Set block size for stacking input data in matrices.
-   * Default is 4096.
-   *
-   * @group expertSetParam
-   */
-  @Since("3.0.0")
-  def setBlockSize(value: Int): this.type = set(blockSize, value)
-
-  /**
-   * Sets both numUserBlocks and numItemBlocks to the specific value.
-   *
-   * @group setParam
-   */
-  @Since("1.3.0")
-  def setNumBlocks(value: Int): this.type = {
-    setNumUserBlocks(value)
-    setNumItemBlocks(value)
-    this
-  }
-
-  @Since("2.0.0")
-  override def fit(dataset: Dataset[_]): ALSModel = instrumented { instr =>
-    transformSchema(dataset.schema)
-    import dataset.sparkSession.implicits._
-
-    val r = if ($(ratingCol) != "") col($(ratingCol)).cast(FloatType) else lit(1.0f)
-    val ratings = dataset
-      .select(checkedCast(col($(userCol))), checkedCast(col($(itemCol))), r)
-      .rdd
-      .map { row =>
-        Rating(row.getInt(0), row.getInt(1), row.getFloat(2))
-      }
-
-    instr.logPipelineStage(this)
-    instr.logDataset(dataset)
-    instr.logParams(this, rank, numUserBlocks, numItemBlocks, implicitPrefs, alpha, userCol,
-      itemCol, ratingCol, predictionCol, maxIter, regParam, nonnegative, checkpointInterval,
-      seed, intermediateStorageLevel, finalStorageLevel, blockSize)
-
-    val (userFactors, itemFactors) = ALS.train(ratings, rank = $(rank),
-      numUserBlocks = $(numUserBlocks), numItemBlocks = $(numItemBlocks),
-      maxIter = $(maxIter), regParam = $(regParam), implicitPrefs = $(implicitPrefs),
-      alpha = $(alpha), nonnegative = $(nonnegative),
-      intermediateRDDStorageLevel = StorageLevel.fromString($(intermediateStorageLevel)),
-      finalRDDStorageLevel = StorageLevel.fromString($(finalStorageLevel)),
-      checkpointInterval = $(checkpointInterval), seed = $(seed))
-    val userDF = userFactors.toDF("id", "features")
-    val itemDF = itemFactors.toDF("id", "features")
-    val model = new ALSModel(uid, $(rank), userDF, itemDF).setBlockSize($(blockSize))
-      .setParent(this)
-    copyValues(model)
-  }
-
-  @Since("1.3.0")
-  override def transformSchema(schema: StructType): StructType = {
-    validateAndTransformSchema(schema)
-  }
-
-  @Since("1.5.0")
-  override def copy(extra: ParamMap): ALS = defaultCopy(extra)
-}
-
-
-/**
- * An implementation of ALS that supports generic ID types, specialized for Int and Long. This is
- * exposed as a developer API for users who do need other ID types. But it is not recommended
- * because it increases the shuffle size and memory requirement during training. For simplicity,
- * users and items must have the same type. The number of distinct users/items should be smaller
- * than 2 billion.
- */
-object ALS extends DefaultParamsReadable[ALS] with Logging {
-
-  /**
-   * Rating class for better code readability.
-   */
-  case class Rating[@specialized(Int, Long) ID](user: ID, item: ID, rating: Float)
-
-  @Since("1.6.0")
-  override def load(path: String): ALS = super.load(path)
-
-  /** Trait for least squares solvers applied to the normal equation. */
-  private[recommendation] trait LeastSquaresNESolver extends Serializable {
-    /** Solves a least squares problem with regularization (possibly with other constraints). */
-    def solve(ne: NormalEquation, lambda: Double): Array[Float]
-  }
-
-  /** Cholesky solver for least square problems. */
-  private[recommendation] class CholeskySolver extends LeastSquaresNESolver {
-
-    /**
-     * Solves a least squares problem with L2 regularization:
-     *
-     *   min norm(A x - b)^2^ + lambda * norm(x)^2^
-     *
-     * @param ne a [[NormalEquation]] instance that contains AtA, Atb, and n (number of instances)
-     * @param lambda regularization constant
-     * @return the solution x
-     */
-    override def solve(ne: NormalEquation, lambda: Double): Array[Float] = {
-      val k = ne.k
-      // Add scaled lambda to the diagonals of AtA.
-      var i = 0
-      var j = 2
-      while (i < ne.triK) {
-        ne.ata(i) += lambda
-        i += j
-        j += 1
-      }
-      CholeskyDecomposition.solve(ne.ata, ne.atb)
-      val x = new Array[Float](k)
-      i = 0
-      while (i < k) {
-        x(i) = ne.atb(i).toFloat
-        i += 1
-      }
-      ne.reset()
-      x
-    }
-  }
-
-  /** NNLS solver. */
-  private[recommendation] class NNLSSolver extends LeastSquaresNESolver {
-    private var rank: Int = -1
-    private var workspace: NNLS.Workspace = _
-    private var ata: Array[Double] = _
-    private var initialized: Boolean = false
-
-    private def initialize(rank: Int): Unit = {
-      if (!initialized) {
-        this.rank = rank
-        workspace = NNLS.createWorkspace(rank)
-        ata = new Array[Double](rank * rank)
-        initialized = true
-      } else {
-        require(this.rank == rank)
-      }
-    }
-
-    /**
-     * Solves a nonnegative least squares problem with L2 regularization:
-     *
-     *   min_x_  norm(A x - b)^2^ + lambda * n * norm(x)^2^
-     *   subject to x >= 0
-     */
-    override def solve(ne: NormalEquation, lambda: Double): Array[Float] = {
-      val rank = ne.k
-      initialize(rank)
-      fillAtA(ne.ata, lambda)
-      val x = NNLS.solve(ata, ne.atb, workspace)
-      ne.reset()
-      x.map(x => x.toFloat)
-    }
-
-    /**
-     * Given a triangular matrix in the order of fillXtX above, compute the full symmetric square
-     * matrix that it represents, storing it into destMatrix.
-     */
-    private def fillAtA(triAtA: Array[Double], lambda: Double): Unit = {
-      var i = 0
-      var pos = 0
-      var a = 0.0
-      while (i < rank) {
-        var j = 0
-        while (j <= i) {
-          a = triAtA(pos)
-          ata(i * rank + j) = a
-          ata(j * rank + i) = a
-          pos += 1
-          j += 1
-        }
-        ata(i * rank + i) += lambda
-        i += 1
-      }
-    }
-  }
-
-  /**
-   * Representing a normal equation to solve the following weighted least squares problem:
-   *
-   * minimize \sum,,i,, c,,i,, (a,,i,,^T^ x - d,,i,,)^2^ + lambda * x^T^ x.
-   *
-   * Its normal equation is given by
-   *
-   * \sum,,i,, c,,i,, (a,,i,, a,,i,,^T^ x - d,,i,, a,,i,,) + lambda * x = 0.
-   *
-   * Distributing and letting b,,i,, = c,,i,, * d,,i,,
-   *
-   * \sum,,i,, c,,i,, a,,i,, a,,i,,^T^ x - b,,i,, a,,i,, + lambda * x = 0.
-   */
-  private[recommendation] class NormalEquation(val k: Int) extends Serializable {
-
-    /** Number of entries in the upper triangular part of a k-by-k matrix. */
-    val triK = k * (k + 1) / 2
-    /** A^T^ * A */
-    val ata = new Array[Double](triK)
-    /** A^T^ * b */
-    val atb = new Array[Double](k)
-
-    private val da = new Array[Double](k)
-    private val upper = "U"
-
-    private def copyToDouble(a: Array[Float]): Unit = {
-      var i = 0
-      while (i < k) {
-        da(i) = a(i)
-        i += 1
-      }
-    }
-
-    /** Adds an observation. */
-    def add(a: Array[Float], b: Double, c: Double = 1.0): NormalEquation = {
-      require(c >= 0.0)
-      require(a.length == k)
-      copyToDouble(a)
-      blas.dspr(upper, k, c, da, 1, ata)
-      if (b != 0.0) {
-        blas.daxpy(k, b, da, 1, atb, 1)
-      }
-      this
-    }
-
-    /** Merges another normal equation object. */
-    def merge(other: NormalEquation): NormalEquation = {
-      require(other.k == k)
-      blas.daxpy(ata.length, 1.0, other.ata, 1, ata, 1)
-      blas.daxpy(atb.length, 1.0, other.atb, 1, atb, 1)
-      this
-    }
-
-    /** Resets everything to zero, which should be called after each solve. */
-    def reset(): Unit = {
-      ju.Arrays.fill(ata, 0.0)
-      ju.Arrays.fill(atb, 0.0)
-    }
-  }
-
-  def train[ID: ClassTag]( // scalastyle:ignore
-    ratings: RDD[Rating[ID]],
-    rank: Int = 10,
-    numUserBlocks: Int = 10,
-    numItemBlocks: Int = 10,
-    maxIter: Int = 10,
-    regParam: Double = 0.1,
-    implicitPrefs: Boolean = false,
-    alpha: Double = 1.0,
-    nonnegative: Boolean = false,
-    intermediateRDDStorageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK,
-    finalRDDStorageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK,
-    checkpointInterval: Int = 10,
-    seed: Long = 0L)(
-    implicit ord: Ordering[ID]): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = {
-
-    val isPlatformSupported = DALImplUtils.checkClusterPlatformCompatibility(ratings.sparkContext)
-
-    val (userIdAndFactors, itemIdAndFactors) =
-      if (implicitPrefs && isPlatformSupported) {
-        new ALSDALImpl(ratings, rank, maxIter, regParam, alpha, seed).run()
-      } else {
-        trainMLlib(ratings, rank, numUserBlocks, numItemBlocks, maxIter, regParam, implicitPrefs,
-          alpha, nonnegative, intermediateRDDStorageLevel, finalRDDStorageLevel,
-          checkpointInterval, seed)
-      }
-
-    (userIdAndFactors, itemIdAndFactors)
-  }
-
-  private def trainDAL[ID: ClassTag](
-    ratings: RDD[Rating[ID]],
-    rank: Int,
-    maxIter: Int,
-    regParam: Double,
-    alpha: Double,
-    seed: Long): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = {
-    null
-  }
-
-  /**
-   * Implementation of the ALS algorithm.
-   *
-   * This implementation of the ALS factorization algorithm partitions the two sets of factors among
-   * Spark workers so as to reduce network communication by only sending one copy of each factor
-   * vector to each Spark worker on each iteration, and only if needed.  This is achieved by
-   * precomputing some information about the ratings matrix to determine which users require which
-   * item factors and vice versa.  See the Scaladoc for `InBlock` for a detailed explanation of how
-   * the precomputation is done.
-   *
-   * In addition, since each iteration of calculating the factor matrices depends on the known
-   * ratings, which are spread across Spark partitions, a naive implementation would incur
-   * significant network communication overhead between Spark workers, as the ratings RDD would be
-   * repeatedly shuffled during each iteration.  This implementation reduces that overhead by
-   * performing the shuffling operation up front, precomputing each partition's ratings dependencies
-   * and duplicating those values to the appropriate workers before starting iterations to solve for
-   * the factor matrices.  See the Scaladoc for `OutBlock` for a detailed explanation of how the
-   * precomputation is done.
-   *
-   * Note that the term "rating block" is a bit of a misnomer, as the ratings are not partitioned by
-   * contiguous blocks from the ratings matrix but by a hash function on the rating's location in
-   * the matrix.  If it helps you to visualize the partitions, it is easier to think of the term
-   * "block" as referring to a subset of an RDD containing the ratings rather than a contiguous
-   * submatrix of the ratings matrix.
-   */
-  private def trainMLlib[ID: ClassTag]( // scalastyle:ignore
-      ratings: RDD[Rating[ID]],
-      rank: Int = 10,
-      numUserBlocks: Int = 10,
-      numItemBlocks: Int = 10,
-      maxIter: Int = 10,
-      regParam: Double = 0.1,
-      implicitPrefs: Boolean = false,
-      alpha: Double = 1.0,
-      nonnegative: Boolean = false,
-      intermediateRDDStorageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK,
-      finalRDDStorageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK,
-      checkpointInterval: Int = 10,
-      seed: Long = 0L)(
-      implicit ord: Ordering[ID]): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = {
-
-    require(!ratings.isEmpty(), s"No ratings available from $ratings")
-    require(intermediateRDDStorageLevel != StorageLevel.NONE,
-      "ALS is not designed to run without persisting intermediate RDDs.")
-
-    val sc = ratings.sparkContext
-
-    // Precompute the rating dependencies of each partition
-    val userPart = new ALSPartitioner(numUserBlocks)
-    val itemPart = new ALSPartitioner(numItemBlocks)
-    val blockRatings = partitionRatings(ratings, userPart, itemPart)
-      .persist(intermediateRDDStorageLevel)
-    val (userInBlocks, userOutBlocks) =
-      makeBlocks("user", blockRatings, userPart, itemPart, intermediateRDDStorageLevel)
-    userOutBlocks.count()    // materialize blockRatings and user blocks
-    val swappedBlockRatings = blockRatings.map {
-      case ((userBlockId, itemBlockId), RatingBlock(userIds, itemIds, localRatings)) =>
-        ((itemBlockId, userBlockId), RatingBlock(itemIds, userIds, localRatings))
-    }
-    val (itemInBlocks, itemOutBlocks) =
-      makeBlocks("item", swappedBlockRatings, itemPart, userPart, intermediateRDDStorageLevel)
-    itemOutBlocks.count()    // materialize item blocks
-
-    // Encoders for storing each user/item's partition ID and index within its partition using a
-    // single integer; used as an optimization
-    val userLocalIndexEncoder = new LocalIndexEncoder(userPart.numPartitions)
-    val itemLocalIndexEncoder = new LocalIndexEncoder(itemPart.numPartitions)
-
-    // These are the user and item factor matrices that, once trained, are multiplied together to
-    // estimate the rating matrix.  The two matrices are stored in RDDs, partitioned by column such
-    // that each factor column resides on the same Spark worker as its corresponding user or item.
-    val seedGen = new XORShiftRandom(seed)
-    var userFactors = initialize(userInBlocks, rank, seedGen.nextLong())
-    var itemFactors = initialize(itemInBlocks, rank, seedGen.nextLong())
-
-    val solver = if (nonnegative) new NNLSSolver else new CholeskySolver
-
-    var previousCheckpointFile: Option[String] = None
-    val shouldCheckpoint: Int => Boolean = (iter) =>
-      sc.checkpointDir.isDefined && checkpointInterval != -1 && (iter % checkpointInterval == 0)
-    val deletePreviousCheckpointFile: () => Unit = () =>
-      previousCheckpointFile.foreach { file =>
-        try {
-          val checkpointFile = new Path(file)
-          checkpointFile.getFileSystem(sc.hadoopConfiguration).delete(checkpointFile, true)
-        } catch {
-          case e: IOException =>
-            logWarning(s"Cannot delete checkpoint file $file:", e)
-        }
-      }
-
-    if (implicitPrefs) {
-      for (iter <- 1 to maxIter) {
-        userFactors.setName(s"userFactors-$iter").persist(intermediateRDDStorageLevel)
-        val previousItemFactors = itemFactors
-        itemFactors = computeFactors(userFactors, userOutBlocks, itemInBlocks, rank, regParam,
-          userLocalIndexEncoder, implicitPrefs, alpha, solver)
-        previousItemFactors.unpersist()
-        itemFactors.setName(s"itemFactors-$iter").persist(intermediateRDDStorageLevel)
-        // TODO: Generalize PeriodicGraphCheckpointer and use it here.
-        val deps = itemFactors.dependencies
-        if (shouldCheckpoint(iter)) {
-          itemFactors.checkpoint() // itemFactors gets materialized in computeFactors
-        }
-        val previousUserFactors = userFactors
-        userFactors = computeFactors(itemFactors, itemOutBlocks, userInBlocks, rank, regParam,
-          itemLocalIndexEncoder, implicitPrefs, alpha, solver)
-        if (shouldCheckpoint(iter)) {
-          ALS.cleanShuffleDependencies(sc, deps)
-          deletePreviousCheckpointFile()
-          previousCheckpointFile = itemFactors.getCheckpointFile
-        }
-        previousUserFactors.unpersist()
-      }
-    } else {
-      var previousCachedItemFactors: Option[RDD[(Int, FactorBlock)]] = None
-      for (iter <- 0 until maxIter) {
-        itemFactors = computeFactors(userFactors, userOutBlocks, itemInBlocks, rank, regParam,
-          userLocalIndexEncoder, solver = solver)
-        if (shouldCheckpoint(iter)) {
-          itemFactors.setName(s"itemFactors-$iter").persist(intermediateRDDStorageLevel)
-          val deps = itemFactors.dependencies
-          itemFactors.checkpoint()
-          itemFactors.count() // checkpoint item factors and cut lineage
-          ALS.cleanShuffleDependencies(sc, deps)
-          deletePreviousCheckpointFile()
-
-          previousCachedItemFactors.foreach(_.unpersist())
-          previousCheckpointFile = itemFactors.getCheckpointFile
-          previousCachedItemFactors = Option(itemFactors)
-        }
-        userFactors = computeFactors(itemFactors, itemOutBlocks, userInBlocks, rank, regParam,
-          itemLocalIndexEncoder, solver = solver)
-      }
-    }
-    val userIdAndFactors = userInBlocks
-      .mapValues(_.srcIds)
-      .join(userFactors)
-      .mapPartitions({ items =>
-        items.flatMap { case (_, (ids, factors)) =>
-          ids.view.zip(factors)
-        }
-      // Preserve the partitioning because IDs are consistent with the partitioners in userInBlocks
-      // and userFactors.
-      }, preservesPartitioning = true)
-      .setName("userFactors")
-      .persist(finalRDDStorageLevel)
-    val itemIdAndFactors = itemInBlocks
-      .mapValues(_.srcIds)
-      .join(itemFactors)
-      .mapPartitions({ items =>
-        items.flatMap { case (_, (ids, factors)) =>
-          ids.view.zip(factors)
-        }
-      }, preservesPartitioning = true)
-      .setName("itemFactors")
-      .persist(finalRDDStorageLevel)
-    if (finalRDDStorageLevel != StorageLevel.NONE) {
-      userIdAndFactors.count()
-      userInBlocks.unpersist()
-      userOutBlocks.unpersist()
-      itemOutBlocks.unpersist()
-      blockRatings.unpersist()
-      itemIdAndFactors.count()
-      itemFactors.unpersist()
-      itemInBlocks.unpersist()
-    }
-    (userIdAndFactors, itemIdAndFactors)
-  }
-
-  /**
-   * Factor block that stores factors (Array[Float]) in an Array.
-   */
-  private type FactorBlock = Array[Array[Float]]
-
-  /**
-   * A mapping of the columns of the items factor matrix that are needed when calculating each row
-   * of the users factor matrix, and vice versa.
-   *
-   * Specifically, when calculating a user factor vector, since only those columns of the items
-   * factor matrix that correspond to the items that that user has rated are needed, we can avoid
-   * having to repeatedly copy the entire items factor matrix to each worker later in the algorithm
-   * by precomputing these dependencies for all users, storing them in an RDD of `OutBlock`s.  The
-   * items' dependencies on the columns of the users factor matrix is computed similarly.
-   *
-   * =Example=
-   *
-   * Using the example provided in the `InBlock` Scaladoc, `userOutBlocks` would look like the
-   * following:
-   *
-   * {{{
-   *     userOutBlocks.collect() == Seq(
-   *       0 -> Array(Array(0, 1), Array(0, 1)),
-   *       1 -> Array(Array(0), Array(0))
-   *     )
-   * }}}
-   *
-   * Each value in this map-like sequence is of type `Array[Array[Int]]`.  The values in the
-   * inner array are the ranks of the sorted user IDs in that partition; so in the example above,
-   * `Array(0, 1)` in partition 0 refers to user IDs 0 and 6, since when all unique user IDs in
-   * partition 0 are sorted, 0 is the first ID and 6 is the second.  The position of each inner
-   * array in its enclosing outer array denotes the partition number to which item IDs map; in the
-   * example, the first `Array(0, 1)` is in position 0 of its outer array, denoting item IDs that
-   * map to partition 0.
-   *
-   * In summary, the data structure encodes the following information:
-   *
-   *   *  There are ratings with user IDs 0 and 6 (encoded in `Array(0, 1)`, where 0 and 1 are the
-   *   indices of the user IDs 0 and 6 on partition 0) whose item IDs map to partitions 0 and 1
-   *   (represented by the fact that `Array(0, 1)` appears in both the 0th and 1st positions).
-   *
-   *   *  There are ratings with user ID 3 (encoded in `Array(0)`, where 0 is the index of the user
-   *   ID 3 on partition 1) whose item IDs map to partitions 0 and 1 (represented by the fact that
-   *   `Array(0)` appears in both the 0th and 1st positions).
-   */
-  private type OutBlock = Array[Array[Int]]
-
-  /**
-   * In-link block for computing user and item factor matrices.
-   *
-   * The ALS algorithm partitions the columns of the users factor matrix evenly among Spark workers.
-   * Since each column of the factor matrix is calculated using the known ratings of the correspond-
-   * ing user, and since the ratings don't change across iterations, the ALS algorithm preshuffles
-   * the ratings to the appropriate partitions, storing them in `InBlock` objects.
-   *
-   * The ratings shuffled by item ID are computed similarly and also stored in `InBlock` objects.
-   * Note that this means every rating is stored twice, once as shuffled by user ID and once by item
-   * ID.  This is a necessary tradeoff, since in general a rating will not be on the same worker
-   * when partitioned by user as by item.
-   *
-   * =Example=
-   *
-   * Say we have a small collection of eight items to offer the seven users in our application.  We
-   * have some known ratings given by the users, as seen in the matrix below:
-   *
-   * {{{
-   *                       Items
-   *            0   1   2   3   4   5   6   7
-   *          +---+---+---+---+---+---+---+---+
-   *        0 |   |0.1|   |   |0.4|   |   |0.7|
-   *          +---+---+---+---+---+---+---+---+
-   *        1 |   |   |   |   |   |   |   |   |
-   *          +---+---+---+---+---+---+---+---+
-   *     U  2 |   |   |   |   |   |   |   |   |
-   *     s    +---+---+---+---+---+---+---+---+
-   *     e  3 |   |3.1|   |   |3.4|   |   |3.7|
-   *     r    +---+---+---+---+---+---+---+---+
-   *     s  4 |   |   |   |   |   |   |   |   |
-   *          +---+---+---+---+---+---+---+---+
-   *        5 |   |   |   |   |   |   |   |   |
-   *          +---+---+---+---+---+---+---+---+
-   *        6 |   |6.1|   |   |6.4|   |   |6.7|
-   *          +---+---+---+---+---+---+---+---+
-   * }}}
-   *
-   * The ratings are represented as an RDD, passed to the `partitionRatings` method as the `ratings`
-   * parameter:
-   *
-   * {{{
-   *     ratings.collect() == Seq(
-   *       Rating(0, 1, 0.1f),
-   *       Rating(0, 4, 0.4f),
-   *       Rating(0, 7, 0.7f),
-   *       Rating(3, 1, 3.1f),
-   *       Rating(3, 4, 3.4f),
-   *       Rating(3, 7, 3.7f),
-   *       Rating(6, 1, 6.1f),
-   *       Rating(6, 4, 6.4f),
-   *       Rating(6, 7, 6.7f)
-   *     )
-   * }}}
-   *
-   * Say that we are using two partitions to calculate each factor matrix:
-   *
-   * {{{
-   *     val userPart = new ALSPartitioner(2)
-   *     val itemPart = new ALSPartitioner(2)
-   *     val blockRatings = partitionRatings(ratings, userPart, itemPart)
-   * }}}
-   *
-   * Ratings are mapped to partitions using the user/item IDs modulo the number of partitions.  With
-   * two partitions, ratings with even-valued user IDs are shuffled to partition 0 while those with
-   * odd-valued user IDs are shuffled to partition 1:
-   *
-   * {{{
-   *     userInBlocks.collect() == Seq(
-   *       0 -> Seq(
-   *              // Internally, the class stores the ratings in a more optimized format than
-   *              // a sequence of `Rating`s, but for clarity we show it as such here.
-   *              Rating(0, 1, 0.1f),
-   *              Rating(0, 4, 0.4f),
-   *              Rating(0, 7, 0.7f),
-   *              Rating(6, 1, 6.1f),
-   *              Rating(6, 4, 6.4f),
-   *              Rating(6, 7, 6.7f)
-   *            ),
-   *       1 -> Seq(
-   *              Rating(3, 1, 3.1f),
-   *              Rating(3, 4, 3.4f),
-   *              Rating(3, 7, 3.7f)
-   *            )
-   *     )
-   * }}}
-   *
-   * Similarly, ratings with even-valued item IDs are shuffled to partition 0 while those with
-   * odd-valued item IDs are shuffled to partition 1:
-   *
-   * {{{
-   *     itemInBlocks.collect() == Seq(
-   *       0 -> Seq(
-   *              Rating(0, 4, 0.4f),
-   *              Rating(3, 4, 3.4f),
-   *              Rating(6, 4, 6.4f)
-   *            ),
-   *       1 -> Seq(
-   *              Rating(0, 1, 0.1f),
-   *              Rating(0, 7, 0.7f),
-   *              Rating(3, 1, 3.1f),
-   *              Rating(3, 7, 3.7f),
-   *              Rating(6, 1, 6.1f),
-   *              Rating(6, 7, 6.7f)
-   *            )
-   *     )
-   * }}}
-   *
-   * @param srcIds src ids (ordered)
-   * @param dstPtrs dst pointers. Elements in range [dstPtrs(i), dstPtrs(i+1)) of dst indices and
-   *                ratings are associated with srcIds(i).
-   * @param dstEncodedIndices encoded dst indices
-   * @param ratings ratings
-   * @see [[LocalIndexEncoder]]
-   */
-  private[recommendation] case class InBlock[@specialized(Int, Long) ID: ClassTag](
-      srcIds: Array[ID],
-      dstPtrs: Array[Int],
-      dstEncodedIndices: Array[Int],
-      ratings: Array[Float]) {
-    /** Size of the block. */
-    def size: Int = ratings.length
-    require(dstEncodedIndices.length == size)
-    require(dstPtrs.length == srcIds.length + 1)
-  }
-
-  /**
-   * Initializes factors randomly given the in-link blocks.
-   *
-   * @param inBlocks in-link blocks
-   * @param rank rank
-   * @return initialized factor blocks
-   */
-  private def initialize[ID](
-      inBlocks: RDD[(Int, InBlock[ID])],
-      rank: Int,
-      seed: Long): RDD[(Int, FactorBlock)] = {
-    // Choose a unit vector uniformly at random from the unit sphere, but from the
-    // "first quadrant" where all elements are nonnegative. This can be done by choosing
-    // elements distributed as Normal(0,1) and taking the absolute value, and then normalizing.
-    // This appears to create factorizations that have a slightly better reconstruction
-    // (<1%) compared picking elements uniformly at random in [0,1].
-    inBlocks.mapPartitions({ iter =>
-      iter.map {
-        case (srcBlockId, inBlock) =>
-          val random = new XORShiftRandom(byteswap64(seed ^ srcBlockId))
-          val factors = Array.fill(inBlock.srcIds.length) {
-            val factor = Array.fill(rank)(random.nextGaussian().toFloat)
-            val nrm = blas.snrm2(rank, factor, 1)
-            blas.sscal(rank, 1.0f / nrm, factor, 1)
-            factor
-          }
-          (srcBlockId, factors)
-      }
-    }, preservesPartitioning = true)
-  }
-
-  /**
-   * A rating block that contains src IDs, dst IDs, and ratings, stored in primitive arrays.
-   */
-  private[recommendation] case class RatingBlock[@specialized(Int, Long) ID: ClassTag](
-      srcIds: Array[ID],
-      dstIds: Array[ID],
-      ratings: Array[Float]) {
-    /** Size of the block. */
-    def size: Int = srcIds.length
-    require(dstIds.length == srcIds.length)
-    require(ratings.length == srcIds.length)
-  }
-
-  /**
-   * Builder for [[RatingBlock]]. `mutable.ArrayBuilder` is used to avoid boxing/unboxing.
-   */
-  private[recommendation] class RatingBlockBuilder[@specialized(Int, Long) ID: ClassTag]
-    extends Serializable {
-
-    private val srcIds = mutable.ArrayBuilder.make[ID]
-    private val dstIds = mutable.ArrayBuilder.make[ID]
-    private val ratings = mutable.ArrayBuilder.make[Float]
-    var size = 0
-
-    /** Adds a rating. */
-    def add(r: Rating[ID]): this.type = {
-      size += 1
-      srcIds += r.user
-      dstIds += r.item
-      ratings += r.rating
-      this
-    }
-
-    /** Merges another [[RatingBlockBuilder]]. */
-    def merge(other: RatingBlock[ID]): this.type = {
-      size += other.srcIds.length
-      srcIds ++= other.srcIds
-      dstIds ++= other.dstIds
-      ratings ++= other.ratings
-      this
-    }
-
-    /** Builds a [[RatingBlock]]. */
-    def build(): RatingBlock[ID] = {
-      RatingBlock[ID](srcIds.result(), dstIds.result(), ratings.result())
-    }
-  }
-
-  /**
-   * Groups an RDD of [[Rating]]s by the user partition and item partition to which each `Rating`
-   * maps according to the given partitioners.  The returned pair RDD holds the ratings, encoded in
-   * a memory-efficient format but otherwise unchanged, keyed by the (user partition ID, item
-   * partition ID) pair.
-   *
-   * Performance note: This is an expensive operation that performs an RDD shuffle.
-   *
-   * Implementation note: This implementation produces the same result as the following but
-   * generates fewer intermediate objects:
-   *
-   * {{{
-   *     ratings.map { r =>
-   *       ((srcPart.getPartition(r.user), dstPart.getPartition(r.item)), r)
-   *     }.aggregateByKey(new RatingBlockBuilder)(
-   *         seqOp = (b, r) => b.add(r),
-   *         combOp = (b0, b1) => b0.merge(b1.build()))
-   *       .mapValues(_.build())
-   * }}}
-   *
-   * @param ratings raw ratings
-   * @param srcPart partitioner for src IDs
-   * @param dstPart partitioner for dst IDs
-   * @return an RDD of rating blocks in the form of ((srcBlockId, dstBlockId), ratingBlock)
-   */
-  private def partitionRatings[ID: ClassTag](
-      ratings: RDD[Rating[ID]],
-      srcPart: Partitioner,
-      dstPart: Partitioner): RDD[((Int, Int), RatingBlock[ID])] = {
-    val numPartitions = srcPart.numPartitions * dstPart.numPartitions
-    ratings.mapPartitions { iter =>
-      val builders = Array.fill(numPartitions)(new RatingBlockBuilder[ID])
-      iter.flatMap { r =>
-        val srcBlockId = srcPart.getPartition(r.user)
-        val dstBlockId = dstPart.getPartition(r.item)
-        val idx = srcBlockId + srcPart.numPartitions * dstBlockId
-        val builder = builders(idx)
-        builder.add(r)
-        if (builder.size >= 2048) { // 2048 * (3 * 4) = 24k
-          builders(idx) = new RatingBlockBuilder
-          Iterator.single(((srcBlockId, dstBlockId), builder.build()))
-        } else {
-          Iterator.empty
-        }
-      } ++ {
-        builders.view.zipWithIndex.filter(_._1.size > 0).map { case (block, idx) =>
-          val srcBlockId = idx % srcPart.numPartitions
-          val dstBlockId = idx / srcPart.numPartitions
-          ((srcBlockId, dstBlockId), block.build())
-        }
-      }
-    }.groupByKey().mapValues { blocks =>
-      val builder = new RatingBlockBuilder[ID]
-      blocks.foreach(builder.merge)
-      builder.build()
-    }.setName("ratingBlocks")
-  }
-
-  /**
-   * Builder for uncompressed in-blocks of (srcId, dstEncodedIndex, rating) tuples.
-   *
-   * @param encoder encoder for dst indices
-   */
-  private[recommendation] class UncompressedInBlockBuilder[@specialized(Int, Long) ID: ClassTag](
-      encoder: LocalIndexEncoder)(
-      implicit ord: Ordering[ID]) {
-
-    private val srcIds = mutable.ArrayBuilder.make[ID]
-    private val dstEncodedIndices = mutable.ArrayBuilder.make[Int]
-    private val ratings = mutable.ArrayBuilder.make[Float]
-
-    /**
-     * Adds a dst block of (srcId, dstLocalIndex, rating) tuples.
-     *
-     * @param dstBlockId dst block ID
-     * @param srcIds original src IDs
-     * @param dstLocalIndices dst local indices
-     * @param ratings ratings
-     */
-    def add(
-        dstBlockId: Int,
-        srcIds: Array[ID],
-        dstLocalIndices: Array[Int],
-        ratings: Array[Float]): this.type = {
-      val sz = srcIds.length
-      require(dstLocalIndices.length == sz)
-      require(ratings.length == sz)
-      this.srcIds ++= srcIds
-      this.ratings ++= ratings
-      var j = 0
-      while (j < sz) {
-        this.dstEncodedIndices += encoder.encode(dstBlockId, dstLocalIndices(j))
-        j += 1
-      }
-      this
-    }
-
-    /** Builds a [[UncompressedInBlock]]. */
-    def build(): UncompressedInBlock[ID] = {
-      new UncompressedInBlock(srcIds.result(), dstEncodedIndices.result(), ratings.result())
-    }
-  }
-
-  /**
-   * A block of (srcId, dstEncodedIndex, rating) tuples stored in primitive arrays.
-   */
-  private[recommendation] class UncompressedInBlock[@specialized(Int, Long) ID: ClassTag](
-      val srcIds: Array[ID],
-      val dstEncodedIndices: Array[Int],
-      val ratings: Array[Float])(
-      implicit ord: Ordering[ID]) {
-
-    /** Size the of block. */
-    def length: Int = srcIds.length
-
-    /**
-     * Compresses the block into an `InBlock`. The algorithm is the same as converting a sparse
-     * matrix from coordinate list (COO) format into compressed sparse column (CSC) format.
-     * Sorting is done using Spark's built-in Timsort to avoid generating too many objects.
-     */
-    def compress(): InBlock[ID] = {
-      val sz = length
-      assert(sz > 0, "Empty in-link block should not exist.")
-      sort()
-      val uniqueSrcIdsBuilder = mutable.ArrayBuilder.make[ID]
-      val dstCountsBuilder = mutable.ArrayBuilder.make[Int]
-      var preSrcId = srcIds(0)
-      uniqueSrcIdsBuilder += preSrcId
-      var curCount = 1
-      var i = 1
-      while (i < sz) {
-        val srcId = srcIds(i)
-        if (srcId != preSrcId) {
-          uniqueSrcIdsBuilder += srcId
-          dstCountsBuilder += curCount
-          preSrcId = srcId
-          curCount = 0
-        }
-        curCount += 1
-        i += 1
-      }
-      dstCountsBuilder += curCount
-      val uniqueSrcIds = uniqueSrcIdsBuilder.result()
-      val numUniqueSrdIds = uniqueSrcIds.length
-      val dstCounts = dstCountsBuilder.result()
-      val dstPtrs = new Array[Int](numUniqueSrdIds + 1)
-      var sum = 0
-      i = 0
-      while (i < numUniqueSrdIds) {
-        sum += dstCounts(i)
-        i += 1
-        dstPtrs(i) = sum
-      }
-      InBlock(uniqueSrcIds, dstPtrs, dstEncodedIndices, ratings)
-    }
-
-    private def sort(): Unit = {
-      val sz = length
-      // Since there might be interleaved log messages, we insert a unique id for easy pairing.
-      val sortId = Utils.random.nextInt()
-      logDebug(s"Start sorting an uncompressed in-block of size $sz. (sortId = $sortId)")
-      val start = System.nanoTime()
-      val sorter = new Sorter(new UncompressedInBlockSort[ID])
-      sorter.sort(this, 0, length, Ordering[KeyWrapper[ID]])
-      val duration = (System.nanoTime() - start) / 1e9
-      logDebug(s"Sorting took $duration seconds. (sortId = $sortId)")
-    }
-  }
-
-  /**
-   * A wrapper that holds a primitive key.
-   *
-   * @see [[UncompressedInBlockSort]]
-   */
-  private class KeyWrapper[@specialized(Int, Long) ID: ClassTag](
-      implicit ord: Ordering[ID]) extends Ordered[KeyWrapper[ID]] {
-
-    var key: ID = _
-
-    override def compare(that: KeyWrapper[ID]): Int = {
-      ord.compare(key, that.key)
-    }
-
-    def setKey(key: ID): this.type = {
-      this.key = key
-      this
-    }
-  }
-
-  /**
-   * [[SortDataFormat]] of [[UncompressedInBlock]] used by [[Sorter]].
-   */
-  private class UncompressedInBlockSort[@specialized(Int, Long) ID: ClassTag](
-      implicit ord: Ordering[ID])
-    extends SortDataFormat[KeyWrapper[ID], UncompressedInBlock[ID]] {
-
-    override def newKey(): KeyWrapper[ID] = new KeyWrapper()
-
-    override def getKey(
-        data: UncompressedInBlock[ID],
-        pos: Int,
-        reuse: KeyWrapper[ID]): KeyWrapper[ID] = {
-      if (reuse == null) {
-        new KeyWrapper().setKey(data.srcIds(pos))
-      } else {
-        reuse.setKey(data.srcIds(pos))
-      }
-    }
-
-    override def getKey(
-        data: UncompressedInBlock[ID],
-        pos: Int): KeyWrapper[ID] = {
-      getKey(data, pos, null)
-    }
-
-    private def swapElements[@specialized(Int, Float) T](
-        data: Array[T],
-        pos0: Int,
-        pos1: Int): Unit = {
-      val tmp = data(pos0)
-      data(pos0) = data(pos1)
-      data(pos1) = tmp
-    }
-
-    override def swap(data: UncompressedInBlock[ID], pos0: Int, pos1: Int): Unit = {
-      swapElements(data.srcIds, pos0, pos1)
-      swapElements(data.dstEncodedIndices, pos0, pos1)
-      swapElements(data.ratings, pos0, pos1)
-    }
-
-    override def copyRange(
-        src: UncompressedInBlock[ID],
-        srcPos: Int,
-        dst: UncompressedInBlock[ID],
-        dstPos: Int,
-        length: Int): Unit = {
-      System.arraycopy(src.srcIds, srcPos, dst.srcIds, dstPos, length)
-      System.arraycopy(src.dstEncodedIndices, srcPos, dst.dstEncodedIndices, dstPos, length)
-      System.arraycopy(src.ratings, srcPos, dst.ratings, dstPos, length)
-    }
-
-    override def allocate(length: Int): UncompressedInBlock[ID] = {
-      new UncompressedInBlock(
-        new Array[ID](length), new Array[Int](length), new Array[Float](length))
-    }
-
-    override def copyElement(
-        src: UncompressedInBlock[ID],
-        srcPos: Int,
-        dst: UncompressedInBlock[ID],
-        dstPos: Int): Unit = {
-      dst.srcIds(dstPos) = src.srcIds(srcPos)
-      dst.dstEncodedIndices(dstPos) = src.dstEncodedIndices(srcPos)
-      dst.ratings(dstPos) = src.ratings(srcPos)
-    }
-  }
-
-  /**
-   * Creates in-blocks and out-blocks from rating blocks.
-   *
-   * @param prefix prefix for in/out-block names
-   * @param ratingBlocks rating blocks
-   * @param srcPart partitioner for src IDs
-   * @param dstPart partitioner for dst IDs
-   * @return (in-blocks, out-blocks)
-   */
-  private def makeBlocks[ID: ClassTag](
-      prefix: String,
-      ratingBlocks: RDD[((Int, Int), RatingBlock[ID])],
-      srcPart: Partitioner,
-      dstPart: Partitioner,
-      storageLevel: StorageLevel)(
-      implicit srcOrd: Ordering[ID]): (RDD[(Int, InBlock[ID])], RDD[(Int, OutBlock)]) = {
-    val inBlocks = ratingBlocks.map {
-      case ((srcBlockId, dstBlockId), RatingBlock(srcIds, dstIds, ratings)) =>
-        // The implementation is a faster version of
-        // val dstIdToLocalIndex = dstIds.toSet.toSeq.sorted.zipWithIndex.toMap
-        val start = System.nanoTime()
-        val dstIdSet = new OpenHashSet[ID](1 << 20)
-        dstIds.foreach(dstIdSet.add)
-        val sortedDstIds = new Array[ID](dstIdSet.size)
-        var i = 0
-        var pos = dstIdSet.nextPos(0)
-        while (pos != -1) {
-          sortedDstIds(i) = dstIdSet.getValue(pos)
-          pos = dstIdSet.nextPos(pos + 1)
-          i += 1
-        }
-        assert(i == dstIdSet.size)
-        Sorting.quickSort(sortedDstIds)
-        val dstIdToLocalIndex = new OpenHashMap[ID, Int](sortedDstIds.length)
-        i = 0
-        while (i < sortedDstIds.length) {
-          dstIdToLocalIndex.update(sortedDstIds(i), i)
-          i += 1
-        }
-        logDebug(
-          "Converting to local indices took " + (System.nanoTime() - start) / 1e9 + " seconds.")
-        val dstLocalIndices = dstIds.map(dstIdToLocalIndex.apply)
-        (srcBlockId, (dstBlockId, srcIds, dstLocalIndices, ratings))
-    }.groupByKey(new ALSPartitioner(srcPart.numPartitions))
-      .mapValues { iter =>
-        val builder =
-          new UncompressedInBlockBuilder[ID](new LocalIndexEncoder(dstPart.numPartitions))
-        iter.foreach { case (dstBlockId, srcIds, dstLocalIndices, ratings) =>
-          builder.add(dstBlockId, srcIds, dstLocalIndices, ratings)
-        }
-        builder.build().compress()
-      }.setName(prefix + "InBlocks")
-      .persist(storageLevel)
-    val outBlocks = inBlocks.mapValues { case InBlock(srcIds, dstPtrs, dstEncodedIndices, _) =>
-      val encoder = new LocalIndexEncoder(dstPart.numPartitions)
-      val activeIds = Array.fill(dstPart.numPartitions)(mutable.ArrayBuilder.make[Int])
-      var i = 0
-      val seen = new Array[Boolean](dstPart.numPartitions)
-      while (i < srcIds.length) {
-        var j = dstPtrs(i)
-        ju.Arrays.fill(seen, false)
-        while (j < dstPtrs(i + 1)) {
-          val dstBlockId = encoder.blockId(dstEncodedIndices(j))
-          if (!seen(dstBlockId)) {
-            activeIds(dstBlockId) += i // add the local index in this out-block
-            seen(dstBlockId) = true
-          }
-          j += 1
-        }
-        i += 1
-      }
-      activeIds.map { x =>
-        x.result()
-      }
-    }.setName(prefix + "OutBlocks")
-      .persist(storageLevel)
-    (inBlocks, outBlocks)
-  }
-
-  /**
-   * Compute dst factors by constructing and solving least square problems.
-   *
-   * @param srcFactorBlocks src factors
-   * @param srcOutBlocks src out-blocks
-   * @param dstInBlocks dst in-blocks
-   * @param rank rank
-   * @param regParam regularization constant
-   * @param srcEncoder encoder for src local indices
-   * @param implicitPrefs whether to use implicit preference
-   * @param alpha the alpha constant in the implicit preference formulation
-   * @param solver solver for least squares problems
-   * @return dst factors
-   */
-  private def computeFactors[ID](
-      srcFactorBlocks: RDD[(Int, FactorBlock)],
-      srcOutBlocks: RDD[(Int, OutBlock)],
-      dstInBlocks: RDD[(Int, InBlock[ID])],
-      rank: Int,
-      regParam: Double,
-      srcEncoder: LocalIndexEncoder,
-      implicitPrefs: Boolean = false,
-      alpha: Double = 1.0,
-      solver: LeastSquaresNESolver): RDD[(Int, FactorBlock)] = {
-    val numSrcBlocks = srcFactorBlocks.partitions.length
-    val YtY = if (implicitPrefs) Some(computeYtY(srcFactorBlocks, rank)) else None
-    val srcOut = srcOutBlocks.join(srcFactorBlocks).flatMap {
-      case (srcBlockId, (srcOutBlock, srcFactors)) =>
-        srcOutBlock.view.zipWithIndex.map { case (activeIndices, dstBlockId) =>
-          (dstBlockId, (srcBlockId, activeIndices.map(idx => srcFactors(idx))))
-        }
-    }
-    val merged = srcOut.groupByKey(new ALSPartitioner(dstInBlocks.partitions.length))
-
-    // SPARK-28927: Nondeterministic RDDs causes inconsistent in/out blocks in case of rerun.
-    // It can cause runtime error when matching in/out user/item blocks.
-    val isBlockRDDNondeterministic =
-      dstInBlocks.outputDeterministicLevel == DeterministicLevel.INDETERMINATE ||
-        srcOutBlocks.outputDeterministicLevel == DeterministicLevel.INDETERMINATE
-
-    dstInBlocks.join(merged).mapValues {
-      case (InBlock(dstIds, srcPtrs, srcEncodedIndices, ratings), srcFactors) =>
-        val sortedSrcFactors = new Array[FactorBlock](numSrcBlocks)
-        srcFactors.foreach { case (srcBlockId, factors) =>
-          sortedSrcFactors(srcBlockId) = factors
-        }
-        val dstFactors = new Array[Array[Float]](dstIds.length)
-        var j = 0
-        val ls = new NormalEquation(rank)
-        while (j < dstIds.length) {
-          ls.reset()
-          if (implicitPrefs) {
-            ls.merge(YtY.get)
-          }
-          var i = srcPtrs(j)
-          var numExplicits = 0
-          while (i < srcPtrs(j + 1)) {
-            val encoded = srcEncodedIndices(i)
-            val blockId = srcEncoder.blockId(encoded)
-            val localIndex = srcEncoder.localIndex(encoded)
-            var srcFactor: Array[Float] = null
-            try {
-              srcFactor = sortedSrcFactors(blockId)(localIndex)
-            } catch {
-              case a: ArrayIndexOutOfBoundsException if isBlockRDDNondeterministic =>
-                val errMsg = "A failure detected when matching In/Out blocks of users/items. " +
-                  "Because at least one In/Out block RDD is found to be nondeterministic now, " +
-                  "the issue is probably caused by nondeterministic input data. You can try to " +
-                  "checkpoint training data to make it deterministic. If you do `repartition` + " +
-                  "`sample` or `randomSplit`, you can also try to sort it before `sample` or " +
-                  "`randomSplit` to make it deterministic."
-                throw new SparkException(errMsg, a)
-            }
-            val rating = ratings(i)
-            if (implicitPrefs) {
-              // Extension to the original paper to handle rating < 0. confidence is a function
-              // of |rating| instead so that it is never negative. c1 is confidence - 1.
-              val c1 = alpha * math.abs(rating)
-              // For rating <= 0, the corresponding preference is 0. So the second argument of add
-              // is only there for rating > 0.
-              if (rating > 0.0) {
-                numExplicits += 1
-              }
-              ls.add(srcFactor, if (rating > 0.0) 1.0 + c1 else 0.0, c1)
-            } else {
-              ls.add(srcFactor, rating)
-              numExplicits += 1
-            }
-            i += 1
-          }
-          // Weight lambda by the number of explicit ratings based on the ALS-WR paper.
-          dstFactors(j) = solver.solve(ls, numExplicits * regParam)
-          j += 1
-        }
-        dstFactors
-    }
-  }
-
-  /**
-   * Computes the Gramian matrix of user or item factors, which is only used in implicit preference.
-   * Caching of the input factors is handled in [[ALS#train]].
-   */
-  private def computeYtY(factorBlocks: RDD[(Int, FactorBlock)], rank: Int): NormalEquation = {
-    factorBlocks.values.aggregate(new NormalEquation(rank))(
-      seqOp = (ne, factors) => {
-        factors.foreach(ne.add(_, 0.0))
-        ne
-      },
-      combOp = (ne1, ne2) => ne1.merge(ne2))
-  }
-
-  /**
-   * Encoder for storing (blockId, localIndex) into a single integer.
-   *
-   * We use the leading bits (including the sign bit) to store the block id and the rest to store
-   * the local index. This is based on the assumption that users/items are approximately evenly
-   * partitioned. With this assumption, we should be able to encode two billion distinct values.
-   *
-   * @param numBlocks number of blocks
-   */
-  private[recommendation] class LocalIndexEncoder(numBlocks: Int) extends Serializable {
-
-    require(numBlocks > 0, s"numBlocks must be positive but found $numBlocks.")
-
-    private[this] final val numLocalIndexBits =
-      math.min(java.lang.Integer.numberOfLeadingZeros(numBlocks - 1), 31)
-    private[this] final val localIndexMask = (1 << numLocalIndexBits) - 1
-
-    /** Encodes a (blockId, localIndex) into a single integer. */
-    def encode(blockId: Int, localIndex: Int): Int = {
-      require(blockId < numBlocks)
-      require((localIndex & ~localIndexMask) == 0)
-      (blockId << numLocalIndexBits) | localIndex
-    }
-
-    /** Gets the block id from an encoded index. */
-    @inline
-    def blockId(encoded: Int): Int = {
-      encoded >>> numLocalIndexBits
-    }
-
-    /** Gets the local index from an encoded index. */
-    @inline
-    def localIndex(encoded: Int): Int = {
-      encoded & localIndexMask
-    }
-  }
-
-  /**
-   * Partitioner used by ALS. We require that getPartition is a projection. That is, for any key k,
-   * we have getPartition(getPartition(k)) = getPartition(k). Since the default HashPartitioner
-   * satisfies this requirement, we simply use a type alias here.
-   */
-  private[recommendation] type ALSPartitioner = org.apache.spark.HashPartitioner
-
-  /**
-   * Private function to clean up all of the shuffles files from the dependencies and their parents.
-   */
-  private[spark] def cleanShuffleDependencies[T](
-      sc: SparkContext,
-      deps: Seq[Dependency[_]],
-      blocking: Boolean = false): Unit = {
-    // If there is no reference tracking we skip clean up.
-    sc.cleaner.foreach { cleaner =>
-      /**
-       * Clean the shuffles & all of its parents.
-       */
-      def cleanEagerly(dep: Dependency[_]): Unit = {
-        if (dep.isInstanceOf[ShuffleDependency[_, _, _]]) {
-          val shuffleId = dep.asInstanceOf[ShuffleDependency[_, _, _]].shuffleId
-          cleaner.doCleanupShuffle(shuffleId, blocking)
-        }
-        val rdd = dep.rdd
-        val rddDeps = rdd.dependencies
-        if (rdd.getStorageLevel == StorageLevel.NONE && rddDeps != null) {
-          rddDeps.foreach(cleanEagerly)
-        }
-      }
-      deps.foreach(cleanEagerly)
-    }
-  }
-}
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALSDALImpl.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALSDALImpl.scala
deleted file mode 100644
index bcb95ca1f..000000000
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALSDALImpl.scala
+++ /dev/null
@@ -1,387 +0,0 @@
-package org.apache.spark.ml.recommendation
-
-import com.intel.daal.data_management.data.CSRNumericTable.Indexing
-import org.apache.spark.rdd.{ExecutorInProcessCoalescePartitioner, RDD}
-
-import scala.reflect.ClassTag
-import com.intel.daal.data_management.data.{CSRNumericTable, HomogenNumericTable, RowMergedNumericTable, Matrix => DALMatrix}
-import com.intel.daal.services.DaalContext
-import org.apache.spark.Partitioner
-import org.apache.spark.internal.Logging
-import org.apache.spark.ml.recommendation.ALS.Rating
-import org.apache.spark.ml.util._
-
-import java.nio.{ByteBuffer, ByteOrder}
-import scala.collection.mutable.ArrayBuffer
-//import java.nio.DoubleBuffer
-import java.nio.FloatBuffer
-
-class ALSDataPartitioner(blocks: Int, itemsInBlock: Long)
-  extends Partitioner {
-  def numPartitions: Int = blocks
-  def getPartition(key: Any): Int = {
-    val k = key.asInstanceOf[Long]
-    // itemsInBlock = numItems / partitions
-    // remaining records will belog to the last partition
-    // 21 => 5, 5, 5, 6
-    // 46 => 11, 11, 11, 13
-    math.min((k / itemsInBlock).toInt, blocks-1)
-  }
-}
-
-class ALSDALImpl[@specialized(Int, Long) ID: ClassTag](
-  data: RDD[Rating[ID]],
-  nFactors: Int,
-  maxIter: Int,
-  regParam: Double,
-  alpha: Double,
-  seed: Long,
-) extends Serializable with Logging {
-
-  // Rating struct size is size of Long+Long+Float
-  val RATING_SIZE = 8 + 8 + 4
-
-  // Return Map partitionId -> (ratingsNum, csrRowNum, rowOffset)
-  private def getRatingsPartitionInfo(data: RDD[Rating[ID]]): Map[Int, (Int, Int, Int)] = {
-    val collectd = data.mapPartitionsWithIndex { case (index: Int, it: Iterator[Rating[ID]]) =>
-      var ratingsNum = 0
-      var s = Set[ID]()
-      it.foreach { v =>
-        s += v.user
-        ratingsNum += 1
-      }
-      Iterator((index, (ratingsNum, s.count(_ => true))))
-    }.collect
-
-    var ret = Map[Int, (Int, Int, Int)]()
-    var rowOffset = 0
-    collectd.foreach { v =>
-      val partitionId = v._1
-      val ratingsNum = v._2._1
-      val csrRowNum = v._2._2
-      ret += ( partitionId -> (ratingsNum, csrRowNum, rowOffset))
-      rowOffset = rowOffset + csrRowNum
-    }
-
-    ret
-  }
-
-  private def ratingsToCSRNumericTables(ratings: RDD[Rating[ID]],
-    nVectors: Long, nFeatures: Long, nBlocks: Long): RDD[CSRNumericTable] = {
-
-//    val rowSortedRatings = ratings.sortBy(_.user.toString.toLong)
-
-//    val itemsInBlock = (nFeatures + nBlocks - 1) / nBlocks
-    val itemsInBlock = nFeatures / nBlocks
-//    val rowSortedGrouped = rowSortedRatings.groupBy(value => value.user.toString.toLong / itemsInBlock).flatMap(_._2)
-    val rowSortedGrouped = ratings
-      // Transpose the dataset
-      .map { p =>
-        Rating(p.item, p.user, p.rating)
-      }
-      .groupBy(value => value.user.toString.toLong)
-      .partitionBy(new ALSDataPartitioner(nBlocks.toInt, itemsInBlock))
-      .flatMap(_._2).mapPartitions { p =>
-        p.toArray.sortBy(_.user.toString.toLong).toIterator
-      }
-
-    println("rowSortedGrouped partition number: ",  rowSortedGrouped.getNumPartitions)
-
-    //    rowSortedGrouped.mapPartitionsWithIndex { case (partitionId, partition) =>
-//        println("partitionId", partitionId)
-//        partition.foreach { p =>
-//          println(p.user, p.item, p.rating) }
-//        Iterator(partitionId)
-//    }.collect()
-
-    val ratingsPartitionInfo = getRatingsPartitionInfo(rowSortedGrouped)
-    println("ratingsPartitionInfo:",  ratingsPartitionInfo)
-
-    rowSortedGrouped.mapPartitionsWithIndex { case (partitionId, partition) =>
-      val ratingsNum = ratingsPartitionInfo(partitionId)._1
-      val csrRowNum = ratingsPartitionInfo(partitionId)._2
-      val values = Array.fill(ratingsNum) { 0.0f }
-      val columnIndices = Array.fill(ratingsNum) { 0L }
-      val rowOffsets = ArrayBuffer[Long](1L)
-
-
-      var index = 0
-      var curRow = 0L
-      // Each partition converted to one CSRNumericTable
-      partition.foreach { p =>
-        // Modify row index for each partition (start from 0)
-        val row = p.user.toString.toLong - ratingsPartitionInfo(partitionId)._3
-        val column = p.item.toString.toLong
-        val rating = p.rating
-
-        values(index) = rating
-        // one-based index
-        columnIndices(index) = column + 1
-
-        if (row > curRow) {
-          curRow = row
-          // one-based index
-          rowOffsets += index + 1
-        }
-
-        index = index + 1
-      }
-      // one-based row index
-      rowOffsets += index+1
-
-      println("PartitionId:", partitionId)
-      println("csrRowNum", csrRowNum)
-//      println("rowOffsets", rowOffsets.mkString(","))
-//      println("columnIndices", columnIndices.mkString(","))
-//      println("values", values.mkString(","))
-
-      val contextLocal = new DaalContext()
-
-      println("ALSDALImpl: Loading native libraries ..." )
-      LibLoader.loadLibraries()
-
-      val cTable = OneDAL.cNewCSRNumericTable(values, columnIndices, rowOffsets.toArray, nVectors, csrRowNum)
-      val table = new CSRNumericTable(contextLocal, cTable)
-//      table.pack()
-
-      println("Input dimensions:", table.getNumberOfRows, table.getNumberOfColumns)
-
-      // There is a bug https://github.com/oneapi-src/oneDAL/pull/1288,
-      // printNumericTable can't print correct result for CSRNumericTable, use C++ printNumericTable
-      // Service.printNumericTable("Input: ", table)
-
-      Iterator(table)
-    }.cache()
-  }
-
-//  def factorsToRDD(cUsersFactorsNumTab: Long, cItemsFactorsNumTab: Long)
-//    :(RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = {
-//    val usersFactorsNumTab = OneDAL.makeNumericTable(cUsersFactorsNumTab)
-//    val itemsFactorsNumTab = OneDAL.makeNumericTable(cItemsFactorsNumTab)
-//
-//    Service.printNumericTable("usersFactorsNumTab", usersFactorsNumTab)
-//    Service.printNumericTable("itemsFactorsNumTab", itemsFactorsNumTab)
-//
-//    null
-//  }
-
-  def ratingsToByteBuffer(ratings: Array[Rating[ID]]): ByteBuffer = {
-//    println("ratings len", ratings.length)
-
-    val buffer= ByteBuffer.allocateDirect(ratings.length*(8+8+4))
-    // Use little endian
-    buffer.order(ByteOrder.LITTLE_ENDIAN)
-    ratings.foreach { rating =>
-      buffer.putLong(rating.user.toString.toLong)
-      buffer.putLong(rating.item.toString.toLong)
-      buffer.putFloat(rating.rating)
-    }
-    buffer
-  }
-
-  def run(): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = {
-    val executorNum = Utils.sparkExecutorNum(data.sparkContext)
-    val executorCores = Utils.sparkExecutorCores()
-
-    val nFeatures = data.max()(new Ordering[Rating[ID]]() {
-      override def compare(x: Rating[ID], y: Rating[ID]): Int =
-        Ordering[Long].compare(x.item.toString.toLong, y.item.toString.toLong)
-    }).item.toString.toLong + 1
-
-    val nVectors = data.max()(new Ordering[Rating[ID]]() {
-      override def compare(x: Rating[ID], y: Rating[ID]): Int =
-        Ordering[Long].compare(x.user.toString.toLong, y.user.toString.toLong)
-    }).user.toString.toLong + 1
-
-//    val largestItems = data.sortBy(_.item.toString.toLong, ascending = false).take(1)
-//    val nFeatures = largestItems(0).item.toString.toLong + 1
-
-//    val largestUsers = data.sortBy(_.user.toString.toLong, ascending = false).take(1)
-//    val nVectors = largestUsers(0).user.toString.toLong + 1
-
-    val nBlocks = executorNum
-
-//    val nRatings = data.count()
-
-    logInfo(s"ALSDAL fit using $executorNum Executors for $nVectors vectors and $nFeatures features")
-
-    val numericTables = data.repartition(executorNum).setName("Repartitioned for conversion").cache()
-
-    val executorIPAddress = Utils.sparkFirstExecutorIP(numericTables.sparkContext)
-    val kvsIP = numericTables.sparkContext.conf.get("spark.oap.mllib.oneccl.kvs.ip", executorIPAddress)
-
-    val kvsPortDetected = Utils.checkExecutorAvailPort(numericTables, kvsIP)
-    val kvsPort = numericTables.sparkContext.conf.getInt("spark.oap.mllib.oneccl.kvs.port", kvsPortDetected)
-
-    val kvsIPPort = kvsIP+"_"+kvsPort
-
-    val results = numericTables
-      // Transpose the dataset
-      .map { p =>
-        Rating(p.item, p.user, p.rating) }
-      .mapPartitionsWithIndex { (rank, iter) =>
-        val context = new DaalContext()
-        println("ALSDALImpl: Loading libMLlibDAL.so" )
-        LibLoader.loadLibraries()
-
-        OneCCL.init(executorNum, rank, kvsIPPort)
-        val rankId = OneCCL.rankID()
-
-        println("rankId", rankId, "nUsers", nVectors, "nItems", nFeatures)
-
-        val buffer = ratingsToByteBuffer(iter.toArray)
-        val bufferInfo = new ALSPartitionInfo
-        val shuffledBuffer = cShuffleData(buffer, nFeatures.toInt, nBlocks, bufferInfo)
-
-        val table = bufferToCSRNumericTable(shuffledBuffer, bufferInfo, nVectors.toInt, nFeatures.toInt, nBlocks, rankId)
-
-        val result = new ALSResult()
-        cDALImplictALS(
-          table.getCNumericTable, nUsers = nVectors,
-          nFactors, maxIter, regParam, alpha,
-          executorNum,
-          executorCores,
-          rankId,
-          result
-        )
-        Iterator(result)
-    }.cache()
-
-//    results.foreach { p =>
-////      val usersFactorsNumTab = OneDAL.makeNumericTable(p.cUsersFactorsNumTab)
-////      println("foreach", p.cUsersFactorsNumTab, p.cItemsFactorsNumTab)
-//      println("result", p.rankId, p.cUserOffset, p.cItemOffset);
-//    }
-
-//    val usersFactorsRDD = results.mapPartitionsWithIndex { (index: Int, partiton: Iterator[ALSResult]) =>
-//      partiton.foreach { p =>
-//        val usersFactorsNumTab = OneDAL.makeNumericTable(p.cUsersFactorsNumTab)
-//        Service.printNumericTable("usersFactorsNumTab", usersFactorsNumTab)
-//      }
-//      Iterator()
-//    }.collect()
-
-    val usersFactorsRDD = results.mapPartitionsWithIndex { (index: Int, partiton: Iterator[ALSResult]) =>
-      val ret = partiton.flatMap { p =>
-        val userOffset = p.cUserOffset.toInt
-        val usersFactorsNumTab = OneDAL.makeNumericTable(p.cUsersFactorsNumTab)
-        val nRows = usersFactorsNumTab.getNumberOfRows.toInt
-        val nCols = usersFactorsNumTab.getNumberOfColumns.toInt
-        var buffer = FloatBuffer.allocate(nCols * nRows)
-        // should use returned buffer
-        buffer = usersFactorsNumTab.getBlockOfRows(0, nRows, buffer)
-        (0 until nRows).map { index =>
-          val array = Array.fill(nCols){0.0f}
-          buffer.get(array, 0, nCols)
-          ((index+userOffset).asInstanceOf[ID], array)
-        }.toIterator
-      }
-      ret
-    }.setName("userFactors").cache()
-
-    val itemsFactorsRDD = results.mapPartitionsWithIndex { (index: Int, partiton: Iterator[ALSResult]) =>
-      val ret = partiton.flatMap { p =>
-        val itemOffset = p.cItemOffset.toInt
-        val itemsFactorsNumTab = OneDAL.makeNumericTable(p.cItemsFactorsNumTab)
-        val nRows = itemsFactorsNumTab.getNumberOfRows.toInt
-        val nCols = itemsFactorsNumTab.getNumberOfColumns.toInt
-        var buffer = FloatBuffer.allocate(nCols * nRows)
-        // should use returned buffer
-        buffer = itemsFactorsNumTab.getBlockOfRows(0, nRows, buffer)
-        (0 until nRows).map { index =>
-          val array = Array.fill(nCols){0.0f}
-          buffer.get(array, 0, nCols)
-          ((index+itemOffset).asInstanceOf[ID], array)
-        }.toIterator
-      }
-      ret
-    }.setName("itemFactors").cache()
-
-    usersFactorsRDD.count()
-    itemsFactorsRDD.count()
-
-//    usersFactorsRDD.foreach { case (id, array) =>
-//        println("usersFactorsRDD", id, array.mkString(", "))
-//    }
-//
-//    itemsFactorsRDD.foreach { case (id, array) =>
-//      println("itemsFactorsRDD", id, array.mkString(", "))
-//    }
-
-    (usersFactorsRDD, itemsFactorsRDD)
-  }
-
-  private def getPartitionOffset(partitionId: Int, nRatings: Int, nBlocks: Int): Int = {
-    require(partitionId >=0 && partitionId < nBlocks)
-    val itemsInBlock = nRatings / nBlocks
-    return partitionId * itemsInBlock
-  }
-
-  private def bufferToCSRNumericTable(buffer: ByteBuffer, info: ALSPartitionInfo,
-                                      nVectors: Int, nFeatures: Int, nBlocks: Int, rankId: Int): CSRNumericTable = {
-    // Use little endian
-    buffer.order(ByteOrder.LITTLE_ENDIAN)
-
-    val ratingsNum = info.ratingsNum
-    val csrRowNum = info.csrRowNum
-    val values = Array.fill(ratingsNum) { 0.0f }
-    val columnIndices = Array.fill(ratingsNum) { 0L }
-    val rowOffsets = ArrayBuffer[Long](1L)
-
-    var index = 0
-    var curRow = 0L
-    // Each partition converted to one CSRNumericTable
-    for (i <- 0 until ratingsNum) {
-      // Modify row index for each partition (start from 0)
-      val row = buffer.getLong(i*RATING_SIZE) - getPartitionOffset(rankId, nFeatures, nBlocks)
-      val column = buffer.getLong(i*RATING_SIZE+8)
-      val rating = buffer.getFloat(i*RATING_SIZE+16)
-
-      values(index) = rating
-      // one-based index
-      columnIndices(index) = column + 1
-
-      if (row > curRow) {
-        curRow = row
-        // one-based index
-        rowOffsets += index + 1
-      }
-
-      index = index + 1
-    }
-    // one-based row index
-    rowOffsets += index+1
-
-//    println("rankId:", rankId)
-//    println("csrRowNum", csrRowNum)
-
-//    println(rowOffsets.mkString(" "))
-//    println(columnIndices.mkString(" "))
-//    println(values.mkString(" "))
-
-    val contextLocal = new DaalContext()
-    val cTable = OneDAL.cNewCSRNumericTable(values, columnIndices, rowOffsets.toArray, nVectors, csrRowNum)
-    val table = new CSRNumericTable(contextLocal, cTable)
-
-    println("Input dimensions:", table.getNumberOfRows, table.getNumberOfColumns)
-//    Service.printNumericTable("Input NumericTable", table)
-
-    table
-  }
-
-  // Single entry to call Implict ALS DAL backend
-  @native private def cDALImplictALS(data: Long, 
-                                     nUsers: Long,
-                                     nFactors: Int,
-                                     maxIter: Int,
-                                     regParam: Double,
-                                     alpha: Double,
-                                     executor_num: Int,
-                                     executor_cores: Int,
-                                     rankId: Int,
-                                     result: ALSResult): Long
-  @native private def cShuffleData(data: ByteBuffer,
-                                   nTotalKeys: Int,
-                                   nBlocks: Int,
-                                   info: ALSPartitionInfo): ByteBuffer
-}
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala
index 7581a1003..4c38a9bdc 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala
@@ -17,32 +17,60 @@
 
 package org.apache.spark.ml.util
 
-import org.apache.spark.internal.Logging
+import org.apache.spark.SparkConf
 
-object OneCCL extends Logging {
+object OneCCL {
 
   var cclParam = new CCLParam()
 
+  var kvsIPPort = sys.env.getOrElse("CCL_KVS_IP_PORT", "")
+  var worldSize = sys.env.getOrElse("CCL_WORLD_SIZE", "1").toInt
+
+  var KVS_PORT = 51234
+
+  private def checkEnv() {
+    val altTransport = sys.env.getOrElse("CCL_ATL_TRANSPORT", "")
+    val pmType = sys.env.getOrElse("CCL_PM_TYPE", "")
+    val ipExchange = sys.env.getOrElse("CCL_KVS_IP_EXCHANGE", "")
+
+    assert(altTransport == "ofi")
+    assert(pmType == "resizable")
+    assert(ipExchange == "env")
+    assert(kvsIPPort != "")
+
+  }
+
   // Run on Executor
-  def setExecutorEnv(): Unit = {
+  def setExecutorEnv(executor_num: Int, ip: String, port: Int): Unit = {
+    // Work around ccl by passings in a spark.executorEnv.CCL_KVS_IP_PORT.
+    val ccl_kvs_ip_port = sys.env.getOrElse("CCL_KVS_IP_PORT", s"${ip}_${port}")
+
+    println(s"oneCCL: Initializing with CCL_KVS_IP_PORT: $ccl_kvs_ip_port")
+
+    setEnv("CCL_PM_TYPE", "resizable")
     setEnv("CCL_ATL_TRANSPORT","ofi")
+    setEnv("CCL_ATL_TRANSPORT_PATH", LibLoader.getTempSubDir())
+    setEnv("CCL_KVS_IP_EXCHANGE","env")
+    setEnv("CCL_KVS_IP_PORT", ccl_kvs_ip_port)
+    setEnv("CCL_WORLD_SIZE", s"${executor_num}")
     // Uncomment this if you whant to debug oneCCL
     // setEnv("CCL_LOG_LEVEL", "2")
   }
 
-  def init(executor_num: Int, rank: Int, ip_port: String) = {
-
-    setExecutorEnv()
+  def init(executor_num: Int, ip: String, port: Int) = {
 
-    logInfo(s"Initializing with IP_PORT: ${ip_port}")
+    setExecutorEnv(executor_num, ip, port)
 
     // cclParam is output from native code
-    c_init(executor_num, rank, ip_port, cclParam)
+    c_init(cclParam)
 
     // executor number should equal to oneCCL world size
     assert(executor_num == cclParam.commSize, "executor number should equal to oneCCL world size")
 
-    logInfo(s"Initialized with executorNum: $executor_num, commSize, ${cclParam.commSize}, rankId: ${cclParam.rankId}")
+    println(s"oneCCL: Initialized with executorNum: $executor_num, commSize, ${cclParam.commSize}, rankId: ${cclParam.rankId}")
+    
+    KVS_PORT = KVS_PORT + 1
+
   }
 
   // Run on Executor
@@ -50,16 +78,11 @@ object OneCCL extends Logging {
     c_cleanup()
   }
 
-  def getAvailPort(localIP: String): Int = synchronized {
-    c_getAvailPort(localIP)
-  }
-
-  @native private def c_init(size: Int, rank: Int, ip_port: String, param: CCLParam) : Int
+  @native private def c_init(param: CCLParam) : Int
   @native private def c_cleanup() : Unit
 
   @native def isRoot() : Boolean
   @native def rankID() : Int
 
   @native def setEnv(key: String, value: String, overwrite: Boolean = true): Int
-  @native def c_getAvailPort(localIP: String): Int
-}
\ No newline at end of file
+}
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala
index 9b6c0f6c7..2f6c83775 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala
@@ -149,7 +149,4 @@ object OneDAL {
   @native def cFreeDataMemory(numTableAddr: Long)
 
   @native def cCheckPlatformCompatibility() : Boolean
-
-  @native def cNewCSRNumericTable(data: Array[Float], colIndices: Array[Long], rowOffsets: Array[Long], nFeatures: Long,
-                                  nVectors: Long) : Long
 }
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala
index aa8eb8979..40a1c6823 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala
@@ -71,22 +71,8 @@ object Utils {
     ip
   }
 
-  def checkExecutorAvailPort(data: RDD[_], localIP: String) : Int = {
-    val sc = data.sparkContext
-    val result = data.mapPartitions { p =>
-      LibLoader.loadLibraries()
-      val port = OneCCL.getAvailPort(localIP)
-      if (port != -1)
-        Iterator(port)
-      else
-        Iterator()
-    }.collect()
-
-    return result(0)
-  }
-
   def checkClusterPlatformCompatibility(sc: SparkContext) : Boolean = {
-    LibLoader.loadLibraries()
+    LibLoader.loadLibMLlibDAL()
 
     // check driver platform compatibility
     if (!OneDAL.cCheckPlatformCompatibility())
@@ -96,7 +82,7 @@ object Utils {
     val executor_num = Utils.sparkExecutorNum(sc)
     val data = sc.parallelize(1 to executor_num, executor_num)
     val result = data.map { p =>
-      LibLoader.loadLibraries()
+      LibLoader.loadLibMLlibDAL()
       OneDAL.cCheckPlatformCompatibility()
     }.collect()
 
diff --git a/mllib-dal/src/test/scala/org/apache/spark/ml/recommendation/IntelALSSuite.scala b/mllib-dal/src/test/scala/org/apache/spark/ml/recommendation/IntelALSSuite.scala
deleted file mode 100644
index 339644888..000000000
--- a/mllib-dal/src/test/scala/org/apache/spark/ml/recommendation/IntelALSSuite.scala
+++ /dev/null
@@ -1,1239 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.ml.recommendation
-
-import java.io.File
-import java.util.Random
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable
-import scala.collection.mutable.{ArrayBuffer, WrappedArray}
-
-import com.github.fommil.netlib.BLAS.{getInstance => blas}
-import org.apache.commons.io.FileUtils
-import org.apache.commons.io.filefilter.TrueFileFilter
-import org.scalatest.BeforeAndAfterEach
-
-import org.apache.spark._
-import org.apache.spark.internal.Logging
-import org.apache.spark.ml.linalg.Vectors
-import org.apache.spark.ml.recommendation.ALS._
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
-import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.rdd.RDD
-import org.apache.spark.scheduler.{SparkListener, SparkListenerStageCompleted}
-import org.apache.spark.sql.{DataFrame, Encoder, Row, SparkSession}
-import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
-import org.apache.spark.sql.functions.{col, lit}
-import org.apache.spark.sql.streaming.StreamingQueryException
-import org.apache.spark.sql.types._
-import org.apache.spark.storage.StorageLevel
-import org.apache.spark.util.Utils
-
-class IntelALSSuite extends MLTest with DefaultReadWriteTest with Logging {
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    sc.setCheckpointDir(tempDir.getAbsolutePath)
-  }
-
-  override def afterAll(): Unit = {
-    super.afterAll()
-  }
-
-  test("LocalIndexEncoder") {
-    val random = new Random
-    for (numBlocks <- Seq(1, 2, 5, 10, 20, 50, 100)) {
-      val encoder = new LocalIndexEncoder(numBlocks)
-      val maxLocalIndex = Int.MaxValue / numBlocks
-      val tests = Seq.fill(5)((random.nextInt(numBlocks), random.nextInt(maxLocalIndex))) ++
-        Seq((0, 0), (numBlocks - 1, maxLocalIndex))
-      tests.foreach { case (blockId, localIndex) =>
-        val err = s"Failed with numBlocks=$numBlocks, blockId=$blockId, and localIndex=$localIndex."
-        val encoded = encoder.encode(blockId, localIndex)
-        assert(encoder.blockId(encoded) === blockId, err)
-        assert(encoder.localIndex(encoded) === localIndex, err)
-      }
-    }
-  }
-
-  test("normal equation construction") {
-    val k = 2
-    val ne0 = new NormalEquation(k)
-      .add(Array(1.0f, 2.0f), 3.0)
-      .add(Array(4.0f, 5.0f), 12.0, 2.0) // weighted
-    assert(ne0.k === k)
-    assert(ne0.triK === k * (k + 1) / 2)
-    // NumPy code that computes the expected values:
-    // A = np.matrix("1 2; 4 5")
-    // b = np.matrix("3; 6")
-    // C = np.matrix(np.diag([1, 2]))
-    // ata = A.transpose() * C * A
-    // atb = A.transpose() * C * b
-    assert(Vectors.dense(ne0.ata) ~== Vectors.dense(33.0, 42.0, 54.0) relTol 1e-8)
-    assert(Vectors.dense(ne0.atb) ~== Vectors.dense(51.0, 66.0) relTol 1e-8)
-
-    val ne1 = new NormalEquation(2)
-      .add(Array(7.0f, 8.0f), 9.0)
-    ne0.merge(ne1)
-    // NumPy code that computes the expected values:
-    // A = np.matrix("1 2; 4 5; 7 8")
-    // b = np.matrix("3; 6; 9")
-    // C = np.matrix(np.diag([1, 2, 1]))
-    // ata = A.transpose() * C * A
-    // atb = A.transpose() * C * b
-    assert(Vectors.dense(ne0.ata) ~== Vectors.dense(82.0, 98.0, 118.0) relTol 1e-8)
-    assert(Vectors.dense(ne0.atb) ~== Vectors.dense(114.0, 138.0) relTol 1e-8)
-
-    intercept[IllegalArgumentException] {
-      ne0.add(Array(1.0f), 2.0)
-    }
-    intercept[IllegalArgumentException] {
-      ne0.add(Array(1.0f, 2.0f, 3.0f), 4.0)
-    }
-    intercept[IllegalArgumentException] {
-      ne0.add(Array(1.0f, 2.0f), 0.0, -1.0)
-    }
-    intercept[IllegalArgumentException] {
-      val ne2 = new NormalEquation(3)
-      ne0.merge(ne2)
-    }
-
-    ne0.reset()
-    assert(ne0.ata.forall(_ == 0.0))
-    assert(ne0.atb.forall(_ == 0.0))
-  }
-
-  test("CholeskySolver") {
-    val k = 2
-    val ne0 = new NormalEquation(k)
-      .add(Array(1.0f, 2.0f), 4.0)
-      .add(Array(1.0f, 3.0f), 9.0)
-      .add(Array(1.0f, 4.0f), 16.0)
-    val ne1 = new NormalEquation(k)
-      .merge(ne0)
-
-    val chol = new CholeskySolver
-    val x0 = chol.solve(ne0, 0.0).map(_.toDouble)
-    // NumPy code that computes the expected solution:
-    // A = np.matrix("1 2; 1 3; 1 4")
-    // b = b = np.matrix("3; 6")
-    // x0 = np.linalg.lstsq(A, b)[0]
-    assert(Vectors.dense(x0) ~== Vectors.dense(-8.333333, 6.0) relTol 1e-6)
-
-    assert(ne0.ata.forall(_ == 0.0))
-    assert(ne0.atb.forall(_ == 0.0))
-
-    val x1 = chol.solve(ne1, 1.5).map(_.toDouble)
-    // NumPy code that computes the expected solution, where lambda is scaled by n:
-    // x0 = np.linalg.solve(A.transpose() * A + 1.5 * np.eye(2), A.transpose() * b)
-    assert(Vectors.dense(x1) ~== Vectors.dense(-0.1155556, 3.28) relTol 1e-6)
-  }
-
-  test("RatingBlockBuilder") {
-    val emptyBuilder = new RatingBlockBuilder[Int]()
-    assert(emptyBuilder.size === 0)
-    val emptyBlock = emptyBuilder.build()
-    assert(emptyBlock.srcIds.isEmpty)
-    assert(emptyBlock.dstIds.isEmpty)
-    assert(emptyBlock.ratings.isEmpty)
-
-    val builder0 = new RatingBlockBuilder()
-      .add(Rating(0, 1, 2.0f))
-      .add(Rating(3, 4, 5.0f))
-    assert(builder0.size === 2)
-    val builder1 = new RatingBlockBuilder()
-      .add(Rating(6, 7, 8.0f))
-      .merge(builder0.build())
-    assert(builder1.size === 3)
-    val block = builder1.build()
-    val ratings = Seq.tabulate(block.size) { i =>
-      (block.srcIds(i), block.dstIds(i), block.ratings(i))
-    }.toSet
-    assert(ratings === Set((0, 1, 2.0f), (3, 4, 5.0f), (6, 7, 8.0f)))
-  }
-
-  test("UncompressedInBlock") {
-    val encoder = new LocalIndexEncoder(10)
-    val uncompressed = new UncompressedInBlockBuilder[Int](encoder)
-      .add(0, Array(1, 0, 2), Array(0, 1, 4), Array(1.0f, 2.0f, 3.0f))
-      .add(1, Array(3, 0), Array(2, 5), Array(4.0f, 5.0f))
-      .build()
-    assert(uncompressed.length === 5)
-    val records = Seq.tabulate(uncompressed.length) { i =>
-      val dstEncodedIndex = uncompressed.dstEncodedIndices(i)
-      val dstBlockId = encoder.blockId(dstEncodedIndex)
-      val dstLocalIndex = encoder.localIndex(dstEncodedIndex)
-      (uncompressed.srcIds(i), dstBlockId, dstLocalIndex, uncompressed.ratings(i))
-    }.toSet
-    val expected =
-      Set((1, 0, 0, 1.0f), (0, 0, 1, 2.0f), (2, 0, 4, 3.0f), (3, 1, 2, 4.0f), (0, 1, 5, 5.0f))
-    assert(records === expected)
-
-    val compressed = uncompressed.compress()
-    assert(compressed.size === 5)
-    assert(compressed.srcIds.toSeq === Seq(0, 1, 2, 3))
-    assert(compressed.dstPtrs.toSeq === Seq(0, 2, 3, 4, 5))
-    var decompressed = ArrayBuffer.empty[(Int, Int, Int, Float)]
-    var i = 0
-    while (i < compressed.srcIds.length) {
-      var j = compressed.dstPtrs(i)
-      while (j < compressed.dstPtrs(i + 1)) {
-        val dstEncodedIndex = compressed.dstEncodedIndices(j)
-        val dstBlockId = encoder.blockId(dstEncodedIndex)
-        val dstLocalIndex = encoder.localIndex(dstEncodedIndex)
-        decompressed += ((compressed.srcIds(i), dstBlockId, dstLocalIndex, compressed.ratings(j)))
-        j += 1
-      }
-      i += 1
-    }
-    assert(decompressed.toSet === expected)
-  }
-
-  test("CheckedCast") {
-    val checkedCast = new ALS().checkedCast
-    val df = spark.range(1)
-
-    withClue("Valid Integer Ids") {
-      df.select(checkedCast(lit(123))).collect()
-    }
-
-    withClue("Valid Long Ids") {
-      df.select(checkedCast(lit(1231L))).collect()
-    }
-
-    withClue("Valid Decimal Ids") {
-      df.select(checkedCast(lit(123).cast(DecimalType(15, 2)))).collect()
-    }
-
-    withClue("Valid Double Ids") {
-      df.select(checkedCast(lit(123.0))).collect()
-    }
-
-    val msg = "either out of Integer range or contained a fractional part"
-    withClue("Invalid Long: out of range") {
-      val e: SparkException = intercept[SparkException] {
-        df.select(checkedCast(lit(1231000000000L))).collect()
-      }
-      assert(e.getMessage.contains(msg))
-    }
-
-    withClue("Invalid Decimal: out of range") {
-      val e: SparkException = intercept[SparkException] {
-        df.select(checkedCast(lit(1231000000000.0).cast(DecimalType(15, 2)))).collect()
-      }
-      assert(e.getMessage.contains(msg))
-    }
-
-    withClue("Invalid Decimal: fractional part") {
-      val e: SparkException = intercept[SparkException] {
-        df.select(checkedCast(lit(123.1).cast(DecimalType(15, 2)))).collect()
-      }
-      assert(e.getMessage.contains(msg))
-    }
-
-    withClue("Invalid Double: out of range") {
-      val e: SparkException = intercept[SparkException] {
-        df.select(checkedCast(lit(1231000000000.0))).collect()
-      }
-      assert(e.getMessage.contains(msg))
-    }
-
-    withClue("Invalid Double: fractional part") {
-      val e: SparkException = intercept[SparkException] {
-        df.select(checkedCast(lit(123.1))).collect()
-      }
-      assert(e.getMessage.contains(msg))
-    }
-
-    withClue("Invalid Type") {
-      val e: SparkException = intercept[SparkException] {
-        df.select(checkedCast(lit("123.1"))).collect()
-      }
-      assert(e.getMessage.contains("was not numeric"))
-    }
-  }
-
-  /**
-   * Generates an explicit feedback dataset for testing ALS.
-   * @param numUsers number of users
-   * @param numItems number of items
-   * @param rank rank
-   * @param noiseStd the standard deviation of additive Gaussian noise on training data
-   * @param seed random seed
-   * @return (training, test)
-   */
-  def genExplicitTestData(
-      numUsers: Int,
-      numItems: Int,
-      rank: Int,
-      noiseStd: Double = 0.0,
-      seed: Long = 11L): (RDD[Rating[Int]], RDD[Rating[Int]]) = {
-    val trainingFraction = 0.6
-    val testFraction = 0.3
-    val totalFraction = trainingFraction + testFraction
-    val random = new Random(seed)
-    val userFactors = genFactors(numUsers, rank, random)
-    val itemFactors = genFactors(numItems, rank, random)
-    val training = ArrayBuffer.empty[Rating[Int]]
-    val test = ArrayBuffer.empty[Rating[Int]]
-    for ((userId, userFactor) <- userFactors; (itemId, itemFactor) <- itemFactors) {
-      val x = random.nextDouble()
-      if (x < totalFraction) {
-        val rating = blas.sdot(rank, userFactor, 1, itemFactor, 1)
-        if (x < trainingFraction) {
-          val noise = noiseStd * random.nextGaussian()
-          training += Rating(userId, itemId, rating + noise.toFloat)
-        } else {
-          test += Rating(userId, itemId, rating)
-        }
-      }
-    }
-    logInfo(s"Generated an explicit feedback dataset with ${training.size} ratings for training " +
-      s"and ${test.size} for test.")
-    (sc.parallelize(training, 2), sc.parallelize(test, 2))
-  }
-
-  /**
-   * Generates an implicit feedback dataset for testing ALS.
-   * @param numUsers number of users
-   * @param numItems number of items
-   * @param rank rank
-   * @param noiseStd the standard deviation of additive Gaussian noise on training data
-   * @param seed random seed
-   * @return (training, test)
-   */
-  def genImplicitTestData(
-      numUsers: Int,
-      numItems: Int,
-      rank: Int,
-      noiseStd: Double = 0.0,
-      seed: Long = 11L): (RDD[Rating[Int]], RDD[Rating[Int]]) = {
-    ALSSuite.genImplicitTestData(sc, numUsers, numItems, rank, noiseStd, seed)
-  }
-
-  /**
-   * Generates random user/item factors, with i.i.d. values drawn from U(a, b).
-   * @param size number of users/items
-   * @param rank number of features
-   * @param random random number generator
-   * @param a min value of the support (default: -1)
-   * @param b max value of the support (default: 1)
-   * @return a sequence of (ID, factors) pairs
-   */
-  private def genFactors(
-      size: Int,
-      rank: Int,
-      random: Random,
-      a: Float = -1.0f,
-      b: Float = 1.0f): Seq[(Int, Array[Float])] = {
-    IntelALSSuite.genFactors(size, rank, random, a, b)
-  }
-
-  /**
-  * Train ALS using the given training set and parameters
-  * @param training training dataset
-  * @param rank rank of the matrix factorization
-  * @param maxIter max number of iterations
-  * @param regParam regularization constant
-  * @param implicitPrefs whether to use implicit preference
-  * @param numUserBlocks number of user blocks
-  * @param numItemBlocks number of item blocks
-  * @return a trained ALSModel
-  */
-  def trainALS(
-    training: RDD[Rating[Int]],
-    rank: Int,
-    maxIter: Int,
-    regParam: Double,
-    implicitPrefs: Boolean = false,
-    numUserBlocks: Int = 2,
-    numItemBlocks: Int = 3): ALSModel = {
-    val spark = this.spark
-    import spark.implicits._
-    val als = new ALS()
-      .setRank(rank)
-      .setRegParam(regParam)
-      .setImplicitPrefs(implicitPrefs)
-      .setNumUserBlocks(numUserBlocks)
-      .setNumItemBlocks(numItemBlocks)
-      .setSeed(0)
-    als.fit(training.toDF())
-  }
-
-  /**
-   * Test ALS using the given training/test splits and parameters.
-   * @param training training dataset
-   * @param test test dataset
-   * @param rank rank of the matrix factorization
-   * @param maxIter max number of iterations
-   * @param regParam regularization constant
-   * @param implicitPrefs whether to use implicit preference
-   * @param numUserBlocks number of user blocks
-   * @param numItemBlocks number of item blocks
-   * @param targetRMSE target test RMSE
-   */
-  def testALS(
-      training: RDD[Rating[Int]],
-      test: RDD[Rating[Int]],
-      rank: Int,
-      maxIter: Int,
-      regParam: Double,
-      implicitPrefs: Boolean = false,
-      numUserBlocks: Int = 2,
-      numItemBlocks: Int = 3,
-      targetRMSE: Double = 0.05): Unit = {
-    val spark = this.spark
-    import spark.implicits._
-    val als = new ALS()
-      .setRank(rank)
-      .setRegParam(regParam)
-      .setImplicitPrefs(implicitPrefs)
-      .setNumUserBlocks(numUserBlocks)
-      .setNumItemBlocks(numItemBlocks)
-      .setSeed(0)
-    val alpha = als.getAlpha
-    val model = als.fit(training.toDF())
-    testTransformerByGlobalCheckFunc[Rating[Int]](test.toDF(), model, "rating", "prediction") {
-        case rows: Seq[Row] =>
-          val predictions = rows.map(row => (row.getFloat(0).toDouble, row.getFloat(1).toDouble))
-
-          val rmse =
-            if (implicitPrefs) {
-              // TODO: Use a better (rank-based?) evaluation metric for implicit feedback.
-              // We limit the ratings and the predictions to interval [0, 1] and compute the
-              // weighted RMSE with the confidence scores as weights.
-              val (totalWeight, weightedSumSq) = predictions.map { case (rating, prediction) =>
-                val confidence = 1.0 + alpha * math.abs(rating)
-                val rating01 = math.max(math.min(rating, 1.0), 0.0)
-                val prediction01 = math.max(math.min(prediction, 1.0), 0.0)
-                val err = prediction01 - rating01
-                (confidence, confidence * err * err)
-              }.reduce[(Double, Double)] { case ((c0, e0), (c1, e1)) =>
-                (c0 + c1, e0 + e1)
-              }
-              math.sqrt(weightedSumSq / totalWeight)
-            } else {
-              val errorSquares = predictions.map { case (rating, prediction) =>
-                val err = rating - prediction
-                err * err
-              }
-              val mse = errorSquares.sum / errorSquares.length
-              math.sqrt(mse)
-            }
-          logInfo(s"Test RMSE is $rmse.")
-          assert(rmse < targetRMSE)
-    }
-
-    MLTestingUtils.checkCopyAndUids(als, model)
-  }
-
-  test("exact rank-1 matrix") {
-    val (training, test) = genExplicitTestData(numUsers = 20, numItems = 40, rank = 1)
-    testALS(training, test, maxIter = 1, rank = 1, regParam = 1e-5, targetRMSE = 0.001)
-    testALS(training, test, maxIter = 1, rank = 2, regParam = 1e-5, targetRMSE = 0.001)
-  }
-
-  test("approximate rank-1 matrix") {
-    val (training, test) =
-      genExplicitTestData(numUsers = 20, numItems = 40, rank = 1, noiseStd = 0.01)
-    testALS(training, test, maxIter = 2, rank = 1, regParam = 0.01, targetRMSE = 0.02)
-    testALS(training, test, maxIter = 2, rank = 2, regParam = 0.01, targetRMSE = 0.02)
-  }
-
-  test("approximate rank-2 matrix") {
-    val (training, test) =
-      genExplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
-    testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, targetRMSE = 0.03)
-    testALS(training, test, maxIter = 4, rank = 3, regParam = 0.01, targetRMSE = 0.03)
-  }
-
-  test("different block settings") {
-    val (training, test) =
-      genExplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
-    for ((numUserBlocks, numItemBlocks) <- Seq((1, 1), (1, 2), (2, 1), (2, 2))) {
-      testALS(training, test, maxIter = 4, rank = 3, regParam = 0.01, targetRMSE = 0.03,
-        numUserBlocks = numUserBlocks, numItemBlocks = numItemBlocks)
-    }
-  }
-
-  test("more blocks than ratings") {
-    val (training, test) =
-      genExplicitTestData(numUsers = 4, numItems = 4, rank = 1)
-    testALS(training, test, maxIter = 2, rank = 1, regParam = 1e-4, targetRMSE = 0.002,
-     numItemBlocks = 5, numUserBlocks = 5)
-  }
-
-  test("implicit feedback") {
-    val (training, test) =
-      genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
-    testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, implicitPrefs = true,
-      targetRMSE = 0.3)
-  }
-
-  test("implicit feedback regression") {
-    val trainingWithNeg = sc.parallelize(Seq(Rating(0, 0, 1), Rating(1, 1, 1), Rating(0, 1, -3)))
-    val trainingWithZero = sc.parallelize(Seq(Rating(0, 0, 1), Rating(1, 1, 1), Rating(0, 1, 0)))
-    val modelWithNeg =
-      trainALS(trainingWithNeg, rank = 1, maxIter = 5, regParam = 0.01, implicitPrefs = true)
-    val modelWithZero =
-      trainALS(trainingWithZero, rank = 1, maxIter = 5, regParam = 0.01, implicitPrefs = true)
-    val userFactorsNeg = modelWithNeg.userFactors
-    val itemFactorsNeg = modelWithNeg.itemFactors
-    val userFactorsZero = modelWithZero.userFactors
-    val itemFactorsZero = modelWithZero.itemFactors
-    assert(userFactorsNeg.intersect(userFactorsZero).count() == 0)
-    assert(itemFactorsNeg.intersect(itemFactorsZero).count() == 0)
-  }
-  test("using generic ID types") {
-    val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
-
-    val longRatings = ratings.map(r => Rating(r.user.toLong, r.item.toLong, r.rating))
-    val (longUserFactors, _) = ALS.train(longRatings, rank = 2, maxIter = 4, seed = 0)
-    assert(longUserFactors.first()._1.getClass === classOf[Long])
-
-    val strRatings = ratings.map(r => Rating(r.user.toString, r.item.toString, r.rating))
-    val (strUserFactors, _) = ALS.train(strRatings, rank = 2, maxIter = 4, seed = 0)
-    assert(strUserFactors.first()._1.getClass === classOf[String])
-  }
-
-  test("nonnegative constraint") {
-    val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
-    val (userFactors, itemFactors) =
-      ALS.train(ratings, rank = 2, maxIter = 4, nonnegative = true, seed = 0)
-    def isNonnegative(factors: RDD[(Int, Array[Float])]): Boolean = {
-      factors.values.map { _.forall(_ >= 0.0) }.reduce(_ && _)
-    }
-    assert(isNonnegative(userFactors))
-    assert(isNonnegative(itemFactors))
-    // TODO: Validate the solution.
-  }
-
-  test("als partitioner is a projection") {
-    for (p <- Seq(1, 10, 100, 1000)) {
-      val part = new ALSPartitioner(p)
-      var k = 0
-      while (k < p) {
-        assert(k === part.getPartition(k))
-        assert(k === part.getPartition(k.toLong))
-        k += 1
-      }
-    }
-  }
-
-  test("partitioner in returned factors") {
-    val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
-    val (userFactors, itemFactors) = ALS.train(
-      ratings, rank = 2, maxIter = 4, numUserBlocks = 3, numItemBlocks = 4, seed = 0)
-    for ((tpe, factors) <- Seq(("User", userFactors), ("Item", itemFactors))) {
-      assert(userFactors.partitioner.isDefined, s"$tpe factors should have partitioner.")
-      val part = userFactors.partitioner.get
-      userFactors.mapPartitionsWithIndex { (idx, items) =>
-        items.foreach { case (id, _) =>
-          if (part.getPartition(id) != idx) {
-            throw new SparkException(s"$tpe with ID $id should not be in partition $idx.")
-          }
-        }
-        Iterator.empty
-      }.count()
-    }
-  }
-
-  test("als with large number of iterations") {
-    val (ratings, _) = genExplicitTestData(numUsers = 4, numItems = 4, rank = 1)
-    ALS.train(ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2, seed = 0)
-    ALS.train(ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2,
-      implicitPrefs = true, seed = 0)
-  }
-
-  test("read/write") {
-    val spark = this.spark
-    import ALSSuite._
-    import spark.implicits._
-    val (ratings, _) = genExplicitTestData(numUsers = 4, numItems = 4, rank = 1)
-
-    def getFactors(df: DataFrame): Set[(Int, Array[Float])] = {
-      df.select("id", "features").collect().map { case r =>
-        (r.getInt(0), r.getAs[Array[Float]](1))
-      }.toSet
-    }
-
-    def checkModelData(model: ALSModel, model2: ALSModel): Unit = {
-      assert(model.rank === model2.rank)
-      assert(getFactors(model.userFactors) === getFactors(model2.userFactors))
-      assert(getFactors(model.itemFactors) === getFactors(model2.itemFactors))
-    }
-
-    val als = new ALS()
-    testEstimatorAndModelReadWrite(als, ratings.toDF(), allEstimatorParamSettings,
-      allModelParamSettings, checkModelData)
-  }
-
-  private def checkNumericTypesALS(
-      estimator: ALS,
-      spark: SparkSession,
-      column: String,
-      baseType: NumericType)
-      (check: (ALSModel, ALSModel) => Unit)
-      (check2: (ALSModel, ALSModel, DataFrame, Encoder[_]) => Unit): Unit = {
-    val dfs = genRatingsDFWithNumericCols(spark, column)
-    val maybeDf = dfs.find { case (numericTypeWithEncoder, _) =>
-      numericTypeWithEncoder.numericType == baseType
-    }
-    assert(maybeDf.isDefined)
-    val df = maybeDf.get._2
-
-    val expected = estimator.fit(df)
-    val actuals = dfs.map(t => (t, estimator.fit(t._2)))
-    actuals.foreach { case (_, actual) => check(expected, actual) }
-    actuals.foreach { case (t, actual) => check2(expected, actual, t._2, t._1.encoder) }
-
-    val baseDF = dfs.find(_._1.numericType == baseType).get._2
-    val others = baseDF.columns.toSeq.diff(Seq(column)).map(col)
-    val cols = Seq(col(column).cast(StringType)) ++ others
-    val strDF = baseDF.select(cols: _*)
-    val thrown = intercept[IllegalArgumentException] {
-      estimator.fit(strDF)
-    }
-    assert(thrown.getMessage.contains(
-      s"$column must be of type numeric but was actually of type string"))
-  }
-
-  private class NumericTypeWithEncoder[A](val numericType: NumericType)
-      (implicit val encoder: Encoder[(A, Int, Double)])
-
-  private def genRatingsDFWithNumericCols(
-      spark: SparkSession,
-      column: String) = {
-
-    import testImplicits._
-
-    val df = spark.createDataFrame(Seq(
-      (0, 10, 1.0),
-      (1, 20, 2.0),
-      (2, 30, 3.0),
-      (3, 40, 4.0),
-      (4, 50, 5.0)
-    )).toDF("user", "item", "rating")
-
-    val others = df.columns.toSeq.diff(Seq(column)).map(col)
-    val types =
-      Seq(new NumericTypeWithEncoder[Short](ShortType),
-        new NumericTypeWithEncoder[Long](LongType),
-        new NumericTypeWithEncoder[Int](IntegerType),
-        new NumericTypeWithEncoder[Float](FloatType),
-        new NumericTypeWithEncoder[Byte](ByteType),
-        new NumericTypeWithEncoder[Double](DoubleType),
-        new NumericTypeWithEncoder[Decimal](DecimalType(10, 0))(ExpressionEncoder())
-      )
-    types.map { t =>
-      val cols = Seq(col(column).cast(t.numericType)) ++ others
-      t -> df.select(cols: _*)
-    }
-  }
-
-  test("input type validation") {
-    val spark = this.spark
-    import spark.implicits._
-
-    // check that ALS can handle all numeric types for rating column
-    // and user/item columns (when the user/item ids are within Int range)
-    val als = new ALS().setMaxIter(1).setRank(1)
-    Seq(("user", IntegerType), ("item", IntegerType), ("rating", FloatType)).foreach {
-      case (colName, sqlType) =>
-        checkNumericTypesALS(als, spark, colName, sqlType) {
-          (ex, act) =>
-            ex.userFactors.first().getSeq[Float](1) === act.userFactors.first().getSeq[Float](1)
-        } { (ex, act, df, enc) =>
-          // With AQE on/off, the order of result may be different. Here sortby the result.
-          val expected = ex.transform(df).selectExpr("prediction")
-            .sort("prediction").first().getFloat(0)
-          testTransformerByGlobalCheckFunc(df, act, "prediction") {
-            case rows: Seq[Row] =>
-              expected ~== rows.sortBy(_.getFloat(0)).head.getFloat(0) absTol 1e-6
-          }(enc)
-        }
-    }
-    // check user/item ids falling outside of Int range
-    val big = Int.MaxValue.toLong + 1
-    val small = Int.MinValue.toDouble - 1
-    val df = Seq(
-      (0, 0L, 0d, 1, 1L, 1d, 3.0),
-      (0, big, small, 0, big, small, 2.0),
-      (1, 1L, 1d, 0, 0L, 0d, 5.0)
-    ).toDF("user", "user_big", "user_small", "item", "item_big", "item_small", "rating")
-    val msg = "either out of Integer range or contained a fractional part"
-    withClue("fit should fail when ids exceed integer range. ") {
-      assert(intercept[SparkException] {
-        als.fit(df.select(df("user_big").as("user"), df("item"), df("rating")))
-      }.getCause.getMessage.contains(msg))
-      assert(intercept[SparkException] {
-        als.fit(df.select(df("user_small").as("user"), df("item"), df("rating")))
-      }.getCause.getMessage.contains(msg))
-      assert(intercept[SparkException] {
-        als.fit(df.select(df("item_big").as("item"), df("user"), df("rating")))
-      }.getCause.getMessage.contains(msg))
-      assert(intercept[SparkException] {
-        als.fit(df.select(df("item_small").as("item"), df("user"), df("rating")))
-      }.getCause.getMessage.contains(msg))
-    }
-    withClue("transform should fail when ids exceed integer range. ") {
-      val model = als.fit(df)
-      def testTransformIdExceedsIntRange[A : Encoder](dataFrame: DataFrame): Unit = {
-        val e1 = intercept[SparkException] {
-          model.transform(dataFrame).collect()
-        }
-        TestUtils.assertExceptionMsg(e1, msg)
-        val e2 = intercept[StreamingQueryException] {
-          testTransformer[A](dataFrame, model, "prediction") { _ => }
-        }
-        TestUtils.assertExceptionMsg(e2, msg)
-      }
-      testTransformIdExceedsIntRange[(Long, Int)](df.select(df("user_big").as("user"),
-        df("item")))
-      testTransformIdExceedsIntRange[(Double, Int)](df.select(df("user_small").as("user"),
-        df("item")))
-      testTransformIdExceedsIntRange[(Long, Int)](df.select(df("item_big").as("item"),
-        df("user")))
-      testTransformIdExceedsIntRange[(Double, Int)](df.select(df("item_small").as("item"),
-        df("user")))
-    }
-  }
-
-  test("SPARK-18268: ALS with empty RDD should fail with better message") {
-    val ratings = sc.parallelize(Array.empty[Rating[Int]])
-    intercept[IllegalArgumentException] {
-      ALS.train(ratings)
-    }
-  }
-
-  test("ALS cold start user/item prediction strategy") {
-    val spark = this.spark
-    import org.apache.spark.sql.functions._
-    import spark.implicits._
-
-    val (ratings, _) = genExplicitTestData(numUsers = 4, numItems = 4, rank = 1)
-    val data = ratings.toDF
-    val knownUser = data.select(max("user")).as[Int].first()
-    val unknownUser = knownUser + 10
-    val knownItem = data.select(max("item")).as[Int].first()
-    val unknownItem = knownItem + 20
-    val test = Seq(
-      (unknownUser, unknownItem, true),
-      (knownUser, unknownItem, true),
-      (unknownUser, knownItem, true),
-      (knownUser, knownItem, false)
-    ).toDF("user", "item", "expectedIsNaN")
-
-    val als = new ALS().setMaxIter(1).setRank(1)
-    // default is 'nan'
-    val defaultModel = als.fit(data)
-    testTransformer[(Int, Int, Boolean)](test, defaultModel, "expectedIsNaN", "prediction") {
-      case Row(expectedIsNaN: Boolean, prediction: Float) =>
-        assert(prediction.isNaN === expectedIsNaN)
-    }
-
-    // check 'drop' strategy should filter out rows with unknown users/items
-    val defaultPrediction = defaultModel.transform(test).select("prediction")
-      .as[Float].filter(!_.isNaN).first()
-    testTransformerByGlobalCheckFunc[(Int, Int, Boolean)](test,
-      defaultModel.setColdStartStrategy("drop"), "prediction") {
-      case rows: Seq[Row] =>
-        val dropPredictions = rows.map(_.getFloat(0))
-        assert(dropPredictions.length == 1)
-        assert(!dropPredictions.head.isNaN)
-        assert(dropPredictions.head ~== defaultPrediction relTol 1e-14)
-    }
-  }
-
-  test("case insensitive cold start param value") {
-    val spark = this.spark
-    import spark.implicits._
-    val (ratings, _) = genExplicitTestData(numUsers = 2, numItems = 2, rank = 1)
-    val data = ratings.toDF
-    val model = new ALS().fit(data)
-    Seq("nan", "NaN", "Nan", "drop", "DROP", "Drop").foreach { s =>
-      testTransformer[Rating[Int]](data, model.setColdStartStrategy(s), "prediction") { _ => }
-    }
-  }
-
-  private def getALSModel = {
-    val spark = this.spark
-    import spark.implicits._
-
-    val userFactors = Seq(
-      (0, Array(6.0f, 4.0f)),
-      (1, Array(3.0f, 4.0f)),
-      (2, Array(3.0f, 6.0f))
-    ).toDF("id", "features")
-    val itemFactors = Seq(
-      (3, Array(5.0f, 6.0f)),
-      (4, Array(6.0f, 2.0f)),
-      (5, Array(3.0f, 6.0f)),
-      (6, Array(4.0f, 1.0f))
-    ).toDF("id", "features")
-    val als = new ALS().setRank(2)
-    new ALSModel(als.uid, als.getRank, userFactors, itemFactors)
-      .setUserCol("user")
-      .setItemCol("item")
-  }
-
-  test("recommendForAllUsers with k <, = and > num_items") {
-    val model = getALSModel
-    val numUsers = model.userFactors.count
-    val numItems = model.itemFactors.count
-    val expected = Map(
-      0 -> Seq((3, 54f), (4, 44f), (5, 42f), (6, 28f)),
-      1 -> Seq((3, 39f), (5, 33f), (4, 26f), (6, 16f)),
-      2 -> Seq((3, 51f), (5, 45f), (4, 30f), (6, 18f))
-    )
-
-    Seq(2, 4, 6).foreach { k =>
-      val n = math.min(k, numItems).toInt
-      val expectedUpToN = expected.mapValues(_.slice(0, n))
-      val topItems = model.recommendForAllUsers(k)
-      assert(topItems.count() == numUsers)
-      assert(topItems.columns.contains("user"))
-      checkRecommendations(topItems, expectedUpToN, "item")
-    }
-  }
-
-  test("recommendForAllItems with k <, = and > num_users") {
-    val model = getALSModel
-    val numUsers = model.userFactors.count
-    val numItems = model.itemFactors.count
-    val expected = Map(
-      3 -> Seq((0, 54f), (2, 51f), (1, 39f)),
-      4 -> Seq((0, 44f), (2, 30f), (1, 26f)),
-      5 -> Seq((2, 45f), (0, 42f), (1, 33f)),
-      6 -> Seq((0, 28f), (2, 18f), (1, 16f))
-    )
-
-    Seq(2, 3, 4).foreach { k =>
-      val n = math.min(k, numUsers).toInt
-      val expectedUpToN = expected.mapValues(_.slice(0, n))
-      val topUsers = getALSModel.recommendForAllItems(k)
-      assert(topUsers.count() == numItems)
-      assert(topUsers.columns.contains("item"))
-      checkRecommendations(topUsers, expectedUpToN, "user")
-    }
-  }
-
-  test("recommendForUserSubset with k <, = and > num_items") {
-    val spark = this.spark
-    import spark.implicits._
-    val model = getALSModel
-    val numItems = model.itemFactors.count
-    val expected = Map(
-      0 -> Seq((3, 54f), (4, 44f), (5, 42f), (6, 28f)),
-      2 -> Seq((3, 51f), (5, 45f), (4, 30f), (6, 18f))
-    )
-    val userSubset = expected.keys.toSeq.toDF("user")
-    val numUsersSubset = userSubset.count
-
-    Seq(2, 4, 6).foreach { k =>
-      val n = math.min(k, numItems).toInt
-      val expectedUpToN = expected.mapValues(_.slice(0, n))
-      val topItems = model.recommendForUserSubset(userSubset, k)
-      assert(topItems.count() == numUsersSubset)
-      assert(topItems.columns.contains("user"))
-      checkRecommendations(topItems, expectedUpToN, "item")
-    }
-  }
-
-  test("recommendForItemSubset with k <, = and > num_users") {
-    val spark = this.spark
-    import spark.implicits._
-    val model = getALSModel
-    val numUsers = model.userFactors.count
-    val expected = Map(
-      3 -> Seq((0, 54f), (2, 51f), (1, 39f)),
-      6 -> Seq((0, 28f), (2, 18f), (1, 16f))
-    )
-    val itemSubset = expected.keys.toSeq.toDF("item")
-    val numItemsSubset = itemSubset.count
-
-    Seq(2, 3, 4).foreach { k =>
-      val n = math.min(k, numUsers).toInt
-      val expectedUpToN = expected.mapValues(_.slice(0, n))
-      val topUsers = model.recommendForItemSubset(itemSubset, k)
-      assert(topUsers.count() == numItemsSubset)
-      assert(topUsers.columns.contains("item"))
-      checkRecommendations(topUsers, expectedUpToN, "user")
-    }
-  }
-
-  test("subset recommendations eliminate duplicate ids, returns same results as unique ids") {
-    val spark = this.spark
-    import spark.implicits._
-    val model = getALSModel
-    val k = 2
-
-    val users = Seq(0, 1).toDF("user")
-    val dupUsers = Seq(0, 1, 0, 1).toDF("user")
-    val singleUserRecs = model.recommendForUserSubset(users, k)
-    val dupUserRecs = model.recommendForUserSubset(dupUsers, k)
-      .as[(Int, Seq[(Int, Float)])].collect().toMap
-    assert(singleUserRecs.count == dupUserRecs.size)
-    checkRecommendations(singleUserRecs, dupUserRecs, "item")
-
-    val items = Seq(3, 4, 5).toDF("item")
-    val dupItems = Seq(3, 4, 5, 4, 5).toDF("item")
-    val singleItemRecs = model.recommendForItemSubset(items, k)
-    val dupItemRecs = model.recommendForItemSubset(dupItems, k)
-      .as[(Int, Seq[(Int, Float)])].collect().toMap
-    assert(singleItemRecs.count == dupItemRecs.size)
-    checkRecommendations(singleItemRecs, dupItemRecs, "user")
-  }
-
-  test("subset recommendations on full input dataset equivalent to recommendForAll") {
-    val spark = this.spark
-    import spark.implicits._
-    val model = getALSModel
-    val k = 2
-
-    val userSubset = model.userFactors.withColumnRenamed("id", "user").drop("features")
-    val userSubsetRecs = model.recommendForUserSubset(userSubset, k)
-    val allUserRecs = model.recommendForAllUsers(k).as[(Int, Seq[(Int, Float)])].collect().toMap
-    checkRecommendations(userSubsetRecs, allUserRecs, "item")
-
-    val itemSubset = model.itemFactors.withColumnRenamed("id", "item").drop("features")
-    val itemSubsetRecs = model.recommendForItemSubset(itemSubset, k)
-    val allItemRecs = model.recommendForAllItems(k).as[(Int, Seq[(Int, Float)])].collect().toMap
-    checkRecommendations(itemSubsetRecs, allItemRecs, "user")
-  }
-
-  test("ALS should not introduce unnecessary shuffle") {
-    def getShuffledDependencies(rdd: RDD[_]): Seq[ShuffleDependency[_, _, _]] = {
-      rdd.dependencies.flatMap {
-        case s: ShuffleDependency[_, _, _] =>
-          Seq(s) ++ getShuffledDependencies(s.rdd)
-        case o =>
-          Seq.empty ++ getShuffledDependencies(o.rdd)
-      }
-    }
-
-    val spark = this.spark
-    import spark.implicits._
-    val (ratings, _) = genExplicitTestData(numUsers = 2, numItems = 2, rank = 1)
-    val data = ratings.toDF
-    val model = new ALS()
-      .setMaxIter(2)
-      .setImplicitPrefs(true)
-      .setCheckpointInterval(-1)
-      .fit(data)
-
-    val userFactors = model.userFactors
-    val itemFactors = model.itemFactors
-    val shuffledUserFactors = getShuffledDependencies(userFactors.rdd).filter { dep =>
-      dep.rdd.name != null && dep.rdd.name.contains("userFactors")
-    }
-    val shuffledItemFactors = getShuffledDependencies(itemFactors.rdd).filter { dep =>
-      dep.rdd.name != null && dep.rdd.name.contains("itemFactors")
-    }
-    assert(shuffledUserFactors.size == 0)
-    assert(shuffledItemFactors.size == 0)
-  }
-
-  private def checkRecommendations(
-      topK: DataFrame,
-      expected: Map[Int, Seq[(Int, Float)]],
-      dstColName: String): Unit = {
-    val spark = this.spark
-    import spark.implicits._
-
-    assert(topK.columns.contains("recommendations"))
-    topK.as[(Int, Seq[(Int, Float)])].collect().foreach { case (id: Int, recs: Seq[(Int, Float)]) =>
-      assert(recs === expected(id))
-    }
-    topK.collect().foreach { row =>
-      val recs = row.getAs[WrappedArray[Row]]("recommendations")
-      assert(recs(0).fieldIndex(dstColName) == 0)
-      assert(recs(0).fieldIndex("rating") == 1)
-    }
-  }
-}
-
-class ALSCleanerSuite extends SparkFunSuite with BeforeAndAfterEach {
-  override def beforeEach(): Unit = {
-    super.beforeEach()
-    // Once `Utils.getOrCreateLocalRootDirs` is called, it is cached in `Utils.localRootDirs`.
-    // Unless this is manually cleared before and after a test, it returns the same directory
-    // set before even if 'spark.local.dir' is configured afterwards.
-    Utils.clearLocalRootDirs()
-  }
-
-  override def afterEach(): Unit = {
-    Utils.clearLocalRootDirs()
-    super.afterEach()
-  }
-
-  test("ALS shuffle cleanup standalone") {
-    val conf = new SparkConf()
-    val localDir = Utils.createTempDir()
-    val checkpointDir = Utils.createTempDir()
-    def getAllFiles: Set[File] =
-      FileUtils.listFiles(localDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
-    try {
-      conf.set("spark.local.dir", localDir.getAbsolutePath)
-      val sc = new SparkContext("local[2]", "test", conf)
-      try {
-        sc.setCheckpointDir(checkpointDir.getAbsolutePath)
-        // Test checkpoint and clean parents
-        val input = sc.parallelize(1 to 1000)
-        val keyed = input.map(x => (x % 20, 1))
-        val shuffled = keyed.reduceByKey(_ + _)
-        val keysOnly = shuffled.keys
-        val deps = keysOnly.dependencies
-        keysOnly.count()
-        ALS.cleanShuffleDependencies(sc, deps, true)
-        val resultingFiles = getAllFiles
-        assert(resultingFiles === Set())
-        // Ensure running count again works fine even if we kill the shuffle files.
-        keysOnly.count()
-      } finally {
-        sc.stop()
-      }
-    } finally {
-      Utils.deleteRecursively(localDir)
-      Utils.deleteRecursively(checkpointDir)
-    }
-  }
-
-  test("ALS shuffle cleanup in algorithm") {
-    val conf = new SparkConf()
-    val localDir = Utils.createTempDir()
-    val checkpointDir = Utils.createTempDir()
-    def getAllFiles: Set[File] =
-      FileUtils.listFiles(localDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
-    try {
-      conf.set("spark.local.dir", localDir.getAbsolutePath)
-      val sc = new SparkContext("local[2]", "ALSCleanerSuite", conf)
-      try {
-        sc.setCheckpointDir(checkpointDir.getAbsolutePath)
-        // Generate test data
-        val (training, _) = ALSSuite.genImplicitTestData(sc, 20, 5, 1, 0.2, 0)
-        // Implicitly test the cleaning of parents during ALS training
-        val spark = SparkSession.builder
-          .sparkContext(sc)
-          .getOrCreate()
-        import spark.implicits._
-        val als = new ALS()
-          .setRank(1)
-          .setRegParam(1e-5)
-          .setSeed(0)
-          .setCheckpointInterval(1)
-          .setMaxIter(7)
-        val model = als.fit(training.toDF())
-        val resultingFiles = getAllFiles
-        // We expect the last shuffles files, block ratings, user factors, and item factors to be
-        // around but no more.
-        val pattern = "shuffle_(\\d+)_.+\\.data".r
-        val rddIds = resultingFiles.flatMap { f =>
-          pattern.findAllIn(f.getName()).matchData.map { _.group(1) } }
-        assert(rddIds.size === 4)
-      } finally {
-        sc.stop()
-      }
-    } finally {
-      Utils.deleteRecursively(localDir)
-      Utils.deleteRecursively(checkpointDir)
-    }
-  }
-}
-
-class ALSStorageSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest with Logging {
-
-  test("invalid storage params") {
-    intercept[IllegalArgumentException] {
-      new ALS().setIntermediateStorageLevel("foo")
-    }
-    intercept[IllegalArgumentException] {
-      new ALS().setIntermediateStorageLevel("NONE")
-    }
-    intercept[IllegalArgumentException] {
-      new ALS().setFinalStorageLevel("foo")
-    }
-  }
-
-  test("default and non-default storage params set correct RDD StorageLevels") {
-    val spark = this.spark
-    import spark.implicits._
-    val data = Seq(
-      (0, 0, 1.0),
-      (0, 1, 2.0),
-      (1, 2, 3.0),
-      (1, 0, 2.0)
-    ).toDF("user", "item", "rating")
-    val als = new ALS().setMaxIter(1).setRank(1)
-    // add listener to check intermediate RDD default storage levels
-    val defaultListener = new IntermediateRDDStorageListener
-    sc.addSparkListener(defaultListener)
-    val model = als.fit(data)
-    // check final factor RDD default storage levels
-    val defaultFactorRDDs = sc.getPersistentRDDs.collect {
-      case (id, rdd) if rdd.name == "userFactors" || rdd.name == "itemFactors" =>
-        rdd.name -> ((id, rdd.getStorageLevel))
-    }.toMap
-    defaultFactorRDDs.foreach { case (_, (id, level)) =>
-      assert(level == StorageLevel.MEMORY_AND_DISK)
-    }
-    defaultListener.storageLevels.foreach(level => assert(level == StorageLevel.MEMORY_AND_DISK))
-
-    // add listener to check intermediate RDD non-default storage levels
-    val nonDefaultListener = new IntermediateRDDStorageListener
-    sc.addSparkListener(nonDefaultListener)
-    val nonDefaultModel = als
-      .setFinalStorageLevel("MEMORY_ONLY")
-      .setIntermediateStorageLevel("DISK_ONLY")
-      .fit(data)
-    // check final factor RDD non-default storage levels
-    val levels = sc.getPersistentRDDs.collect {
-      case (id, rdd) if rdd.name == "userFactors" && rdd.id != defaultFactorRDDs("userFactors")._1
-        || rdd.name == "itemFactors" && rdd.id != defaultFactorRDDs("itemFactors")._1 =>
-        rdd.getStorageLevel
-    }
-    levels.foreach(level => assert(level == StorageLevel.MEMORY_ONLY))
-    nonDefaultListener.storageLevels.foreach(level => assert(level == StorageLevel.DISK_ONLY))
-  }
-}
-
-private class IntermediateRDDStorageListener extends SparkListener {
-
-  val storageLevels: mutable.ArrayBuffer[StorageLevel] = mutable.ArrayBuffer()
-
-  override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = {
-    val stageLevels = stageCompleted.stageInfo.rddInfos.collect {
-      case info if info.name.contains("Blocks") || info.name.contains("Factors-") =>
-        info.storageLevel
-    }
-    storageLevels ++= stageLevels
-  }
-
-}
-
-object IntelALSSuite extends Logging {
-
-  /**
-   * Mapping from all Params to valid settings which differ from the defaults.
-   * This is useful for tests which need to exercise all Params, such as save/load.
-   * This excludes input columns to simplify some tests.
-   */
-  val allModelParamSettings: Map[String, Any] = Map(
-    "predictionCol" -> "myPredictionCol"
-  )
-
-  /**
-   * Mapping from all Params to valid settings which differ from the defaults.
-   * This is useful for tests which need to exercise all Params, such as save/load.
-   * This excludes input columns to simplify some tests.
-   */
-  val allEstimatorParamSettings: Map[String, Any] = allModelParamSettings ++ Map(
-    "maxIter" -> 1,
-    "rank" -> 1,
-    "regParam" -> 0.01,
-    "numUserBlocks" -> 2,
-    "numItemBlocks" -> 2,
-    "implicitPrefs" -> true,
-    "alpha" -> 0.9,
-    "nonnegative" -> true,
-    "checkpointInterval" -> 20,
-    "intermediateStorageLevel" -> "MEMORY_ONLY",
-    "finalStorageLevel" -> "MEMORY_AND_DISK_SER"
-  )
-
-  // Helper functions to generate test data we share between ALS test suites
-
-  /**
-   * Generates random user/item factors, with i.i.d. values drawn from U(a, b).
-   * @param size number of users/items
-   * @param rank number of features
-   * @param random random number generator
-   * @param a min value of the support (default: -1)
-   * @param b max value of the support (default: 1)
-   * @return a sequence of (ID, factors) pairs
-   */
-  private def genFactors(
-      size: Int,
-      rank: Int,
-      random: Random,
-      a: Float = -1.0f,
-      b: Float = 1.0f): Seq[(Int, Array[Float])] = {
-    require(size > 0 && size < Int.MaxValue / 3)
-    require(b > a)
-    val ids = mutable.Set.empty[Int]
-    while (ids.size < size) {
-      ids += random.nextInt()
-    }
-    val width = b - a
-    ids.toSeq.sorted.map(id => (id, Array.fill(rank)(a + random.nextFloat() * width)))
-  }
-
-  /**
-   * Generates an implicit feedback dataset for testing ALS.
-   *
-   * @param sc SparkContext
-   * @param numUsers number of users
-   * @param numItems number of items
-   * @param rank rank
-   * @param noiseStd the standard deviation of additive Gaussian noise on training data
-   * @param seed random seed
-   * @return (training, test)
-   */
-  def genImplicitTestData(
-      sc: SparkContext,
-      numUsers: Int,
-      numItems: Int,
-      rank: Int,
-      noiseStd: Double = 0.0,
-      seed: Long = 11L): (RDD[Rating[Int]], RDD[Rating[Int]]) = {
-    // The assumption of the implicit feedback model is that unobserved ratings are more likely to
-    // be negatives.
-    val positiveFraction = 0.8
-    val negativeFraction = 1.0 - positiveFraction
-    val trainingFraction = 0.6
-    val testFraction = 0.3
-    val totalFraction = trainingFraction + testFraction
-    val random = new Random(seed)
-    val userFactors = genFactors(numUsers, rank, random)
-    val itemFactors = genFactors(numItems, rank, random)
-    val training = ArrayBuffer.empty[Rating[Int]]
-    val test = ArrayBuffer.empty[Rating[Int]]
-    for ((userId, userFactor) <- userFactors; (itemId, itemFactor) <- itemFactors) {
-      val rating = blas.sdot(rank, userFactor, 1, itemFactor, 1)
-      val threshold = if (rating > 0) positiveFraction else negativeFraction
-      val observed = random.nextDouble() < threshold
-      if (observed) {
-        val x = random.nextDouble()
-        if (x < totalFraction) {
-          if (x < trainingFraction) {
-            val noise = noiseStd * random.nextGaussian()
-            training += Rating(userId, itemId, rating + noise.toFloat)
-          } else {
-            test += Rating(userId, itemId, rating)
-          }
-        }
-      }
-    }
-    logInfo(s"Generated an implicit feedback dataset with ${training.size} ratings for training " +
-      s"and ${test.size} for test.")
-    (sc.parallelize(training, 2), sc.parallelize(test, 2))
-  }
-}
diff --git a/mllib-dal/test-cluster.sh b/mllib-dal/test-cluster.sh
deleted file mode 100755
index 4f5a6132a..000000000
--- a/mllib-dal/test-cluster.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/usr/bin/env bash
-
-cd ../dev/test-cluster/workloads
-
-./run-kmeans-pyspark.sh
diff --git a/mllib-dal/test.sh b/mllib-dal/test.sh
index 0157c22a4..f7e73ca1f 100755
--- a/mllib-dal/test.sh
+++ b/mllib-dal/test.sh
@@ -35,9 +35,8 @@ export LD_PRELOAD=$JAVA_HOME/jre/lib/amd64/libjsig.so
 # -Dtest=none to turn off the Java tests
 
 # Test all
-# mvn -Dtest=none -Dmaven.test.skip=false test
+mvn -Dtest=none -Dmaven.test.skip=false test
 
 # Individual test
-mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.clustering.IntelKMeansSuite test
-mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.feature.IntelPCASuite test
-# mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.recommendation.IntelALSSuite test
+# mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.clustering.IntelKMeansSuite test
+# mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.feature.IntelPCASuite test