Skip to content
This repository has been archived by the owner on Sep 18, 2023. It is now read-only.

[NSE-186]backport to 1.1 branch batch3 #252

Merged
merged 8 commits into from
Apr 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
cd native-sql-engine/cpp/
mkdir -p build
cd build
cmake .. -DTESTS=1
cmake .. -DBUILD_ARROW=0 -DTESTS=1
make
cd src
ctest -R
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ We implemented common operators based on Apache Arrow Compute and Gandiva. The S

We implemented columnar shuffle to improve the shuffle performance. With the columnar layout we could do very efficient data compression for different data format.

Please check the operator supporting details [here](./docs/operators.md)

## Build the Plugin

### Building by Conda
Expand Down
34 changes: 33 additions & 1 deletion arrow-data-source/common/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,41 @@
<groupId>com.intel.oap</groupId>
<version>1.1.0</version>
</parent>

<modelVersion>4.0.0</modelVersion>

<artifactId>spark-arrow-datasource-common</artifactId>

<dependencies>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-memory-netty</artifactId>
<version>${arrow.version}</version>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-dataset</artifactId>
<version>${arrow.version}</version>
<exclusions>
<exclusion>
<groupId>io.netty</groupId>
<artifactId>netty-common</artifactId>
</exclusion>
<exclusion>
<groupId>io.netty</groupId>
<artifactId>netty-buffer</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
</exclusion>
</exclusions>
<scope>compile</scope>
</dependency>
</dependencies>
<build>
<sourceDirectory>${project.basedir}/src/main/scala</sourceDirectory>
<testSourceDirectory>${project.basedir}/src/test/scala</testSourceDirectory>
Expand Down
77 changes: 46 additions & 31 deletions arrow-data-source/pom.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>com.intel.oap</groupId>
<artifactId>native-sql-engine-parent</artifactId>
<version>1.1.0</version>
</parent>

<modelVersion>4.0.0</modelVersion>
<groupId>com.intel.oap</groupId>
<artifactId>spark-arrow-datasource</artifactId>
<name>OAP Project Spark Arrow Datasource</name>
Expand All @@ -20,6 +24,11 @@
<arrow.version>3.0.0</arrow.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<script.dir>${arrow.script.dir}</script.dir>
<datasource.cpp_tests>${cpp_tests}</datasource.cpp_tests>
<datasource.build_arrow>${build_arrow}</datasource.build_arrow>
<datasource.static_arrow>${static_arrow}</datasource.static_arrow>
<datasource.arrow_root>${arrow_root}</datasource.arrow_root>
</properties>

<repositories>
Expand All @@ -39,35 +48,6 @@
</pluginRepositories>

<dependencies>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-memory-netty</artifactId>
<version>${arrow.version}</version>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-dataset</artifactId>
<version>${arrow.version}</version>
<exclusions>
<exclusion>
<groupId>io.netty</groupId>
<artifactId>netty-common</artifactId>
</exclusion>
<exclusion>
<groupId>io.netty</groupId>
<artifactId>netty-buffer</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
</exclusion>
</exclusions>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
Expand Down Expand Up @@ -123,6 +103,41 @@

<build>
<plugins>
<plugin>
<artifactId>exec-maven-plugin</artifactId>
<groupId>org.codehaus.mojo</groupId>
<version>1.6.0</version>
<inherited>false</inherited>
<executions>
<execution>
<id>Build arrow</id>
<phase>generate-resources</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>bash</executable>
<arguments>
<argument>${script.dir}/build_arrow.sh</argument>
<argument>--tests=${datasource.cpp_tests}</argument>
<argument>--build_arrow=${datasource.build_arrow}</argument>
<argument>--static_arrow=${datasource.static_arrow}</argument>
<argument>--arrow_root=${datasource.arrow_root}</argument>
</arguments>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<configuration>
<filesets>
<fileset>
<directory>${script.dir}/build</directory>
</fileset>
</filesets>
</configuration>
</plugin>
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
Expand Down
106 changes: 106 additions & 0 deletions arrow-data-source/script/build_arrow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/bin/bash

set -eu

NPROC=$(nproc)

TESTS=OFF
BUILD_ARROW=OFF
STATIC_ARROW=OFF
ARROW_ROOT=/usr/local

for arg in "$@"
do
case $arg in
-t=*|--tests=*)
TESTS=("${arg#*=}")
shift # Remove argument name from processing
;;
-a=*|--build_arrow=*)
BUILD_ARROW=("${arg#*=}")
shift # Remove argument name from processing
;;
-s=*|--static_arrow=*)
STATIC_ARROW=("${arg#*=}")
shift # Remove argument name from processing
;;
-ar=*|--arrow_root=*)
ARROW_ROOT=("${arg#*=}")
shift # Remove argument name from processing
;;
*)
OTHER_ARGUMENTS+=("$1")
shift # Remove generic argument from processing
;;
esac
done

echo "CMAKE Arguments:"
echo "TESTS=${TESTS}"
echo "BUILD_ARROW=${BUILD_ARROW}"
echo "STATIC_ARROW=${STATIC_ARROW}"
echo "ARROW_ROOT=${ARROW_ROOT}"

CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd)
echo $CURRENT_DIR

cd ${CURRENT_DIR}
if [ -d build ]; then
rm -r build
fi

if [ $BUILD_ARROW == "ON" ]; then
echo "Building Arrow from Source ..."
mkdir build
cd build
ARROW_PREFIX="${CURRENT_DIR}/build" # Use build directory as ARROW_PREFIX
ARROW_SOURCE_DIR="${ARROW_PREFIX}/arrow_ep"
ARROW_INSTALL_DIR="${ARROW_PREFIX}/arrow_install"

echo "ARROW_PREFIX=${ARROW_PREFIX}"
echo "ARROW_SOURCE_DIR=${ARROW_SOURCE_DIR}"
echo "ARROW_INSTALL_DIR=${ARROW_INSTALL_DIR}"
mkdir -p $ARROW_SOURCE_DIR
mkdir -p $ARROW_INSTALL_DIR
git clone https://github.com/oap-project/arrow.git --branch arrow-3.0.0-oap $ARROW_SOURCE_DIR
pushd $ARROW_SOURCE_DIR

cmake ./cpp \
-DARROW_BUILD_STATIC=OFF -DARROW_BUILD_SHARED=ON -DARROW_COMPUTE=ON \
-DARROW_S3=ON \
-DARROW_GANDIVA_JAVA=ON \
-DARROW_GANDIVA=ON \
-DARROW_PARQUET=ON \
-DARROW_HDFS=ON \
-DARROW_BOOST_USE_SHARED=OFF \
-DARROW_JNI=ON \
-DARROW_DATASET=ON \
-DARROW_WITH_PROTOBUF=ON \
-DARROW_WITH_SNAPPY=ON \
-DARROW_WITH_LZ4=ON \
-DARROW_WITH_ZSTD=OFF \
-DARROW_WITH_BROTLI=OFF \
-DARROW_WITH_ZLIB=OFF \
-DARROW_WITH_FASTPFOR=ON \
-DARROW_FILESYSTEM=ON \
-DARROW_JSON=ON \
-DARROW_CSV=ON \
-DARROW_FLIGHT=OFF \
-DARROW_JEMALLOC=ON \
-DARROW_SIMD_LEVEL=AVX2 \
-DARROW_RUNTIME_SIMD_LEVEL=MAX \
-DARROW_DEPENDENCY_SOURCE=BUNDLED \
-DCMAKE_INSTALL_PREFIX=${ARROW_INSTALL_DIR} \
-DCMAKE_INSTALL_LIBDIR=lib

make -j$NPROC
make install

cd java
mvn clean install -P arrow-jni -am -Darrow.cpp.build.dir=${ARROW_INSTALL_DIR}/lib -DskipTests -Dcheckstyle.skip
popd
echo "Finish to build Arrow from Source !!!"
else
echo "Use ARROW_ROOT as Arrow Library Path"
echo "ARROW_ROOT=${ARROW_ROOT}"
fi
File renamed without changes.
30 changes: 30 additions & 0 deletions docs/operators.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
| No. | Executor | Description | Notes | BOOLEAN | BYTE | SHORT | INT | LONG | FLOAT | DOUBLE | STRING | DECIMAL | DATE | TIMESTAMP | NULL | BINARY | CALENDAR | ARRAY | MAP | STRUCT | UDT |
| --- | --------------------------- | ----------- | ----------------- | ------- | ---- | ----- | --- | ---- | ----- | ------ | ------ | ------- | ---- | --------- | ---- | ------ | -------- | ----- | --- | ------ | --- |
| 1 | CoalesceExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 2 | CollectLimitExec | | using row version | | | | | | | | | | | | | | | | | | |
| 3 | ExpandExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 4 | FileSourceScanExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 5 | FilterExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 6 | GenerateExec | | using row version | | | | | | | | | | | | | | | | | | |
| 7 | GlobalLimitExec | | using row version | | | | | | | | | | | | | | | | | | |
| 8 | LocalLimitExec | | | | | | | | | | | | | | | | | | | | |
| 9 | ProjectExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 10 | RangeExec | | using row version | | | | | | | | | | | | | | | | | | |
| 11 | SortExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 12 | TakeOrderedAndPorjectExec | | using row version | | | | | | | | | | | | | | | | | | |
| 13 | UnionExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 14 | CustomShuffleReaderExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 15 | HashAggregateExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 16 | SortAggregateExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 17 | DataWritingCommandExec | | using row version | | | | | | | | | | | | | | | | | | |
| 18 | BatchScanExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 19 | BroadcastExchangeExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 20 | ShuffleExchangeExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 21 | BroadcastHashJoinExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 22 | BroadcastNestedLoopJoinExec | | using row version | | | | | | | | | | | | | | | | | | |
| 23 | CartesianProductExec | | using row version | | | | | | | | | | | | | | | | | | |
| 24 | ShuffledHashJoinExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 25 | SortMergeJoinExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
| 26 | ArrowEvalPythonExec | | using row version | | | | | | | | | | | | | | | | | | |
| 27 | WindowINPandasExec | | using row version | | | | | | | | | | | | | | | | | | |
| 28 | WindowExec | | | y | y | y | y | y | y | y | y | y | y | | | | | | | | |
36 changes: 24 additions & 12 deletions native-sql-engine/core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>com.intel.oap</groupId>
<artifactId>native-sql-engine-parent</artifactId>
<version>1.1.0</version>
<relativePath>../../pom.xml</relativePath>
</parent>

<groupId>com.intel.oap</groupId>
<artifactId>spark-columnar-core</artifactId>
Expand All @@ -36,11 +41,12 @@
<hive.parquet.group>com.twitter</hive.parquet.group>
<parquet.deps.scope>provided</parquet.deps.scope>
<jars.target.dir>${project.build.directory}/scala-${scala.binary.version}/jars</jars.target.dir>
<cpp_tests>OFF</cpp_tests>
<build_arrow>ON</build_arrow>
<static_arrow>OFF</static_arrow>
<build_protobuf>ON</build_protobuf>
<arrow_root>/usr/local</arrow_root>
<nativesql.cpp_tests>${cpp_tests}</nativesql.cpp_tests>
<nativesql.build_arrow>OFF</nativesql.build_arrow>
<nativesql.static_arrow>${static_arrow}</nativesql.static_arrow>
<nativesql.arrow.bfs.install.dir>${project.basedir}/../../arrow-data-source/script/build/arrow_install</nativesql.arrow.bfs.install.dir>
<nativesql.arrow_root>${arrow_root}</nativesql.arrow_root>
<nativesql.build_protobuf>${build_protobuf}</nativesql.build_protobuf>
</properties>
<dependencies>
<!-- Prevent our dummy JAR from being included in Spark distributions or uploaded to YARN -->
Expand Down Expand Up @@ -304,16 +310,22 @@
<executable>bash</executable>
<arguments>
<argument>${cpp.dir}/compile.sh</argument>
<argument>${cpp_tests}</argument>
<argument>${build_arrow}</argument>
<argument>${static_arrow}</argument>
<argument>${build_protobuf}</argument>
<argument>${arrow_root}</argument>
<argument>${nativesql.cpp_tests}</argument>
<argument>${nativesql.build_arrow}</argument>
<argument>${nativesql.static_arrow}</argument>
<argument>${nativesql.build_protobuf}</argument>
<argument>${nativesql.arrow_root}</argument>
<argument>${nativesql.arrow.bfs.install.dir}</argument>
</arguments>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.1</version>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
Expand Down Expand Up @@ -376,7 +388,7 @@
<version>1.0.0</version>
<configuration>
<verbose>false</verbose>
<failOnViolation>true</failOnViolation>
<failOnViolation>false</failOnViolation>
<includeTestSourceDirectory>true</includeTestSourceDirectory>
<failOnWarning>false</failOnWarning>
<sourceDirectory>${project.basedir}/src/main/scala</sourceDirectory>
Expand Down
Loading