Merge branch 'develop'

dimajix · Dec 13, 2021 · c2d2748 · c2d2748
2 parents 27de909 + 3d77b6d
commit c2d2748
Show file tree

Hide file tree

Showing 405 changed files with 52,860 additions and 5,794 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -110,9 +110,9 @@ build-hadoop3.2-spark3.0:
 
 build-hadoop2.7-spark3.1:
   stage: build
-  script: 'mvn ${MAVEN_CLI_OPTS} clean package -Phadoop-2.7 -Pspark-3.0 -Ddockerfile.skip'
+  script: 'mvn ${MAVEN_CLI_OPTS} clean package -Phadoop-2.7 -Pspark-3.1 -Ddockerfile.skip'
   artifacts:
-    name: "flowman-dist-hadoop3.1-spark3.0"
+    name: "flowman-dist-hadoop3.1-spark3.1"
     paths:
       - flowman-dist/target/flowman-dist-*-bin.tar.gz
     expire_in: 5 days
@@ -126,6 +126,24 @@ build-hadoop3.2-spark3.1:
       - flowman-dist/target/flowman-dist-*-bin.tar.gz
     expire_in: 5 days
 
+build-hadoop2.7-spark3.2:
+  stage: build
+  script: 'mvn ${MAVEN_CLI_OPTS} clean package -Phadoop-2.7 -Pspark-3.2 -Ddockerfile.skip'
+  artifacts:
+    name: "flowman-dist-hadoop3.1-spark3.2"
+    paths:
+      - flowman-dist/target/flowman-dist-*-bin.tar.gz
+    expire_in: 5 days
+
+build-hadoop3.3-spark3.2:
+  stage: build
+  script: 'mvn ${MAVEN_CLI_OPTS} clean package -Phadoop-3.3 -Pspark-3.2 -Ddockerfile.skip'
+  artifacts:
+    name: "flowman-dist-hadoop3.3-spark3.2"
+    paths:
+      - flowman-dist/target/flowman-dist-*-bin.tar.gz
+    expire_in: 5 days
+
 build-cdh6.3:
   stage: build
   image: dimajix/maven-npm:jdk-1.8

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -0,0 +1,11 @@
+version: 2
+
+# Build from the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/conf.py
+
+# Explicitly set the version of Python and its requirements
+python:
+  version: 3.7
+  install:
+    - requirements: docs/requirements.txt
diff --git a/BUILDING.md b/BUILDING.md
@@ -93,12 +93,14 @@ using the correct version. The following profiles are available:
 * spark-2.4
 * spark-3.0
 * spark-3.1 
+* spark-3.2
 * hadoop-2.6
 * hadoop-2.7
 * hadoop-2.8
 * hadoop-2.9
 * hadoop-3.1
 * hadoop-3.2
+* hadoop-3.3
 * CDH-6.3
 * CDP-7.1
 
@@ -154,12 +156,31 @@ mvn clean install -Pspark-3.0 -Phadoop-3.1
 mvn clean install -Pspark-3.0 -Phadoop-3.2
 ```
 
+### Spark 3.1 and Hadoop 2.7
+
+```shell
+mvn clean install -Pspark-3.1 -Phadoop-2.7
+```
+
 ### Spark 3.1 and Hadoop 3.2
 
 ```shell
 mvn clean install -Pspark-3.1 -Phadoop-3.2
 ```
 
+### Spark 3.2 and Hadoop 2.7
+
+```shell
+mvn clean install -Pspark-3.2 -Phadoop-2.7
+```
+
+### Spark 3.2 and Hadoop 3.3
+
+```shell
+mvn clean install -Pspark-3.2 -Phadoop-3.3
+```
+
+
 ## Building for Cloudera
 
 The Maven project also contains preconfigured profiles for Cloudera CDH 6.3 and for Cloudera CDP 7.1.

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,18 @@
+# Version 0.19.0 - 2021-12-13
+
+* Add build profile for Hadoop 3.3
+* Add build profile for Spark 3.2
+* Allow SQL expressions as dimensions in `aggregate` mapping
+* Update Hive views when the resulting schema would change
+* Add new `mapping cache` command to FlowShell
+* Support embedded connection definitions
+* Much improved Flowman History Server
+* Fix wrong metric names with TemplateTarget
+* Implement more `template` types for `connection`, `schema`, `dataset`, `assertion` and `measure`
+* Implement new `measure` target for creating custom metrics for measuring data quality
+* Add new config option `flowman.execution.mapping.parallelism`
+
+
 # Version 0.18.0 - 2021-10-13
 
 * Improve automatic schema migration for Hive and JDBC relations

diff --git a/QUICKSTART.md b/QUICKSTART.md
@@ -16,16 +16,16 @@ Fortunately, Spark is rather simple to install locally on your machine:
 
 ### Download & Install Spark
 
-As of this writing, the latest release of Flowman is 0.14.2 and is available prebuilt for Spark 3.0.1 on the Spark
+As of this writing, the latest release of Flowman is 0.18.0 and is available prebuilt for Spark 3.1.2 on the Spark
 homepage. So we download the appropriate Spark distribution from the Apache archive and unpack it.
 
 ```shell
 # Create a nice playground which doesn't mess up your system
 mkdir playground
 cd playground# Download and unpack Spark & Hadoop
 
-curl -L https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop3.2.tgz | tar xvzf -# Create a nice link
-ln -snf spark-3.0.1-bin-hadoop3.2 spark
+curl -L https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz | tar xvzf -# Create a nice link
+ln -snf spark-3.1.2-bin-hadoop3.2 spark
 ```
 
 The Spark package already contains Hadoop, so with this single download you already have both installed and integrated with each other.
@@ -37,8 +37,8 @@ You find prebuilt Flowman packages on the corresponding release page on GitHub.
 
 ```shell
 # Download and unpack Flowman
-curl -L https://github.com/dimajix/flowman/releases/download/0.14.2/flowman-dist-0.14.2-oss-spark3.0-hadoop3.2-bin.tar.gz | tar xvzf -# Create a nice link
-ln -snf flowman-0.14.2 flowman
+curl -L https://github.com/dimajix/flowman/releases/download/0.18.0/flowman-dist-0.18.0-oss-spark3.1-hadoop3.2-bin.tar.gz | tar xvzf -# Create a nice link
+ln -snf flowman-0.18.0 flowman
 ```
 
 ### Flowman Configuration

diff --git a/build-release.sh b/build-release.sh
@@ -1,23 +1,50 @@
 #!/usr/bin/env bash
 
-rm -rf release
-mkdir release
+FLOWMAN_VERSION=$(mvn -q -N help:evaluate -Dexpression=project.version -DforceStdout)
+echo "Building Flowman release version ${FLOWMAN_VERSION}"
+
+mkdir -p release
+
 
 build_profile() {
     profiles=""
     for p in $@
     do
         profiles="$profiles -P$p"
     done
-    mvn clean install $profiles -DskipTests -Ddockerfile.skip
+
+    # Set new version
+    HADOOP_DIST=$(mvn $profiles -q -N help:evaluate -Dexpression=hadoop.dist -DforceStdout)
+    SPARK_API_VERSION=$(mvn $profiles -q -N help:evaluate -Dexpression=spark-api.version -DforceStdout)
+    HADOOP_API_VERSION=$(mvn $profiles -q -N help:evaluate -Dexpression=hadoop-api.version -DforceStdout)
+
+    echo "Building for dist $HADOOP_DIST with Spark $SPARK_API_VERSION and Hadoop $HADOOP_API_VERSION"
+    mvn -q versions:set -DnewVersion=${FLOWMAN_VERSION}-${HADOOP_DIST}-spark${SPARK_API_VERSION}-hadoop${HADOOP_API_VERSION}
+
+    #mvn clean deploy $profiles -DskipTests -Dflowman.dist.suffix=""
+    mvn clean install $profiles -DskipTests -Dflowman.dist.suffix=""
     cp flowman-dist/target/flowman-dist-*.tar.gz release
+
+    # Revert to original version
+    mvn -q versions:revert
 }
 
+
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
 build_profile hadoop-2.6 spark-2.4
 build_profile hadoop-2.7 spark-2.4
+
+export JAVA_HOME=
 build_profile hadoop-2.7 spark-3.0
 build_profile hadoop-3.2 spark-3.0
 build_profile hadoop-2.7 spark-3.1
 build_profile hadoop-3.2 spark-3.1
+build_profile hadoop-2.7 spark-3.2
+build_profile hadoop-3.3 spark-3.2
+
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
 build_profile CDH-6.3
 build_profile CDP-7.1
+
+# Finally build default version
+mvn clean install -DskipTests
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM openjdk:8
+FROM openjdk:11
 MAINTAINER k.kupferschmidt@dimajix.de
 
 ARG DIST_FILE

diff --git a/docker/conf/flowman-env.sh b/docker/conf/flowman-env.sh
@@ -24,3 +24,6 @@ if [[ "$AWS_ACCESS_KEY_ID" != "" ]]; then
         --conf spark.hadoop.fs.s3a.secret.key=${AWS_SECRET_ACCESS_KEY}
         $SPARK_OPTS"
 fi
+
+# Allocate somewhat more memory for driver
+SPARK_DRIVER_MEMORY="4G"
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -2,15 +2,14 @@ version: "3"
 
 services:
   flowman:
-    image: dimajix/flowman:${project.version}-${hadoop.dist}-spark${spark-api.version}-hadoop${hadoop-api.version}
+    image: dimajix/flowman:${project.version}${flowman.dist.suffix}
     command: bash
     environment:
       - http_proxy=${http_proxy}
       - https_proxy=${https_proxy}
       - SPARK_MASTER=local[10]
-      - HDFS_DEFAULT_FS=
-      - AWS_ACCESS_KEY_ID=
-      - AWS_SECRET_ACCESS_KEY=
-      - S3_ENDPOINT=s3.eu-central-1.amazonaws.com
-      - S3_PROXY_HOST=
-      - S3_PROXY_PORT=
+      #- AWS_ACCESS_KEY_ID=
+      #- AWS_SECRET_ACCESS_KEY=
+      #- S3_ENDPOINT=s3.eu-central-1.amazonaws.com
+      #- S3_PROXY_HOST=
+      #- S3_PROXY_PORT=-1
diff --git a/docker/pom.xml b/docker/pom.xml
@@ -10,27 +10,21 @@
     <parent>
         <groupId>com.dimajix.flowman</groupId>
         <artifactId>flowman-root</artifactId>
-        <version>0.18.0</version>
+        <version>0.19.0</version>
         <relativePath>../pom.xml</relativePath>
     </parent>
 
-    <properties>
-        <dist.tag>${project.version}-${hadoop.dist}-spark${spark-api.version}-hadoop${hadoop-api.version}</dist.tag>
-        <docker.base-image.repository>dimajix/spark</docker.base-image.repository>
-        <docker.base-image.version>${spark.version}</docker.base-image.version>
-    </properties>
-
     <profiles>
         <profile>
             <id>CDH-6.3</id>
             <properties>
-                <docker.base-image.version>2.4.5</docker.base-image.version>
+                <dockerfile.skip>true</dockerfile.skip>
             </properties>
         </profile>
         <profile>
             <id>CDP-7.1</id>
             <properties>
-                <docker.base-image.version>2.4.5</docker.base-image.version>
+                <dockerfile.skip>true</dockerfile.skip>
             </properties>
         </profile>
     </profiles>
@@ -53,7 +47,7 @@
                                 <resource>
                                     <directory>../flowman-dist/target</directory>
                                     <includes>
-                                        <include>flowman-dist-${dist.tag}-bin.tar.gz</include>
+                                        <include>flowman-dist-${flowman.dist.label}-bin.tar.gz</include>
                                     </includes>
                                     <filtering>false</filtering>
                                 </resource>
@@ -95,12 +89,12 @@
                     <repository>dimajix/flowman</repository>
                     <contextDirectory>target/build</contextDirectory>
                     <useMavenSettingsForAuth>true</useMavenSettingsForAuth>
-                    <tag>${dist.tag}</tag>
+                    <tag>${flowman.dist.label}</tag>
                     <pullNewerImage>false</pullNewerImage>
                     <buildArgs>
                         <BUILD_SPARK_VERSION>${spark.version}</BUILD_SPARK_VERSION>
                         <BUILD_HADOOP_VERSION>${hadoop-api.version}</BUILD_HADOOP_VERSION>
-                        <DIST_FILE>flowman-dist-${dist.tag}-bin.tar.gz</DIST_FILE>
+                        <DIST_FILE>flowman-dist-${flowman.dist.label}-bin.tar.gz</DIST_FILE>
                         <http_proxy>${env.http_proxy}</http_proxy>
                         <https_proxy>${env.https_proxy}</https_proxy>
                     </buildArgs>

diff --git a/docs/cli/flowserver.md b/docs/cli/flowserver.md
@@ -1,3 +1,64 @@
-# Flowman Server
+# Flowman History Server
 
-The Flowman server is used to provide both a REST interface and a small UI.
+The Flowman History Server provides both a REST interface and a small UI which gives you an overview of jobs and
+targets executed in the past. In order to use the History Server, you need to configure a *history database* in
+the [Flowman namespace](../spec/namespace.md).
+
+![Flowman History Server](../images/history-server.png)
+
+## Configuration
+The history database needs to be configured in the `default-namespace.yml` file and also (if used) in the
+`history-server.yml` file. The first configuration will be used by [flowexec](flowexec.md) and [flowshell](flowshell.md)
+while the second file will be picked up by the Flowman History Server.
+
+```yaml
+# The following definition provides a "run history" stored in a database. If nothing else is specified, the database
+# is stored locally as a Derby database. If you do not want to use the history, you can simply remove the whole
+# 'history' block from this file.
+history:
+  kind: jdbc
+  connection: flowman_state
+  retries: 3
+  timeout: 1000
+
+# The following connection is used by the history database above.
+connections:
+  flowman_state:
+    driver: $System.getenv('FLOWMAN_HISTORY_DRIVER', 'org.apache.derby.jdbc.EmbeddedDriver')
+    url: $System.getenv('FLOWMAN_HISTORY_URL', $String.concat('jdbc:derby:', $System.getenv('FLOWMAN_HOME'), '/flowman-history;create=true'))
+    username: $System.getenv('FLOWMAN_HISTORY_USER', '')
+    password: $System.getenv('FLOWMAN_HISTORY_PASSWORD', '')
+
+# This section enables plugins. You may want to remove plugins which are of no use for you.
+plugins:
+  - flowman-mariadb
+  - flowman-mysql
+  - flowman-mssqlserver
+```
+Note that this configuration falls back to a Derby database when no other database is provided via the system 
+environment variables `FLOWMAN_HISTORY_*`. This setup is not recommended except for experimentation, as a Derby 
+database can only be accessed by a single client at a time (which prevents running the History Server and 
+flowexec/flowshell at the same time).
+
+A more useful setup might look as follows:
+```yaml
+history:
+  kind: jdbc
+  retries: 3
+  timeout: 1000
+  connection:
+    driver: "com.mysql.cj.jdbc.Driver"
+    url: "jdbc:mysql://mysql-server.in.my.network/flowman_history"
+    username: "flowman"
+    password: "secret"
+
+plugins:
+  - flowman-mysql
+```
+Note that you also need to load the corresponding database plugin to provide the driver.
+
+
+## Running
+
+Once the History Server is configured, it can be started via `flowserver`. This will start a web server which listens
+on port 8080, so you should be able to access the UI via [http://localhost:8080](http://localhost:8080).
diff --git a/docs/conf.py b/docs/conf.py
@@ -61,9 +61,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.18'
+version = '0.19'
 # The full version, including alpha/beta/rc tags.
-release = '0.18.0'
+release = '0.19.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/config.md b/docs/config.md
@@ -39,11 +39,19 @@ Configure the executor to use. The default `SimpleExecutor` will process all tar
   sequentially. The alternative implementation `com.dimajix.flowman.execution.ParallelExecutor` will run multiple 
   targets in parallel (if they are not depending on each other)
 
+- `flowman.execution.executor.parallelism` *(type: int)* *(default: 1)*
+The number of mappings to be processed in parallel. Increasing this number may help in scenarios where many 
+relations are read from and their initial setup is slow (for example due to slow directory listings). With the
+default value of 1, the parallelism is completely disabled and a non-threaded code path is used instead.
+
 - `flowman.execution.executor.parallelism` *(type: int)* *(default: 4)*
 The number of targets to be executed in parallel, when the `ParallelExecutor` is used.
 
-- `flowman.execution.scheduler.class` *(type: class)* *(default: `com.dimajix.flowman.execution.SimpleScheduler`)*
-  Configure the scheduler to use. The default `SimpleScheduler` will sort all targets according to their dependency.
+- `flowman.execution.scheduler.class` *(type: class)* *(default: `com.dimajix.flowman.execution.DependencyScheduler`)*
+  Configure the scheduler to use, which essentially decides which target to build next.
+  - The default `DependencyScheduler` will sort all targets according to their dependency.
+  - The simpler `ManualScheduler` will simply respect the order of targets as specified in a job. This may not work
+    well with parallel execution if targets have dependencies.
 
 - `flowman.spark.eagerCache` *(type: boolean)* *(default: `false`)*
 Turns on automatic eager caching of Spark jobs that reference a single cached DataFrame multiple times. This is to