Add experimental-jobs-as-code template (#2177)

## Changes Add experimental-jobs-as-code template allowing defining jobs using Python instead of YAML through the `databricks-bundles` PyPI package. ## Tests Manually and acceptance tests.
databricks · Jan 20, 2025 · 31c10c1 · 31c10c1
1 parent 7034793
commit 31c10c1
Show file tree

Hide file tree

Showing 36 changed files with 1,182 additions and 0 deletions.
diff --git a/acceptance/acceptance_test.go b/acceptance/acceptance_test.go
@@ -8,6 +8,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"regexp"
 	"runtime"
 	"slices"
 	"sort"
@@ -393,14 +394,36 @@ func CopyDir(src, dst string, inputs, outputs map[string]bool) error {
 }
 
 func ListDir(t *testing.T, src string) ([]string, error) {
+	// exclude folders in .gitignore from comparison
+	ignored := []string{
+		"\\.ruff_cache",
+		"\\.venv",
+		".*\\.egg-info",
+		"__pycache__",
+		// depends on uv version
+		"uv.lock",
+	}
+
 	var files []string
 	err := filepath.Walk(src, func(path string, info os.FileInfo, err error) error {
 		if err != nil {
 			return err
 		}
 
 		if info.IsDir() {
+			for _, ignoredFolder := range ignored {
+				if matched, _ := regexp.MatchString(ignoredFolder, info.Name()); matched {
+					return filepath.SkipDir
+				}
+			}
+
 			return nil
+		} else {
+			for _, ignoredFolder := range ignored {
+				if matched, _ := regexp.MatchString(ignoredFolder, info.Name()); matched {
+					return nil
+				}
+			}
 		}
 
 		relPath, err := filepath.Rel(src, path)

diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/input.json b/acceptance/bundle/templates/experimental-jobs-as-code/input.json
@@ -0,0 +1,5 @@
+{
+  "project_name": "my_jobs_as_code",
+  "include_notebook": "yes",
+  "include_python": "yes"
+}
diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output.txt b/acceptance/bundle/templates/experimental-jobs-as-code/output.txt
@@ -0,0 +1,85 @@
+
+>>> $CLI bundle init experimental-jobs-as-code --config-file ./input.json --output-dir output
+
+Welcome to (EXPERIMENTAL) "Jobs as code" template for Databricks Asset Bundles!
+Workspace to use (auto-detected, edit in 'my_jobs_as_code/databricks.yml'): $DATABRICKS_URL
+
+✨ Your new project has been created in the 'my_jobs_as_code' directory!
+
+Please refer to the README.md file for "getting started" instructions.
+See also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html.
+
+>>> $CLI bundle validate -t dev --output json
+{
+  "jobs": {
+    "my_jobs_as_code_job": {
+      "deployment": {
+        "kind": "BUNDLE",
+        "metadata_file_path": "/Workspace/Users/$USERNAME/.bundle/my_jobs_as_code/dev/state/metadata.json"
+      },
+      "edit_mode": "UI_LOCKED",
+      "email_notifications": {
+        "on_failure": [
+          "$USERNAME"
+        ]
+      },
+      "format": "MULTI_TASK",
+      "job_clusters": [
+        {
+          "job_cluster_key": "job_cluster",
+          "new_cluster": {
+            "autoscale": {
+              "max_workers": 4,
+              "min_workers": 1
+            },
+            "node_type_id": "i3.xlarge",
+            "spark_version": "15.4.x-scala2.12"
+          }
+        }
+      ],
+      "max_concurrent_runs": 4,
+      "name": "[dev $USERNAME] my_jobs_as_code_job",
+      "permissions": [],
+      "queue": {
+        "enabled": true
+      },
+      "tags": {
+        "dev": "$USERNAME"
+      },
+      "tasks": [
+        {
+          "job_cluster_key": "job_cluster",
+          "notebook_task": {
+            "notebook_path": "/Workspace/Users/$USERNAME/.bundle/my_jobs_as_code/dev/files/src/notebook"
+          },
+          "task_key": "notebook_task"
+        },
+        {
+          "depends_on": [
+            {
+              "task_key": "notebook_task"
+            }
+          ],
+          "job_cluster_key": "job_cluster",
+          "libraries": [
+            {
+              "whl": "dist/*.whl"
+            }
+          ],
+          "python_wheel_task": {
+            "entry_point": "main",
+            "package_name": "my_jobs_as_code"
+          },
+          "task_key": "main_task"
+        }
+      ],
+      "trigger": {
+        "pause_status": "PAUSED",
+        "periodic": {
+          "interval": 1,
+          "unit": "DAYS"
+        }
+      }
+    }
+  }
+}
diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/.gitignore b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/.gitignore
@@ -0,0 +1,8 @@
+.databricks/
+build/
+dist/
+__pycache__/
+*.egg-info
+.venv/
+scratch/**
+!scratch/README.md
diff --git a/...nce/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/README.md b/...nce/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/README.md
@@ -0,0 +1,58 @@
+# my_jobs_as_code
+
+The 'my_jobs_as_code' project was generated by using the "Jobs as code" template.
+
+## Prerequisites
+
+1. Install Databricks CLI 0.238 or later.
+   See [Install or update the Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/install.html).
+
+2. Install uv. See [Installing uv](https://docs.astral.sh/uv/getting-started/installation/).
+   We use uv to create a virtual environment and install the required dependencies.
+
+3. Authenticate to your Databricks workspace if you have not done so already:
+    ```
+    $ databricks configure
+    ```
+
+4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
+   https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for
+   **Databricks Connect** for instructions on running the included Python code from a different IDE.
+
+5. For documentation on the Databricks Asset Bundles format used
+   for this project, and for CI/CD configuration, see
+   https://docs.databricks.com/dev-tools/bundles/index.html.
+
+## Deploy and run jobs
+
+1. Create a new virtual environment and install the required dependencies:
+    ```
+    $ uv sync
+    ```
+
+2. To deploy the bundle to the development target:
+    ```
+    $ databricks bundle deploy --target dev
+    ```
+
+   *(Note that "dev" is the default target, so the `--target` parameter is optional here.)*
+
+   This deploys everything that's defined for this project.
+   For example, the default template would deploy a job called
+   `[dev yourname] my_jobs_as_code_job` to your workspace.
+   You can find that job by opening your workspace and clicking on **Workflows**.
+
+3. Similarly, to deploy a production copy, type:
+   ```
+   $ databricks bundle deploy --target prod
+   ```
+
+   Note that the default job from the template has a schedule that runs every day
+   (defined in resources/my_jobs_as_code_job.py). The schedule
+   is paused when deploying in development mode (see [Databricks Asset Bundle deployment modes](
+   https://docs.databricks.com/dev-tools/bundles/deployment-modes.html)).
+
+4. To run a job:
+   ```
+   $ databricks bundle run
+   ```
diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/databricks.yml b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/databricks.yml
@@ -0,0 +1,48 @@
+# This is a Databricks asset bundle definition for my_jobs_as_code.
+# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
+bundle:
+  name: my_jobs_as_code
+  uuid: <UUID>
+
+experimental:
+  python:
+    # Activate virtual environment before loading resources defined in Python.
+    # If disabled, defaults to using the Python interpreter available in the current shell.
+    venv_path: .venv
+    # Functions called to load resources defined in Python. See resources/__init__.py
+    resources:
+      - "resources:load_resources"
+
+artifacts:
+  default:
+    type: whl
+    path: .
+    # We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
+    # to ensure that changes to wheel package are picked up when used on all-purpose clusters
+    build: LOCAL_VERSION=$(date +%Y%m%d.%H%M%S) uv build
+
+include:
+  - resources/*.yml
+
+targets:
+  dev:
+    # The default target uses 'mode: development' to create a development copy.
+    # - Deployed resources get prefixed with '[dev my_user_name]'
+    # - Any job schedules and triggers are paused by default.
+    # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
+    mode: development
+    default: true
+    workspace:
+      host: $DATABRICKS_URL
+
+  prod:
+    mode: production
+    workspace:
+      host: $DATABRICKS_URL
+      # We explicitly specify /Workspace/Users/$USERNAME to make sure we only have a single copy.
+      root_path: /Workspace/Users/$USERNAME/.bundle/${bundle.name}/${bundle.target}
+    permissions:
+      - user_name: $USERNAME
+        level: CAN_MANAGE
+    run_as:
+      user_name: $USERNAME
diff --git a/...tance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/fixtures/.gitkeep b/...tance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/fixtures/.gitkeep
@@ -0,0 +1,22 @@
+# Fixtures
+
+This folder is reserved for fixtures, such as CSV files.
+
+Below is an example of how to load fixtures as a data frame:
+
+```
+import pandas as pd
+import os
+
+def get_absolute_path(*relative_parts):
+    if 'dbutils' in globals():
+        base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
+        path = os.path.normpath(os.path.join(base_dir, *relative_parts))
+        return path if path.startswith("/Workspace") else "/Workspace" + path
+    else:
+        return os.path.join(*relative_parts)
+
+csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
+df = pd.read_csv(csv_file)
+display(df)
+```
diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/pyproject.toml b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/pyproject.toml
@@ -0,0 +1,49 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "my_jobs_as_code"
+requires-python = ">=3.10"
+description = "wheel file based on my_jobs_as_code"
+
+# Dependencies in case the output wheel file is used as a library dependency.
+# For defining dependencies, when this package is used in Databricks, see:
+# https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
+#
+# Example:
+# dependencies = [
+#     "requests==x.y.z",
+# ]
+dependencies = [
+]
+
+# see setup.py
+dynamic = ["version"]
+
+[project.entry-points.packages]
+main = "my_jobs_as_code.main:main"
+
+[tool.setuptools]
+py-modules = ["resources", "my_jobs_as_code"]
+
+[tool.uv]
+## Dependencies for local development
+dev-dependencies = [
+    "databricks-bundles==0.7.0",
+
+    ## Add code completion support for DLT
+    # "databricks-dlt",
+
+    ## databricks-connect can be used to run parts of this project locally.
+    ## See https://docs.databricks.com/dev-tools/databricks-connect.html.
+    ##
+    ## Uncomment line below to install a version of db-connect that corresponds to
+    ## the Databricks Runtime version used for this project.
+    # "databricks-connect>=15.4,<15.5",
+]
+
+override-dependencies = [
+    # pyspark package conflicts with 'databricks-connect'
+    "pyspark; sys_platform == 'never'",
+]
diff --git a/...e/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/__init__.py b/...e/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/__init__.py
@@ -0,0 +1,16 @@
+from databricks.bundles.core import (
+    Bundle,
+    Resources,
+    load_resources_from_current_package_module,
+)
+
+
+def load_resources(bundle: Bundle) -> Resources:
+    """
+    'load_resources' function is referenced in databricks.yml and is responsible for loading
+    bundle resources defined in Python code. This function is called by Databricks CLI during
+    bundle deployment. After deployment, this function is not used.
+    """
+
+    # the default implementation loads all Python files in 'resources' directory
+    return load_resources_from_current_package_module()