From ba5b3f05bb699c9be59b39661cb8186ce7175587 Mon Sep 17 00:00:00 2001 From: Mike Alfare <13974384+mikealfare@users.noreply.github.com> Date: Fri, 21 Jun 2024 16:53:38 -0400 Subject: [PATCH 1/5] skip broken tests, link to the issue for resolution (#1056) --- tests/functional/adapter/test_python_model.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/functional/adapter/test_python_model.py b/tests/functional/adapter/test_python_model.py index 1195cbd3e..957361cb7 100644 --- a/tests/functional/adapter/test_python_model.py +++ b/tests/functional/adapter/test_python_model.py @@ -15,7 +15,9 @@ class TestPythonModelSpark(BasePythonModelTests): @pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_sql_endpoint") class TestPySpark(BasePySparkTests): - pass + @pytest.mark.skip("https://github.com/dbt-labs/dbt-spark/issues/1054") + def test_different_dataframes(self, project): + return super().test_different_dataframes(project) @pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_sql_endpoint") @@ -69,6 +71,7 @@ class TestChangingSchemaSpark: def models(self): return {"simple_python_model.py": models__simple_python_model} + @pytest.mark.skip("https://github.com/dbt-labs/dbt-spark/issues/1054") def test_changing_schema_with_log_validation(self, project, logs_dir): run_dbt(["run"]) write_file( From cd6efba4f006f7bc3de761a02717ff9261b736a1 Mon Sep 17 00:00:00 2001 From: Matthew McKnight <91097623+McKnight-42@users.noreply.github.com> Date: Fri, 21 Jun 2024 16:50:48 -0500 Subject: [PATCH 2/5] update user docs-issue workflow (#1051) * update user docs-issue workflow * pre-commit fix * update workflow based onf feedback * whitespace * update to match bigquery * pin numpy to below 2.0 new release * remove numpy pin for its own pr --- .github/workflows/docs-issues.yml | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/.github/workflows/docs-issues.yml b/.github/workflows/docs-issues.yml index 00a098df8..f49cf517c 100644 --- a/.github/workflows/docs-issues.yml +++ b/.github/workflows/docs-issues.yml @@ -1,19 +1,18 @@ # **what?** -# Open an issue in docs.getdbt.com when a PR is labeled `user docs` +# Open an issue in docs.getdbt.com when an issue is labeled `user docs` and closed as completed # **why?** # To reduce barriers for keeping docs up to date # **when?** -# When a PR is labeled `user docs` and is merged. Runs on pull_request_target to run off the workflow already merged, -# not the workflow that existed on the PR branch. This allows old PRs to get comments. +# When an issue is labeled `user docs` and is closed as completed. Can be labeled before or after the issue is closed. -name: Open issues in docs.getdbt.com repo when a PR is labeled -run-name: "Open an issue in docs.getdbt.com for PR #${{ github.event.pull_request.number }}" +name: Open issues in docs.getdbt.com repo when an issue is labeled +run-name: "Open an issue in docs.getdbt.com for issue #${{ github.event.issue.number }}" on: - pull_request_target: + issues: types: [labeled, closed] defaults: @@ -21,23 +20,22 @@ defaults: shell: bash permissions: - issues: write # opens new issues - pull-requests: write # comments on PRs - + issues: write # comments on issues jobs: open_issues: - # we only want to run this when the PR has been merged or the label in the labeled event is `user docs`. Otherwise it runs the + # we only want to run this when the issue is closed as completed and the label `user docs` has been assigned. + # If this logic does not exist in this workflow, it runs the # risk of duplicaton of issues being created due to merge and label both triggering this workflow to run and neither having # generating the comment before the other runs. This lives here instead of the shared workflow because this is where we # decide if it should run or not. if: | - (github.event.pull_request.merged == true) && - ((github.event.action == 'closed' && contains( github.event.pull_request.labels.*.name, 'user docs')) || + (github.event.issue.state == 'closed' && github.event.issue.state_reason == 'completed') && ( + (github.event.action == 'closed' && contains(github.event.issue.labels.*.name, 'user docs')) || (github.event.action == 'labeled' && github.event.label.name == 'user docs')) uses: dbt-labs/actions/.github/workflows/open-issue-in-repo.yml@main with: issue_repository: "dbt-labs/docs.getdbt.com" - issue_title: "Docs Changes Needed from ${{ github.event.repository.name }} PR #${{ github.event.pull_request.number }}" + issue_title: "Docs Changes Needed from ${{ github.event.repository.name }} Issue #${{ github.event.issue.number }}" issue_body: "At a minimum, update body to include a link to the page on docs.getdbt.com requiring updates and what part(s) of the page you would like to see updated." secrets: inherit From 50634b9c6038016f888ea84be542d9a84e52a141 Mon Sep 17 00:00:00 2001 From: Colin Rogers <111200756+colin-rogers-dbt@users.noreply.github.com> Date: Mon, 24 Jun 2024 09:09:25 -0700 Subject: [PATCH 3/5] update spark internal-release workflow (#1052) Co-authored-by: Mike Alfare <13974384+mikealfare@users.noreply.github.com> --- .github/workflows/release-internal.yml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/.github/workflows/release-internal.yml b/.github/workflows/release-internal.yml index eb892415c..d4e7a3c93 100644 --- a/.github/workflows/release-internal.yml +++ b/.github/workflows/release-internal.yml @@ -10,15 +10,12 @@ # # Manual trigger. -name: "Release internal patch" +name: "Release to Cloud" +run-name: "Release to Cloud off of ${{ inputs.ref }}" on: workflow_dispatch: inputs: - version_number: - description: "The release version number (i.e. 1.0.0b1)" - type: string - required: true ref: description: "The ref (sha or branch name) to use" type: string @@ -29,6 +26,11 @@ on: type: string default: "python -c \"import dbt.adapters.spark\"" required: true + skip_tests: + description: "Should the tests be skipped? (default to false)" + type: boolean + required: true + default: false defaults: run: @@ -129,15 +131,14 @@ jobs: run: python dagger/run_dbt_spark_tests.py --profile ${{ matrix.test }} invoke-reusable-workflow: - name: "Build and Release Internally" + name: "Create cloud release" needs: [run-integration-tests] - uses: "dbt-labs/dbt-release/.github/workflows/internal-archive-release.yml@main" with: - version_number: "${{ inputs.version_number }}" package_test_command: "${{ inputs.package_test_command }}" dbms_name: "spark" ref: "${{ inputs.ref }}" + skip_tests: "${{ inputs.skip_tests }}" secrets: "inherit" From 824ca0f2249d145234f21d7e4066e033a273e2e2 Mon Sep 17 00:00:00 2001 From: Mike Alfare <13974384+mikealfare@users.noreply.github.com> Date: Wed, 26 Jun 2024 12:06:32 -0400 Subject: [PATCH 4/5] Update the spark version to the current version (#1055) * update the spark version to the current version * update pin for pydantic to resolve https://github.com/explosion/spaCy/issues/12659 * exclude koalas dataframes from test --- tests/functional/adapter/test_python_model.py | 35 ++++++++++++++++--- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/tests/functional/adapter/test_python_model.py b/tests/functional/adapter/test_python_model.py index 957361cb7..cd798d1da 100644 --- a/tests/functional/adapter/test_python_model.py +++ b/tests/functional/adapter/test_python_model.py @@ -15,9 +15,22 @@ class TestPythonModelSpark(BasePythonModelTests): @pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_sql_endpoint") class TestPySpark(BasePySparkTests): - @pytest.mark.skip("https://github.com/dbt-labs/dbt-spark/issues/1054") def test_different_dataframes(self, project): - return super().test_different_dataframes(project) + """ + Test that python models are supported using dataframes from: + - pandas + - pyspark + - pyspark.pandas (formerly dataspark.koalas) + + Note: + The CI environment is on Apache Spark >3.1, which includes koalas as pyspark.pandas. + The only Databricks runtime that supports Apache Spark <=3.1 is 9.1 LTS, which is EOL 2024-09-23. + For more information, see: + - https://github.com/databricks/koalas + - https://docs.databricks.com/en/release-notes/runtime/index.html + """ + results = run_dbt(["run", "--exclude", "koalas_df"]) + assert len(results) == 3 @pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_sql_endpoint") @@ -37,7 +50,7 @@ def model(dbt, spark): materialized='table', submission_method='job_cluster', job_cluster_config={ - "spark_version": "7.3.x-scala2.12", + "spark_version": "12.2.x-scala2.12", "node_type_id": "i3.xlarge", "num_workers": 0, "spark_conf": { @@ -48,7 +61,7 @@ def model(dbt, spark): "ResourceClass": "SingleNode" } }, - packages=['spacy', 'torch', 'pydantic<1.10.3'] + packages=['spacy', 'torch', 'pydantic>=1.10.8'] ) data = [[1,2]] * 10 return spark.createDataFrame(data, schema=['test', 'test2']) @@ -67,11 +80,23 @@ def model(dbt, spark): @pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_sql_endpoint") class TestChangingSchemaSpark: + """ + Confirm that we can setup a spot instance and parse required packages into the Databricks job. + + Notes: + - This test generates a spot instance on demand using the settings from `job_cluster_config` + in `models__simple_python_model` above. It takes several minutes to run due to creating the cluster. + The job can be monitored via "Data Engineering > Job Runs" or "Workflows > Job Runs" + in the Databricks UI (instead of via the normal cluster). + - The `spark_version` argument will need to periodically be updated. It will eventually become + unsupported and start experiencing issues. + - See https://github.com/explosion/spaCy/issues/12659 for why we're pinning pydantic + """ + @pytest.fixture(scope="class") def models(self): return {"simple_python_model.py": models__simple_python_model} - @pytest.mark.skip("https://github.com/dbt-labs/dbt-spark/issues/1054") def test_changing_schema_with_log_validation(self, project, logs_dir): run_dbt(["run"]) write_file( From 034cb6118e808c1c9ad81d3553a136ac94b77781 Mon Sep 17 00:00:00 2001 From: Mila Page <67295367+VersusFacit@users.noreply.github.com> Date: Mon, 8 Jul 2024 12:11:09 -0400 Subject: [PATCH 5/5] Base 207/add test (#1057) * Add test for upstream change. * Skip session since it's not liking the test. * Import pytest to fix skip error. * Dial in tests to reflect error messages from spark. --------- Co-authored-by: Mila Page Co-authored-by: Mike Alfare --- .../adapter/dbt_show/test_dbt_show.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 tests/functional/adapter/dbt_show/test_dbt_show.py diff --git a/tests/functional/adapter/dbt_show/test_dbt_show.py b/tests/functional/adapter/dbt_show/test_dbt_show.py new file mode 100644 index 000000000..bc56fd908 --- /dev/null +++ b/tests/functional/adapter/dbt_show/test_dbt_show.py @@ -0,0 +1,22 @@ +import pytest + +from dbt.tests.adapter.dbt_show.test_dbt_show import ( + BaseShowSqlHeader, + BaseShowLimit, + BaseShowDoesNotHandleDoubleLimit, +) + + +class TestSparkShowLimit(BaseShowLimit): + pass + + +class TestSparkShowSqlHeader(BaseShowSqlHeader): + pass + + +@pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_http_cluster") +class TestSparkShowDoesNotHandleDoubleLimit(BaseShowDoesNotHandleDoubleLimit): + """The syntax message is quite variable across clusters, but this hits two at once.""" + + DATABASE_ERROR_MESSAGE = "limit"