diff --git a/.github/workflows/sdk-jobs-spark-automation-run_interactive_session_notebook.yml b/.github/workflows/sdk-jobs-spark-automation-run_interactive_session_notebook.yml new file mode 100644 index 0000000000..8a2aa179ee --- /dev/null +++ b/.github/workflows/sdk-jobs-spark-automation-run_interactive_session_notebook.yml @@ -0,0 +1,80 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: sdk-jobs-spark-automation-run_interactive_session_notebook +# This file is created by sdk/python/readme.py. +# Please do not edit directly. +on: + workflow_dispatch: + schedule: + - cron: "30 11/12 * * *" + pull_request: + branches: + - main + paths: + - sdk/python/jobs/spark/automation/** + - .github/workflows/sdk-jobs-spark-automation-run_interactive_session_notebook.yml + - sdk/python/dev-requirements.txt + - infra/bootstrapping/** + - sdk/python/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: setup python + uses: actions/setup-python@v2 + with: + python-version: "3.8" + - name: pip install notebook reqs + run: pip install -r sdk/python/dev-requirements.txt + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup SDK + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: sdk/python + continue-on-error: true + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: setup spark resources + run: | + bash -x jobs/spark/setup_spark.sh jobs/spark/ jobs/spark/automation/run_interactive_session_notebook.ipynb + working-directory: sdk/python + continue-on-error: true + - name: run jobs/spark/automation/run_interactive_session_notebook.ipynb + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json"; + bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "run_interactive_session_notebook.ipynb"; + [ -f "../../.azureml/config" ] && cat "../../.azureml/config"; + papermill -k python run_interactive_session_notebook.ipynb run_interactive_session_notebook.output.ipynb + working-directory: sdk/python/jobs/spark/automation + - name: upload notebook's working folder as an artifact + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: run_interactive_session_notebook + path: sdk/python/jobs/spark/automation diff --git a/.github/workflows/sdk-jobs-spark-submit_spark_standalone_jobs_managed_vnet.yml b/.github/workflows/sdk-jobs-spark-submit_spark_standalone_jobs_managed_vnet.yml new file mode 100644 index 0000000000..89d7aa0dc6 --- /dev/null +++ b/.github/workflows/sdk-jobs-spark-submit_spark_standalone_jobs_managed_vnet.yml @@ -0,0 +1,80 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: sdk-jobs-spark-submit_spark_standalone_jobs_managed_vnet +# This file is created by sdk/python/readme.py. +# Please do not edit directly. +on: + workflow_dispatch: + schedule: + - cron: "14 1/12 * * *" + pull_request: + branches: + - main + paths: + - sdk/python/jobs/spark/** + - .github/workflows/sdk-jobs-spark-submit_spark_standalone_jobs_managed_vnet.yml + - sdk/python/dev-requirements.txt + - infra/bootstrapping/** + - sdk/python/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: setup python + uses: actions/setup-python@v2 + with: + python-version: "3.8" + - name: pip install notebook reqs + run: pip install -r sdk/python/dev-requirements.txt + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup SDK + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: sdk/python + continue-on-error: true + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: setup spark resources + run: | + bash -x jobs/spark/setup_spark.sh jobs/spark/ jobs/spark/submit_spark_standalone_jobs_managed_vnet.ipynb + working-directory: sdk/python + continue-on-error: true + - name: run jobs/spark/submit_spark_standalone_jobs_managed_vnet.ipynb + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json"; + bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "submit_spark_standalone_jobs_managed_vnet.ipynb"; + [ -f "../../.azureml/config" ] && cat "../../.azureml/config"; + papermill -k python submit_spark_standalone_jobs_managed_vnet.ipynb submit_spark_standalone_jobs_managed_vnet.output.ipynb + working-directory: sdk/python/jobs/spark + - name: upload notebook's working folder as an artifact + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: submit_spark_standalone_jobs_managed_vnet + path: sdk/python/jobs/spark diff --git a/sdk/python/data-wrangling/interactive_data_wrangling.ipynb b/sdk/python/data-wrangling/interactive_data_wrangling.ipynb index f5939089b2..14e34f4b68 100644 --- a/sdk/python/data-wrangling/interactive_data_wrangling.ipynb +++ b/sdk/python/data-wrangling/interactive_data_wrangling.ipynb @@ -49,11 +49,15 @@ "source": [ "from pyspark.sql import SparkSession\n", "\n", + "key_vault_name = \"\"\n", + "access_key_secret_name = \"\"\n", + "storage_account_name = \"\"\n", + "\n", "sc = SparkSession.builder.getOrCreate()\n", "token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary\n", - "access_key = token_library.getSecret(\"\", \"\")\n", + "access_key = token_library.getSecret(key_vault_name, access_key_secret_name)\n", "sc._jsc.hadoopConfiguration().set(\n", - " \"fs.azure.account.key..blob.core.windows.net\", access_key\n", + " f\"fs.azure.account.key.{storage_account_name}.blob.core.windows.net\", access_key\n", ")" ] }, @@ -84,8 +88,11 @@ "import pyspark.pandas as pd\n", "from pyspark.ml.feature import Imputer\n", "\n", + "blob_container_name = \"\"\n", + "storage_account_name = \"\"\n", + "\n", "df = pd.read_csv(\n", - " \"wasbs://@.blob.core.windows.net/data/titanic.csv\",\n", + " f\"wasbs://{blob_container_name}@{storage_account_name}.blob.core.windows.net/data/titanic.csv\",\n", " index_col=\"PassengerId\",\n", ")\n", "imputer = Imputer(inputCols=[\"Age\"], outputCol=\"Age\").setStrategy(\n", @@ -96,7 +103,7 @@ ") # Fill Cabin column with value \"None\" if missing\n", "df.dropna(inplace=True) # Drop the rows which still have any missing value\n", "df.to_csv(\n", - " \"wasbs://@.blob.core.windows.net/data/wrangled\",\n", + " f\"wasbs://{blob_container_name}@{storage_account_name}.blob.core.windows.net/data/wrangled\",\n", " index_col=\"PassengerId\",\n", ")" ] @@ -141,11 +148,16 @@ "source": [ "from pyspark.sql import SparkSession\n", "\n", + "key_vault_name = \"\"\n", + "sas_token_secret_name = \"\"\n", + "blob_container_name = \"\"\n", + "storage_account_name = \"\"\n", + "\n", "sc = SparkSession.builder.getOrCreate()\n", "token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary\n", - "sas_token = token_library.getSecret(\"\", \"\")\n", + "sas_token = token_library.getSecret(key_vault_name, sas_token_secret_name)\n", "sc._jsc.hadoopConfiguration().set(\n", - " \"fs.azure.sas...blob.core.windows.net\",\n", + " f\"fs.azure.sas.{blob_container_name}.{storage_account_name}.blob.core.windows.net\",\n", " sas_token,\n", ")" ] @@ -177,8 +189,11 @@ "import pyspark.pandas as pd\n", "from pyspark.ml.feature import Imputer\n", "\n", + "blob_container_name = \"\"\n", + "storage_account_name = \"\"\n", + "\n", "df = pd.read_csv(\n", - " \"wasbs://@.blob.core.windows.net/data/titanic.csv\",\n", + " f\"wasbs://{blob_container_name}@{storage_account_name}.blob.core.windows.net/data/titanic.csv\",\n", " index_col=\"PassengerId\",\n", ")\n", "imputer = Imputer(inputCols=[\"Age\"], outputCol=\"Age\").setStrategy(\n", @@ -189,7 +204,7 @@ ") # Fill Cabin column with value \"None\" if missing\n", "df.dropna(inplace=True) # Drop the rows which still have any missing value\n", "df.to_csv(\n", - " \"wasbs://@.blob.core.windows.net/data/wrangled\",\n", + " f\"wasbs://{blob_container_name}@{storage_account_name}.blob.core.windows.net/data/wrangled\",\n", " index_col=\"PassengerId\",\n", ")" ] @@ -236,8 +251,11 @@ "import pyspark.pandas as pd\n", "from pyspark.ml.feature import Imputer\n", "\n", + "file_system_name = \"\"\n", + "gen2_storage_account_name = \"\"\n", + "\n", "df = pd.read_csv(\n", - " \"abfss://@.dfs.core.windows.net/data/titanic.csv\",\n", + " f\"abfss://{file_system_name}@{gen2_storage_account_name}.dfs.core.windows.net/data/titanic.csv\",\n", " index_col=\"PassengerId\",\n", ")\n", "imputer = Imputer(inputCols=[\"Age\"], outputCol=\"Age\").setStrategy(\n", @@ -248,7 +266,7 @@ ") # Fill Cabin column with value \"None\" if missing\n", "df.dropna(inplace=True) # Drop the rows which still have any missing value\n", "df.to_csv(\n", - " \"abfss://@.dfs.core.windows.net/data/wrangled\",\n", + " f\"abfss://{file_system_name}@{gen2_storage_account_name}.dfs.core.windows.net/data/wrangled\",\n", " index_col=\"PassengerId\",\n", ")" ] @@ -272,7 +290,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "- To enable read and write access, assign **Contributor** and **Storage Blob Data Contributor** roles to the user identity.\n", + "- To enable read and write access, assign **Contributor** and **Storage Blob Data Contributor** roles to the Service Principal.\n", "- Set configuration properties as follows:\n", " - Client ID property: `fs.azure.account.oauth2.client.id..dfs.core.windows.net`\n", " - Client secret property: `fs.azure.account.oauth2.client.secret..dfs.core.windows.net`\n", @@ -298,32 +316,39 @@ "source": [ "from pyspark.sql import SparkSession\n", "\n", + "key_vault_name = \"\"\n", + "client_id_secret_name = \"\"\n", + "tenant_id_secret_name = \"\"\n", + "client_secret_name = \"\"\n", + "gen2_storage_account_name = \"\"\n", + "\n", "sc = SparkSession.builder.getOrCreate()\n", "token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary\n", "\n", "# Set up service principal tenant ID, client ID and secret from Azure Key Vault\n", - "client_id = token_library.getSecret(\"\", \"\")\n", - "tenant_id = token_library.getSecret(\"\", \"\")\n", - "client_secret = token_library.getSecret(\"\", \"\")\n", + "client_id = token_library.getSecret(key_vault_name, client_id_secret_name)\n", + "tenant_id = token_library.getSecret(key_vault_name, tenant_id_secret_name)\n", + "client_secret = token_library.getSecret(key_vault_name, client_secret_name)\n", "\n", "# Set up service principal which has access of the data\n", "sc._jsc.hadoopConfiguration().set(\n", - " \"fs.azure.account.auth.type..dfs.core.windows.net\", \"OAuth\"\n", + " f\"fs.azure.account.auth.type.{gen2_storage_account_name}.dfs.core.windows.net\",\n", + " \"OAuth\",\n", ")\n", "sc._jsc.hadoopConfiguration().set(\n", - " \"fs.azure.account.oauth.provider.type..dfs.core.windows.net\",\n", + " f\"fs.azure.account.oauth.provider.type.{gen2_storage_account_name}.dfs.core.windows.net\",\n", " \"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider\",\n", ")\n", "sc._jsc.hadoopConfiguration().set(\n", - " \"fs.azure.account.oauth2.client.id..dfs.core.windows.net\",\n", + " f\"fs.azure.account.oauth2.client.id.{gen2_storage_account_name}.dfs.core.windows.net\",\n", " client_id,\n", ")\n", "sc._jsc.hadoopConfiguration().set(\n", - " \"fs.azure.account.oauth2.client.secret..dfs.core.windows.net\",\n", + " f\"fs.azure.account.oauth2.client.secret.{gen2_storage_account_name}.dfs.core.windows.net\",\n", " client_secret,\n", ")\n", "sc._jsc.hadoopConfiguration().set(\n", - " \"fs.azure.account.oauth2.client.endpoint..dfs.core.windows.net\",\n", + " f\"fs.azure.account.oauth2.client.endpoint.{gen2_storage_account_name}.dfs.core.windows.net\",\n", " \"https://login.microsoftonline.com/\" + tenant_id + \"/oauth2/token\",\n", ")" ] @@ -355,8 +380,11 @@ "import pyspark.pandas as pd\n", "from pyspark.ml.feature import Imputer\n", "\n", + "file_system_name = \"\"\n", + "gen2_storage_account_name = \"\"\n", + "\n", "df = pd.read_csv(\n", - " \"abfss://@.dfs.core.windows.net/data/titanic.csv\",\n", + " f\"abfss://{file_system_name}@{gen2_storage_account_name}.dfs.core.windows.net/data/titanic.csv\",\n", " index_col=\"PassengerId\",\n", ")\n", "imputer = Imputer(inputCols=[\"Age\"], outputCol=\"Age\").setStrategy(\n", @@ -367,7 +395,7 @@ ") # Fill Cabin column with value \"None\" if missing\n", "df.dropna(inplace=True) # Drop the rows which still have any missing value\n", "df.to_csv(\n", - " \"abfss://@.dfs.core.windows.net/data/wrangled\",\n", + " f\"abfss://{file_system_name}@{gen2_storage_account_name}.dfs.core.windows.net/data/wrangled\",\n", " index_col=\"PassengerId\",\n", ")" ] diff --git a/sdk/python/data-wrangling/interactive_data_wrangling.py b/sdk/python/data-wrangling/interactive_data_wrangling.py new file mode 100644 index 0000000000..6d30261ef8 --- /dev/null +++ b/sdk/python/data-wrangling/interactive_data_wrangling.py @@ -0,0 +1,152 @@ +## Interactive Data Wrangling using Apache Spark in Azure Machine Learning. Before executing these sample codes in an Azure Machine Learning Notebook, select **Serverless Spark Compute** under **Azure Machine Learning Serverless Spark** or select an attached Synapse Spark pool under **Synapse Spark pools** from the **Compute** selection menu. It is highly recommened to follow the documentation page: [Interactive data wrangling with Apache Spark in Azure Machine Learning](https://learn.microsoft.com/azure/machine-learning/interactive-data-wrangling-with-apache-spark-azure-ml) for more details related to the code samples provided in this notebook. + +### Access and wrangle Azure Blob storage data using Access Key + +#### First, Set the access key as configuration property `fs.azure.account.key..blob.core.windows.net`. + +from pyspark.sql import SparkSession + +key_vault_name = "" +access_key_secret_name = "" +storage_account_name = "" + +sc = SparkSession.builder.getOrCreate() +token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary +access_key = token_library.getSecret(key_vault_name, access_key_secret_name) +sc._jsc.hadoopConfiguration().set( + f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", access_key +) + +#### Access data using `wasbs://` URI and perform data wrangling. +import pyspark.pandas as pd +from pyspark.ml.feature import Imputer + +blob_container_name = "" +storage_account_name = "" + +df = pd.read_csv( + f"wasbs://{blob_container_name}@{storage_account_name}.blob.core.windows.net/data/titanic.csv", + index_col="PassengerId", +) +imputer = Imputer(inputCols=["Age"], outputCol="Age").setStrategy( + "mean" +) # Replace missing values in Age column with the mean value +df.fillna( + value={"Cabin": "None"}, inplace=True +) # Fill Cabin column with value "None" if missing +df.dropna(inplace=True) # Drop the rows which still have any missing value +df.to_csv( + f"wasbs://{blob_container_name}@{storage_account_name}.blob.core.windows.net/data/wrangled", + index_col="PassengerId", +) + +### Access and wrangle Azure Blob storage data using SAS token + +#### First, set the SAS token as configuration property `fs.azure.sas...blob.core.windows.net`. +from pyspark.sql import SparkSession + +key_vault_name = "" +sas_token_secret_name = "" +blob_container_name = "" +storage_account_name = "" + +sc = SparkSession.builder.getOrCreate() +token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary +sas_token = token_library.getSecret(key_vault_name, sas_token_secret_name) +sc._jsc.hadoopConfiguration().set( + f"fs.azure.sas.{blob_container_name}.{storage_account_name}.blob.core.windows.net", + sas_token, +) + +#### Access data using `wasbs://` URI and perform data wrangling. +import pyspark.pandas as pd +from pyspark.ml.feature import Imputer + +blob_container_name = "" +storage_account_name = "" + +df = pd.read_csv( + f"wasbs://{blob_container_name}@{storage_account_name}.blob.core.windows.net/data/titanic.csv", + index_col="PassengerId", +) +imputer = Imputer(inputCols=["Age"], outputCol="Age").setStrategy( + "mean" +) # Replace missing values in Age column with the mean value +df.fillna( + value={"Cabin": "None"}, inplace=True +) # Fill Cabin column with value "None" if missing +df.dropna(inplace=True) # Drop the rows which still have any missing value +df.to_csv( + f"wasbs://{blob_container_name}@{storage_account_name}.blob.core.windows.net/data/wrangled", + index_col="PassengerId", +) + +### Access and wrangle ADLS Gen 2 data using User Identity passthrough + +#### - To enable read and write access, assign **Contributor** and **Storage Blob Data Contributor** roles to the user identity. +#### - Access data using `abfss://` URI and perform data wrangling. +import pyspark.pandas as pd +from pyspark.ml.feature import Imputer + +file_system_name = "" +gen2_storage_account_name = "" + +df = pd.read_csv( + f"abfss://{file_system_name}@{gen2_storage_account_name}.dfs.core.windows.net/data/titanic.csv", + index_col="PassengerId", +) +imputer = Imputer(inputCols=["Age"], outputCol="Age").setStrategy( + "mean" +) # Replace missing values in Age column with the mean value +df.fillna( + value={"Cabin": "None"}, inplace=True +) # Fill Cabin column with value "None" if missing +df.dropna(inplace=True) # Drop the rows which still have any missing value +df.to_csv( + f"abfss://{file_system_name}@{gen2_storage_account_name}.dfs.core.windows.net/data/wrangled", + index_col="PassengerId", +) + + +### Access and wrangle data using credentialed AzureML Blob Datastore +#### - Access data using `azureml://` URI and perform data wrangling. +import pyspark.pandas as pd +from pyspark.ml.feature import Imputer + +df = pd.read_csv( + "azureml://datastores/workspaceblobstore/paths/data/titanic.csv", + index_col="PassengerId", +) +imputer = Imputer(inputCols=["Age"], outputCol="Age").setStrategy( + "mean" +) # Replace missing values in Age column with the mean value +df.fillna( + value={"Cabin": "None"}, inplace=True +) # Fill Cabin column with value "None" if missing +df.dropna(inplace=True) # Drop the rows which still have any missing value +df.to_csv( + "azureml://datastores/workspaceblobstore/paths/data/wrangled", + index_col="PassengerId", +) + +### Access and wrangle data using credentialless AzureML Blob Datastore +#### - To enable read and write access, assign **Contributor** and **Storage Blob Data Contributor** roles to the user identity on the Azure Blob storage account that the datastore points to. +#### - Access data using `azureml://` URI and perform data wrangling. +import pyspark.pandas as pd +from pyspark.ml.feature import Imputer + +df = pd.read_csv( + "azureml://datastores/credlessblobdatastore/paths/data/titanic.csv", + index_col="PassengerId", +) +imputer = Imputer(inputCols=["Age"], outputCol="Age").setStrategy( + "mean" +) # Replace missing values in Age column with the mean value +df.fillna( + value={"Cabin": "None"}, inplace=True +) # Fill Cabin column with value "None" if missing +df.dropna(inplace=True) # Drop the rows which still have any missing value +df.to_csv( + "azureml://datastores/credlessblobdatastore/paths/data/wrangled", + index_col="PassengerId", +) diff --git a/sdk/python/jobs/spark/automation/create_credential_less_data_store.yml b/sdk/python/jobs/spark/automation/create_credential_less_data_store.yml new file mode 100644 index 0000000000..4c56659dcb --- /dev/null +++ b/sdk/python/jobs/spark/automation/create_credential_less_data_store.yml @@ -0,0 +1,6 @@ +$schema: https://azuremlschemas.azureedge.net/latest/azureBlob.schema.json +name: +type: azure_blob +description: Credential-less datastore pointing to a blob container. +account_name: +container_name: \ No newline at end of file diff --git a/sdk/python/jobs/spark/automation/run_interactive_session_notebook.ipynb b/sdk/python/jobs/spark/automation/run_interactive_session_notebook.ipynb new file mode 100644 index 0000000000..1d87178f33 --- /dev/null +++ b/sdk/python/jobs/spark/automation/run_interactive_session_notebook.ipynb @@ -0,0 +1,79 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use a serverless Spark compute" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should have an attached Synapse Spark pool available in your workspace. Please see documentation page: [Attach and manage a Synapse Spark pool in Azure Machine Learning (preview)](https://learn.microsoft.com/azure/machine-learning/how-to-manage-synapse-spark-pool) for more details.\n", + "\n", + "**Note** - To ensure successful execution of Spark job, the identity being used for the Spark job should be assigned **Contributor** and **Storage Blob Data Contributor** roles on the Azure storage account used for data input and output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient, spark, Input, Output\n", + "from azure.identity import DefaultAzureCredential\n", + "\n", + "subscription_id = \"\"\n", + "resource_group = \"\"\n", + "workspace = \"\"\n", + "ml_client = MLClient(\n", + " DefaultAzureCredential(), subscription_id, resource_group, workspace\n", + ")\n", + "\n", + "spark_job = spark(\n", + " display_name=\"interactive_data_wrangling\",\n", + " code=\"../../../data-wrangling\",\n", + " entry={\"file\": \"interactive_data_wrangling.py\"},\n", + " driver_cores=1,\n", + " driver_memory=\"2g\",\n", + " executor_cores=2,\n", + " executor_memory=\"2g\",\n", + " executor_instances=2,\n", + " resources={\n", + " \"instance_type\": \"Standard_E8S_V3\",\n", + " \"runtime_version\": \"3.2.0\",\n", + " },\n", + ")\n", + "\n", + "returned_spark_job = ml_client.jobs.create_or_update(spark_job)\n", + "\n", + "print(returned_spark_job.id)\n", + "# Wait until the job completes\n", + "ml_client.jobs.stream(returned_spark_job.name)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10 - SDK V2", + "language": "python", + "name": "python310-sdkv2" + }, + "language_info": { + "name": "python", + "version": "3.7.10" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "6aeff17a1aa7735c2f7cb3a6d691fe1b4d4c3b8d2d650f644ad0f24e1b8e3f3f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/sdk/python/jobs/spark/user-assigned-identity.yml b/sdk/python/jobs/spark/automation/user-assigned-identity.yml similarity index 100% rename from sdk/python/jobs/spark/user-assigned-identity.yml rename to sdk/python/jobs/spark/automation/user-assigned-identity.yml diff --git a/sdk/python/jobs/spark/setup_spark.sh b/sdk/python/jobs/spark/setup_spark.sh index f03786ceac..4c9c98ddca 100644 --- a/sdk/python/jobs/spark/setup_spark.sh +++ b/sdk/python/jobs/spark/setup_spark.sh @@ -16,10 +16,10 @@ SQL_ADMIN_LOGIN_USER="automation" SQL_ADMIN_LOGIN_PASSWORD="auto123!" SPARK_POOL_NAME="automationpool" SPARK_POOL_ADMIN_ROLE_ID="6e4bf58a-b8e1-4cc3-bbf9-d73143322b78" -USER_IDENTITY_YML="jobs/spark/user-assigned-identity.yml" -#OUTBOUND_RULE_NAME="automationtestrule" -#KEY_VAULT_NAME=$(az ml workspace show --query key_vault -o tsv | cut -d'/' -f9-) -#ACCESS_KEY_SECRET_NAME="automationsecret" +USER_IDENTITY_YML=$1"automation/user-assigned-identity.yml" +CREAT_CREDENTIAL_LESS_DS_YML=$1"automation/create_credential_less_data_store.yml" +AZURE_REGION_NAME=${LOCATION} +OUTBOUND_RULE_NAME="automationtestrule" # if [[ "$2" == *"resources/compute"* ]] @@ -44,25 +44,123 @@ az identity create --name $AML_USER_MANAGED_ID --resource-group $RESOURCE_GROUP AML_USER_MANAGED_ID_OID=$(az identity show --resource-group $RESOURCE_GROUP -n $AML_USER_MANAGED_ID --query principalId -o tsv) # -# -az storage account create --name $GEN2_STORAGE_NAME --resource-group $RESOURCE_GROUP --location $LOCATION --sku Standard_LRS --kind StorageV2 --enable-hierarchical-namespace true -az storage fs create -n $GEN2_FILE_SYSTEM --account-name $GEN2_STORAGE_NAME -az synapse workspace create --name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --storage-account $GEN2_STORAGE_NAME --file-system $GEN2_FILE_SYSTEM --sql-admin-login-user $SQL_ADMIN_LOGIN_USER --sql-admin-login-password $SQL_ADMIN_LOGIN_PASSWORD --location $LOCATION -az role assignment create --role "Storage Blob Data Owner" --assignee $AML_USER_MANAGED_ID_OID --scope /subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.Storage/storageAccounts/$GEN2_STORAGE_NAME/blobServices/default/containers/$GEN2_FILE_SYSTEM -az synapse spark pool create --name $SPARK_POOL_NAME --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --spark-version 3.2 --node-count 3 --node-size Medium --min-node-count 3 --max-node-count 10 --enable-auto-scale true -az synapse workspace firewall-rule create --name allowAll --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --start-ip-address 0.0.0.0 --end-ip-address 255.255.255.255 -# +# +if [[ "$2" == *"managed_vnet"* ]] +then + AML_WORKSPACE_NAME=${AML_WORKSPACE_NAME}-vnet + AZURE_STORAGE_ACCOUNT="blobstoragevnet" + BLOB_CONTAINER_NAME="blobstoragevnetcontainer" + GEN2_STORAGE_ACCOUNT_NAME="gen2storagevnet" + ADLS_CONTAINER_NAME="gen2containervnet" + az storage account create -n $AZURE_STORAGE_ACCOUNT -g $RESOURCE_GROUP -l $LOCATION --sku Standard_LRS + az storage container create -n $BLOB_CONTAINER_NAME --account-name $AZURE_STORAGE_ACCOUNT + + az storage account create --name $GEN2_STORAGE_ACCOUNT_NAME --resource-group $RESOURCE_GROUP --location $LOCATION --sku Standard_LRS --kind StorageV2 --enable-hierarchical-namespace true + az storage container create -n $ADLS_CONTAINER_NAME --account-name $GEN2_STORAGE_ACCOUNT_NAME + + ACCOUNT_KEY=$(az storage account keys list --account-name $AZURE_STORAGE_ACCOUNT --query "[0].value" -o tsv) + ACCESS_KEY_SECRET_NAME="autotestaccountkey" + KEY_VAULT=$(az ml workspace show -g $RESOURCE_GROUP -n $AML_WORKSPACE_NAME --query key_vault -o tsv) + KEY_VAULT_NAME=$(basename "$KEY_VAULT") + az keyvault secret set --name $ACCESS_KEY_SECRET_NAME --vault-name $KEY_VAULT_NAME --value $ACCOUNT_KEY + + # + sed -i "s//$SUBSCRIPTION_ID/g; + s//$RESOURCE_GROUP/g; + s//$AML_WORKSPACE_NAME/g; + s//$AZURE_REGION_NAME/g; + s//$AZURE_STORAGE_ACCOUNT/g; + s//$OUTBOUND_RULE_NAME/g; + s//$KEY_VAULT_NAME/g; + s//$ACCESS_KEY_SECRET_NAME/g; + s//$BLOB_CONTAINER_NAME/g; + s//$GEN2_STORAGE_ACCOUNT_NAME/g; + s//$ADLS_CONTAINER_NAME/g;" $2 +# +# +elif [[ "$2" == *"run_interactive_session_notebook"* ]] +then + #NOTEBOOK_TO_CONVERT="../../data-wrangling/interactive_data_wrangling.ipynb" + #ipython nbconvert $NOTEBOOK_TO_CONVERT --to script + + ACCOUNT_KEY=$(az storage account keys list --account-name $AZURE_STORAGE_ACCOUNT --query "[0].value" -o tsv) + ACCESS_KEY_SECRET_NAME="autotestaccountkey" + + KEY_VAULT_NAME="autotestsparkkv" + az keyvault create -n $KEY_VAULT_NAME -g $RESOURCE_GROUP + + NOTEBOOK_PY="./data-wrangling/interactive_data_wrangling.py" + az keyvault secret set --name $ACCESS_KEY_SECRET_NAME --vault-name $KEY_VAULT_NAME --value $ACCOUNT_KEY + + END_TIME=`date -u -d "60 minutes" '+%Y-%m-%dT%H:%MZ'` + SAS_TOKEN=`az storage container generate-sas -n $AZUREML_DEFAULT_CONTAINER --account-name $AZURE_STORAGE_ACCOUNT --https-only --permissions dlrw --expiry $END_TIME -o tsv` + SAS_TOKEN_SECRET_NAME="autotestsastoken" + az keyvault secret set --name $SAS_TOKEN_SECRET_NAME --vault-name $KEY_VAULT_NAME --value $SAS_TOKEN -sed -i "s//$SUBSCRIPTION_ID/g; + GEN2_STORAGE_ACCOUNT_NAME=${RESOURCE_GROUP}gen2 + FILE_SYSTEM_NAME=${RESOURCE_GROUP}file + az storage account create --name $GEN2_STORAGE_ACCOUNT_NAME --resource-group $RESOURCE_GROUP --location $LOCATION --sku Standard_LRS --kind StorageV2 --enable-hierarchical-namespace true + az storage fs create -n $FILE_SYSTEM_NAME --account-name $GEN2_STORAGE_ACCOUNT_NAME + az role assignment create --role "Storage Blob Data Contributor" --assignee $AML_USER_MANAGED_ID_OID --scope /subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.Storage/storageAccounts/$GEN2_STORAGE_ACCOUNT_NAME/blobServices/default/containers/$FILE_SYSTEM_NAME + + TITANIC_DATA_FILE="titanic.csv" + az storage fs file upload --file-system $FILE_SYSTEM_NAME --source ./data-wrangling/data/$TITANIC_DATA_FILE --path data/$TITANIC_DATA_FILE --account-name $GEN2_STORAGE_ACCOUNT_NAME + + # SERVICE_PRINCIPAL_NAME="${RESOURCE_GROUP}sp" + # az ad sp create-for-rbac --name $SERVICE_PRINCIPAL_NAME + # LIST_SP_DETAILS=$(az ad sp list --display-name $SERVICE_PRINCIPAL_NAME) + # SP_APPID=$(echo $LIST_SP_DETAILS | jq -r '[0].appId') + # SP_OBJECTID=$(echo $LIST_SP_DETAILS | jq -r '[0].id') + # SP_TENANTID=$(echo $LIST_SP_DETAILS | jq -r '[0].appOwnerOrganizationId') + # SPA_SP_SECRET=$(az ad sp credential reset --id $SP_OBJECTID --query "password") + + # CLIENT_ID_SECRET_NAME="autotestspsecretclient" + # TENANT_ID_SECRET_NAME="autotestspsecrettenant" + # CLIENT_SECRET_NAME="autotestspsecret" + # az keyvault secret set --name $CLIENT_ID_SECRET_NAME --vault-name $KEY_VAULT_NAME --value $SP_APPID + # az keyvault secret set --name $TENANT_ID_SECRET_NAME --vault-name $KEY_VAULT_NAME --value $SP_TENANTID + # az keyvault secret set --name $CLIENT_SECRET_NAME --vault-name $KEY_VAULT_NAME --value $SPA_SP_SECRET + # az role assignment create --role "Storage Blob Data Contributor" --assignee $SP_APPID --scope /subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.Storage/storageAccounts/$GEN2_STORAGE_ACCOUNT_NAME/blobServices/default/containers/$FILE_SYSTEM_NAME + # az role assignment create --role "Contributor" --assignee $SP_APPID --scope /subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.Storage/storageAccounts/$GEN2_STORAGE_ACCOUNT_NAME/blobServices/default/containers/$FILE_SYSTEM_NAME + + CREDENTIAL_LESS_DATA_STORE_NAME="credlessblobdatastore" + sed -i "s//$AZURE_STORAGE_ACCOUNT/g; + s//$AZUREML_DEFAULT_CONTAINER/g + s//$CREDENTIAL_LESS_DATA_STORE_NAME/g;" $CREAT_CREDENTIAL_LESS_DS_YML + az ml datastore create --file $CREAT_CREDENTIAL_LESS_DS_YML --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME + # USER="azuremlsdk" + sed -i "s//$KEY_VAULT_NAME/g; + s//$ACCESS_KEY_SECRET_NAME/g; + s//$AZURE_STORAGE_ACCOUNT/g; + s//$AZUREML_DEFAULT_CONTAINER/g + s//$SAS_TOKEN_SECRET_NAME/g; + s//$GEN2_STORAGE_ACCOUNT_NAME/g + s//$FILE_SYSTEM_NAME/g;" $NOTEBOOK_PY + + sed -i "s//$SUBSCRIPTION_ID/g; s//$RESOURCE_GROUP/g; - s//$AML_USER_MANAGED_ID/g;" $USER_IDENTITY_YML + s//$AML_WORKSPACE_NAME/g;" $2 +# +else + # + az storage account create --name $GEN2_STORAGE_NAME --resource-group $RESOURCE_GROUP --location $LOCATION --sku Standard_LRS --kind StorageV2 --enable-hierarchical-namespace true + az storage fs create -n $GEN2_FILE_SYSTEM --account-name $GEN2_STORAGE_NAME + az synapse workspace create --name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --storage-account $GEN2_STORAGE_NAME --file-system $GEN2_FILE_SYSTEM --sql-admin-login-user $SQL_ADMIN_LOGIN_USER --sql-admin-login-password $SQL_ADMIN_LOGIN_PASSWORD --location $LOCATION + az role assignment create --role "Storage Blob Data Owner" --assignee $AML_USER_MANAGED_ID_OID --scope /subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.Storage/storageAccounts/$GEN2_STORAGE_NAME/blobServices/default/containers/$GEN2_FILE_SYSTEM + az synapse spark pool create --name $SPARK_POOL_NAME --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --spark-version 3.2 --node-count 3 --node-size Medium --min-node-count 3 --max-node-count 10 --enable-auto-scale true + az synapse workspace firewall-rule create --name allowAll --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --start-ip-address 0.0.0.0 --end-ip-address 255.255.255.255 + # + + sed -i "s//$SUBSCRIPTION_ID/g; + s//$RESOURCE_GROUP/g; + s//$AML_USER_MANAGED_ID/g;" $USER_IDENTITY_YML -# -az ml workspace update --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --name $AML_WORKSPACE_NAME --file $USER_IDENTITY_YML -# + # + az ml workspace update --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --name $AML_WORKSPACE_NAME --file $USER_IDENTITY_YML + # -# -sed -i "s//$SUBSCRIPTION_ID/g; + # + sed -i "s//$SUBSCRIPTION_ID/g; s//$RESOURCE_GROUP/g; s//$AML_WORKSPACE_NAME/g; s//$ATTACHED_SPARK_POOL_NAME/g; @@ -72,27 +170,26 @@ sed -i "s//$SUBSCRIPTION_ID/g; s//$ATTACHED_SPARK_POOL_NAME_UAI/g; s//$AML_USER_MANAGED_ID/g;" $ATTACH_SPARK_PY -python $ATTACH_SPARK_PY -# + python $ATTACH_SPARK_PY + # -COMPUTE_MANAGED_IDENTITY=$(az ml compute show --name $ATTACHED_SPARK_POOL_NAME --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME --query identity.principal_id --out tsv) + COMPUTE_MANAGED_IDENTITY=$(az ml compute show --name $ATTACHED_SPARK_POOL_NAME --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME --query identity.principal_id --out tsv) -if [[ ! -z "$COMPUTE_MANAGED_IDENTITY" ]] -then - az synapse role assignment create --workspace-name $SYNAPSE_WORKSPACE_NAME --role $SPARK_POOL_ADMIN_ROLE_ID --assignee $COMPUTE_MANAGED_IDENTITY -fi + if [[ ! -z "$COMPUTE_MANAGED_IDENTITY" ]] + then + az synapse role assignment create --workspace-name $SYNAPSE_WORKSPACE_NAME --role $SPARK_POOL_ADMIN_ROLE_ID --assignee $COMPUTE_MANAGED_IDENTITY + fi -COMPUTE_MANAGED_IDENTITY=$(az ml compute show --name $ATTACHED_SPARK_POOL_NAME_UAI --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME --query identity.principal_id --out tsv) + COMPUTE_MANAGED_IDENTITY=$(az ml compute show --name $ATTACHED_SPARK_POOL_NAME_UAI --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME --query identity.principal_id --out tsv) -if [[ ! -z "$COMPUTE_MANAGED_IDENTITY" ]] -then - az synapse role assignment create --workspace-name $SYNAPSE_WORKSPACE_NAME --role $SPARK_POOL_ADMIN_ROLE_ID --assignee $COMPUTE_MANAGED_IDENTITY -fi + if [[ ! -z "$COMPUTE_MANAGED_IDENTITY" ]] + then + az synapse role assignment create --workspace-name $SYNAPSE_WORKSPACE_NAME --role $SPARK_POOL_ADMIN_ROLE_ID --assignee $COMPUTE_MANAGED_IDENTITY + fi -az synapse role assignment create --workspace-name $SYNAPSE_WORKSPACE_NAME --role $SPARK_POOL_ADMIN_ROLE_ID --assignee $AML_USER_MANAGED_ID + az synapse role assignment create --workspace-name $SYNAPSE_WORKSPACE_NAME --role $SPARK_POOL_ADMIN_ROLE_ID --assignee $AML_USER_MANAGED_ID_OID -# -sed -i "s//$SUBSCRIPTION_ID/g; + sed -i "s//$SUBSCRIPTION_ID/g; s//$RESOURCE_GROUP/g; s//$AML_WORKSPACE_NAME/g; s//$SYNAPSE_WORKSPACE_NAME/g; @@ -101,4 +198,4 @@ sed -i "s//$SUBSCRIPTION_ID/g; s//$AML_USER_MANAGED_ID/g; s//$ATTACHED_SPARK_POOL_NAME_UAI/g; s//$AML_USER_MANAGED_ID/g;" $2 -# +fi diff --git a/sdk/python/jobs/spark/submit_spark_pipeline_jobs.ipynb b/sdk/python/jobs/spark/submit_spark_pipeline_jobs.ipynb index 0c3594274a..8f7b672f80 100644 --- a/sdk/python/jobs/spark/submit_spark_pipeline_jobs.ipynb +++ b/sdk/python/jobs/spark/submit_spark_pipeline_jobs.ipynb @@ -55,6 +55,7 @@ "subscription_id = \"\"\n", "resource_group = \"\"\n", "workspace = \"\"\n", + "attached_spark_pool_name = \"\"\n", "ml_client = MLClient(\n", " DefaultAzureCredential(), subscription_id, resource_group, workspace\n", ")\n", @@ -91,7 +92,7 @@ " )\n", " spark_step.outputs.wrangled_data.mode = InputOutputModes.DIRECT\n", " spark_step.identity = ManagedIdentityConfiguration()\n", - " spark_step.compute = \"\"\n", + " spark_step.compute = attached_spark_pool_name\n", "\n", "\n", "pipeline = spark_pipeline(\n", @@ -132,6 +133,7 @@ "subscription_id = \"\"\n", "resource_group = \"\"\n", "workspace = \"\"\n", + "attached_spark_pool_name_uai = \"\"\n", "ml_client = MLClient(\n", " DefaultAzureCredential(), subscription_id, resource_group, workspace\n", ")\n", @@ -168,7 +170,7 @@ " )\n", " spark_step.outputs.wrangled_data.mode = InputOutputModes.DIRECT\n", " spark_step.identity = UserIdentityConfiguration()\n", - " spark_step.compute = \"\"\n", + " spark_step.compute = attached_spark_pool_name_uai\n", "\n", "\n", "pipeline = spark_pipeline(\n", @@ -209,6 +211,7 @@ "subscription_id = \"\"\n", "resource_group = \"\"\n", "workspace = \"\"\n", + "attached_spark_pool_name = \"\"\n", "ml_client = MLClient(\n", " DefaultAzureCredential(), subscription_id, resource_group, workspace\n", ")\n", @@ -242,7 +245,7 @@ " path=\"azureml://datastores/workspaceblobstore/paths/data/wrangled/\",\n", " )\n", " spark_step.outputs.wrangled_data.mode = InputOutputModes.DIRECT\n", - " spark_step.compute = \"\"\n", + " spark_step.compute = attached_spark_pool_name\n", "\n", "\n", "pipeline = spark_pipeline(\n", diff --git a/sdk/python/jobs/spark/submit_spark_standalone_jobs.ipynb b/sdk/python/jobs/spark/submit_spark_standalone_jobs.ipynb index b202b67b7a..522a421ad7 100644 --- a/sdk/python/jobs/spark/submit_spark_standalone_jobs.ipynb +++ b/sdk/python/jobs/spark/submit_spark_standalone_jobs.ipynb @@ -54,6 +54,7 @@ "subscription_id = \"\"\n", "resource_group = \"\"\n", "workspace = \"\"\n", + "attached_spark_pool_name = \"\"\n", "ml_client = MLClient(\n", " DefaultAzureCredential(), subscription_id, resource_group, workspace\n", ")\n", @@ -67,7 +68,7 @@ " executor_cores=2,\n", " executor_memory=\"2g\",\n", " executor_instances=2,\n", - " compute=\"\",\n", + " compute=attached_spark_pool_name,\n", " inputs={\n", " \"titanic_data\": Input(\n", " type=\"uri_file\",\n", @@ -113,6 +114,7 @@ "subscription_id = \"\"\n", "resource_group = \"\"\n", "workspace = \"\"\n", + "attached_spark_pool_name = \"\"\n", "ml_client = MLClient(\n", " DefaultAzureCredential(), subscription_id, resource_group, workspace\n", ")\n", @@ -126,7 +128,7 @@ " executor_cores=2,\n", " executor_memory=\"2g\",\n", " executor_instances=2,\n", - " compute=\"\",\n", + " compute=attached_spark_pool_name,\n", " inputs={\n", " \"titanic_data\": Input(\n", " type=\"uri_file\",\n", @@ -172,6 +174,7 @@ "subscription_id = \"\"\n", "resource_group = \"\"\n", "workspace = \"\"\n", + "attached_spark_pool_name = \"\"\n", "ml_client = MLClient(\n", " DefaultAzureCredential(), subscription_id, resource_group, workspace\n", ")\n", @@ -185,7 +188,7 @@ " executor_cores=2,\n", " executor_memory=\"2g\",\n", " executor_instances=2,\n", - " compute=\"\",\n", + " compute=attached_spark_pool_name,\n", " inputs={\n", " \"titanic_data\": Input(\n", " type=\"uri_file\",\n", diff --git a/sdk/python/jobs/spark/submit_spark_standalone_jobs_managed_vnet.ipynb b/sdk/python/jobs/spark/submit_spark_standalone_jobs_managed_vnet.ipynb index 088a07d41c..efdd4ecea9 100644 --- a/sdk/python/jobs/spark/submit_spark_standalone_jobs_managed_vnet.ipynb +++ b/sdk/python/jobs/spark/submit_spark_standalone_jobs_managed_vnet.ipynb @@ -183,7 +183,7 @@ "# Provisioning managed VNet with Spark support\n", "include_spark = True\n", "provision_network_result = ml_client.workspaces.begin_provision_network(\n", - " ws_name, include_spark\n", + " workspace_name=ws_name, include_spark=include_spark\n", ").result()" ] }, @@ -592,7 +592,7 @@ "\n", "# This will add a new outbound rule to existing rules\n", "rule_name = \"\" # This name should be unique\n", - "adls_storage_account = \"\"\n", + "adls_storage_account = \"\"\n", "service_resource_id = f\"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.Storage/storageAccounts/{adls_storage_account}\"\n", "subresource_target = \"dfs\"\n", "spark_enabled = True\n", @@ -747,7 +747,7 @@ "# Enter the Azure Data Lake Storage (ADLS) Gen2 account name and container name.\n", "# The file `titanic.csv` should be placed inside folder `data`\n", "# created in the Azure Data Lake Storage (ADLS) Gen2 container.\n", - "adls_storage_account = \"\"\n", + "adls_storage_account = \"\"\n", "container_name = \"\"\n", "\n", "spark_job = spark(\n", @@ -862,7 +862,9 @@ }, "outputs": [], "source": [ - "ml_client.workspaces.begin_delete(name=ws_name, delete_dependent_resources=True)" + "ml_client.workspaces.begin_delete(\n", + " name=ws_name, permanently_delete=True, delete_dependent_resources=True\n", + ")" ] } ], diff --git a/sdk/python/readme.py b/sdk/python/readme.py index c4042abbbc..9dacd9cf1b 100644 --- a/sdk/python/readme.py +++ b/sdk/python/readme.py @@ -19,7 +19,6 @@ "train-hyperparameter-tune-deploy-with-keras", "train-hyperparameter-tune-deploy-with-tensorflow", "interactive_data_wrangling", - "submit_spark_standalone_jobs_managed_vnet", # mlflow SDK samples notebooks "mlflow_sdk_online_endpoints_progresive", "mlflow_sdk_online_endpoints",