feat: provide an example for federated learning

GoogleCloudPlatform · Jan 30, 2025 · 7fe70e3 · 7fe70e3
1 parent 4a79619
commit 7fe70e3
Show file tree

Hide file tree

Showing 38 changed files with 1,166 additions and 12 deletions.
diff --git a/platforms/gke/base/core/gke_enterprise/policycontroller/feature.tf b/platforms/gke/base/core/gke_enterprise/policycontroller/feature.tf
@@ -12,6 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+locals {
+  policy_controller_kubernetes_namespace       = "gatekeeper-system"
+  policy_controller_kubernetes_service_account = "gatekeeper-admin"
+  gatekeeper_wi_member                         = "${local.wi_principal_prefix}/ns/${local.policy_controller_kubernetes_namespace}/sa/${local.policy_controller_kubernetes_service_account}"
+  wi_principal_prefix                          = "principal://iam.googleapis.com/projects/${data.google_project.cluster.number}/locations/global/workloadIdentityPools/${data.google_project.cluster.project_id}.svc.id.goog/subject"
+
+  gatekeeper_iam_roles = [
+    "roles/monitoring.metricWriter"
+  ]
+}
+
 resource "google_gke_hub_feature" "policycontroller" {
   provider = google-beta
 
@@ -38,3 +49,11 @@ resource "google_gke_hub_feature_membership" "cluster_policycontroller" {
     }
   }
 }
+
+resource "google_project_iam_member" "gatekeeper" {
+  for_each = toset(local.gatekeeper_iam_roles)
+
+  member  = local.gatekeeper_wi_member
+  project = google_project_service.anthospolicycontroller_googleapis_com.project
+  role    = each.value
+}
diff --git a/platforms/gke/base/use-cases/federated-learning/README.md b/platforms/gke/base/use-cases/federated-learning/README.md
@@ -226,6 +226,11 @@ To deploy the reference architecture, you do the following:
 After deploying the reference architecture, the GKE cluster is ready to host
 your federated learning workloads.
 
+To help you familiarize with the reference architecture, you can deploy one of
+the provided examples:
+
+- [Train an image classifier using NVIDIA FLARE](/platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/README.md).
+
 ## Destroy the reference architecture
 
 To destroy an instance of the reference architecture, you do the following:
@@ -333,7 +338,7 @@ Pods in the
 
 If this happens:
 
-1. Wait for the cluster to complete the initialiazation
+1. Wait for the cluster to complete the initialization
 1. Delete the Deployment that is impacted by this issue. Config Sync will deploy
    it again with the correct container image identifiers.
 

diff --git a/platforms/gke/base/use-cases/federated-learning/assets/nvflare.svg b/platforms/gke/base/use-cases/federated-learning/assets/nvflare.svg
diff --git a/platforms/gke/base/use-cases/federated-learning/common.sh b/platforms/gke/base/use-cases/federated-learning/common.sh
@@ -32,6 +32,8 @@ FEDERATED_LEARNING_USE_CASE_DIR="${ACP_PLATFORM_BASE_DIR}/use-cases/federated-le
 FEDERATED_LEARNING_USE_CASE_TERRAFORM_DIR="${FEDERATED_LEARNING_USE_CASE_DIR}/terraform"
 # shellcheck disable=SC2034 # Variable is used in other scripts
 FEDERATED_LEARNING_SHARED_CONFIG_DIR="${FEDERATED_LEARNING_USE_CASE_TERRAFORM_DIR}/_shared_config"
+# shellcheck disable=SC2034 # Variable is used in other scripts
+FEDERATED_LEARNING_CONFIG_AUTO_VARS_FILE="${FEDERATED_LEARNING_SHARED_CONFIG_DIR}/uc_federated_learning.auto.tfvars"
 
 # shellcheck disable=SC2034 # Variable is used in other scripts
 # Terraservices that are necessary for the core platform
@@ -48,6 +50,7 @@ federated_learning_terraservices=(
   "workload_identity"
   "container_node_pool"
   "config_management"
+  "cloud_storage"
 )
 
 # shellcheck disable=SC2034 # Variable is used in other scripts
@@ -128,6 +131,7 @@ destroy_terraservice() {
 get_terraform_output() {
   terraservice="${1}"
   output_name="${2}"
+  output_type="${3}"
 
   if [[ ! -d "${terraservice}" ]]; then
     echo "${terraservice} directory doesn't exist or is not readable"
@@ -140,10 +144,18 @@ get_terraform_output() {
     return 1
   fi
 
+  local -a output_command=(terraform -chdir="${terraservice}" output)
+  if [[ "${output_type}" == "json" ]]; then
+    output_command+=(-json)
+  elif [[ "${output_type}" == "raw" ]]; then
+    output_command+=(-raw)
+  fi
+  output_command+=("${output_name}")
+
   if ! output="$(
-    terraform -chdir="${terraservice}" output -raw "${output_name}"
+    "${output_command[@]}"
   )"; then
-    echo "Error while getting ${output_name} output: ${output}"
+    echo "Error while getting ${output_name} output: ${output}. Output command: ${output_command[*]}"
     return 1
   fi
   echo "${output}"
@@ -165,7 +177,8 @@ remove_terraform_configuration_variable_from_file() {
   local destination_file_path="${2}"
   local configuration_variable_name
 
-  configuration_variable_name="$(echo "${configuration_variable}" | awk ' { print $1 }'))"
+  configuration_variable_name="$(echo "${configuration_variable}" | awk ' { print $1 }')"
+  echo "Removing ${configuration_variable_name} from ${destination_file_path}"
   sed -i "/${configuration_variable_name}/d" "${destination_file_path}"
   terraform fmt "${destination_file_path}"
 }

diff --git a/platforms/gke/base/use-cases/federated-learning/deploy.sh b/platforms/gke/base/use-cases/federated-learning/deploy.sh
@@ -32,7 +32,7 @@ for configuration_variable in "${TERRAFORM_CORE_INITIALIZE_CONFIGURATION[@]}"; d
 done
 
 echo "Initializing the core platform"
-# Don't provision any core platform terraservice becuase we just need
+# Don't provision any core platform terraservice because we just need
 # to initialize the terraform environment and remote backend
 # shellcheck disable=SC1091,SC2154
 CORE_TERRASERVICES_APPLY="${core_platform_init_terraservices[*]}" \
@@ -44,7 +44,7 @@ for terraservice in "${federated_learning_core_platform_terraservices[@]}"; do
   provision_terraservice "${terraservice}"
 done
 
-if ! cluster_database_encryption_key_id="$(get_terraform_output "${FEDERATED_LEARNING_USE_CASE_TERRAFORM_DIR}/key_management_service" "cluster_database_encryption_key_id")"; then
+if ! cluster_database_encryption_key_id="$(get_terraform_output "${FEDERATED_LEARNING_USE_CASE_TERRAFORM_DIR}/key_management_service" "cluster_database_encryption_key_id" "raw")"; then
   exit 1
 fi
 edit_terraform_configuration_variable_value_in_file "cluster_database_encryption_key_name_placeholder" "${cluster_database_encryption_key_id}" "${ACP_PLATFORM_SHARED_CONFIG_CLUSTER_AUTO_VARS_FILE}"

diff --git a/platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/README.md b/platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/README.md
@@ -0,0 +1,250 @@
+# Train an image classifier using NVIDIA FLARE
+
+This example uses nVidia FLARE to train an image classifier using federated
+averaging and TensorFlow as the deep learning framework.
+
+[NVIDIA FLARE](https://nvflare.readthedocs.io/en/main/index.html) is a
+domain-agnostic, open-source, extensible SDK that allows researchers and data
+scientists to adapt existing ML/DL workflows to a federated paradigm. It enables
+platform developers to build a secure, privacy-preserving offering for a
+distributed multi-party collaboration.
+
+For more information about nVidia FLARE, see
+[NVIDIA FLARE overview](https://nvflare.readthedocs.io/en/main/flare_overview.html#high-level-system-architecture).
+
+This example builds on top of the infrastructure that the
+[Federated learning reference architecture provides](/platforms/gke/base/use-cases/federated-learning/README.md),
+and follows the best practices the reference architecture establishes.
+
+## Architecture
+
+The following diagram shows one server and two clients that are connected to the
+server:
+
+![NVIDIA FLARE example architecture](/platforms/gke/base/use-cases/federated-learning/assets/nvflare.svg "NVIDIA FLARE example architecture")
+
+As shown in the preceding diagram, the reference architecture helps you to
+create and configure the following components:
+
+- A persistent volume to store the NVIDIA FLARE workspace
+- Two pods that run NVIDIA FLARE clients that connect to the NVIDIA FLARE server
+  in the `nvidia-client1` and `nvidia-client2` namespaces respectively.
+- One pod that runs the NVIDIA FLARE server that aggregates the computation
+  results in the `nvflare-infra` namespace.
+
+## Deploy the reference architecture
+
+This example builds on top of the infrastructure that the
+[Federated Learning reference architecture](/platforms/gke/base/use-cases/federated-learning/README.md)
+provides, and follows the best practices that the reference architecture
+establishes.
+
+Before deploying the NVIDIA FLARE example described in this document, you need
+to deploy the Federated learning reference architecture first. Then, you can
+deploy the NVIDIA FLARE example.
+
+### Provision and configure Google Cloud infrastructure
+
+For this example, you provision new Google Cloud resources in addition to the
+ones that the Federated learning reference architecture provisions.
+
+All the models generated will be stored in a Cloud storage bucket mounted by
+each pod.
+
+1. Open [Cloud Shell](https://cloud.google.com/shell).
+
+1. Run the script to configure the reference architecture and provision Google
+   Cloud resources that this example needs:
+
+   ```sh
+   "${ACP_PLATFORM_BASE_DIR}/use-cases/federated-learning/examples/nvflare-tff/deploy.sh"
+   ```
+
+### Build and push the example container image
+
+In this section, you run a script that:
+
+- Builds a container image with TensorFlow and NVIDIA FLARE installed locally on
+  your host. To build the container image you need about 8GB of persistent
+  storage space and can take up to 20 minutes. For a production deployment,
+  consider using Cloud Build.
+- Pushes the container images to the Artifact Registry repository that the
+  Federated learning reference architecture provides.
+
+To run the script, do the following:
+
+1. Open [Cloud Shell](https://cloud.google.com/shell).
+
+1. Run the script to build the container image:
+
+   ```sh
+   "${ACP_PLATFORM_BASE_DIR}/use-cases/federated-learning/examples/nvflare-tff/build-container-image.sh"
+   ```
+
+### Create NVIDIA FLARE deployment descriptors
+
+In this section, you create descriptors to deploy NVIDIA FLARE in the reference
+architecture:
+
+1. Run an NVIDIA FLARE container based on the container image you built in the
+   preceding section:
+
+   ```bash
+   docker run --rm -v "${HOME}/nvflare-workspace:/opt/NVFlare/workspace" -it "${NVFLARE_EXAMPLE_CONTAINER_IMAGE_LOCALIZED_ID_WITH_TAG}" bash
+   ```
+
+1. Run the NVIDIA FLARE provisioning tool to create a NVIDIA FLARE project
+   configuration file (`project.yml`) that you can customize to your needs:
+
+   ```bash
+   nvflare provision
+   ```
+
+   When prompted, pick the non highly-available deployment option because
+   highly-available NVIDIA FLARE deployments are not yet supported on
+   Kubernetes.
+
+1. Run the provisioning tool again to generate deployment descriptors:
+
+   ```bash
+   nvflare provision
+   ```
+
+   The provisioning tool generates deployment descriptors:
+
+   - `server1` is the server that will aggregate all the results from the
+     computation
+   - `site-1` and `site-2` are the clients that will be connected to the server
+   - `admin@nvidia.com` is the administration client to start and list jobs
+
+1. Copy the NVIDIA FLARE TensorFlow demo deployment descriptors in the
+   workspace:
+
+   ```bash
+   cp -R NVFlare-${NVFLARE_RELEASE_TAG}/examples/hello-world/hello-tf2 workspace/example_project/prod_00/admin@nvidia.com/transfer
+   ```
+
+1. Exit the NVIDIA FLARE container:
+
+   ```bash
+   exit
+   ```
+
+1. Copy the workspace folder to Cloud Storage:
+
+   ```bash
+   gcloud storage -m cp -r "${HOME}/nvflare-workspace/workspace" "gs://${NVFLARE_WORKSPACE_BUCKET_NAME}"
+   ```
+
+### Verify that NVIDIA FLARE pods are running
+
+In this section, you verify that you deployed NVIDIA FLARE in the reference
+architecture:
+
+1. Open the
+   [GKE Workloads Dashboard](https://cloud.google.com/kubernetes-engine/docs/concepts/dashboards#workloads)
+   and verify that NVIDIA FLARE pods are running in the `fl-1` namespace. The
+   output to look for is similar to the following:
+
+   ```bash
+   NAME                               READY   STATUS    RESTARTS   AGE
+   nvflare-client1-57d5b45d84-bmv58   1/1     Running   0          16h
+   nvflare-client2-895b65d8f-p4fs9    1/1     Running   0          16h
+   nvflare-server1-66c44ddb47-dhtqz   1/1     Running   0          16h
+   ```
+
+### Submit the training job
+
+Everything is now ready to submit and run the job:
+
+1. Start a NVIDIA FLARE container based on the container image you previously
+   built:
+
+   ```bash
+   docker run --rm -v "${HOME}/nvflare-workspace:/opt/NVFlare/workspace" -it "${NVFLARE_EXAMPLE_CONTAINER_IMAGE_LOCALIZED_ID_WITH_TAG}" bash
+   ```
+
+1. Connect to NVIDIA FLARE:
+
+   ```bash
+   cd workspace/example_project/prod_00/admin@nvidia.com/startup
+   ./fl_admin.sh
+   ```
+
+   When prompted, the username is `admin@nvidia.com`
+
+   At this point, you should be connected to the NVIDIA FLARE workspace. The
+   output is similar to the following:
+
+   ```bash
+   User Name: admin@nvidia.com
+   Trying to obtain server address
+   Obtained server address: server1:8003
+   Trying to login, please wait ...
+   Logged into server at server1:8003 with SSID: ebc6125d-0a56-4688-9b08-355fe9e4d61a
+   Type ? to list commands; type "? cmdName" to show usage of a command.
+   >
+   ```
+
+   When connected, you can list the jobs submitted to the cluster by using the
+   `list_jobs` command.
+
+1. Submit a training job:
+
+   ```bash
+   submit_job hello-tf2
+   ```
+
+   The output is similar to the following:
+
+   ```bash
+   Submitted job: c8973f05-8787-41c5-8568-ecc15c7683b2
+   Done [262650 usecs] 2024-05-23 09:47:04.543903
+   ```
+
+1. Verify that the training job is running:
+
+   ```bash
+   list_jobs
+   ```
+
+   The output is similar to the following:
+
+   ```bash
+   -----------------------------------------------------------------------------------------------------------------------------
+   | JOB ID                               | NAME      | STATUS             | SUBMIT TIME                      | RUN DURATION   |
+   -----------------------------------------------------------------------------------------------------------------------------
+   | c8973f05-8787-41c5-8568-ecc15c7683b2 | hello-tf2 | RUNNING            | 2024-05-23T09:47:04.488652+00:00 | 0:00:11.978134 |
+   -----------------------------------------------------------------------------------------------------------------------------
+   Done [136046 usecs] 2024-05-23 09:47:17.630953
+   ```
+
+1. Verify that the job completed successfully:
+
+   ```bash
+   list_jobs
+   ```
+
+   The output is similar to the following:
+
+   ```bash
+   -----------------------------------------------------------------------------------------------------------------------------
+   | JOB ID                               | NAME      | STATUS             | SUBMIT TIME                      | RUN DURATION   |
+   -----------------------------------------------------------------------------------------------------------------------------
+   | c8973f05-8787-41c5-8568-ecc15c7683b2 | hello-tf2 | FINISHED:COMPLETED | 2024-05-23T09:47:04.488652+00:00 | 0:01:44.335456 |
+   -----------------------------------------------------------------------------------------------------------------------------
+   Done [56885 usecs] 2024-05-23 09:49:15.420097
+   ```
+
+## Destroy the example environment and the reference architecture
+
+To destroy an instance of this example and the reference architecture, you do
+the following:
+
+1. Open [Cloud Shell](https://cloud.google.com/shell).
+
+1. Run the script to destroy the reference architecture:
+
+   ```sh
+   "${ACP_PLATFORM_BASE_DIR}/use-cases/federated-learning/examples/nvflare-tff/teardown.sh"
+   ```