Skip to content

Commit

Permalink
Merge branch 'add_some_jobs_dracorno' into 'main'
Browse files Browse the repository at this point in the history
[READY]Add jobs 'hps_plugin_benchmark','147gb_model_benchmark'

See merge request dl/hugectr/hugectr!1435
  • Loading branch information
minseokl committed Jan 12, 2024
2 parents 64d89f5 + 9524386 commit 483f52b
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 2 deletions.
28 changes: 26 additions & 2 deletions ci/dracorno/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ py_single_node:
variables:
GPFSFOLDER: $DRACO_LOGDIR/py_single_node
CONT: $TRAIN_IMAGE_VERSIONED
MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT},${DRACO_WDL_PARQUET_DATASET}:${NEW_CRITEO_MOUNT}
MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT},${DRACO_DATASET_NEW_CRITEO}:${NEW_CRITEO_MOUNT}
TEST_CMD: ./ci/integration_test/py_interface/py_single_node.sub

hugectr2onnx:
Expand All @@ -175,7 +175,7 @@ ebc_multi_node:
GPFSFOLDER: $DRACO_LOGDIR/ebc_multi_node
CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED
MOUNTS: ${DRACO_DATASET_NEW_CRITEO}:${DATASET_MOUNT}
WALLTIME: "00:45:00"
WALLTIME: "01:00:00"
DGXNNODES: 2
TEST_CMD: ./ci/integration_test/ebc/ebc.sub

Expand Down Expand Up @@ -214,6 +214,30 @@ s3_backend_test:
DGXNNODES: 1
TEST_CMD: ./ci/integration_test/s3/s3_backend_test.sub

hps_plugin_benchmark:
extends: .dracorno_test_job
needs:
- pipeline: $PARENT_PIPELINE_ID
job: build_tf_hps_trt_plugin
variables:
GPFSFOLDER: $DRACO_LOGDIR/hps_plugin_benchmark
CONT: $TF_TRT_IMAGE_VERSIONED
MOUNTS: ${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/hpc-hugectr/hps_tf_benchmark/hps_plugin_ci_model_repo:/model_repo,${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/hpc-hugectr/hps_tf_benchmark/perf_data:/perf_data
WALLTIME: "00:45:00"
TEST_CMD: ./ci/benchmark/hps_plugin_benchmark/run.sub

147gb_model_benchmark:
extends: .dracorno_test_job
needs:
- pipeline: $PARENT_PIPELINE_ID
job: build_tf_hps_trt_plugin
variables:
GPFSFOLDER: $DRACO_LOGDIR/147gb_model_benchmark
CONT: $TF_TRT_IMAGE_VERSIONED
MOUNTS: ${DATA_PREFIX}/fs1/projects/gpu_compute/datasets/hugectr-ci/hpc-hugectr/hps_tf_benchmark/147gb_ci_model_repo:/model_repo
WALLTIME: "00:45:00"
TEST_CMD: ./ci/benchmark/147gb_model_benchmark/run.sub

#SOK ut tests
sparse_operation_kit_ut-TF1:
extends:
Expand Down
8 changes: 8 additions & 0 deletions ci/template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,14 @@ stages:
GIT_CLONE_PATH: /lustre/fsw/devtech/hpc-hugectr/hugectr-ci/$CI_CONCURRENT_ID/$CI_PROJECT_NAME
stage: post_test

.dracorno_post_test_job:
extends:
- .dracorno_test_job
- .hugectr:rules:test_in_child
variables:
WALLTIME: "00:30:00"
stage: post_test

.cluster_post_test_job_daily:
extends:
- .cluster_test_job
Expand Down

0 comments on commit 483f52b

Please sign in to comment.