From d6ec5155138fc3cfd0a4b8bb6114348d126bc09a Mon Sep 17 00:00:00 2001 From: conggguan Date: Tue, 6 Aug 2024 17:11:11 +0800 Subject: [PATCH 1/4] [Feature] Add a workflow parameter that model uploader can specific a customize prefix. Signed-off-by: conggguan --- .github/workflows/model_uploader.yml | 11 ++++++++++- CHANGELOG.md | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index 1c7362b2..cdfc3c28 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -24,6 +24,10 @@ on: - "BOTH" - "TORCH_SCRIPT" - "ONNX" + upload_prefix: + description: "Specifies the model prefix for uploading. For example, transforming the default path from '.../sentence-transformers/msmarco-distilbert-base-tas-b' to '.../{prefix}/msmarco-distilbert-base-tas-b'." + required: false + type: string model_type: description: "Model type for auto-tracing (SentenceTransformer/Sparse)" required: true @@ -74,7 +78,12 @@ jobs: run: | model_id=${{ github.event.inputs.model_id }} echo "model_folder=ml-models/${{github.event.inputs.model_source}}/${model_id}" >> $GITHUB_OUTPUT - echo "model_prefix_folder=ml-models/${{github.event.inputs.model_source}}/${model_id%%/*}/" >> $GITHUB_OUTPUT + if [[ -n "${{ github.event.inputs.upload_prefix }}" ]]; then + model_prefix="ml-models/${{ github.event.inputs.model_source }}/${{ github.event.inputs.upload_prefix }}" + else + model_prefix="ml-models/${{ github.event.inputs.model_source }}/${model_id%%/*}" + fi + echo "model_prefix_folder=$model_prefix" >> $GITHUB_OUTPUT - name: Initiate workflow_info id: init_workflow_info run: | diff --git a/CHANGELOG.md b/CHANGELOG.md index f8168762..49a52003 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Add workflows and scripts for sparse encoding model tracing and uploading process by @conggguan in ([#394](https://github.com/opensearch-project/opensearch-py-ml/pull/394)) ### Changed +- Add a parameter for customize the upload folder prefix ([#398](https://github.com/opensearch-project/opensearch-py-ml/pull/398)) - Modify ml-models.JenkinsFile so that it takes model format into account and can be triggered with generic webhook by @thanawan-atc in ([#211](https://github.com/opensearch-project/opensearch-py-ml/pull/211)) - Update demo_tracing_model_torchscript_onnx.ipynb to use make_model_config_json by @thanawan-atc in ([#220](https://github.com/opensearch-project/opensearch-py-ml/pull/220)) - Bump torch from 1.13.1 to 2.0.1 and add onnx dependency by @thanawan-atc ([#237](https://github.com/opensearch-project/opensearch-py-ml/pull/237)) From ec192b509c62e48ad39e41801585670f9a2eb1fb Mon Sep 17 00:00:00 2001 From: conggguan Date: Fri, 9 Aug 2024 11:28:24 +0800 Subject: [PATCH 2/4] [Fix] To change the model zip file name from hugging face org id to a custom prefix when upload_prefix provided. Signed-off-by: conggguan --- .ci/run-repository.sh | 3 ++- .github/workflows/model_uploader.yml | 3 ++- .../ml_models/sparse_encoding_model.py | 4 ++-- utils/model_uploader/autotracing_utils.py | 8 +++++++- utils/model_uploader/model_autotracing.py | 8 ++++++++ .../model_uploader/sparse_model_autotracing.py | 17 ++++++++++++++--- 6 files changed, 35 insertions(+), 8 deletions(-) diff --git a/.ci/run-repository.sh b/.ci/run-repository.sh index f94e3e43..edde8a08 100755 --- a/.ci/run-repository.sh +++ b/.ci/run-repository.sh @@ -72,6 +72,7 @@ elif [[ "$TASK_TYPE" == "SentenceTransformerTrace" || "$TASK_TYPE" == "SparseTra echo -e "\033[34;1mINFO:\033[0m TRACING_FORMAT: ${TRACING_FORMAT}\033[0m" echo -e "\033[34;1mINFO:\033[0m EMBEDDING_DIMENSION: ${EMBEDDING_DIMENSION:-N/A}\033[0m" echo -e "\033[34;1mINFO:\033[0m POOLING_MODE: ${POOLING_MODE:-N/A}\033[0m" + echo -e "\033[34;1mINFO:\033[0m UPLOAD_PREFIX: ${UPLOAD_PREFIX:-N/A}\033[0m" echo -e "\033[34;1mINFO:\033[0m MODEL_DESCRIPTION: ${MODEL_DESCRIPTION:-N/A}\033[0m" if [[ "$TASK_TYPE" == "SentenceTransformerTrace" ]]; then @@ -95,7 +96,7 @@ elif [[ "$TASK_TYPE" == "SentenceTransformerTrace" || "$TASK_TYPE" == "SparseTra --env "TEST_TYPE=server" \ --name opensearch-py-ml-trace-runner \ opensearch-project/opensearch-py-ml \ - nox -s "${NOX_TRACE_TYPE}-${PYTHON_VERSION}" -- ${MODEL_ID} ${MODEL_VERSION} ${TRACING_FORMAT} ${EXTRA_ARGS} -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} + nox -s "${NOX_TRACE_TYPE}-${PYTHON_VERSION}" -- ${MODEL_ID} ${MODEL_VERSION} ${TRACING_FORMAT} ${UPLOAD_PREFIX} ${EXTRA_ARGS} -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} # To upload a model, we need the model artifact, description, license files into local path # trace_output should include description and license file. diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index dae40f31..567fe061 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -206,7 +206,8 @@ jobs: echo "MODEL_VERSION=${{ github.event.inputs.model_version }}" >> $GITHUB_ENV echo "TRACING_FORMAT=${{ github.event.inputs.tracing_format }}" >> $GITHUB_ENV echo "EMBEDDING_DIMENSION=${{ github.event.inputs.embedding_dimension }}" >> $GITHUB_ENV - echo "POOLING_MODE=${{ github.event.inputs.pooling_mode }}" >> $GITHUB_ENV + echo "POOLING_MODE=${{ github.event.inputs.pooling_mode }}" >> $GITHUB_ENV + echo "UPLOAD_PREFIX=${{ github.event.inputs.upload_prefix }}" >> $GITHUB_ENV echo "MODEL_DESCRIPTION=${{ github.event.inputs.model_description }}" >> $GITHUB_ENV - name: Autotracing ${{ matrix.cluster }} secured=${{ matrix.secured }} version=${{matrix.entry.opensearch_version}} run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} ${{github.event.inputs.model_type}}Trace" diff --git a/opensearch_py_ml/ml_models/sparse_encoding_model.py b/opensearch_py_ml/ml_models/sparse_encoding_model.py index 74786275..6bb9e88e 100644 --- a/opensearch_py_ml/ml_models/sparse_encoding_model.py +++ b/opensearch_py_ml/ml_models/sparse_encoding_model.py @@ -81,8 +81,8 @@ def save_as_pt( add_apache_license: bool = True, ) -> str: """ - Download sentence transformer model directly from huggingface, convert model to torch script format, - zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster + Download sparse encoding model directly from huggingface, convert model to torch script format, + zip the model file and its tokenizer.json file to prepare to upload to the OpenSearch cluster :param sentences: Required, for example sentences = ['today is sunny'] diff --git a/utils/model_uploader/autotracing_utils.py b/utils/model_uploader/autotracing_utils.py index affd36eb..4165f1b6 100644 --- a/utils/model_uploader/autotracing_utils.py +++ b/utils/model_uploader/autotracing_utils.py @@ -33,6 +33,7 @@ OUTPUT_DIR = "trace_output/" LICENSE_VAR_FILE = "apache_verified.txt" DESCRIPTION_VAR_FILE = "description.txt" +SPARSE_MODEL_TYPE = "neural-sparse" RTOL_TEST = 1e-03 ATOL_TEST = 1e-05 @@ -235,6 +236,7 @@ def prepare_files_for_uploading( model_format: str, src_model_path: str, src_model_config_path: str, + upload_prefix: str = None, ) -> tuple[str, str]: """ Prepare files for uploading by storing them in UPLOAD_FOLDER_PATH @@ -253,7 +255,11 @@ def prepare_files_for_uploading( (path to model config json file) in the UPLOAD_FOLDER_PATH :rtype: Tuple[str, str] """ - model_type, model_name = model_id.split("/") + model_type, model_name = ( + model_id.split("/") + if upload_prefix is None + else (upload_prefix, model_id.split("/")[-1]) + ) model_format = model_format.lower() folder_to_delete = ( TORCHSCRIPT_FOLDER_PATH if model_format == "torch_script" else ONNX_FOLDER_PATH diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py index 7f9f837f..cb6d5251 100644 --- a/utils/model_uploader/model_autotracing.py +++ b/utils/model_uploader/model_autotracing.py @@ -281,6 +281,7 @@ def main( embedding_dimension: Optional[int] = None, pooling_mode: Optional[str] = None, model_description: Optional[str] = None, + upload_prefix: Optional[str] = None ) -> None: """ Perform model auto-tracing and prepare files for uploading to OpenSearch model hub @@ -363,6 +364,7 @@ def main( TORCH_SCRIPT_FORMAT, torchscript_model_path, torchscript_model_config_path, + upload_prefix ) config_path_for_checking_description = torchscript_dst_model_config_path @@ -425,6 +427,11 @@ def main( choices=["BOTH", "TORCH_SCRIPT", "ONNX"], help="Model format for auto-tracing", ) + parser.add_argument( + "upload_prefix", + type=str, + help="Model customize path prefix for upload", + ) parser.add_argument( "-ed", "--embedding_dimension", @@ -462,4 +469,5 @@ def main( args.embedding_dimension, args.pooling_mode, args.model_description, + args.upload_prefix ) diff --git a/utils/model_uploader/sparse_model_autotracing.py b/utils/model_uploader/sparse_model_autotracing.py index b03435d8..f7bd0cfc 100644 --- a/utils/model_uploader/sparse_model_autotracing.py +++ b/utils/model_uploader/sparse_model_autotracing.py @@ -27,6 +27,7 @@ ONNX_FORMAT, RTOL_TEST, SPARSE_ALGORITHM, + SPARSE_MODEL_TYPE, TEMP_MODEL_PATH, TORCH_SCRIPT_FORMAT, TORCHSCRIPT_FOLDER_PATH, @@ -186,6 +187,7 @@ def main( model_version: str, tracing_format: str, model_description: Optional[str] = None, + upload_prefix: Optional[str] = None ) -> None: """ Perform model auto-tracing and prepare files for uploading to OpenSearch model hub @@ -235,7 +237,10 @@ def main( torchscript_model_path, torchscript_model_config_path, ) = trace_sparse_encoding_model( - model_id, model_version, TORCH_SCRIPT_FORMAT, model_description=None + model_id, + model_version, + TORCH_SCRIPT_FORMAT, + model_description=model_description, ) torchscript_encoding_datas = register_and_deploy_sparse_encoding_model( @@ -262,6 +267,7 @@ def main( TORCH_SCRIPT_FORMAT, torchscript_model_path, torchscript_model_config_path, + upload_prefix ) config_path_for_checking_description = torchscript_dst_model_config_path @@ -273,7 +279,7 @@ def main( onnx_model_path, onnx_model_config_path, ) = trace_sparse_encoding_model( - model_id, model_version, ONNX_FORMAT, model_description=None + model_id, model_version, ONNX_FORMAT, model_description=model_description ) onnx_embedding_datas = register_and_deploy_sparse_encoding_model( @@ -325,6 +331,11 @@ def main( choices=["BOTH", "TORCH_SCRIPT", "ONNX"], help="Model format for auto-tracing", ) + parser.add_argument( + "upload_prefix", + type=str, + help="Model customize path prefix for upload", + ) parser.add_argument( "-md", "--model_description", @@ -336,4 +347,4 @@ def main( ) args = parser.parse_args() - main(args.model_id, args.model_version, args.tracing_format, args.model_description) + main(args.model_id, args.model_version, args.tracing_format ,args.model_description,args.upload_prefix) From 7c948202fbfe07e29135eebb766969c792fc4a57 Mon Sep 17 00:00:00 2001 From: conggguan Date: Fri, 9 Aug 2024 13:00:14 +0800 Subject: [PATCH 3/4] [Fix] Revert the redundant history. Signed-off-by: conggguan --- utils/model_uploader/autotracing_utils.py | 1 - utils/model_uploader/model_autotracing.py | 6 ++-- .../sparse_model_autotracing.py | 13 +++++--- .../upload_history/MODEL_UPLOAD_HISTORY.md | 3 -- .../upload_history/supported_models.json | 30 ------------------- 5 files changed, 12 insertions(+), 41 deletions(-) diff --git a/utils/model_uploader/autotracing_utils.py b/utils/model_uploader/autotracing_utils.py index 4165f1b6..86ea2a41 100644 --- a/utils/model_uploader/autotracing_utils.py +++ b/utils/model_uploader/autotracing_utils.py @@ -33,7 +33,6 @@ OUTPUT_DIR = "trace_output/" LICENSE_VAR_FILE = "apache_verified.txt" DESCRIPTION_VAR_FILE = "description.txt" -SPARSE_MODEL_TYPE = "neural-sparse" RTOL_TEST = 1e-03 ATOL_TEST = 1e-05 diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py index cb6d5251..a054b654 100644 --- a/utils/model_uploader/model_autotracing.py +++ b/utils/model_uploader/model_autotracing.py @@ -281,7 +281,7 @@ def main( embedding_dimension: Optional[int] = None, pooling_mode: Optional[str] = None, model_description: Optional[str] = None, - upload_prefix: Optional[str] = None + upload_prefix: Optional[str] = None, ) -> None: """ Perform model auto-tracing and prepare files for uploading to OpenSearch model hub @@ -364,7 +364,7 @@ def main( TORCH_SCRIPT_FORMAT, torchscript_model_path, torchscript_model_config_path, - upload_prefix + upload_prefix, ) config_path_for_checking_description = torchscript_dst_model_config_path @@ -469,5 +469,5 @@ def main( args.embedding_dimension, args.pooling_mode, args.model_description, - args.upload_prefix + args.upload_prefix, ) diff --git a/utils/model_uploader/sparse_model_autotracing.py b/utils/model_uploader/sparse_model_autotracing.py index f7bd0cfc..516e6994 100644 --- a/utils/model_uploader/sparse_model_autotracing.py +++ b/utils/model_uploader/sparse_model_autotracing.py @@ -27,7 +27,6 @@ ONNX_FORMAT, RTOL_TEST, SPARSE_ALGORITHM, - SPARSE_MODEL_TYPE, TEMP_MODEL_PATH, TORCH_SCRIPT_FORMAT, TORCHSCRIPT_FOLDER_PATH, @@ -187,7 +186,7 @@ def main( model_version: str, tracing_format: str, model_description: Optional[str] = None, - upload_prefix: Optional[str] = None + upload_prefix: Optional[str] = None, ) -> None: """ Perform model auto-tracing and prepare files for uploading to OpenSearch model hub @@ -267,7 +266,7 @@ def main( TORCH_SCRIPT_FORMAT, torchscript_model_path, torchscript_model_config_path, - upload_prefix + upload_prefix, ) config_path_for_checking_description = torchscript_dst_model_config_path @@ -347,4 +346,10 @@ def main( ) args = parser.parse_args() - main(args.model_id, args.model_version, args.tracing_format ,args.model_description,args.upload_prefix) + main( + args.model_id, + args.model_version, + args.tracing_format, + args.model_description, + args.upload_prefix, + ) diff --git a/utils/model_uploader/upload_history/MODEL_UPLOAD_HISTORY.md b/utils/model_uploader/upload_history/MODEL_UPLOAD_HISTORY.md index 90bab043..a6105fed 100644 --- a/utils/model_uploader/upload_history/MODEL_UPLOAD_HISTORY.md +++ b/utils/model_uploader/upload_history/MODEL_UPLOAD_HISTORY.md @@ -21,6 +21,3 @@ The following table shows sentence transformer model upload history. |2023-09-13 18:03:32|@dhrubo-os|`sentence-transformers/distiluse-base-multilingual-cased-v1`|1.0.1|TORCH_SCRIPT|N/A|N/A|6178024517| |2023-10-18 18:06:15|@dhrubo-os|`sentence-transformers/paraphrase-mpnet-base-v2`|1.0.0|ONNX|N/A|N/A|6568285400| |2023-10-18 18:06:15|@dhrubo-os|`sentence-transformers/paraphrase-mpnet-base-v2`|1.0.0|TORCH_SCRIPT|N/A|N/A|6568285400| -|2024-08-07 18:01:26|@dhrubo-os|`opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill`|1.0.0|TORCH_SCRIPT|N/A|N/A|10293890748| -|2024-08-07 18:23:41|@dhrubo-os|`opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini`|1.0.0|TORCH_SCRIPT|N/A|N/A|10294048787| -|2024-08-08 09:40:44|@dhrubo-os|`opensearch-project/opensearch-neural-sparse-encoding-v2-distill`|1.0.0|TORCH_SCRIPT|N/A|N/A|10295327692| diff --git a/utils/model_uploader/upload_history/supported_models.json b/utils/model_uploader/upload_history/supported_models.json index a473ec9a..ce09ec4c 100644 --- a/utils/model_uploader/upload_history/supported_models.json +++ b/utils/model_uploader/upload_history/supported_models.json @@ -48,35 +48,5 @@ "Embedding Dimension": "N/A", "Pooling Mode": "N/A", "Workflow Run ID": "6568285400" - }, - { - "Model Uploader": "@dhrubo-os", - "Upload Time": "2024-08-07 18:01:26", - "Model ID": "opensearch-project/opensearch-neural-sparse-encoding-doc-v2-distill", - "Model Version": "1.0.0", - "Model Format": "TORCH_SCRIPT", - "Embedding Dimension": "N/A", - "Pooling Mode": "N/A", - "Workflow Run ID": "10293890748" - }, - { - "Model Uploader": "@dhrubo-os", - "Upload Time": "2024-08-07 18:23:41", - "Model ID": "opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini", - "Model Version": "1.0.0", - "Model Format": "TORCH_SCRIPT", - "Embedding Dimension": "N/A", - "Pooling Mode": "N/A", - "Workflow Run ID": "10294048787" - }, - { - "Model Uploader": "@dhrubo-os", - "Upload Time": "2024-08-08 09:40:44", - "Model ID": "opensearch-project/opensearch-neural-sparse-encoding-v2-distill", - "Model Version": "1.0.0", - "Model Format": "TORCH_SCRIPT", - "Embedding Dimension": "N/A", - "Pooling Mode": "N/A", - "Workflow Run ID": "10295327692" } ] \ No newline at end of file From fdabdf4b69b0e53ec0adef1efe093b20b822ba2a Mon Sep 17 00:00:00 2001 From: conggguan Date: Fri, 9 Aug 2024 13:06:43 +0800 Subject: [PATCH 4/4] [Add] add a changelog item. Signed-off-by: conggguan --- .ci/run-repository.sh | 2 +- CHANGELOG.md | 1 + utils/model_uploader/model_autotracing.py | 5 ++++- utils/model_uploader/sparse_model_autotracing.py | 5 ++++- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.ci/run-repository.sh b/.ci/run-repository.sh index edde8a08..3e51f90c 100755 --- a/.ci/run-repository.sh +++ b/.ci/run-repository.sh @@ -96,7 +96,7 @@ elif [[ "$TASK_TYPE" == "SentenceTransformerTrace" || "$TASK_TYPE" == "SparseTra --env "TEST_TYPE=server" \ --name opensearch-py-ml-trace-runner \ opensearch-project/opensearch-py-ml \ - nox -s "${NOX_TRACE_TYPE}-${PYTHON_VERSION}" -- ${MODEL_ID} ${MODEL_VERSION} ${TRACING_FORMAT} ${UPLOAD_PREFIX} ${EXTRA_ARGS} -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} + nox -s "${NOX_TRACE_TYPE}-${PYTHON_VERSION}" -- ${MODEL_ID} ${MODEL_VERSION} ${TRACING_FORMAT} ${EXTRA_ARGS} -up ${UPLOAD_PREFIX} -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} # To upload a model, we need the model artifact, description, license files into local path # trace_output should include description and license file. diff --git a/CHANGELOG.md b/CHANGELOG.md index 36020385..021e502f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,6 +46,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - updating listing file with three v2 sparse model - by @dhrubo-os ([#412](https://github.com/opensearch-project/opensearch-py-ml/pull/412)) ### Fixed +- Fix the wrong final zip file name in model_uploader workflow, now will name it by the upload_prefix alse.([#413](https://github.com/opensearch-project/opensearch-py-ml/pull/413/files)) - Fix the wrong input parameter for model_uploader's base_download_path in jekins trigger.([#402](https://github.com/opensearch-project/opensearch-py-ml/pull/402)) - Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203)) - Correct demo_ml_commons_integration.ipynb by @thanawan-atc in ([#208](https://github.com/opensearch-project/opensearch-py-ml/pull/208)) diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py index a054b654..22aed243 100644 --- a/utils/model_uploader/model_autotracing.py +++ b/utils/model_uploader/model_autotracing.py @@ -428,8 +428,11 @@ def main( help="Model format for auto-tracing", ) parser.add_argument( - "upload_prefix", + "-up", + "--upload_prefix", type=str, + nargs="?", + default=None, help="Model customize path prefix for upload", ) parser.add_argument( diff --git a/utils/model_uploader/sparse_model_autotracing.py b/utils/model_uploader/sparse_model_autotracing.py index 516e6994..2ec88017 100644 --- a/utils/model_uploader/sparse_model_autotracing.py +++ b/utils/model_uploader/sparse_model_autotracing.py @@ -331,8 +331,11 @@ def main( help="Model format for auto-tracing", ) parser.add_argument( - "upload_prefix", + "-up", + "--upload_prefix", type=str, + nargs="?", + default=None, help="Model customize path prefix for upload", ) parser.add_argument(