From e70d699ec68c3c756f3fad07935b3312c8523f11 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Tue, 23 Apr 2024 17:26:54 +0800 Subject: [PATCH 01/23] add fields and check Signed-off-by: zhichao-aws --- .github/workflows/model_uploader.yml | 53 +++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index 68d75c90f..9c97d83da 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -8,6 +8,14 @@ on: required: true type: string default: "huggingface" + model_license: + description: "Model license (e.g. Apache-2.0)" + required: true + type: choice + default: "Apache-2.0" + options: + - "Apache-2.0" + - "MIT" model_id: description: "Model ID for auto-tracing and uploading (e.g. sentence-transformers/msmarco-distilbert-base-tas-b)" required: true @@ -49,7 +57,18 @@ on: options: - "NO" - "YES" - + mit_copyright_statement: + description: "(Optional) Copyright statement for MIT licensed models. Should be picked from origin MIT license file. E.g. Copyright (c) {year} {author}" + required: false + type: string + mit_attribution_website: + description: "(Optional) The project website for MIT licensed models. The MIT license file is supposed to be found in this website" + required: false + type: string + mit_model_version: + description: "(Optional) The model version for MIT licensed models. Different from the model_version field above, this version should be quoted from the origin project website." + required: false + type: string jobs: # Step 2: Initiate workflow variable @@ -73,6 +92,9 @@ jobs: embedding_dimension=${{ github.event.inputs.embedding_dimension }} pooling_mode=${{ github.event.inputs.pooling_mode }} model_description="${{ github.event.inputs.model_description }}" + mit_copyright_statement="${{ github.event.inputs.mit_copyright_statement }}" + mit_attribution_website="${{ github.event.inputs.mit_attribution_website }}" + mit_model_version="${{ github.event.inputs.mit_model_version }}" workflow_info=" ============= Workflow Details ============== @@ -83,11 +105,15 @@ jobs: ========= Workflow Input Information ========= - Model ID: ${{ github.event.inputs.model_id }} + - Model License ${{ github.event.inputs.model_license }} - Model Version: ${{ github.event.inputs.model_version }} - Tracing Format: ${{ github.event.inputs.tracing_format }} - Embedding Dimension: ${embedding_dimension:-N/A} - Pooling Mode: ${pooling_mode:-N/A} - Model Description: ${model_description:-N/A} + - MIT Copyright Statement: ${mit_copyright_statement:-N/A} + - MIT Attribution Website: ${mit_attribution_website:-N/A} + - MIT Model Version: ${mit_model_version:-N/A} ======== Workflow Output Information ========= - Embedding Verification: Passed" @@ -96,16 +122,29 @@ jobs: echo "${workflow_info@E}" >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT echo "${workflow_info@E}" + - name: Check MIT license material + id: check_mit_license_material + run: | + if [[ "${{ github.event.inputs.model_license }}" == "MIT" ]] + then + echo "Uploading MIT licensed model" + if [[ "${{ github.event.inputs.mit_copyright_statement }}" == "" && "${{ github.event.inputs.mit_attribution_website }}" == "" && "${{ github.event.inputs.mit_model_version }}" == "" ]] + then + echo "missing materials for MIT model" + exit 1 + fi - name: Initiate license_line id: init_license_line run: | - echo "verified=:white_check_mark: — It is verified that this model is licensed under Apache 2.0" >> $GITHUB_OUTPUT - echo "unverified=- [ ] :warning: The license cannot be verified. Please confirm by yourself that the model is licensed under Apache 2.0 :warning:" >> $GITHUB_OUTPUT + echo "verified_apache=:white_check_mark: — It is verified that this model is licensed under Apache 2.0" >> $GITHUB_OUTPUT + echo "verified_mit=:white_check_mark: — It is verified that this model is licensed under MIT, and we have enough materials to generate the attribution file" >> $GITHUB_OUTPUT + echo "unverified=- [ ] :warning: The license cannot be verified. Please confirm by yourself that the model is licensed under Apache 2.0 or MIT with enough materials :warning:" >> $GITHUB_OUTPUT outputs: model_folder: ${{ steps.init_folders.outputs.model_folder }} sentence_transformer_folder: ${{ steps.init_folders.outputs.sentence_transformer_folder }} workflow_info: ${{ steps.init_workflow_info.outputs.workflow_info }} - verified_license_line: ${{ steps.init_license_line.outputs.verified }} + verified_apache_license_line: ${{ steps.init_license_line.outputs.verified_apache }} + verified_mit_license_line: ${{ steps.init_license_line.outputs.verified_mit }} unverified_license_line: ${{ steps.init_license_line.outputs.unverified }} # Step 3: Check if the model already exists in the model hub @@ -175,11 +214,15 @@ jobs: - name: Export Arguments run: | echo "MODEL_ID=${{ github.event.inputs.model_id }}" >> $GITHUB_ENV + echo "MODEL_LICENSE=${{ github.event.inputs.model_license }}" >> $GITHUB_ENV echo "MODEL_VERSION=${{ github.event.inputs.model_version }}" >> $GITHUB_ENV echo "TRACING_FORMAT=${{ github.event.inputs.tracing_format }}" >> $GITHUB_ENV echo "EMBEDDING_DIMENSION=${{ github.event.inputs.embedding_dimension }}" >> $GITHUB_ENV echo "POOLING_MODE=${{ github.event.inputs.pooling_mode }}" >> $GITHUB_ENV - echo "MODEL_DESCRIPTION=${{ github.event.inputs.model_description }}" >> $GITHUB_ENV + echo "MODEL_DESCRIPTION=${{ github.event.inputs.model_description }}" >> $GITHUB_ENV + echo "MIT_COPYRIGHT_STATEMENT=${{ github.event.inputs.mit_copyright_statement }}" >> $GITHUB_ENV + echo "MIT_ATTRIBUTION_WEBSITE=${{ github.event.inputs.mit_attribution_website }}" >> $GITHUB_ENV + echo "MIT_MODEL_VERSION=${{ github.event.inputs.mit_model_version }}" >> $GITHUB_ENV - name: Autotracing ${{ matrix.cluster }} secured=${{ matrix.secured }} version=${{matrix.entry.opensearch_version}} run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} trace" - name: Limit Model Size to 2GB From dcea54e1b24effb1acd5085e2125fb47735404d2 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Wed, 24 Apr 2024 09:39:06 +0000 Subject: [PATCH 02/23] add code for MIT licensed models Signed-off-by: zhichao-aws --- .ci/run-repository.sh | 7 +- .github/workflows/model_uploader.yml | 4 +- .../ml_models/sentencetransformermodel.py | 34 ++++++- utils/model_uploader/model_autotracing.py | 88 ++++++++++++++++--- .../model_uploader/third_party_statements.py | 60 +++++++++++++ 5 files changed, 176 insertions(+), 17 deletions(-) create mode 100644 utils/model_uploader/third_party_statements.py diff --git a/.ci/run-repository.sh b/.ci/run-repository.sh index bd97c17b6..08591c560 100755 --- a/.ci/run-repository.sh +++ b/.ci/run-repository.sh @@ -68,11 +68,15 @@ elif [[ "$TASK_TYPE" == "doc" ]]; then elif [[ "$TASK_TYPE" == "trace" ]]; then # Set up OpenSearch cluster & Run model autotracing (Invoked by model_uploader.yml workflow) echo -e "\033[34;1mINFO:\033[0m MODEL_ID: ${MODEL_ID}\033[0m" + echo -e "\033[34;1mINFO:\033[0m MODEL_LICENSE: ${MODEL_LICENSE}\033[0m" echo -e "\033[34;1mINFO:\033[0m MODEL_VERSION: ${MODEL_VERSION}\033[0m" echo -e "\033[34;1mINFO:\033[0m TRACING_FORMAT: ${TRACING_FORMAT}\033[0m" echo -e "\033[34;1mINFO:\033[0m EMBEDDING_DIMENSION: ${EMBEDDING_DIMENSION:-N/A}\033[0m" echo -e "\033[34;1mINFO:\033[0m POOLING_MODE: ${POOLING_MODE:-N/A}\033[0m" echo -e "\033[34;1mINFO:\033[0m MODEL_DESCRIPTION: ${MODEL_DESCRIPTION:-N/A}\033[0m" + echo -e "\033[34;1mINFO:\033[0m MIT_COPYRIGHT_STATEMENT: ${MIT_COPYRIGHT_STATEMENT:-N/A}\033[0m" + echo -e "\033[34;1mINFO:\033[0m MIT_ATTRIBUTION_WEBSITE: ${MIT_ATTRIBUTION_WEBSITE:-N/A}\033[0m" + echo -e "\033[34;1mINFO:\033[0m MIT_MODEL_VERSION: ${MIT_MODEL_VERSION:-N/A}\033[0m" docker run \ --network=${network_name} \ @@ -84,7 +88,8 @@ elif [[ "$TASK_TYPE" == "trace" ]]; then --env "TEST_TYPE=server" \ --name opensearch-py-ml-trace-runner \ opensearch-project/opensearch-py-ml \ - nox -s "trace-${PYTHON_VERSION}" -- ${MODEL_ID} ${MODEL_VERSION} ${TRACING_FORMAT} -ed ${EMBEDDING_DIMENSION} -pm ${POOLING_MODE} -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} + nox -s "trace-${PYTHON_VERSION}" -- ${MODEL_ID} ${MODEL_LICENSE} ${MODEL_VERSION} ${TRACING_FORMAT} -ed ${EMBEDDING_DIMENSION} -pm ${POOLING_MODE} \ + -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} -mcs ${MIT_COPYRIGHT_STATEMENT} -maw ${MIT_ATTRIBUTION_WEBSITE} -mmv ${MIT_MODEL_VERSION} docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/upload/ ./upload/ docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/trace_output/ ./trace_output/ diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index 9c97d83da..1c4a4746f 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -238,8 +238,8 @@ jobs: - name: License Verification id: license_verification run: | - apache_verified=$(> $GITHUB_OUTPUT echo "license_info=Automatically Verified" >> $GITHUB_OUTPUT diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 10f1174b5..9f4208de1 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -15,7 +15,7 @@ import subprocess import time from pathlib import Path -from typing import List +from typing import List, Optional from zipfile import ZipFile import matplotlib.pyplot as plt @@ -656,6 +656,20 @@ def _add_apache_license_to_model_zip_file(self, model_zip_file_path: str): with ZipFile(str(model_zip_file_path), "a") as zipObj: zipObj.writestr("LICENSE", r.content) + + def _add_third_party_statements_text_to_model_zip_file(self, third_party_statements_text: str, model_zip_file_path: str): + """ + Add Statements text for non Apache-2.0 licensed third party model. Add it to the model zip file at model_zip_file_path + + :param third_party_statements_text: Statements text for non Apache-2.0 licensed third party model. Should be put in the final artifact. + :type third_party_statements_text: string + :param model_zip_file_path: Path to the model zip file + :type model_zip_file_path: string + :return: no return value expected + :rtype: None + """ + with ZipFile(str(model_zip_file_path), "a") as zipObj: + zipObj.writestr("THIRD-PARTY", third_party_statements_text) def zip_model( self, @@ -771,6 +785,7 @@ def save_as_pt( model_output_path: str = None, zip_file_name: str = None, add_apache_license: bool = False, + third_party_statements_text: Optional[str] = None ) -> str: """ Download sentence transformer model directly from huggingface, convert model to torch script format, @@ -802,10 +817,15 @@ def save_as_pt( :param add_apache_license: Optional, whether to add a Apache-2.0 license file to model zip file :type add_apache_license: string + :param third_party_statements_text: Statements text for non Apache-2.0 licensed third party model. Should be put in the final artifact. + :type third_party_statements_text: string :return: model zip file path. The file path where the zip file is being saved :rtype: string """ - + + if add_apache_license == True and not third_party_statements_text is None: + assert False, "When the model is from third party under non Apache-2.0 license, we can not add Apache-2.0 license for it." + model = SentenceTransformer(model_id) if model_name is None: @@ -870,6 +890,8 @@ def save_as_pt( ) if add_apache_license: self._add_apache_license_to_model_zip_file(zip_file_path) + if not third_party_statements_text is None: + self._add_third_party_statements_text_to_model_zip_file(third_party_statements_text, zip_file_path) self.torch_script_zip_file_path = zip_file_path print("zip file is saved to ", zip_file_path, "\n") @@ -883,6 +905,7 @@ def save_as_onnx( model_output_path: str = None, zip_file_name: str = None, add_apache_license: bool = False, + third_party_statements_text: Optional[str] = None ) -> str: """ Download sentence transformer model directly from huggingface, convert model to onnx format, @@ -911,9 +934,14 @@ def save_as_onnx( :param add_apache_license: Optional, whether to add a Apache-2.0 license file to model zip file :type add_apache_license: string + :param third_party_statements_text: Statements text for non Apache-2.0 licensed third party model. Should be put in the final artifact. + :type third_party_statements_text: string :return: model zip file path. The file path where the zip file is being saved :rtype: string """ + + if add_apache_license == True and not third_party_statements_text is None: + assert False, "When the model is from third party under non Apache-2.0 license, we can not add Apache-2.0 license for it." model = SentenceTransformer(model_id) @@ -968,6 +996,8 @@ def save_as_onnx( ) if add_apache_license: self._add_apache_license_to_model_zip_file(zip_file_path) + if not third_party_statements_text is None: + self._add_third_party_statements_text_to_model_zip_file(third_party_statements_text, zip_file_path) self.onnx_zip_file_path = zip_file_path print("zip file is saved to ", zip_file_path, "\n") diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py index 3794087a8..6781cbe32 100644 --- a/utils/model_uploader/model_autotracing.py +++ b/utils/model_uploader/model_autotracing.py @@ -32,6 +32,7 @@ from opensearch_py_ml.ml_commons import MLCommonClient from opensearch_py_ml.ml_models.sentencetransformermodel import SentenceTransformerModel from tests import OPENSEARCH_TEST_CLIENT +from third_party_statements import generate_thirdpart_statements_for_MIT BOTH_FORMAT = "BOTH" TORCH_SCRIPT_FORMAT = "TORCH_SCRIPT" @@ -43,7 +44,7 @@ UPLOAD_FOLDER_PATH = "upload/" MODEL_CONFIG_FILE_NAME = "ml-commons_model_config.json" OUTPUT_DIR = "trace_output/" -LICENSE_VAR_FILE = "apache_verified.txt" +LICENSE_VAR_FILE = "license_verified.txt" DESCRIPTION_VAR_FILE = "description.txt" TEST_SENTENCES = [ "First test sentence", @@ -54,14 +55,14 @@ ATOL_TEST = 1e-05 -def verify_license_in_md_file() -> bool: +def verify_license_in_md_file(model_license: str = "Apache-2.0") -> bool: """ - Verify that the model is licensed under Apache 2.0 + Verify that the model is licensed under target model_license (Apache-2.0 or MIT) by looking at metadata in README.md file of the model TODO: Support other open source licenses in future - :return: Whether the model is licensed under Apache 2.0 + :return: Whether the model is under target model_license :rtype: Bool """ try: @@ -75,21 +76,22 @@ def verify_license_in_md_file() -> bool: if start == -1 or end == -1: return False metadata_info = readme_data[start + 3 : end] - if "apache-2.0" in metadata_info.lower(): - print("\nFound apache-2.0 license at " + TEMP_MODEL_PATH + "/README.md") + if model_license.lower() in metadata_info.lower(): + print(f"\nFound {model_license} license at " + TEMP_MODEL_PATH + "/README.md") return True else: - print("\nDid not find apache-2.0 license at " + TEMP_MODEL_PATH + "/README.md") + print(f"\nDid not find {model_license} license at " + TEMP_MODEL_PATH + "/README.md") return False - def trace_sentence_transformer_model( model_id: str, + model_license: str, model_version: str, model_format: str, embedding_dimension: Optional[int] = None, pooling_mode: Optional[str] = None, model_description: Optional[str] = None, + third_party_statements_text: Optional[str] = None ) -> Tuple[str, str]: """ Trace the pretrained sentence transformer model, create a model config file, @@ -97,6 +99,8 @@ def trace_sentence_transformer_model( :param model_id: Model ID of the pretrained model :type model_id: string + :param model_license: Model license ("Apache-2.0" or "MIT") + :type model_license: string :param model_version: Version of the pretrained model for registration :type model_version: string :param model_format: Model format ("TORCH_SCRIPT" or "ONNX") @@ -107,6 +111,8 @@ def trace_sentence_transformer_model( :type pooling_mode: string :param model_description: Model description input :type model_description: string + :param third_party_statements_text: Statements text for non Apache-2.0 licensed third party model. Should be put in the final artifact. + :type third_party_statements_text: string :return: Tuple of model_path (path to model zip file) and model_config_path (path to model config json file) :rtype: Tuple[str, str] """ @@ -135,11 +141,14 @@ def trace_sentence_transformer_model( model_path = pre_trained_model.save_as_pt( model_id=model_id, sentences=TEST_SENTENCES, - add_apache_license=True, + add_apache_license=model_license=="Apache-2.0", + third_party_statements_text=third_party_statements_text ) else: model_path = pre_trained_model.save_as_onnx( - model_id=model_id, add_apache_license=True + model_id=model_id, + add_apache_license=model_license=="Apache-2.0", + third_party_statements_text=third_party_statements_text ) except Exception as e: assert False, f"Raised Exception during saving model as {model_format}: {e}" @@ -407,17 +416,21 @@ def store_description_variable(config_path_for_checking_description: str) -> Non def main( model_id: str, + model_license: str, model_version: str, tracing_format: str, embedding_dimension: Optional[int] = None, pooling_mode: Optional[str] = None, model_description: Optional[str] = None, + third_party_statements_text: Optional[str] = None ) -> None: """ Perform model auto-tracing and prepare files for uploading to OpenSearch model hub :param model_id: Model ID of the pretrained model :type model_id: string + :param model_license: Model license ("Apache-2.0" or "MIT") + :type model_license: string :param model_version: Version of the pretrained model for registration :type model_version: string :param tracing_format: Tracing format ("TORCH_SCRIPT", "ONNX", or "BOTH") @@ -428,12 +441,15 @@ def main( :type pooling_mode: string :param model_description: Model description input :type model_description: string + :param third_party_statements_text: Statements text for non Apache-2.0 licensed third party model. Should be put in the final artifact. + :type third_party_statements_text: string :return: No return value expected :rtype: None """ print("\n=== Begin running model_autotracing.py ===") print("Model ID: ", model_id) + print("Model License: ", model_license) print("Model Version: ", model_version) print("Tracing Format: ", tracing_format) print( @@ -445,6 +461,10 @@ def main( "Model Description: ", model_description if model_description is not None else "N/A", ) + print( + "Third Party Statements Text: ", + third_party_statements_text if third_party_statements_text is not None else "N/A" + ) print("==========================================") ml_client = MLCommonClient(OPENSEARCH_TEST_CLIENT) @@ -455,7 +475,7 @@ def main( ) pre_trained_model.save(path=TEMP_MODEL_PATH) - license_verified = verify_license_in_md_file() + license_verified = verify_license_in_md_file(model_license=model_license) try: shutil.rmtree(TEMP_MODEL_PATH) except Exception as e: @@ -468,11 +488,13 @@ def main( torchscript_model_config_path, ) = trace_sentence_transformer_model( model_id, + model_license, model_version, TORCH_SCRIPT_FORMAT, embedding_dimension, pooling_mode, model_description, + third_party_statements_text ) torchscript_embedding_data = register_and_deploy_sentence_transformer_model( @@ -509,11 +531,13 @@ def main( onnx_model_config_path, ) = trace_sentence_transformer_model( model_id, + model_license, model_version, ONNX_FORMAT, embedding_dimension, pooling_mode, model_description, + third_party_statements_text ) onnx_embedding_data = register_and_deploy_sentence_transformer_model( @@ -557,6 +581,9 @@ def main( type=str, help="Model ID for auto-tracing and uploading (e.g. sentence-transformers/msmarco-distilbert-base-tas-b)", ) + parser.add_argument( + "model_license", type=str, help="Model license (e.g. Apache-2.0)" + ) parser.add_argument( "model_version", type=str, help="Model version number (e.g. 1.0.1)" ) @@ -593,13 +620,50 @@ def main( const=None, help="Model description if you want to overwrite the default description", ) + parser.add_argument( + "-mcs", + "--mit_copyright_statement", + type=str, + nargs="?", + default=None, + const=None, + help="Copyright statement for MIT licensed models. Should be picked from origin MIT license file. E.g. Copyright (c) year author", + ) + parser.add_argument( + "-maw", + "--mit_attribution_website", + type=str, + nargs="?", + default=None, + const=None, + help="The project website for MIT licensed models. The MIT license file is supposed to be found in this website", + ) + parser.add_argument( + "-mmv", + "--mit_model_version", + type=str, + nargs="?", + default=None, + const=None, + help="The model version for MIT licensed models. Different from the model_version field above, this version should be quoted from the origin project website.", + ) + args = parser.parse_args() - + + if args.model_license == "MIT": + # in the model_uploader.yml we check that all materials are provided + third_party_statements_text = generate_thirdpart_statements_for_MIT(model_id=args.model_id, copyright_statement=args.mit_copyright_statement, + attribution_website=args.mit_attribution_website,model_version=args.model_version) + else: + third_party_statements_text = None + main( args.model_id, + args.model_license, args.model_version, args.tracing_format, args.embedding_dimension, args.pooling_mode, args.model_description, + third_party_statements_text ) diff --git a/utils/model_uploader/third_party_statements.py b/utils/model_uploader/third_party_statements.py new file mode 100644 index 000000000..6c24640b1 --- /dev/null +++ b/utils/model_uploader/third_party_statements.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. +from string import Template + +MIT_TEMPLATE = Template(""" +** $model_id; version $model_version $attribution_website +$copyright_statement + +MIT License + +$copyright_statement + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""") + +def generate_thirdpart_statements_for_MIT( + model_id: str, + copyright_statement: str, + attribution_website: str, + model_version:str +) -> str: + """ + Generate statements text for MIT-licensed third party model. The result should be put in the final artifact. + + :param model_id: Model ID of the pretrained model + :type model_id: string + :param copyright_statement: MIT models copyright statement + :type copyright_statement: string + :param attribution_website: The project website for MIT licensed models + :type attribution_website: string + :param model_version: The model version for MIT licensed models + :type model_version: string + :return: Statements text for MIT-licensed third party model. + :rtype: str + """ + + + result = MIT_TEMPLATE.substitute(model_id=model_id, copyright_statement=copyright_statement, + attribution_website=attribution_website, model_version=model_version) + return result.strip() \ No newline at end of file From 607336cedea6e9ad73ea056f4eb2f69dacad2272 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Wed, 24 Apr 2024 09:53:54 +0000 Subject: [PATCH 03/23] license line; remove manual approve Signed-off-by: zhichao-aws --- .github/workflows/model_uploader.yml | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index 1c4a4746f..d80d0d868 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -241,7 +241,12 @@ jobs: license_verified=$(> $GITHUB_OUTPUT + if [[ "${{ github.event.inputs.model_license }}" == "Apache-2.0" ]] + then + echo "license_line=${{ needs.init-workflow-var.outputs.verified_apache_license_line }}" >> $GITHUB_OUTPUT + else + echo "license_line=${{ needs.init-workflow-var.outputs.verified_mit_license_line }}" >> $GITHUB_OUTPUT + fi echo "license_info=Automatically Verified" >> $GITHUB_OUTPUT else echo "license_line=${{ needs.init-workflow-var.outputs.unverified_license_line }}" >> $GITHUB_OUTPUT @@ -312,14 +317,14 @@ jobs: echo "${issue_body@E}" >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT echo "${issue_body@E}" - - uses: trstringer/manual-approval@v1 - with: - secret: ${{ github.TOKEN }} - approvers: ${{ steps.get_approvers.outputs.approvers }} - minimum-approvals: 2 - issue-title: "Upload Model to OpenSearch Model Hub (${{ github.event.inputs.model_id }})" - issue-body: ${{ steps.create_issue_body.outputs.issue_body }} - exclude-workflow-initiator-as-approver: false + # - uses: trstringer/manual-approval@v1 + # with: + # secret: ${{ github.TOKEN }} + # approvers: ${{ steps.get_approvers.outputs.approvers }} + # minimum-approvals: 2 + # issue-title: "Upload Model to OpenSearch Model Hub (${{ github.event.inputs.model_id }})" + # issue-body: ${{ steps.create_issue_body.outputs.issue_body }} + # exclude-workflow-initiator-as-approver: false # Step 6: Download the artifacts & Upload it to the S3 bucket model-uploading: From 7adfbf354b5d4bc7504dbf9327fc236952d739eb Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Wed, 24 Apr 2024 10:02:59 +0000 Subject: [PATCH 04/23] rename to third_party_statements_text Signed-off-by: zhichao-aws --- .../ml_models/sentencetransformermodel.py | 32 +++++++++---------- utils/model_uploader/model_autotracing.py | 28 ++++++++-------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 9f4208de1..3934190ae 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -657,19 +657,19 @@ def _add_apache_license_to_model_zip_file(self, model_zip_file_path: str): with ZipFile(str(model_zip_file_path), "a") as zipObj: zipObj.writestr("LICENSE", r.content) - def _add_third_party_statements_text_to_model_zip_file(self, third_party_statements_text: str, model_zip_file_path: str): + def _add_third_party_copyrights_statements_to_model_zip_file(self, third_party_copyrights_statements: str, model_zip_file_path: str): """ Add Statements text for non Apache-2.0 licensed third party model. Add it to the model zip file at model_zip_file_path - :param third_party_statements_text: Statements text for non Apache-2.0 licensed third party model. Should be put in the final artifact. - :type third_party_statements_text: string + :param third_party_copyrights_statements: Statements text for non Apache-2.0 licensed third party model. Should be put in the final artifact. + :type third_party_copyrights_statements: string :param model_zip_file_path: Path to the model zip file :type model_zip_file_path: string :return: no return value expected :rtype: None """ with ZipFile(str(model_zip_file_path), "a") as zipObj: - zipObj.writestr("THIRD-PARTY", third_party_statements_text) + zipObj.writestr("THIRD-PARTY", third_party_copyrights_statements) def zip_model( self, @@ -785,7 +785,7 @@ def save_as_pt( model_output_path: str = None, zip_file_name: str = None, add_apache_license: bool = False, - third_party_statements_text: Optional[str] = None + third_party_copyrights_statements: Optional[str] = None ) -> str: """ Download sentence transformer model directly from huggingface, convert model to torch script format, @@ -817,13 +817,13 @@ def save_as_pt( :param add_apache_license: Optional, whether to add a Apache-2.0 license file to model zip file :type add_apache_license: string - :param third_party_statements_text: Statements text for non Apache-2.0 licensed third party model. Should be put in the final artifact. - :type third_party_statements_text: string + :param third_party_copyrights_statements: Statements text for non Apache-2.0 licensed third party model. Should be put in the final artifact. + :type third_party_copyrights_statements: string :return: model zip file path. The file path where the zip file is being saved :rtype: string """ - if add_apache_license == True and not third_party_statements_text is None: + if add_apache_license == True and not third_party_copyrights_statements is None: assert False, "When the model is from third party under non Apache-2.0 license, we can not add Apache-2.0 license for it." model = SentenceTransformer(model_id) @@ -890,8 +890,8 @@ def save_as_pt( ) if add_apache_license: self._add_apache_license_to_model_zip_file(zip_file_path) - if not third_party_statements_text is None: - self._add_third_party_statements_text_to_model_zip_file(third_party_statements_text, zip_file_path) + if not third_party_copyrights_statements is None: + self._add_third_party_copyrights_statements_to_model_zip_file(third_party_copyrights_statements, zip_file_path) self.torch_script_zip_file_path = zip_file_path print("zip file is saved to ", zip_file_path, "\n") @@ -905,7 +905,7 @@ def save_as_onnx( model_output_path: str = None, zip_file_name: str = None, add_apache_license: bool = False, - third_party_statements_text: Optional[str] = None + third_party_copyrights_statements: Optional[str] = None ) -> str: """ Download sentence transformer model directly from huggingface, convert model to onnx format, @@ -934,13 +934,13 @@ def save_as_onnx( :param add_apache_license: Optional, whether to add a Apache-2.0 license file to model zip file :type add_apache_license: string - :param third_party_statements_text: Statements text for non Apache-2.0 licensed third party model. Should be put in the final artifact. - :type third_party_statements_text: string + :param third_party_copyrights_statements: Statements text for non Apache-2.0 licensed third party model. Should be put in the final artifact. + :type third_party_copyrights_statements: string :return: model zip file path. The file path where the zip file is being saved :rtype: string """ - if add_apache_license == True and not third_party_statements_text is None: + if add_apache_license == True and not third_party_copyrights_statements is None: assert False, "When the model is from third party under non Apache-2.0 license, we can not add Apache-2.0 license for it." model = SentenceTransformer(model_id) @@ -996,8 +996,8 @@ def save_as_onnx( ) if add_apache_license: self._add_apache_license_to_model_zip_file(zip_file_path) - if not third_party_statements_text is None: - self._add_third_party_statements_text_to_model_zip_file(third_party_statements_text, zip_file_path) + if not third_party_copyrights_statements is None: + self._add_third_party_copyrights_statements_to_model_zip_file(third_party_copyrights_statements, zip_file_path) self.onnx_zip_file_path = zip_file_path print("zip file is saved to ", zip_file_path, "\n") diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py index 6781cbe32..517308792 100644 --- a/utils/model_uploader/model_autotracing.py +++ b/utils/model_uploader/model_autotracing.py @@ -91,7 +91,7 @@ def trace_sentence_transformer_model( embedding_dimension: Optional[int] = None, pooling_mode: Optional[str] = None, model_description: Optional[str] = None, - third_party_statements_text: Optional[str] = None + third_party_copyrights_statements: Optional[str] = None ) -> Tuple[str, str]: """ Trace the pretrained sentence transformer model, create a model config file, @@ -111,8 +111,8 @@ def trace_sentence_transformer_model( :type pooling_mode: string :param model_description: Model description input :type model_description: string - :param third_party_statements_text: Statements text for non Apache-2.0 licensed third party model. Should be put in the final artifact. - :type third_party_statements_text: string + :param third_party_copyrights_statements: Statements text for non Apache-2.0 licensed third party model. Should be put in the final artifact. + :type third_party_copyrights_statements: string :return: Tuple of model_path (path to model zip file) and model_config_path (path to model config json file) :rtype: Tuple[str, str] """ @@ -142,13 +142,13 @@ def trace_sentence_transformer_model( model_id=model_id, sentences=TEST_SENTENCES, add_apache_license=model_license=="Apache-2.0", - third_party_statements_text=third_party_statements_text + third_party_copyrights_statements=third_party_copyrights_statements ) else: model_path = pre_trained_model.save_as_onnx( model_id=model_id, add_apache_license=model_license=="Apache-2.0", - third_party_statements_text=third_party_statements_text + third_party_copyrights_statements=third_party_copyrights_statements ) except Exception as e: assert False, f"Raised Exception during saving model as {model_format}: {e}" @@ -422,7 +422,7 @@ def main( embedding_dimension: Optional[int] = None, pooling_mode: Optional[str] = None, model_description: Optional[str] = None, - third_party_statements_text: Optional[str] = None + third_party_copyrights_statements: Optional[str] = None ) -> None: """ Perform model auto-tracing and prepare files for uploading to OpenSearch model hub @@ -441,8 +441,8 @@ def main( :type pooling_mode: string :param model_description: Model description input :type model_description: string - :param third_party_statements_text: Statements text for non Apache-2.0 licensed third party model. Should be put in the final artifact. - :type third_party_statements_text: string + :param third_party_copyrights_statements: Statements text for non Apache-2.0 licensed third party model. Should be put in the final artifact. + :type third_party_copyrights_statements: string :return: No return value expected :rtype: None """ @@ -463,7 +463,7 @@ def main( ) print( "Third Party Statements Text: ", - third_party_statements_text if third_party_statements_text is not None else "N/A" + third_party_copyrights_statements if third_party_copyrights_statements is not None else "N/A" ) print("==========================================") @@ -494,7 +494,7 @@ def main( embedding_dimension, pooling_mode, model_description, - third_party_statements_text + third_party_copyrights_statements ) torchscript_embedding_data = register_and_deploy_sentence_transformer_model( @@ -537,7 +537,7 @@ def main( embedding_dimension, pooling_mode, model_description, - third_party_statements_text + third_party_copyrights_statements ) onnx_embedding_data = register_and_deploy_sentence_transformer_model( @@ -652,10 +652,10 @@ def main( if args.model_license == "MIT": # in the model_uploader.yml we check that all materials are provided - third_party_statements_text = generate_thirdpart_statements_for_MIT(model_id=args.model_id, copyright_statement=args.mit_copyright_statement, + third_party_copyrights_statements = generate_thirdpart_statements_for_MIT(model_id=args.model_id, copyright_statement=args.mit_copyright_statement, attribution_website=args.mit_attribution_website,model_version=args.model_version) else: - third_party_statements_text = None + third_party_copyrights_statements = None main( args.model_id, @@ -665,5 +665,5 @@ def main( args.embedding_dimension, args.pooling_mode, args.model_description, - third_party_statements_text + third_party_copyrights_statements ) From 7d183788a49cf028eb24cbbfd667e160166e39e2 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Wed, 24 Apr 2024 10:13:01 +0000 Subject: [PATCH 05/23] change to copyright statements Signed-off-by: zhichao-aws --- .github/workflows/model_uploader.yml | 34 +++++++---------------- utils/model_uploader/model_autotracing.py | 33 +++------------------- 2 files changed, 14 insertions(+), 53 deletions(-) diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index d80d0d868..d0fe97dc2 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -57,16 +57,8 @@ on: options: - "NO" - "YES" - mit_copyright_statement: - description: "(Optional) Copyright statement for MIT licensed models. Should be picked from origin MIT license file. E.g. Copyright (c) {year} {author}" - required: false - type: string - mit_attribution_website: - description: "(Optional) The project website for MIT licensed models. The MIT license file is supposed to be found in this website" - required: false - type: string - mit_model_version: - description: "(Optional) The model version for MIT licensed models. Different from the model_version field above, this version should be quoted from the origin project website." + third_party_copyrights_statements: + description: "(Optional) Copyright statements for MIT licensed models. Should be put in the final artifact." required: false type: string @@ -92,9 +84,7 @@ jobs: embedding_dimension=${{ github.event.inputs.embedding_dimension }} pooling_mode=${{ github.event.inputs.pooling_mode }} model_description="${{ github.event.inputs.model_description }}" - mit_copyright_statement="${{ github.event.inputs.mit_copyright_statement }}" - mit_attribution_website="${{ github.event.inputs.mit_attribution_website }}" - mit_model_version="${{ github.event.inputs.mit_model_version }}" + third_party_copyrights_statements="${{ github.event.inputs.third_party_copyrights_statements }}" workflow_info=" ============= Workflow Details ============== @@ -111,9 +101,7 @@ jobs: - Embedding Dimension: ${embedding_dimension:-N/A} - Pooling Mode: ${pooling_mode:-N/A} - Model Description: ${model_description:-N/A} - - MIT Copyright Statement: ${mit_copyright_statement:-N/A} - - MIT Attribution Website: ${mit_attribution_website:-N/A} - - MIT Model Version: ${mit_model_version:-N/A} + - Third Party Copyrights Statements: ${third_party_copyrights_statements:-N/A} ======== Workflow Output Information ========= - Embedding Verification: Passed" @@ -122,22 +110,22 @@ jobs: echo "${workflow_info@E}" >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT echo "${workflow_info@E}" - - name: Check MIT license material - id: check_mit_license_material + - name: Check MIT license copyright statements + id: check_mit_license_copyright_statements run: | if [[ "${{ github.event.inputs.model_license }}" == "MIT" ]] then echo "Uploading MIT licensed model" - if [[ "${{ github.event.inputs.mit_copyright_statement }}" == "" && "${{ github.event.inputs.mit_attribution_website }}" == "" && "${{ github.event.inputs.mit_model_version }}" == "" ]] + if [[ "${{ github.event.inputs.third_party_copyrights_statements }}" == "" ]] then - echo "missing materials for MIT model" + echo "missing copyright statements for MIT model" exit 1 fi - name: Initiate license_line id: init_license_line run: | echo "verified_apache=:white_check_mark: — It is verified that this model is licensed under Apache 2.0" >> $GITHUB_OUTPUT - echo "verified_mit=:white_check_mark: — It is verified that this model is licensed under MIT, and we have enough materials to generate the attribution file" >> $GITHUB_OUTPUT + echo "verified_mit=:white_check_mark: — It is verified that this model is licensed under MIT, and we have provided copyright statements" >> $GITHUB_OUTPUT echo "unverified=- [ ] :warning: The license cannot be verified. Please confirm by yourself that the model is licensed under Apache 2.0 or MIT with enough materials :warning:" >> $GITHUB_OUTPUT outputs: model_folder: ${{ steps.init_folders.outputs.model_folder }} @@ -220,9 +208,7 @@ jobs: echo "EMBEDDING_DIMENSION=${{ github.event.inputs.embedding_dimension }}" >> $GITHUB_ENV echo "POOLING_MODE=${{ github.event.inputs.pooling_mode }}" >> $GITHUB_ENV echo "MODEL_DESCRIPTION=${{ github.event.inputs.model_description }}" >> $GITHUB_ENV - echo "MIT_COPYRIGHT_STATEMENT=${{ github.event.inputs.mit_copyright_statement }}" >> $GITHUB_ENV - echo "MIT_ATTRIBUTION_WEBSITE=${{ github.event.inputs.mit_attribution_website }}" >> $GITHUB_ENV - echo "MIT_MODEL_VERSION=${{ github.event.inputs.mit_model_version }}" >> $GITHUB_ENV + echo "THIRD_PARTY_COPYRIGHTS_STATEMENTS=${{ github.event.inputs.third_party_copyrights_statements }}" >> $GITHUB_ENV - name: Autotracing ${{ matrix.cluster }} secured=${{ matrix.secured }} version=${{matrix.entry.opensearch_version}} run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} trace" - name: Limit Model Size to 2GB diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py index 517308792..4fa9545df 100644 --- a/utils/model_uploader/model_autotracing.py +++ b/utils/model_uploader/model_autotracing.py @@ -621,41 +621,16 @@ def main( help="Model description if you want to overwrite the default description", ) parser.add_argument( - "-mcs", - "--mit_copyright_statement", + "-tpcs", + "--third_party_copyrights_statements", type=str, nargs="?", default=None, const=None, - help="Copyright statement for MIT licensed models. Should be picked from origin MIT license file. E.g. Copyright (c) year author", - ) - parser.add_argument( - "-maw", - "--mit_attribution_website", - type=str, - nargs="?", - default=None, - const=None, - help="The project website for MIT licensed models. The MIT license file is supposed to be found in this website", - ) - parser.add_argument( - "-mmv", - "--mit_model_version", - type=str, - nargs="?", - default=None, - const=None, - help="The model version for MIT licensed models. Different from the model_version field above, this version should be quoted from the origin project website.", + help="Copyright statement for MIT licensed models.", ) args = parser.parse_args() - - if args.model_license == "MIT": - # in the model_uploader.yml we check that all materials are provided - third_party_copyrights_statements = generate_thirdpart_statements_for_MIT(model_id=args.model_id, copyright_statement=args.mit_copyright_statement, - attribution_website=args.mit_attribution_website,model_version=args.model_version) - else: - third_party_copyrights_statements = None main( args.model_id, @@ -665,5 +640,5 @@ def main( args.embedding_dimension, args.pooling_mode, args.model_description, - third_party_copyrights_statements + args.third_party_copyrights_statements ) From a5c6627ef74dedd6035dff2b0d6e7e6a2b3cd2cd Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Wed, 24 Apr 2024 10:15:41 +0000 Subject: [PATCH 06/23] change to copyright statements Signed-off-by: zhichao-aws --- .ci/run-repository.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.ci/run-repository.sh b/.ci/run-repository.sh index 08591c560..67e156029 100755 --- a/.ci/run-repository.sh +++ b/.ci/run-repository.sh @@ -74,9 +74,7 @@ elif [[ "$TASK_TYPE" == "trace" ]]; then echo -e "\033[34;1mINFO:\033[0m EMBEDDING_DIMENSION: ${EMBEDDING_DIMENSION:-N/A}\033[0m" echo -e "\033[34;1mINFO:\033[0m POOLING_MODE: ${POOLING_MODE:-N/A}\033[0m" echo -e "\033[34;1mINFO:\033[0m MODEL_DESCRIPTION: ${MODEL_DESCRIPTION:-N/A}\033[0m" - echo -e "\033[34;1mINFO:\033[0m MIT_COPYRIGHT_STATEMENT: ${MIT_COPYRIGHT_STATEMENT:-N/A}\033[0m" - echo -e "\033[34;1mINFO:\033[0m MIT_ATTRIBUTION_WEBSITE: ${MIT_ATTRIBUTION_WEBSITE:-N/A}\033[0m" - echo -e "\033[34;1mINFO:\033[0m MIT_MODEL_VERSION: ${MIT_MODEL_VERSION:-N/A}\033[0m" + echo -e "\033[34;1mINFO:\033[0m THIRD_PARTY_COPYRIGHTS_STATEMENTS: ${THIRD_PARTY_COPYRIGHTS_STATEMENTS:-N/A}\033[0m" docker run \ --network=${network_name} \ @@ -89,7 +87,7 @@ elif [[ "$TASK_TYPE" == "trace" ]]; then --name opensearch-py-ml-trace-runner \ opensearch-project/opensearch-py-ml \ nox -s "trace-${PYTHON_VERSION}" -- ${MODEL_ID} ${MODEL_LICENSE} ${MODEL_VERSION} ${TRACING_FORMAT} -ed ${EMBEDDING_DIMENSION} -pm ${POOLING_MODE} \ - -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} -mcs ${MIT_COPYRIGHT_STATEMENT} -maw ${MIT_ATTRIBUTION_WEBSITE} -mmv ${MIT_MODEL_VERSION} + -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} -tpcs ${THIRD_PARTY_COPYRIGHTS_STATEMENTS} docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/upload/ ./upload/ docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/trace_output/ ./trace_output/ From 46b99d74173a2a496a7478af6bab656327f0ca32 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Wed, 24 Apr 2024 10:24:01 +0000 Subject: [PATCH 07/23] add fi Signed-off-by: zhichao-aws --- .github/workflows/model_uploader.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index d0fe97dc2..fc9addad6 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -120,6 +120,7 @@ jobs: then echo "missing copyright statements for MIT model" exit 1 + fi fi - name: Initiate license_line id: init_license_line From efbc53389f7662a47f470443f9303e43db9fc3ab Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Thu, 25 Apr 2024 03:06:25 +0000 Subject: [PATCH 08/23] add quote to string var Signed-off-by: zhichao-aws --- .ci/run-repository.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/run-repository.sh b/.ci/run-repository.sh index 67e156029..8754f9dd8 100755 --- a/.ci/run-repository.sh +++ b/.ci/run-repository.sh @@ -87,7 +87,7 @@ elif [[ "$TASK_TYPE" == "trace" ]]; then --name opensearch-py-ml-trace-runner \ opensearch-project/opensearch-py-ml \ nox -s "trace-${PYTHON_VERSION}" -- ${MODEL_ID} ${MODEL_LICENSE} ${MODEL_VERSION} ${TRACING_FORMAT} -ed ${EMBEDDING_DIMENSION} -pm ${POOLING_MODE} \ - -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} -tpcs ${THIRD_PARTY_COPYRIGHTS_STATEMENTS} + -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} -tpcs "${THIRD_PARTY_COPYRIGHTS_STATEMENTS}" docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/upload/ ./upload/ docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/trace_output/ ./trace_output/ From 3e9871dc2a093ea56a21c52053d2784f432d713f Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Thu, 25 Apr 2024 06:26:52 +0000 Subject: [PATCH 09/23] fix quote Signed-off-by: zhichao-aws --- .github/workflows/model_uploader.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index fc9addad6..7b2f6f6fa 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -290,7 +290,7 @@ jobs: - name: Create Issue Body id: create_issue_body run: | - issue_body="Please approve or deny opensearch-py-ml model uploading: + issue_body='Please approve or deny opensearch-py-ml model uploading: ${{ needs.model-auto-tracing.outputs.license_line }} @@ -298,7 +298,7 @@ jobs: ${{ needs.model-auto-tracing.outputs.model_description_info }} ===== Dry Run of Model Uploading ===== - ${{ needs.model-auto-tracing.outputs.dryrun_output }}" + ${{ needs.model-auto-tracing.outputs.dryrun_output }}' echo "issue_body<> $GITHUB_OUTPUT echo "${issue_body@E}" >> $GITHUB_OUTPUT From 20d4fc851d00bd06ad31affb3848e8d52b0bfc3e Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Thu, 25 Apr 2024 06:39:14 +0000 Subject: [PATCH 10/23] fix quote Signed-off-by: zhichao-aws --- .github/workflows/model_uploader.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index 7b2f6f6fa..21437e628 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -375,14 +375,14 @@ jobs: - name: Create PR Body id: create_pr_body run: | - pr_body=" + pr_body=' - [ ] This PR made commit to only these three files: MODEL_UPLOAD_HISTORY.md, supported_models.json, and CHANGELOG.md. - [ ] CHANGELOG.md has been updated by the workflow or by you if the workflow fails to do so. - [ ] Merge conflicts have been resolved. ${{ needs.init-workflow-var.outputs.workflow_info }} ${{ needs.model-auto-tracing.outputs.license_info }} - ${{ needs.model-auto-tracing.outputs.model_description_info }}" + ${{ needs.model-auto-tracing.outputs.model_description_info }}' echo "pr_body<> $GITHUB_OUTPUT echo "${pr_body@E}" >> $GITHUB_OUTPUT From 337ffa984652ebb9f5b2e0e2c34ba10ee63a208c Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Thu, 25 Apr 2024 07:17:40 +0000 Subject: [PATCH 11/23] log failure state Signed-off-by: zhichao-aws --- opensearch_py_ml/ml_commons/ml_commons_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opensearch_py_ml/ml_commons/ml_commons_client.py b/opensearch_py_ml/ml_commons/ml_commons_client.py index 2f387ad11..cc6676ebe 100644 --- a/opensearch_py_ml/ml_commons/ml_commons_client.py +++ b/opensearch_py_ml/ml_commons/ml_commons_client.py @@ -382,7 +382,7 @@ def deploy_model(self, model_id: str, wait_until_deployed: bool = True) -> objec elif model_state == "PARTIALLY_DEPLOYED": print("Model deployed only partially") else: - raise Exception("Model deployment failed") + raise Exception("Model deployment failed, model_state:",model_state) return self._get_task_info(task_id) From a0073019c35095c3ed14eab5c0ecd81d70d448f4 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Thu, 25 Apr 2024 08:15:19 +0000 Subject: [PATCH 12/23] use url Signed-off-by: zhichao-aws --- .ci/run-repository.sh | 4 +- .github/workflows/model_uploader.yml | 23 ++++--- opensearch_py_ml/field_mappings.py | 6 +- .../ml_commons/ml_commons_client.py | 2 +- opensearch_py_ml/ml_commons/model_uploader.py | 6 +- .../metrics_correlation/event_detection.py | 6 +- .../ml_models/sentencetransformermodel.py | 38 +++++++---- utils/model_uploader/model_autotracing.py | 55 +++++++++------- .../model_uploader/third_party_statements.py | 66 ++++++------------- 9 files changed, 103 insertions(+), 103 deletions(-) diff --git a/.ci/run-repository.sh b/.ci/run-repository.sh index 8754f9dd8..b3b6974eb 100755 --- a/.ci/run-repository.sh +++ b/.ci/run-repository.sh @@ -74,7 +74,7 @@ elif [[ "$TASK_TYPE" == "trace" ]]; then echo -e "\033[34;1mINFO:\033[0m EMBEDDING_DIMENSION: ${EMBEDDING_DIMENSION:-N/A}\033[0m" echo -e "\033[34;1mINFO:\033[0m POOLING_MODE: ${POOLING_MODE:-N/A}\033[0m" echo -e "\033[34;1mINFO:\033[0m MODEL_DESCRIPTION: ${MODEL_DESCRIPTION:-N/A}\033[0m" - echo -e "\033[34;1mINFO:\033[0m THIRD_PARTY_COPYRIGHTS_STATEMENTS: ${THIRD_PARTY_COPYRIGHTS_STATEMENTS:-N/A}\033[0m" + echo -e "\033[34;1mINFO:\033[0m MIT_LICENSE_URL: ${MIT_LICENSE_URL:-N/A}\033[0m" docker run \ --network=${network_name} \ @@ -87,7 +87,7 @@ elif [[ "$TASK_TYPE" == "trace" ]]; then --name opensearch-py-ml-trace-runner \ opensearch-project/opensearch-py-ml \ nox -s "trace-${PYTHON_VERSION}" -- ${MODEL_ID} ${MODEL_LICENSE} ${MODEL_VERSION} ${TRACING_FORMAT} -ed ${EMBEDDING_DIMENSION} -pm ${POOLING_MODE} \ - -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} -tpcs "${THIRD_PARTY_COPYRIGHTS_STATEMENTS}" + -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} -mlu ${MIT_LICENSE_URL} docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/upload/ ./upload/ docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/trace_output/ ./trace_output/ diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index 21437e628..39d6353f0 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -57,8 +57,8 @@ on: options: - "NO" - "YES" - third_party_copyrights_statements: - description: "(Optional) Copyright statements for MIT licensed models. Should be put in the final artifact." + MIT_license_url: + description: "(Optional) MIT license of the huggingface MIT model." required: false type: string @@ -84,7 +84,7 @@ jobs: embedding_dimension=${{ github.event.inputs.embedding_dimension }} pooling_mode=${{ github.event.inputs.pooling_mode }} model_description="${{ github.event.inputs.model_description }}" - third_party_copyrights_statements="${{ github.event.inputs.third_party_copyrights_statements }}" + MIT_license_url="${{ github.event.inputs.MIT_license_url }}" workflow_info=" ============= Workflow Details ============== @@ -101,7 +101,7 @@ jobs: - Embedding Dimension: ${embedding_dimension:-N/A} - Pooling Mode: ${pooling_mode:-N/A} - Model Description: ${model_description:-N/A} - - Third Party Copyrights Statements: ${third_party_copyrights_statements:-N/A} + - MIT License Url: ${MIT_license_url:-N/A} ======== Workflow Output Information ========= - Embedding Verification: Passed" @@ -110,15 +110,20 @@ jobs: echo "${workflow_info@E}" >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT echo "${workflow_info@E}" - - name: Check MIT license copyright statements - id: check_mit_license_copyright_statements + - name: Check MIT license url + id: check_mit_license_url run: | if [[ "${{ github.event.inputs.model_license }}" == "MIT" ]] then echo "Uploading MIT licensed model" - if [[ "${{ github.event.inputs.third_party_copyrights_statements }}" == "" ]] + if [[ "${{ github.event.inputs.MIT_license_url }}" == "" ]] then - echo "missing copyright statements for MIT model" + echo "missing MIT license url" + exit 1 + fi + if [[ "${{ github.event.inputs.model_source }}" == "" ]] + then + echo "only support MIT models from huggingface now" exit 1 fi fi @@ -209,7 +214,7 @@ jobs: echo "EMBEDDING_DIMENSION=${{ github.event.inputs.embedding_dimension }}" >> $GITHUB_ENV echo "POOLING_MODE=${{ github.event.inputs.pooling_mode }}" >> $GITHUB_ENV echo "MODEL_DESCRIPTION=${{ github.event.inputs.model_description }}" >> $GITHUB_ENV - echo "THIRD_PARTY_COPYRIGHTS_STATEMENTS=${{ github.event.inputs.third_party_copyrights_statements }}" >> $GITHUB_ENV + echo "MIT_LICENSE_URL=${{ github.event.inputs.MIT_license_url }}" >> $GITHUB_ENV - name: Autotracing ${{ matrix.cluster }} secured=${{ matrix.secured }} version=${{matrix.entry.opensearch_version}} run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} trace" - name: Limit Model Size to 2GB diff --git a/opensearch_py_ml/field_mappings.py b/opensearch_py_ml/field_mappings.py index 75bbd1833..b3c83914a 100644 --- a/opensearch_py_ml/field_mappings.py +++ b/opensearch_py_ml/field_mappings.py @@ -445,9 +445,9 @@ def find_aggregatable(row, df): try: series = df.loc[df.os_field_name == os_field_name_keyword] if not series.empty and series.is_aggregatable.squeeze(): - row_as_dict["aggregatable_os_field_name"] = ( - os_field_name_keyword - ) + row_as_dict[ + "aggregatable_os_field_name" + ] = os_field_name_keyword else: row_as_dict["aggregatable_os_field_name"] = None except KeyError: diff --git a/opensearch_py_ml/ml_commons/ml_commons_client.py b/opensearch_py_ml/ml_commons/ml_commons_client.py index cc6676ebe..8b50a95cc 100644 --- a/opensearch_py_ml/ml_commons/ml_commons_client.py +++ b/opensearch_py_ml/ml_commons/ml_commons_client.py @@ -382,7 +382,7 @@ def deploy_model(self, model_id: str, wait_until_deployed: bool = True) -> objec elif model_state == "PARTIALLY_DEPLOYED": print("Model deployed only partially") else: - raise Exception("Model deployment failed, model_state:",model_state) + raise Exception("Model deployment failed, model_state:", model_state) return self._get_task_info(task_id) diff --git a/opensearch_py_ml/ml_commons/model_uploader.py b/opensearch_py_ml/ml_commons/model_uploader.py index 850f6a80a..af7af2f62 100644 --- a/opensearch_py_ml/ml_commons/model_uploader.py +++ b/opensearch_py_ml/ml_commons/model_uploader.py @@ -95,9 +95,9 @@ def _register_model( model_meta_json[TOTAL_CHUNKS_FIELD] = total_num_chunks if MODEL_CONTENT_SIZE_IN_BYTES_FIELD not in model_meta_json: - model_meta_json[MODEL_CONTENT_SIZE_IN_BYTES_FIELD] = ( - model_content_size_in_bytes - ) + model_meta_json[ + MODEL_CONTENT_SIZE_IN_BYTES_FIELD + ] = model_content_size_in_bytes if MODEL_CONTENT_HASH_VALUE not in model_meta_json: # Generate the sha1 hash for the model zip file hash_val_model_file = _generate_model_content_hash_value(model_path) diff --git a/opensearch_py_ml/ml_models/metrics_correlation/event_detection.py b/opensearch_py_ml/ml_models/metrics_correlation/event_detection.py index f6e5187cd..f9395d1a4 100644 --- a/opensearch_py_ml/ml_models/metrics_correlation/event_detection.py +++ b/opensearch_py_ml/ml_models/metrics_correlation/event_detection.py @@ -94,9 +94,9 @@ def merge_events( send = ends[ix] sevents = candidates[ix, :] - merged: List[Dict[str, torch.Tensor]] = ( - [] - ) # merge in linear pass over time dimension + merged: List[ + Dict[str, torch.Tensor] + ] = [] # merge in linear pass over time dimension currstart = torch.tensor([-1]) currend = torch.tensor([-1]) currevent = torch.ones(T) * -1.0 diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 3934190ae..63e3e6e56 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -656,8 +656,10 @@ def _add_apache_license_to_model_zip_file(self, model_zip_file_path: str): with ZipFile(str(model_zip_file_path), "a") as zipObj: zipObj.writestr("LICENSE", r.content) - - def _add_third_party_copyrights_statements_to_model_zip_file(self, third_party_copyrights_statements: str, model_zip_file_path: str): + + def _add_third_party_copyrights_statements_to_model_zip_file( + self, third_party_copyrights_statements: str, model_zip_file_path: str + ): """ Add Statements text for non Apache-2.0 licensed third party model. Add it to the model zip file at model_zip_file_path @@ -785,7 +787,7 @@ def save_as_pt( model_output_path: str = None, zip_file_name: str = None, add_apache_license: bool = False, - third_party_copyrights_statements: Optional[str] = None + third_party_copyrights_statements: Optional[str] = None, ) -> str: """ Download sentence transformer model directly from huggingface, convert model to torch script format, @@ -822,10 +824,12 @@ def save_as_pt( :return: model zip file path. The file path where the zip file is being saved :rtype: string """ - + if add_apache_license == True and not third_party_copyrights_statements is None: - assert False, "When the model is from third party under non Apache-2.0 license, we can not add Apache-2.0 license for it." - + assert ( + False + ), "When the model is from third party under non Apache-2.0 license, we can not add Apache-2.0 license for it." + model = SentenceTransformer(model_id) if model_name is None: @@ -891,7 +895,9 @@ def save_as_pt( if add_apache_license: self._add_apache_license_to_model_zip_file(zip_file_path) if not third_party_copyrights_statements is None: - self._add_third_party_copyrights_statements_to_model_zip_file(third_party_copyrights_statements, zip_file_path) + self._add_third_party_copyrights_statements_to_model_zip_file( + third_party_copyrights_statements, zip_file_path + ) self.torch_script_zip_file_path = zip_file_path print("zip file is saved to ", zip_file_path, "\n") @@ -905,7 +911,7 @@ def save_as_onnx( model_output_path: str = None, zip_file_name: str = None, add_apache_license: bool = False, - third_party_copyrights_statements: Optional[str] = None + third_party_copyrights_statements: Optional[str] = None, ) -> str: """ Download sentence transformer model directly from huggingface, convert model to onnx format, @@ -939,9 +945,11 @@ def save_as_onnx( :return: model zip file path. The file path where the zip file is being saved :rtype: string """ - + if add_apache_license == True and not third_party_copyrights_statements is None: - assert False, "When the model is from third party under non Apache-2.0 license, we can not add Apache-2.0 license for it." + assert ( + False + ), "When the model is from third party under non Apache-2.0 license, we can not add Apache-2.0 license for it." model = SentenceTransformer(model_id) @@ -997,7 +1005,9 @@ def save_as_onnx( if add_apache_license: self._add_apache_license_to_model_zip_file(zip_file_path) if not third_party_copyrights_statements is None: - self._add_third_party_copyrights_statements_to_model_zip_file(third_party_copyrights_statements, zip_file_path) + self._add_third_party_copyrights_statements_to_model_zip_file( + third_party_copyrights_statements, zip_file_path + ) self.onnx_zip_file_path = zip_file_path print("zip file is saved to ", zip_file_path, "\n") @@ -1334,9 +1344,9 @@ def make_model_config_json( model_config_content["model_content_size_in_bytes"] = os.stat( model_zip_file_path ).st_size - model_config_content["model_content_hash_value"] = ( - _generate_model_content_hash_value(model_zip_file_path) - ) + model_config_content[ + "model_content_hash_value" + ] = _generate_model_content_hash_value(model_zip_file_path) if verbose: print("generating ml-commons_model_config.json file...\n") diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py index 4fa9545df..d7c967ad3 100644 --- a/utils/model_uploader/model_autotracing.py +++ b/utils/model_uploader/model_autotracing.py @@ -32,7 +32,9 @@ from opensearch_py_ml.ml_commons import MLCommonClient from opensearch_py_ml.ml_models.sentencetransformermodel import SentenceTransformerModel from tests import OPENSEARCH_TEST_CLIENT -from third_party_statements import generate_thirdpart_statements_for_MIT +from third_party_statements import ( + generate_thirdpart_statements_for_huggingface_MIT_models, +) BOTH_FORMAT = "BOTH" TORCH_SCRIPT_FORMAT = "TORCH_SCRIPT" @@ -80,9 +82,14 @@ def verify_license_in_md_file(model_license: str = "Apache-2.0") -> bool: print(f"\nFound {model_license} license at " + TEMP_MODEL_PATH + "/README.md") return True else: - print(f"\nDid not find {model_license} license at " + TEMP_MODEL_PATH + "/README.md") + print( + f"\nDid not find {model_license} license at " + + TEMP_MODEL_PATH + + "/README.md" + ) return False + def trace_sentence_transformer_model( model_id: str, model_license: str, @@ -91,7 +98,7 @@ def trace_sentence_transformer_model( embedding_dimension: Optional[int] = None, pooling_mode: Optional[str] = None, model_description: Optional[str] = None, - third_party_copyrights_statements: Optional[str] = None + third_party_copyrights_statements: Optional[str] = None, ) -> Tuple[str, str]: """ Trace the pretrained sentence transformer model, create a model config file, @@ -141,14 +148,14 @@ def trace_sentence_transformer_model( model_path = pre_trained_model.save_as_pt( model_id=model_id, sentences=TEST_SENTENCES, - add_apache_license=model_license=="Apache-2.0", - third_party_copyrights_statements=third_party_copyrights_statements + add_apache_license=model_license == "Apache-2.0", + third_party_copyrights_statements=third_party_copyrights_statements, ) else: model_path = pre_trained_model.save_as_onnx( model_id=model_id, - add_apache_license=model_license=="Apache-2.0", - third_party_copyrights_statements=third_party_copyrights_statements + add_apache_license=model_license == "Apache-2.0", + third_party_copyrights_statements=third_party_copyrights_statements, ) except Exception as e: assert False, f"Raised Exception during saving model as {model_format}: {e}" @@ -422,7 +429,7 @@ def main( embedding_dimension: Optional[int] = None, pooling_mode: Optional[str] = None, model_description: Optional[str] = None, - third_party_copyrights_statements: Optional[str] = None + third_party_copyrights_statements: Optional[str] = None, ) -> None: """ Perform model auto-tracing and prepare files for uploading to OpenSearch model hub @@ -462,8 +469,10 @@ def main( model_description if model_description is not None else "N/A", ) print( - "Third Party Statements Text: ", - third_party_copyrights_statements if third_party_copyrights_statements is not None else "N/A" + "Third Party Statements Text: ", + third_party_copyrights_statements + if third_party_copyrights_statements is not None + else "N/A", ) print("==========================================") @@ -494,7 +503,7 @@ def main( embedding_dimension, pooling_mode, model_description, - third_party_copyrights_statements + third_party_copyrights_statements, ) torchscript_embedding_data = register_and_deploy_sentence_transformer_model( @@ -526,10 +535,7 @@ def main( if tracing_format in [ONNX_FORMAT, BOTH_FORMAT]: print("--- Begin tracing a model in ONNX ---") - ( - onnx_model_path, - onnx_model_config_path, - ) = trace_sentence_transformer_model( + (onnx_model_path, onnx_model_config_path,) = trace_sentence_transformer_model( model_id, model_license, model_version, @@ -537,7 +543,7 @@ def main( embedding_dimension, pooling_mode, model_description, - third_party_copyrights_statements + third_party_copyrights_statements, ) onnx_embedding_data = register_and_deploy_sentence_transformer_model( @@ -621,17 +627,22 @@ def main( help="Model description if you want to overwrite the default description", ) parser.add_argument( - "-tpcs", - "--third_party_copyrights_statements", + "-mlu", + "--mit_license_url", type=str, nargs="?", default=None, const=None, - help="Copyright statement for MIT licensed models.", + help="MIT license url", ) - + args = parser.parse_args() - + + third_party_copyrights_statements = ( + generate_thirdpart_statements_for_huggingface_MIT_models( + args.model_id, args.mit_license_url + ) + ) main( args.model_id, args.model_license, @@ -640,5 +651,5 @@ def main( args.embedding_dimension, args.pooling_mode, args.model_description, - args.third_party_copyrights_statements + third_party_copyrights_statements, ) diff --git a/utils/model_uploader/third_party_statements.py b/utils/model_uploader/third_party_statements.py index 6c24640b1..924b3c571 100644 --- a/utils/model_uploader/third_party_statements.py +++ b/utils/model_uploader/third_party_statements.py @@ -4,57 +4,31 @@ # compatible open source license. # Any modifications Copyright OpenSearch Contributors. See # GitHub history for details. -from string import Template +import re +import requests -MIT_TEMPLATE = Template(""" -** $model_id; version $model_version $attribution_website -$copyright_statement - -MIT License -$copyright_statement - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""") - -def generate_thirdpart_statements_for_MIT( - model_id: str, - copyright_statement: str, - attribution_website: str, - model_version:str +def generate_thirdpart_statements_for_huggingface_MIT_models( + model_id: str, mit_license_url: str ) -> str: """ - Generate statements text for MIT-licensed third party model. The result should be put in the final artifact. - + Generate statements text for huggingface MIT-licensed third party model. The result should be put in the final artifact. + :param model_id: Model ID of the pretrained model :type model_id: string - :param copyright_statement: MIT models copyright statement - :type copyright_statement: string - :param attribution_website: The project website for MIT licensed models - :type attribution_website: string - :param model_version: The model version for MIT licensed models - :type model_version: string - :return: Statements text for MIT-licensed third party model. + :param mit_license_url: the url of the model's MIT license + :type mit_license_url: string :rtype: str """ - - - result = MIT_TEMPLATE.substitute(model_id=model_id, copyright_statement=copyright_statement, - attribution_website=attribution_website, model_version=model_version) - return result.strip() \ No newline at end of file + + r = requests.get(mit_license_url) + assert r.status_code == 200, "Failed to add license file to the model zip file" + license_text = r.content.decode("utf-8") + + # find the copyright statements from origin MIT license. It looks like: Copyright (c) {year} {authorname} + copyright_statements = re.findall("Copyright.*\n", license_text)[0].strip() + huggingface_url = "https://huggingface.co/" + model_id + + full_statements = f"** {model_id}; version -- {huggingface_url}\n{copyright_statements}\n\n{license_text}" + + return full_statements From e997594f63f084822cb4742598538fcdd0186b20 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Thu, 25 Apr 2024 08:31:47 +0000 Subject: [PATCH 13/23] lint Signed-off-by: zhichao-aws --- .github/workflows/model_uploader.yml | 2 +- opensearch_py_ml/field_mappings.py | 6 +++--- opensearch_py_ml/ml_commons/model_uploader.py | 6 +++--- .../metrics_correlation/event_detection.py | 6 +++--- .../ml_models/sentencetransformermodel.py | 14 ++++++------- utils/model_uploader/model_autotracing.py | 20 ++++++++++++------- .../model_uploader/third_party_statements.py | 1 + 7 files changed, 31 insertions(+), 24 deletions(-) diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index 39d6353f0..013840836 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -58,7 +58,7 @@ on: - "NO" - "YES" MIT_license_url: - description: "(Optional) MIT license of the huggingface MIT model." + description: "(Optional) MIT license url of the huggingface MIT model." required: false type: string diff --git a/opensearch_py_ml/field_mappings.py b/opensearch_py_ml/field_mappings.py index b3c83914a..75bbd1833 100644 --- a/opensearch_py_ml/field_mappings.py +++ b/opensearch_py_ml/field_mappings.py @@ -445,9 +445,9 @@ def find_aggregatable(row, df): try: series = df.loc[df.os_field_name == os_field_name_keyword] if not series.empty and series.is_aggregatable.squeeze(): - row_as_dict[ - "aggregatable_os_field_name" - ] = os_field_name_keyword + row_as_dict["aggregatable_os_field_name"] = ( + os_field_name_keyword + ) else: row_as_dict["aggregatable_os_field_name"] = None except KeyError: diff --git a/opensearch_py_ml/ml_commons/model_uploader.py b/opensearch_py_ml/ml_commons/model_uploader.py index af7af2f62..850f6a80a 100644 --- a/opensearch_py_ml/ml_commons/model_uploader.py +++ b/opensearch_py_ml/ml_commons/model_uploader.py @@ -95,9 +95,9 @@ def _register_model( model_meta_json[TOTAL_CHUNKS_FIELD] = total_num_chunks if MODEL_CONTENT_SIZE_IN_BYTES_FIELD not in model_meta_json: - model_meta_json[ - MODEL_CONTENT_SIZE_IN_BYTES_FIELD - ] = model_content_size_in_bytes + model_meta_json[MODEL_CONTENT_SIZE_IN_BYTES_FIELD] = ( + model_content_size_in_bytes + ) if MODEL_CONTENT_HASH_VALUE not in model_meta_json: # Generate the sha1 hash for the model zip file hash_val_model_file = _generate_model_content_hash_value(model_path) diff --git a/opensearch_py_ml/ml_models/metrics_correlation/event_detection.py b/opensearch_py_ml/ml_models/metrics_correlation/event_detection.py index f9395d1a4..f6e5187cd 100644 --- a/opensearch_py_ml/ml_models/metrics_correlation/event_detection.py +++ b/opensearch_py_ml/ml_models/metrics_correlation/event_detection.py @@ -94,9 +94,9 @@ def merge_events( send = ends[ix] sevents = candidates[ix, :] - merged: List[ - Dict[str, torch.Tensor] - ] = [] # merge in linear pass over time dimension + merged: List[Dict[str, torch.Tensor]] = ( + [] + ) # merge in linear pass over time dimension currstart = torch.tensor([-1]) currend = torch.tensor([-1]) currevent = torch.ones(T) * -1.0 diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 63e3e6e56..c37b30479 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -825,7 +825,7 @@ def save_as_pt( :rtype: string """ - if add_apache_license == True and not third_party_copyrights_statements is None: + if add_apache_license == True and third_party_copyrights_statements is not None: assert ( False ), "When the model is from third party under non Apache-2.0 license, we can not add Apache-2.0 license for it." @@ -894,7 +894,7 @@ def save_as_pt( ) if add_apache_license: self._add_apache_license_to_model_zip_file(zip_file_path) - if not third_party_copyrights_statements is None: + if third_party_copyrights_statements is not None: self._add_third_party_copyrights_statements_to_model_zip_file( third_party_copyrights_statements, zip_file_path ) @@ -946,7 +946,7 @@ def save_as_onnx( :rtype: string """ - if add_apache_license == True and not third_party_copyrights_statements is None: + if add_apache_license == True and third_party_copyrights_statements is not None: assert ( False ), "When the model is from third party under non Apache-2.0 license, we can not add Apache-2.0 license for it." @@ -1004,7 +1004,7 @@ def save_as_onnx( ) if add_apache_license: self._add_apache_license_to_model_zip_file(zip_file_path) - if not third_party_copyrights_statements is None: + if third_party_copyrights_statements is not None: self._add_third_party_copyrights_statements_to_model_zip_file( third_party_copyrights_statements, zip_file_path ) @@ -1344,9 +1344,9 @@ def make_model_config_json( model_config_content["model_content_size_in_bytes"] = os.stat( model_zip_file_path ).st_size - model_config_content[ - "model_content_hash_value" - ] = _generate_model_content_hash_value(model_zip_file_path) + model_config_content["model_content_hash_value"] = ( + _generate_model_content_hash_value(model_zip_file_path) + ) if verbose: print("generating ml-commons_model_config.json file...\n") diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py index d7c967ad3..0ba1355bb 100644 --- a/utils/model_uploader/model_autotracing.py +++ b/utils/model_uploader/model_autotracing.py @@ -29,13 +29,14 @@ ROOT_DIR = os.path.join(THIS_DIR, "../..") sys.path.append(ROOT_DIR) -from opensearch_py_ml.ml_commons import MLCommonClient -from opensearch_py_ml.ml_models.sentencetransformermodel import SentenceTransformerModel -from tests import OPENSEARCH_TEST_CLIENT from third_party_statements import ( generate_thirdpart_statements_for_huggingface_MIT_models, ) +from opensearch_py_ml.ml_commons import MLCommonClient +from opensearch_py_ml.ml_models.sentencetransformermodel import SentenceTransformerModel +from tests import OPENSEARCH_TEST_CLIENT + BOTH_FORMAT = "BOTH" TORCH_SCRIPT_FORMAT = "TORCH_SCRIPT" ONNX_FORMAT = "ONNX" @@ -470,9 +471,11 @@ def main( ) print( "Third Party Statements Text: ", - third_party_copyrights_statements - if third_party_copyrights_statements is not None - else "N/A", + ( + third_party_copyrights_statements + if third_party_copyrights_statements is not None + else "N/A" + ), ) print("==========================================") @@ -535,7 +538,10 @@ def main( if tracing_format in [ONNX_FORMAT, BOTH_FORMAT]: print("--- Begin tracing a model in ONNX ---") - (onnx_model_path, onnx_model_config_path,) = trace_sentence_transformer_model( + ( + onnx_model_path, + onnx_model_config_path, + ) = trace_sentence_transformer_model( model_id, model_license, model_version, diff --git a/utils/model_uploader/third_party_statements.py b/utils/model_uploader/third_party_statements.py index 924b3c571..0603b5a47 100644 --- a/utils/model_uploader/third_party_statements.py +++ b/utils/model_uploader/third_party_statements.py @@ -5,6 +5,7 @@ # Any modifications Copyright OpenSearch Contributors. See # GitHub history for details. import re + import requests From 08540b270d4c5e4b9142f473d11b613d5e0584f8 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Thu, 25 Apr 2024 09:16:28 +0000 Subject: [PATCH 14/23] improve error log Signed-off-by: zhichao-aws --- .github/workflows/model_uploader.yml | 8 ++++---- opensearch_py_ml/ml_commons/ml_commons_client.py | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index 013840836..b0a06fe4b 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -295,7 +295,7 @@ jobs: - name: Create Issue Body id: create_issue_body run: | - issue_body='Please approve or deny opensearch-py-ml model uploading: + issue_body="Please approve or deny opensearch-py-ml model uploading: ${{ needs.model-auto-tracing.outputs.license_line }} @@ -303,7 +303,7 @@ jobs: ${{ needs.model-auto-tracing.outputs.model_description_info }} ===== Dry Run of Model Uploading ===== - ${{ needs.model-auto-tracing.outputs.dryrun_output }}' + ${{ needs.model-auto-tracing.outputs.dryrun_output }}" echo "issue_body<> $GITHUB_OUTPUT echo "${issue_body@E}" >> $GITHUB_OUTPUT @@ -380,14 +380,14 @@ jobs: - name: Create PR Body id: create_pr_body run: | - pr_body=' + pr_body=" - [ ] This PR made commit to only these three files: MODEL_UPLOAD_HISTORY.md, supported_models.json, and CHANGELOG.md. - [ ] CHANGELOG.md has been updated by the workflow or by you if the workflow fails to do so. - [ ] Merge conflicts have been resolved. ${{ needs.init-workflow-var.outputs.workflow_info }} ${{ needs.model-auto-tracing.outputs.license_info }} - ${{ needs.model-auto-tracing.outputs.model_description_info }}' + ${{ needs.model-auto-tracing.outputs.model_description_info }}" echo "pr_body<> $GITHUB_OUTPUT echo "${pr_body@E}" >> $GITHUB_OUTPUT diff --git a/opensearch_py_ml/ml_commons/ml_commons_client.py b/opensearch_py_ml/ml_commons/ml_commons_client.py index 8b50a95cc..8dec28829 100644 --- a/opensearch_py_ml/ml_commons/ml_commons_client.py +++ b/opensearch_py_ml/ml_commons/ml_commons_client.py @@ -371,7 +371,7 @@ def deploy_model(self, model_id: str, wait_until_deployed: bool = True) -> objec for i in range(TIMEOUT): ml_model_status = self.get_model_info(model_id) model_state = ml_model_status.get("model_state") - if model_state in ["DEPLOYED", "PARTIALLY_DEPLOYED"]: + if model_state in ["DEPLOYED", "PARTIALLY_DEPLOYED", "DEPLOY_FAILED"]: break time.sleep(1) @@ -382,7 +382,8 @@ def deploy_model(self, model_id: str, wait_until_deployed: bool = True) -> objec elif model_state == "PARTIALLY_DEPLOYED": print("Model deployed only partially") else: - raise Exception("Model deployment failed, model_state:", model_state) + raise Exception("Model deployment failed, model_state:", model_state, + "task info:",self.get_task_info(task_id)) return self._get_task_info(task_id) From 98794e04c7144b3bc4558145a752d1cb3d41690a Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Thu, 25 Apr 2024 09:51:45 +0000 Subject: [PATCH 15/23] lint Signed-off-by: zhichao-aws --- opensearch_py_ml/ml_commons/ml_commons_client.py | 5 ++--- utils/model_uploader/model_autotracing.py | 10 +++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/opensearch_py_ml/ml_commons/ml_commons_client.py b/opensearch_py_ml/ml_commons/ml_commons_client.py index 8dec28829..2f387ad11 100644 --- a/opensearch_py_ml/ml_commons/ml_commons_client.py +++ b/opensearch_py_ml/ml_commons/ml_commons_client.py @@ -371,7 +371,7 @@ def deploy_model(self, model_id: str, wait_until_deployed: bool = True) -> objec for i in range(TIMEOUT): ml_model_status = self.get_model_info(model_id) model_state = ml_model_status.get("model_state") - if model_state in ["DEPLOYED", "PARTIALLY_DEPLOYED", "DEPLOY_FAILED"]: + if model_state in ["DEPLOYED", "PARTIALLY_DEPLOYED"]: break time.sleep(1) @@ -382,8 +382,7 @@ def deploy_model(self, model_id: str, wait_until_deployed: bool = True) -> objec elif model_state == "PARTIALLY_DEPLOYED": print("Model deployed only partially") else: - raise Exception("Model deployment failed, model_state:", model_state, - "task info:",self.get_task_info(task_id)) + raise Exception("Model deployment failed") return self._get_task_info(task_id) diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py index 0ba1355bb..6d755dd3f 100644 --- a/utils/model_uploader/model_autotracing.py +++ b/utils/model_uploader/model_autotracing.py @@ -643,12 +643,12 @@ def main( ) args = parser.parse_args() - - third_party_copyrights_statements = ( - generate_thirdpart_statements_for_huggingface_MIT_models( - args.model_id, args.mit_license_url + if args.model_license == "MIT": + third_party_copyrights_statements = ( + generate_thirdpart_statements_for_huggingface_MIT_models( + args.model_id, args.mit_license_url + ) ) - ) main( args.model_id, args.model_license, From 70f0713fcf8a248bb4210a6105ca76d41234f61b Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Fri, 26 Apr 2024 02:40:02 +0000 Subject: [PATCH 16/23] fix Signed-off-by: zhichao-aws --- utils/model_uploader/model_autotracing.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py index 6d755dd3f..3d2b8767b 100644 --- a/utils/model_uploader/model_autotracing.py +++ b/utils/model_uploader/model_autotracing.py @@ -649,6 +649,9 @@ def main( args.model_id, args.mit_license_url ) ) + else: + third_party_copyrights_statements = None + main( args.model_id, args.model_license, From fe0471a020a90b7b94b91c206ebcc49a2c81b32d Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Fri, 26 Apr 2024 04:56:55 +0000 Subject: [PATCH 17/23] add test Signed-off-by: zhichao-aws --- .../test_sentencetransformermodel_pytest.py | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index 17f86c75b..ba3cad9cd 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -221,6 +221,26 @@ def test_missing_files(): assert "Cannot find config.json" in str(exc_info.value) +def test_save_model_but_license_conflicts(): + with pytest.raises(AssertionError) as exc_info: + test_model.save_as_pt( + sentences=["today is sunny"], add_apache_license=True, third_party_copyrights_statements="test statements" + ) + assert ( + "When the model is from third party under non Apache-2.0 license, we can not add Apache-2.0 license for it." + in str(exc_info.value) + ) + + with pytest.raises(AssertionError) as exc_info: + test_model.save_as_onnx( + add_apache_license=True, third_party_copyrights_statements="test statements" + ) + assert ( + "When the model is from third party under non Apache-2.0 license, we can not add Apache-2.0 license for it." + in str(exc_info.value) + ) + + def test_save_as_pt(): try: test_model.save_as_pt(sentences=["today is sunny"]) @@ -658,5 +678,64 @@ def test_zip_model_with_license(): clean_test_folder(TEST_FOLDER) +def test_save_as_pt_with_third_party_copyrights_statements(): + model_id = "sentence-transformers/all-MiniLM-L6-v2" + model_format = "TORCH_SCRIPT" + torch_script_zip_file_path = os.path.join(TEST_FOLDER, "all-MiniLM-L6-v2.zip") + torch_script_expected_filenames = { + "all-MiniLM-L6-v2.pt", + "tokenizer.json", + "THIRD-PARTY", + } + + clean_test_folder(TEST_FOLDER) + test_model18 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + third_party_copyrights_statements = "test statements" + + test_model18.save_as_pt( + model_id=model_id, + sentences=["today is sunny"], + third_party_copyrights_statements=third_party_copyrights_statements, + ) + + compare_model_zip_file( + torch_script_zip_file_path, torch_script_expected_filenames, model_format + ) + + clean_test_folder(TEST_FOLDER) + + +def test_save_as_onnx_with_third_party_copyrights_statements(): + model_id = "sentence-transformers/all-MiniLM-L6-v2" + model_format = "TORCH_SCRIPT" + torch_script_zip_file_path = os.path.join(TEST_FOLDER, "all-MiniLM-L6-v2.zip") + torch_script_expected_filenames = { + "all-MiniLM-L6-v2.pt", + "tokenizer.json", + "THIRD-PARTY", + } + + clean_test_folder(TEST_FOLDER) + test_model19 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + third_party_copyrights_statements = "test statements" + + test_model19.save_as_onnx( + model_id=model_id, + third_party_copyrights_statements=third_party_copyrights_statements, + ) + + compare_model_zip_file( + torch_script_zip_file_path, torch_script_expected_filenames, model_format + ) + + clean_test_folder(TEST_FOLDER) + + clean_test_folder(TEST_FOLDER) clean_test_folder(TESTDATA_UNZIP_FOLDER) From db4900226cd7ad687f9d69c1105f1a0adef3acb2 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Fri, 26 Apr 2024 05:00:37 +0000 Subject: [PATCH 18/23] lint Signed-off-by: zhichao-aws --- tests/ml_models/test_sentencetransformermodel_pytest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index ba3cad9cd..beee4127e 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -224,7 +224,9 @@ def test_missing_files(): def test_save_model_but_license_conflicts(): with pytest.raises(AssertionError) as exc_info: test_model.save_as_pt( - sentences=["today is sunny"], add_apache_license=True, third_party_copyrights_statements="test statements" + sentences=["today is sunny"], + add_apache_license=True, + third_party_copyrights_statements="test statements", ) assert ( "When the model is from third party under non Apache-2.0 license, we can not add Apache-2.0 license for it." From 4200890470ec3a5722124f2ad0c7f8fedd5f9a83 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Fri, 26 Apr 2024 05:03:20 +0000 Subject: [PATCH 19/23] changelog Signed-off-by: zhichao-aws --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index adac4dbf7..e3980ffa8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Add support for model profiles by @rawwar in ([#358](https://github.com/opensearch-project/opensearch-py-ml/pull/358)) - Support for security default admin credential changes in 2.12.0 in ([#365](https://github.com/opensearch-project/opensearch-py-ml/pull/365)) - adding cross encoder models in the pre-trained traced list ([#378](https://github.com/opensearch-project/opensearch-py-ml/pull/378)) +- adding support for upload huggingface MIT licensed models ([](https://github.com/opensearch-project/opensearch-py-ml/pull/)) ### Changed From 34a18d6a7266fde36b5e35cc4326da7f4ea4ab7b Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Fri, 26 Apr 2024 08:55:48 +0000 Subject: [PATCH 20/23] finalize Signed-off-by: zhichao-aws --- .github/workflows/model_uploader.yml | 16 ++++++++-------- CHANGELOG.md | 2 +- .../test_sentencetransformermodel_pytest.py | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index b0a06fe4b..ff2724eb0 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -309,14 +309,14 @@ jobs: echo "${issue_body@E}" >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT echo "${issue_body@E}" - # - uses: trstringer/manual-approval@v1 - # with: - # secret: ${{ github.TOKEN }} - # approvers: ${{ steps.get_approvers.outputs.approvers }} - # minimum-approvals: 2 - # issue-title: "Upload Model to OpenSearch Model Hub (${{ github.event.inputs.model_id }})" - # issue-body: ${{ steps.create_issue_body.outputs.issue_body }} - # exclude-workflow-initiator-as-approver: false + - uses: trstringer/manual-approval@v1 + with: + secret: ${{ github.TOKEN }} + approvers: ${{ steps.get_approvers.outputs.approvers }} + minimum-approvals: 2 + issue-title: "Upload Model to OpenSearch Model Hub (${{ github.event.inputs.model_id }})" + issue-body: ${{ steps.create_issue_body.outputs.issue_body }} + exclude-workflow-initiator-as-approver: false # Step 6: Download the artifacts & Upload it to the S3 bucket model-uploading: diff --git a/CHANGELOG.md b/CHANGELOG.md index e3980ffa8..7589a646b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Add support for model profiles by @rawwar in ([#358](https://github.com/opensearch-project/opensearch-py-ml/pull/358)) - Support for security default admin credential changes in 2.12.0 in ([#365](https://github.com/opensearch-project/opensearch-py-ml/pull/365)) - adding cross encoder models in the pre-trained traced list ([#378](https://github.com/opensearch-project/opensearch-py-ml/pull/378)) -- adding support for upload huggingface MIT licensed models ([](https://github.com/opensearch-project/opensearch-py-ml/pull/)) +- adding support for upload huggingface MIT licensed models ([388](https://github.com/opensearch-project/opensearch-py-ml/pull/388)) ### Changed diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index beee4127e..1528739d1 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -715,7 +715,7 @@ def test_save_as_onnx_with_third_party_copyrights_statements(): model_format = "TORCH_SCRIPT" torch_script_zip_file_path = os.path.join(TEST_FOLDER, "all-MiniLM-L6-v2.zip") torch_script_expected_filenames = { - "all-MiniLM-L6-v2.pt", + "all-MiniLM-L6-v2.onnx", "tokenizer.json", "THIRD-PARTY", } From 5f6e71442e33648a6dbc06cec29e7734f7655316 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Fri, 26 Apr 2024 10:22:24 +0000 Subject: [PATCH 21/23] lint Signed-off-by: zhichao-aws --- opensearch_py_ml/ml_models/sentencetransformermodel.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index c37b30479..a93ceaead 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -829,7 +829,6 @@ def save_as_pt( assert ( False ), "When the model is from third party under non Apache-2.0 license, we can not add Apache-2.0 license for it." - model = SentenceTransformer(model_id) if model_name is None: @@ -950,7 +949,6 @@ def save_as_onnx( assert ( False ), "When the model is from third party under non Apache-2.0 license, we can not add Apache-2.0 license for it." - model = SentenceTransformer(model_id) if model_name is None: From 5da6ae9302d249e8d60f42fb37fa01f3254ff5b4 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Mon, 13 May 2024 03:07:59 +0000 Subject: [PATCH 22/23] change third_party to static var Signed-off-by: zhichao-aws --- opensearch_py_ml/ml_models/sentencetransformermodel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index a93ceaead..eb5375bd6 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -38,6 +38,7 @@ ) LICENSE_URL = "https://github.com/opensearch-project/opensearch-py-ml/raw/main/LICENSE" +THIRD_PARTY_FILE_NAME = "THIRD-PARTY" class SentenceTransformerModel: @@ -671,7 +672,7 @@ def _add_third_party_copyrights_statements_to_model_zip_file( :rtype: None """ with ZipFile(str(model_zip_file_path), "a") as zipObj: - zipObj.writestr("THIRD-PARTY", third_party_copyrights_statements) + zipObj.writestr(THIRD_PARTY_FILE_NAME, third_party_copyrights_statements) def zip_model( self, From 4f6af15d1870c75467fed1147a299d62e6f7299b Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Tue, 14 May 2024 07:02:20 +0000 Subject: [PATCH 23/23] change mit_license to addtional license Signed-off-by: zhichao-aws --- .ci/run-repository.sh | 4 ++-- .github/workflows/model_uploader.yml | 16 ++++++++-------- utils/model_uploader/model_autotracing.py | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.ci/run-repository.sh b/.ci/run-repository.sh index b3b6974eb..30e721e50 100755 --- a/.ci/run-repository.sh +++ b/.ci/run-repository.sh @@ -74,7 +74,7 @@ elif [[ "$TASK_TYPE" == "trace" ]]; then echo -e "\033[34;1mINFO:\033[0m EMBEDDING_DIMENSION: ${EMBEDDING_DIMENSION:-N/A}\033[0m" echo -e "\033[34;1mINFO:\033[0m POOLING_MODE: ${POOLING_MODE:-N/A}\033[0m" echo -e "\033[34;1mINFO:\033[0m MODEL_DESCRIPTION: ${MODEL_DESCRIPTION:-N/A}\033[0m" - echo -e "\033[34;1mINFO:\033[0m MIT_LICENSE_URL: ${MIT_LICENSE_URL:-N/A}\033[0m" + echo -e "\033[34;1mINFO:\033[0m ADDITIONAL_LICENSE_URL: ${ADDITIONAL_LICENSE_URL:-N/A}\033[0m" docker run \ --network=${network_name} \ @@ -87,7 +87,7 @@ elif [[ "$TASK_TYPE" == "trace" ]]; then --name opensearch-py-ml-trace-runner \ opensearch-project/opensearch-py-ml \ nox -s "trace-${PYTHON_VERSION}" -- ${MODEL_ID} ${MODEL_LICENSE} ${MODEL_VERSION} ${TRACING_FORMAT} -ed ${EMBEDDING_DIMENSION} -pm ${POOLING_MODE} \ - -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} -mlu ${MIT_LICENSE_URL} + -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} -alu ${ADDITIONAL_LICENSE_URL} docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/upload/ ./upload/ docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/trace_output/ ./trace_output/ diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index ff2724eb0..4995b6e52 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -57,8 +57,8 @@ on: options: - "NO" - "YES" - MIT_license_url: - description: "(Optional) MIT license url of the huggingface MIT model." + additional_license_url: + description: "(Optional) Additional license url of the huggingface MIT model." required: false type: string @@ -84,7 +84,7 @@ jobs: embedding_dimension=${{ github.event.inputs.embedding_dimension }} pooling_mode=${{ github.event.inputs.pooling_mode }} model_description="${{ github.event.inputs.model_description }}" - MIT_license_url="${{ github.event.inputs.MIT_license_url }}" + additional_license_url="${{ github.event.inputs.additional_license_url }}" workflow_info=" ============= Workflow Details ============== @@ -101,7 +101,7 @@ jobs: - Embedding Dimension: ${embedding_dimension:-N/A} - Pooling Mode: ${pooling_mode:-N/A} - Model Description: ${model_description:-N/A} - - MIT License Url: ${MIT_license_url:-N/A} + - Additional License Url: ${additional_license_url:-N/A} ======== Workflow Output Information ========= - Embedding Verification: Passed" @@ -110,13 +110,13 @@ jobs: echo "${workflow_info@E}" >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT echo "${workflow_info@E}" - - name: Check MIT license url - id: check_mit_license_url + - name: Check Additional license url + id: check_additional_license_url run: | if [[ "${{ github.event.inputs.model_license }}" == "MIT" ]] then echo "Uploading MIT licensed model" - if [[ "${{ github.event.inputs.MIT_license_url }}" == "" ]] + if [[ "${{ github.event.inputs.additional_license_url }}" == "" ]] then echo "missing MIT license url" exit 1 @@ -214,7 +214,7 @@ jobs: echo "EMBEDDING_DIMENSION=${{ github.event.inputs.embedding_dimension }}" >> $GITHUB_ENV echo "POOLING_MODE=${{ github.event.inputs.pooling_mode }}" >> $GITHUB_ENV echo "MODEL_DESCRIPTION=${{ github.event.inputs.model_description }}" >> $GITHUB_ENV - echo "MIT_LICENSE_URL=${{ github.event.inputs.MIT_license_url }}" >> $GITHUB_ENV + echo "ADDITIONAL_LICENSE_URL=${{ github.event.inputs.additional_license_url }}" >> $GITHUB_ENV - name: Autotracing ${{ matrix.cluster }} secured=${{ matrix.secured }} version=${{matrix.entry.opensearch_version}} run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} trace" - name: Limit Model Size to 2GB diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py index 3d2b8767b..a482ad681 100644 --- a/utils/model_uploader/model_autotracing.py +++ b/utils/model_uploader/model_autotracing.py @@ -633,20 +633,20 @@ def main( help="Model description if you want to overwrite the default description", ) parser.add_argument( - "-mlu", - "--mit_license_url", + "-alu", + "--additional_license_url", type=str, nargs="?", default=None, const=None, - help="MIT license url", + help="Additional license url", ) args = parser.parse_args() if args.model_license == "MIT": third_party_copyrights_statements = ( generate_thirdpart_statements_for_huggingface_MIT_models( - args.model_id, args.mit_license_url + args.model_id, args.additional_license_url ) ) else: