Merge branch '1.x' into backport/backport-199-to-1.x

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>
opensearch-project · Aug 10, 2023 · 4831af3 · 4831af3
2 parents 5a00346 + f048ed0
commit 4831af3
Show file tree

Hide file tree

Showing 8 changed files with 263 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,6 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 ## [1.1.0]
 
 ### Added
-
 - Adding documentation for model group id @dhrubo-os ([#176](https://github.com/opensearch-project/opensearch-py-ml/pull/176))
 - listing pre-trained release models @dhrubo-os ([#85](https://github.com/opensearch-project/opensearch-py-ml/pull/85))
 - Upload pretrained models @AlibiZhenis ([#111](https://github.com/opensearch-project/opensearch-py-ml/pull/111/files))
@@ -19,6 +18,8 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - adding jupyter notebook based documentation for metrics correlation algorithm by @AlibiZhenis ([#186](https://github.com/opensearch-project/opensearch-py-ml/pull/186))
 
 ### Changed
+- adding documentation for model group id @dhrubo-os ([#176](https://github.com/opensearch-project/opensearch-py-ml/pull/176))
+- adding jupyter notebook based documentation for metrics correlation algorithm by @AlibiZhenis ([#186](https://github.com/opensearch-project/opensearch-py-ml/pull/186))
 - Update jenkins file to use updated docker image ([#189](https://github.com/opensearch-project/opensearch-py-ml/pull/189))
 - Updated documentation @dhrubo-os ([#98](https://github.com/opensearch-project/opensearch-py-ml/pull/98))
 - Updating ML Commons API documentation @AlibiZhenis ([#156](https://github.com/opensearch-project/opensearch-py-ml/pull/156))
@@ -27,6 +28,8 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - Fix ModelUploader bug & Update model tracing demo notebook by @thanawan-atc in ([#185](https://github.com/opensearch-project/opensearch-py-ml/pull/185))
 - Fix make_model_config_json function by @thanawan-atc in ([#188](https://github.com/opensearch-project/opensearch-py-ml/pull/188))
 - Make make_model_config_json function more concise by @thanawan-atc in ([#191](https://github.com/opensearch-project/opensearch-py-ml/pull/191))
+- Enabled auto-truncation for any pretrained models by @Yerzhaisang in ([#192](https://github.com/opensearch-project/opensearch-py-ml/pull/192))
+- Generalize make_model_config_json function by @thanawan-atc in ([#200](https://github.com/opensearch-project/opensearch-py-ml/pull/200))
 
 ## [1.0.0]    
 

diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
@@ -95,7 +95,8 @@ You should paste this settings in the `Dev Tools` window and run it:
    "persistent" : {
      "plugins.ml_commons.only_run_on_ml_node" : false, 
      "plugins.ml_commons.native_memory_threshold" : 100, 
-     "plugins.ml_commons.max_model_on_node": 20
+     "plugins.ml_commons.max_model_on_node": 20,
+     "plugins.ml_commons.enable_inhouse_python_model": true
    }
  }
 ```

diff --git a/docs/source/examples/data/smd_data.csv b/docs/source/examples/data/smd_data.csv
diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst
@@ -19,12 +19,20 @@ Demo notebooks for Model Training and Tracing
    :maxdepth: 1
 
    demo_transformer_model_train_save_upload_to_openSearch
-   demo_tracing_model_torchscript_onnx.ipynb
+   demo_tracing_model_torchscript_onnx
 
 Demo notebooks for ML Commons plugin integration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. toctree::
    :maxdepth: 1
 
-   demo_ml_commons_integration
+   demo_ml_commons_integration
+
+Demo Notebooks for In-house python based models
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. toctree::
+   :maxdepth: 1
+
+   demo_metrics_correlation
diff --git a/opensearch_py_ml/ml_commons/ml_commons_client.py b/opensearch_py_ml/ml_commons/ml_commons_client.py
@@ -6,6 +6,7 @@
 # GitHub history for details.
 
 
+import json
 import time
 from typing import Any, List, Union
 
@@ -392,6 +393,66 @@ def _get_task_info(self, task_id: str):
             url=API_URL,
         )
 
+    def search_task(self, input_json) -> object:
+        """
+        This method searches a task from opensearch cluster (using ml commons api)
+        :param json: json input for the search request
+        :type json: string or dict
+        :return: returns a json object, with detailed information about the searched task
+        :rtype: object
+        """
+
+        API_URL = f"{ML_BASE_URI}/tasks/_search"
+
+        if isinstance(input_json, str):
+            try:
+                json_obj = json.loads(input_json)
+                if not isinstance(json_obj, dict):
+                    return "Invalid JSON object passed as argument."
+                API_BODY = json.dumps(json_obj)
+            except json.JSONDecodeError:
+                return "Invalid JSON string passed as argument."
+        elif isinstance(input_json, dict):
+            API_BODY = json.dumps(input_json)
+        else:
+            return "Invalid JSON object passed as argument."
+
+        return self._client.transport.perform_request(
+            method="GET",
+            url=API_URL,
+            body=API_BODY,
+        )
+
+    def search_model(self, input_json) -> object:
+        """
+        This method searches a task from opensearch cluster (using ml commons api)
+        :param json: json input for the search request
+        :type json: string or dict
+        :return: returns a json object, with detailed information about the searched task
+        :rtype: object
+        """
+
+        API_URL = f"{ML_BASE_URI}/models/_search"
+
+        if isinstance(input_json, str):
+            try:
+                json_obj = json.loads(input_json)
+                if not isinstance(json_obj, dict):
+                    return "Invalid JSON object passed as argument."
+                API_BODY = json.dumps(json_obj)
+            except json.JSONDecodeError:
+                return "Invalid JSON string passed as argument."
+        elif isinstance(input_json, dict):
+            API_BODY = json.dumps(input_json)
+        else:
+            return "Invalid JSON object passed as argument."
+
+        return self._client.transport.perform_request(
+            method="POST",
+            url=API_URL,
+            body=API_BODY,
+        )
+
     def get_model_info(self, model_id: str) -> object:
         """
         This method return information about a model registered in the opensearch cluster (using ml commons api)

diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py
@@ -701,6 +701,37 @@ def zip_model(
             )
         print("zip file is saved to " + zip_file_path + "\n")
 
+    def _fill_null_truncation_field(
+        self,
+        save_json_folder_path: str,
+        max_length: int,
+    ) -> None:
+        """
+        Description:
+        Fill truncation field in tokenizer.json when it is null
+
+        :param save_json_folder_path:
+             path to save model json file, e.g, "home/save_pre_trained_model_json/")
+        :type save_json_folder_path: string
+        :param max_length:
+             maximum sequence length for model
+        :type max_length: int
+        :return: no return value expected
+        :rtype: None
+        """
+        tokenizer_file_path = os.path.join(save_json_folder_path, "tokenizer.json")
+        with open(tokenizer_file_path) as user_file:
+            parsed_json = json.load(user_file)
+        if "truncation" not in parsed_json or parsed_json["truncation"] is None:
+            parsed_json["truncation"] = {
+                "direction": "Right",
+                "max_length": max_length,
+                "strategy": "LongestFirst",
+                "stride": 0,
+            }
+            with open(tokenizer_file_path, "w") as file:
+                json.dump(parsed_json, file, indent=2)
+
     def save_as_pt(
         self,
         sentences: [str],
@@ -760,6 +791,9 @@ def save_as_pt(
 
         # save tokenizer.json in save_json_folder_name
         model.save(save_json_folder_path)
+        self._fill_null_truncation_field(
+            save_json_folder_path, model.tokenizer.model_max_length
+        )
 
         # convert to pt format will need to be in cpu,
         # set the device to cpu, convert its input_ids and attention_mask in cpu and save as .pt format
@@ -851,6 +885,9 @@ def save_as_onnx(
 
         # save tokenizer.json in output_path
         model.save(save_json_folder_path)
+        self._fill_null_truncation_field(
+            save_json_folder_path, model.tokenizer.model_max_length
+        )
 
         convert(
             framework="pt",
@@ -1031,28 +1068,20 @@ def make_model_config_json(
             or normalize_result is None
         ):
             try:
-                if (
-                    model_type is None
-                    and len(model._modules) >= 1
-                    and isinstance(model._modules["0"], Transformer)
-                ):
-                    model_type = model._modules["0"].auto_model.__class__.__name__
-                    model_type = model_type.lower().rstrip("model")
                 if embedding_dimension is None:
                     embedding_dimension = model.get_sentence_embedding_dimension()
-                if (
-                    pooling_mode is None
-                    and len(model._modules) >= 2
-                    and isinstance(model._modules["1"], Pooling)
-                ):
-                    pooling_mode = model._modules["1"].get_pooling_mode_str().upper()
-                if normalize_result is None:
-                    if len(model._modules) >= 3 and isinstance(
-                        model._modules["2"], Normalize
-                    ):
+
+                for str_idx, module in model._modules.items():
+                    if model_type is None and isinstance(module, Transformer):
+                        model_type = module.auto_model.__class__.__name__
+                        model_type = model_type.lower().rstrip("model")
+                    elif pooling_mode is None and isinstance(module, Pooling):
+                        pooling_mode = module.get_pooling_mode_str().upper()
+                    elif normalize_result is None and isinstance(module, Normalize):
                         normalize_result = True
-                    else:
-                        normalize_result = False
+                    # TODO: Support 'Dense' module
+                if normalize_result is None:
+                    normalize_result = False
             except Exception as e:
                 raise Exception(
                     f"Raised exception while getting model data from pre-trained hugging-face model object: {e}"

diff --git a/tests/ml_commons/test_ml_commons_client.py b/tests/ml_commons/test_ml_commons_client.py
@@ -100,6 +100,102 @@ def test_execute():
     ), "Raised Exception during execute API testing with JSON string"
 
 
+def test_search():
+    # Search task cases
+    raised = False
+    try:
+        search_task_obj = ml_client.search_task(
+            input_json='{"query": {"match_all": {}},"size": 1}'
+        )
+        assert search_task_obj["hits"]["hits"] != []
+    except:  # noqa: E722
+        raised = True
+    assert raised == False, "Raised Exception in searching task"
+
+    raised = False
+    try:
+        search_task_obj = ml_client.search_task(
+            input_json={"query": {"match_all": {}}, "size": 1}
+        )
+        assert search_task_obj["hits"]["hits"] != []
+    except:  # noqa: E722
+        raised = True
+    assert raised == False, "Raised Exception in searching task"
+
+    raised = False
+    try:
+        search_task_obj = ml_client.search_task(input_json=15)
+        assert search_task_obj == "Invalid JSON object passed as argument."
+    except:  # noqa: E722
+        raised = True
+    assert raised == False, "Raised Exception in searching task"
+
+    raised = False
+    try:
+        search_task_obj = ml_client.search_task(input_json="15")
+        assert search_task_obj == "Invalid JSON object passed as argument."
+    except:  # noqa: E722
+        raised = True
+    assert raised == False, "Raised Exception in searching task"
+
+    raised = False
+    try:
+        search_task_obj = ml_client.search_task(
+            input_json='{"query": {"match_all": {}},size: 1}'
+        )
+        assert search_task_obj == "Invalid JSON string passed as argument."
+    except:  # noqa: E722
+        raised = True
+    assert raised == False, "Raised Exception in searching task"
+
+    # Search model cases
+    raised = False
+    try:
+        search_model_obj = ml_client.search_model(
+            input_json='{"query": {"match_all": {}},"size": 1}'
+        )
+        assert search_model_obj["hits"]["hits"] != []
+    except:  # noqa: E722
+        raised = True
+    assert raised == False, "Raised Exception in searching model"
+
+    raised = False
+    try:
+        search_model_obj = ml_client.search_model(
+            input_json={"query": {"match_all": {}}, "size": 1}
+        )
+        assert search_model_obj["hits"]["hits"] != []
+    except:  # noqa: E722
+        raised = True
+    assert raised == False, "Raised Exception in searching model"
+
+    raised = False
+    try:
+        search_model_obj = ml_client.search_model(input_json=15)
+        assert search_model_obj == "Invalid JSON object passed as argument."
+    except:  # noqa: E722
+        raised = True
+    assert raised == False, "Raised Exception in searching model"
+
+    raised = False
+    try:
+        search_model_obj = ml_client.search_model(input_json="15")
+        assert search_model_obj == "Invalid JSON object passed as argument."
+    except:  # noqa: E722
+        raised = True
+    assert raised == False, "Raised Exception in searching model"
+
+    raised = False
+    try:
+        search_model_obj = ml_client.search_model(
+            input_json='{"query": {"match_all": {}},size: 1}'
+        )
+        assert search_model_obj == "Invalid JSON string passed as argument."
+    except:  # noqa: E722
+        raised = True
+    assert raised == False, "Raised Exception in searching model"
+
+
 def test_DEPRECATED_integration_pretrained_model_upload_unload_delete():
     raised = False
     try:
@@ -379,6 +475,7 @@ def test_integration_model_train_register_full_cycle():
                     print("Model Task Status:", ml_task_status)
                     raised = True
                 assert raised == False, "Raised Exception in pulling task info"
+
                 # This is test is being flaky. Sometimes the test is passing and sometimes showing 500 error
                 # due to memory circuit breaker.
                 # Todo: We need to revisit this test.

diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py
@@ -372,5 +372,37 @@ def test_overwrite_fields_in_model_config():
     clean_test_folder(TEST_FOLDER)
 
 
+def test_truncation_parameter():
+    model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
+    MAX_LENGTH_TASB = 512
+
+    clean_test_folder(TEST_FOLDER)
+    test_model10 = SentenceTransformerModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model10.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+
+    tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
+    try:
+        with open(tokenizer_json_file_path, "r") as json_file:
+            tokenizer_json = json.load(json_file)
+    except Exception as exec:
+        assert (
+            False
+        ), f"Creating tokenizer.json file for tracing raised an exception {exec}"
+
+    assert tokenizer_json[
+        "truncation"
+    ], "truncation parameter in tokenizer.json is null"
+
+    assert (
+        tokenizer_json["truncation"]["max_length"] == MAX_LENGTH_TASB
+    ), "max_length is not properly set"
+
+    clean_test_folder(TEST_FOLDER)
+
+
 clean_test_folder(TEST_FOLDER)
 clean_test_folder(TESTDATA_UNZIP_FOLDER)