From c8e6e9e7e2911d274e2febc58670a7e3910ab18a Mon Sep 17 00:00:00 2001 From: bharathappali Date: Wed, 9 Oct 2024 12:53:10 +0530 Subject: [PATCH 1/4] Add tests for validating recommendation json for accelerator values Signed-off-by: bharathappali --- tests/scripts/helpers/utils.py | 59 +++++++ .../rest_apis/test_list_recommendations.py | 149 +++++++++++++++++- 2 files changed, 207 insertions(+), 1 deletion(-) diff --git a/tests/scripts/helpers/utils.py b/tests/scripts/helpers/utils.py index a70b90fc9..4eb2d0d23 100644 --- a/tests/scripts/helpers/utils.py +++ b/tests/scripts/helpers/utils.py @@ -254,6 +254,8 @@ "memoryLimit_sum", "memoryLimit_avg", "memoryUsage_sum", "memoryUsage_max", "memoryUsage_avg", "memoryUsage_min", "memoryRSS_sum", "memoryRSS_max", "memoryRSS_avg", "memoryRSS_min"] +MIG_PATTERN = r"nvidia\.com/mig-[1-4|7]g\.(5|10|20|40|80)gb" + def generate_test_data(csvfile, test_data, api_name): if os.path.isfile(csvfile): @@ -1526,3 +1528,60 @@ def validate_local_monitoring_recommendation_data_present(recommendations_json): for i in range(list_reco_containers_length): assert recommendations_json[0]['kubernetes_objects'][0]['containers'][i]['recommendations']['data'], "Recommendations data is expected, but not present." assert recommendations_json[0]['kubernetes_objects'][0]['containers'][i]['recommendations']['notifications'][NOTIFICATION_CODE_FOR_RECOMMENDATIONS_AVAILABLE]['message'] == RECOMMENDATIONS_AVAILABLE, "Recommendations notification is expected, but not present." + + +def validate_limits_map_for_accelerator(limits: dict): + for resource, resource_obj in limits.items(): + # Check if the key contains "nvidia" and matches the MIG pattern + if "nvidia" in resource: + # Assert that the key matches the expected MIG pattern + assert re.match(MIG_PATTERN, resource), f"Resource '{resource}' does not match the expected MIG pattern." + + # Assert that the amount is 1.0 and format is "cores" + assert resource_obj.get("amount") == 1.0, f"Resource '{resource}' has an invalid amount: {resource_obj.get('amount')}" + assert resource_obj.get("format") == "cores", f"Resource '{resource}' has an invalid format: {resource_obj.get('format')}" + + + +def validate_accelerator_recommendations_for_container(recommendations_json): + if 'experiment_type' in recommendations_json[0]: + assert recommendations_json[0]['experiment_type'] == CONTAINER_EXPERIMENT_TYPE, "Test is only applicable for container experiment type" + + assert recommendations_json[0]['kubernetes_objects'], "Kubernetes objects expected" + + # Test needs to be changed if we support multiple kubernetes objects + kubernetes_obj = recommendations_json[0]['kubernetes_objects'][0] + assert kubernetes_obj["containers"], "Containers array expected" + + containers = kubernetes_obj["containers"] + assert len(containers) > 0, "Expecting atleast one container" + + for container in containers: + assert container['recommendations'], "Recommendations object expected" + recommendations = container['recommendations'] + + assert recommendations["data"], "Data object expected" + data = recommendations["data"] + + assert len(data) > 0, "Data object cannot be empty" + + for timestamp, interval_recommendation_obj in data.items(): + assert interval_recommendation_obj["recommendation_terms"], "Term based recommendations expected" + terms = interval_recommendation_obj["recommendation_terms"] + + assert len(terms) > 0, "Atleast one term is expected" + + for term_name, term_obj in terms.items(): + term_notifications = term_obj["notifications"] + + if NOTIFICATION_CODE_FOR_COST_RECOMMENDATIONS_AVAILABLE in term_notifications: + cost_limits_map = term_obj["recommendation_engines"]["cost"]["config"]["limits"] + validate_limits_map_for_accelerator(cost_limits_map) + + if NOTIFICATION_CODE_FOR_PERFORMANCE_RECOMMENDATIONS_AVAILABLE in term_notifications: + perf_limits_map = term_obj["recommendation_engines"]["performance"]["config"]["limits"] + validate_limits_map_for_accelerator(perf_limits_map) + + + + diff --git a/tests/scripts/local_monitoring_tests/rest_apis/test_list_recommendations.py b/tests/scripts/local_monitoring_tests/rest_apis/test_list_recommendations.py index 44ea021e4..74eb0ef27 100644 --- a/tests/scripts/local_monitoring_tests/rest_apis/test_list_recommendations.py +++ b/tests/scripts/local_monitoring_tests/rest_apis/test_list_recommendations.py @@ -171,4 +171,151 @@ def test_list_recommendations_namespace_single_result(test_name, expected_status # Delete experiment response = delete_experiment(input_json_file) print("delete exp = ", response.status_code) - assert response.status_code == SUCCESS_STATUS_CODE \ No newline at end of file + assert response.status_code == SUCCESS_STATUS_CODE + +@pytest.mark.sanity +@pytest.mark.parametrize( + "test_name, expected_status_code, version, experiment_name, cluster_name, performance_profile, mode, target_cluster, datasource, experiment_type, kubernetes_obj_type, name, namespace, namespace_name, container_image_name, container_name, measurement_duration, threshold", + [ + ("list_accelerator_recommendations", SUCCESS_STATUS_CODE, "v2.0", "human_eval_exp", "cluster-1", "resource-optimization-local-monitoring", "monitor", "local", "prometheus-1", "container", "statefulset", "human-eval-benchmark", "unpartitioned", None, None, "human-eval-benchmark", "15min", "0.1"), + ] + ) +def test_accelerator_recommendation_if_exists( + test_name, + expected_status_code, + version, + experiment_name, + cluster_name, + performance_profile, + mode, + target_cluster, + datasource, + experiment_type, + kubernetes_obj_type, + name, + namespace, + namespace_name, + container_image_name, + container_name, + measurement_duration, + threshold, + cluster_type): + """ + Test Description: This test validates listRecommendations by passing a valid + container experiment name which has gpu usage + """ + # Generate a temporary JSON filename + tmp_json_file = "/tmp/create_exp_" + test_name + ".json" + print("tmp_json_file = ", tmp_json_file) + + # Load the Jinja2 template + environment = Environment(loader=FileSystemLoader("../json_files/")) + template = environment.get_template("create_exp_template.json") + + # Render the JSON content from the template + content = template.render( + version=version, + experiment_name=experiment_name, + cluster_name=cluster_name, + performance_profile=performance_profile, + mode=mode, + target_cluster=target_cluster, + datasource=datasource, + experiment_type=experiment_type, + kubernetes_obj_type=kubernetes_obj_type, + name=name, + namespace=namespace, + namespace_name=namespace_name, + container_image_name=container_image_name, + container_name=container_name, + measurement_duration=measurement_duration, + threshold=threshold + ) + + # Convert rendered content to a dictionary + json_content = json.loads(content) + + if json_content[0]["kubernetes_objects"][0]["type"] == "None": + json_content[0]["kubernetes_objects"][0].pop("type") + if json_content[0]["kubernetes_objects"][0]["namespaces"]["namespace_name"] == "None": + json_content[0]["kubernetes_objects"][0].pop("namespaces") + if json_content[0]["kubernetes_objects"][0]["containers"][0]["container_name"] == "None": + json_content[0]["kubernetes_objects"][0].pop("containers") + + # Write the final JSON to the temp file + with open(tmp_json_file, mode="w", encoding="utf-8") as message: + json.dump(json_content, message, indent=4) + + input_json_file = tmp_json_file + + form_kruize_url(cluster_type) + response = delete_experiment(input_json_file) + print("delete exp = ", response.status_code) + + #Install default metric profile + if cluster_type == "minikube": + metric_profile_json_file = metric_profile_dir / 'resource_optimization_local_monitoring_norecordingrules.json' + + if cluster_type == "openshift": + metric_profile_json_file = metric_profile_dir / 'resource_optimization_local_monitoring.json' + + response = delete_metric_profile(metric_profile_json_file) + print("delete metric profile = ", response.status_code) + + # Create metric profile using the specified json + response = create_metric_profile(metric_profile_json_file) + + data = response.json() + print(data['message']) + + assert response.status_code == SUCCESS_STATUS_CODE + assert data['status'] == SUCCESS_STATUS + + json_file = open(metric_profile_json_file, "r") + input_json = json.loads(json_file.read()) + metric_profile_name = input_json['metadata']['name'] + assert data['message'] == CREATE_METRIC_PROFILE_SUCCESS_MSG % metric_profile_name + + response = list_metric_profiles(name=metric_profile_name, logging=False) + metric_profile_json = response.json() + + assert response.status_code == SUCCESS_200_STATUS_CODE + + # Validate the json against the json schema + errorMsg = validate_list_metric_profiles_json(metric_profile_json, list_metric_profiles_schema) + assert errorMsg == "" + + # Create namespace experiment using the specified json + response = create_experiment(input_json_file) + + data = response.json() + print(data['message']) + + assert response.status_code == SUCCESS_STATUS_CODE + assert data['status'] == SUCCESS_STATUS + assert data['message'] == CREATE_EXP_SUCCESS_MSG + + # generate recommendations + json_file = open(input_json_file, "r") + input_json = json.loads(json_file.read()) + exp_name = input_json[0]['experiment_name'] + + response = generate_recommendations(exp_name) + assert response.status_code == SUCCESS_STATUS_CODE + + # Invoke list recommendations for the specified experiment + response = list_recommendations(exp_name) + assert response.status_code == SUCCESS_200_STATUS_CODE + list_reco_json = response.json() + + # Validate the json against the json schema + errorMsg = validate_list_reco_json(list_reco_json, list_reco_namespace_json_local_monitoring_schema) + assert errorMsg == "" + + # Validate accelerator info + validate_accelerator_recommendations_for_container(list_reco_json) + + # Delete experiment + response = delete_experiment(input_json_file) + print("delete exp = ", response.status_code) + assert response.status_code == SUCCESS_STATUS_CODE From 43e2b78dcd61835426b885031557d3c148e16c45 Mon Sep 17 00:00:00 2001 From: bharathappali Date: Tue, 15 Oct 2024 10:51:33 +0530 Subject: [PATCH 2/4] Modify Schema variable Signed-off-by: bharathappali --- .../rest_apis/test_list_recommendations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/scripts/local_monitoring_tests/rest_apis/test_list_recommendations.py b/tests/scripts/local_monitoring_tests/rest_apis/test_list_recommendations.py index 74eb0ef27..bbb9510a6 100644 --- a/tests/scripts/local_monitoring_tests/rest_apis/test_list_recommendations.py +++ b/tests/scripts/local_monitoring_tests/rest_apis/test_list_recommendations.py @@ -309,7 +309,7 @@ def test_accelerator_recommendation_if_exists( list_reco_json = response.json() # Validate the json against the json schema - errorMsg = validate_list_reco_json(list_reco_json, list_reco_namespace_json_local_monitoring_schema) + errorMsg = validate_list_reco_json(list_reco_json, list_reco_json_local_monitoring_schema) assert errorMsg == "" # Validate accelerator info From 5cb0c0c9357cc6e359e4de57d9752262822d697d Mon Sep 17 00:00:00 2001 From: bharathappali Date: Tue, 15 Oct 2024 13:05:14 +0530 Subject: [PATCH 3/4] Add docs for the accelerator test Signed-off-by: bharathappali --- .../Local_monitoring_tests.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/scripts/local_monitoring_tests/Local_monitoring_tests.md b/tests/scripts/local_monitoring_tests/Local_monitoring_tests.md index 49aab43f0..a0fdd54e1 100644 --- a/tests/scripts/local_monitoring_tests/Local_monitoring_tests.md +++ b/tests/scripts/local_monitoring_tests/Local_monitoring_tests.md @@ -164,3 +164,19 @@ Local monitoring tests can also be run without using the test_autotune.sh. To do Note: You can check the report.html for the results as it provides better readability + +### Accelerator Test: + +Kruize 0.1 supports the Accelerator Recommendations which provide right sized MIG config as recommendations. + +The test `test_list_recommendations.py::test_accelerator_recommendation_if_exists` is created to check if the accelerator recommendations are in expected format. + +#### Prerequisites to run the test: + +In addition to the pre-requisites mentioned above we need to make sure that a workload with name `human-eval-benchmark` is running in the namespace `unpartitioned` and has the accelerator usage data. + +Else, you can change the workload name and namespace name in the test to match with your workload. + + +Note: The test will fail if it's run as is if there are no matching workloads that the test looks for. This test result can be ignored in case of a non-gpu workload + From 2c55e0ab934e16120df12a14080c057d13e91ffd Mon Sep 17 00:00:00 2001 From: bharathappali Date: Tue, 15 Oct 2024 17:01:49 +0530 Subject: [PATCH 4/4] add link for running the benchmark Signed-off-by: bharathappali --- tests/scripts/local_monitoring_tests/Local_monitoring_tests.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/scripts/local_monitoring_tests/Local_monitoring_tests.md b/tests/scripts/local_monitoring_tests/Local_monitoring_tests.md index a0fdd54e1..3444b66d3 100644 --- a/tests/scripts/local_monitoring_tests/Local_monitoring_tests.md +++ b/tests/scripts/local_monitoring_tests/Local_monitoring_tests.md @@ -175,6 +175,8 @@ The test `test_list_recommendations.py::test_accelerator_recommendation_if_exist In addition to the pre-requisites mentioned above we need to make sure that a workload with name `human-eval-benchmark` is running in the namespace `unpartitioned` and has the accelerator usage data. +Check this out for running the benchmark: [How to run the human eval benchmark?](https://github.com/kruize/benchmarks/tree/master/human-eval-benchmark) + Else, you can change the workload name and namespace name in the test to match with your workload.