From d6ef50080f3b2b923ba0fb89eb0a8daebf8f68d4 Mon Sep 17 00:00:00 2001 From: Jason Dai Date: Tue, 14 May 2024 15:41:41 -0700 Subject: [PATCH] fix: add validation for evaluation dataset fields, update logging info for eval api request count PiperOrigin-RevId: 633729236 --- tests/unit/vertexai/test_evaluation.py | 13 +++++++++++++ vertexai/preview/evaluation/_eval_tasks.py | 2 +- vertexai/preview/evaluation/_evaluation.py | 22 +++++++++++++++++++--- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/tests/unit/vertexai/test_evaluation.py b/tests/unit/vertexai/test_evaluation.py index fefe234621..828ef2a2cf 100644 --- a/tests/unit/vertexai/test_evaluation.py +++ b/tests/unit/vertexai/test_evaluation.py @@ -571,6 +571,19 @@ def test_evaluate_pairwise_metrics_with_multiple_baseline_models(self): ): test_eval_task.evaluate(model=mock_candidate_model) + def test_evaluate_invalid_model_and_dataset_input(self): + test_eval_task = evaluation.EvalTask( + dataset=_TEST_EVAL_DATASET, + metrics=_TEST_METRICS, + ) + with pytest.raises( + ValueError, + match=("The `model` parameter is specified, but the evaluation `dataset`"), + ): + test_eval_task.evaluate( + model=generative_models.GenerativeModel(model_name="invalid_model_name") + ) + @pytest.mark.usefixtures("google_auth_mock") class TestEvaluationUtils: diff --git a/vertexai/preview/evaluation/_eval_tasks.py b/vertexai/preview/evaluation/_eval_tasks.py index 651ec127fc..9e24cc670a 100644 --- a/vertexai/preview/evaluation/_eval_tasks.py +++ b/vertexai/preview/evaluation/_eval_tasks.py @@ -79,7 +79,7 @@ class EvalTask: documentation page [Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval). Usage: - 1. To perform bring your own prediction evaluation, provide the model + 1. To perform bring-your-own-prediction(BYOP) evaluation, provide the model responses in the response column in the dataset. The response column name is "response" by default, or specify `response_column_name` parameter to customize. diff --git a/vertexai/preview/evaluation/_evaluation.py b/vertexai/preview/evaluation/_evaluation.py index d396f9460f..fd348b7358 100644 --- a/vertexai/preview/evaluation/_evaluation.py +++ b/vertexai/preview/evaluation/_evaluation.py @@ -534,8 +534,7 @@ async def _compute_metrics( metric_name = metric tasks_by_metric[metric_name].append(task) - api_request_count = (len(api_metrics) + len(custom_metrics)) * len( - evaluation_run_config.dataset) + api_request_count = len(api_metrics) * len(evaluation_run_config.dataset) _LOGGER.info( f"Computing metrics with a total of {api_request_count} Vertex online" " evaluation service requests." @@ -629,7 +628,8 @@ def evaluate( Raises: ValueError: If the metrics list is empty, or the prompt template is not provided for PairwiseMetric, or multiple baseline models are specified for - PairwiseMetric instances. + PairwiseMetric instances, or both model and dataset model response column + are present. """ if not metrics: @@ -655,6 +655,22 @@ def evaluate( constants.Dataset.REFERENCE_COLUMN ) + if ( + model + and evaluation_run_config.column_map.get( + constants.Dataset.MODEL_RESPONSE_COLUMN + ) + in dataset.columns + ): + raise ValueError( + "The `model` parameter is specified, but the evaluation `dataset`" + f" contains model response column `{response_column_name}` to perform" + " bring-your-own-prediction(BYOP) evaluation. If you would like to" + " perform rapid evaluation using the dataset with the existing model" + f" response column `{response_column_name}`, please remove the" + " `model` input parameter." + ) + baseline_model = None pairwise_metric_exists = any( isinstance(metric, metrics_base.PairwiseMetric)