From 8302300bd7fb391cfcea172c2ddbe777bf881ea7 Mon Sep 17 00:00:00 2001 From: rittik9 Date: Mon, 9 Sep 2024 17:16:59 +0530 Subject: [PATCH 1/8] Fix: Handle zero division error in binary IoU (Jaccard index) calculation --- .../functional/classification/jaccard.py | 2 +- .../unittests/classification/test_jaccard.py | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/torchmetrics/functional/classification/jaccard.py b/src/torchmetrics/functional/classification/jaccard.py index 1d240df68af..dfddd68255f 100644 --- a/src/torchmetrics/functional/classification/jaccard.py +++ b/src/torchmetrics/functional/classification/jaccard.py @@ -67,7 +67,7 @@ def _jaccard_index_reduce( raise ValueError(f"The `average` has to be one of {allowed_average}, got {average}.") confmat = confmat.float() if average == "binary": - return confmat[1, 1] / (confmat[0, 1] + confmat[1, 0] + confmat[1, 1]) + return _safe_divide(confmat[1, 1], (confmat[0, 1] + confmat[1, 0] + confmat[1, 1]), zero_division=zero_division) ignore_index_cond = ignore_index is not None and 0 <= ignore_index < confmat.shape[0] multilabel = confmat.ndim == 3 diff --git a/tests/unittests/classification/test_jaccard.py b/tests/unittests/classification/test_jaccard.py index 6901868eac9..e7afdb557a6 100644 --- a/tests/unittests/classification/test_jaccard.py +++ b/tests/unittests/classification/test_jaccard.py @@ -26,6 +26,7 @@ MultilabelJaccardIndex, ) from torchmetrics.functional.classification.jaccard import ( + _jaccard_index_reduce, binary_jaccard_index, multiclass_jaccard_index, multilabel_jaccard_index, @@ -403,6 +404,26 @@ def test_corner_case(): assert torch.allclose(res, out) +def test_jaccard_index_zero_division(): + """Issue: https://github.com/Lightning-AI/torchmetrics/issues/2658.""" + # Test case where all pixels are background (zeros) + confmat = torch.tensor([[4, 0], [0, 0]]) + + # Test with zero_division=0.0 + result = _jaccard_index_reduce(confmat, average="binary", zero_division=0.0) + assert result == 0.0, f"Expected 0.0, but got {result}" + + # Test with zero_division=1.0 + result = _jaccard_index_reduce(confmat, average="binary", zero_division=1.0) + assert result == 1.0, f"Expected 1.0, but got {result}" + + # Test case with some foreground pixels + confmat = torch.tensor([[2, 1], [1, 1]]) + result = _jaccard_index_reduce(confmat, average="binary", zero_division=0.0) + expected = 1 / 3 + assert torch.isclose(result, torch.tensor(expected)), f"Expected {expected}, but got {result}" + + @pytest.mark.parametrize( ("metric", "kwargs"), [ From 9098d0a02dd7481c016531cc7842ba4a0285617d Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Mon, 9 Sep 2024 19:38:53 +0200 Subject: [PATCH 2/8] chlog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b0b0022476..0e6d5f40643 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,7 +39,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed -- +- Fixed handling zero division error in binary IoU (Jaccard index) calculation ([#2726](https://github.com/Lightning-AI/torchmetrics/pull/2726)) ## [1.4.1] - 2024-08-02 From 21948b2725ab9decc7ee7687fe0e40b7c86f8b2c Mon Sep 17 00:00:00 2001 From: rittik9 Date: Fri, 8 Nov 2024 02:04:37 +0530 Subject: [PATCH 3/8] fix: rouge_score with accumulate='best' gives mixed results #2148 --- src/torchmetrics/functional/text/rouge.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/torchmetrics/functional/text/rouge.py b/src/torchmetrics/functional/text/rouge.py index 58c9a05fecf..5543a1e5d1f 100644 --- a/src/torchmetrics/functional/text/rouge.py +++ b/src/torchmetrics/functional/text/rouge.py @@ -361,12 +361,9 @@ def _rouge_score_update( list_results.append(result_inner.copy()) if accumulate == "best": - key_curr = rouge_keys_values[0] - all_fmeasure = torch.tensor([v[key_curr]["fmeasure"] for v in list_results]) - highest_idx = int(torch.argmax(all_fmeasure).item()) - - for rouge_key in rouge_keys_values: - results[rouge_key].append(list_results[highest_idx][rouge_key]) # todo + for k in rouge_keys_values: + index = torch.argmax(torch.tensor([s[k]["fmeasure"] for s in list_results])) + results[k].append(list_results[index][k]) elif accumulate == "avg": new_result_avg: Dict[Union[int, str], Dict[str, Tensor]] = { From bb208f426ce4a47cb7014b7dbdd13de0e05511aa Mon Sep 17 00:00:00 2001 From: rittik9 Date: Fri, 8 Nov 2024 13:43:35 +0530 Subject: [PATCH 4/8] fix: test_rouge.py --- tests/unittests/text/test_rouge.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/unittests/text/test_rouge.py b/tests/unittests/text/test_rouge.py index a40885587e8..4b30d2ad4d9 100644 --- a/tests/unittests/text/test_rouge.py +++ b/tests/unittests/text/test_rouge.py @@ -74,10 +74,12 @@ def _reference_rouge_score( aggregator_avg = BootstrapAggregator() if accumulate == "best": - key_curr = next(iter(list_results[0].keys())) - all_fmeasure = torch.tensor([v[key_curr].fmeasure for v in list_results]) - highest_idx = torch.argmax(all_fmeasure).item() - aggregator.add_scores(list_results[highest_idx]) + scores = {} + for rouge_key in list_results[0]: + all_fmeasure = torch.tensor([v[rouge_key].fmeasure for v in list_results]) + highest_idx = torch.argmax(all_fmeasure).item() + scores[rouge_key] = list_results[highest_idx][rouge_key] + aggregator.add_scores(scores) elif accumulate == "avg": for _score in list_results: aggregator_avg.add_scores(_score) From e09d751e9af3097824bd0fb9f33f3518a3cca7d7 Mon Sep 17 00:00:00 2001 From: rittik9 Date: Fri, 8 Nov 2024 16:46:40 +0530 Subject: [PATCH 5/8] test: test_rouge.py --- tests/unittests/text/test_rouge.py | 55 ++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tests/unittests/text/test_rouge.py b/tests/unittests/text/test_rouge.py index 4b30d2ad4d9..f7de944943e 100644 --- a/tests/unittests/text/test_rouge.py +++ b/tests/unittests/text/test_rouge.py @@ -271,3 +271,58 @@ def test_rouge_lsum_score(pl_rouge_metric_key, use_stemmer): use_stemmer=use_stemmer, ) assert torch.isclose(metrics_score[rouge_level + "_" + metric], original_score) + + +pytest.mark.parametrize( + "preds, references, expected_scores", + [ + ( + "a b c", + ["a b c", "c b a"], + { + "rouge1_fmeasure": 1.0, + "rouge1_precision": 1.0, + "rouge1_recall": 1.0, + "rouge2_fmeasure": 1.0, + "rouge2_precision": 1.0, + "rouge2_recall": 1.0, + "rougeL_fmeasure": 1.0, + "rougeL_precision": 1.0, + "rougeL_recall": 1.0, + "rougeLsum_fmeasure": 1.0, + "rougeLsum_precision": 1.0, + "rougeLsum_recall": 1.0, + }, + ), + ( + "a b c", + ["c b a", "a b c"], + { + "rouge1_fmeasure": 1.0, + "rouge1_precision": 1.0, + "rouge1_recall": 1.0, + "rouge2_fmeasure": 1.0, + "rouge2_precision": 1.0, + "rouge2_recall": 1.0, + "rougeL_fmeasure": 1.0, + "rougeL_precision": 1.0, + "rougeL_recall": 1.0, + "rougeLsum_fmeasure": 1.0, + "rougeLsum_precision": 1.0, + "rougeLsum_recall": 1.0, + }, + ), + ], +) + + +def test_rouge_score_accumulate_best(preds, references, expected_scores): + """Issue: https://github.com/Lightning-AI/torchmetrics/issues/2148.""" + # Calculate ROUGE scores + result = rouge_score(preds, references, accumulate="best") + + # Assert each expected score + for key in expected_scores: + assert torch.isclose( + result[key], torch.tensor(expected_scores[key]) + ), f"Expected {expected_scores[key]} for {key}, but got {result[key]}" From da834d0e4cb50f1407e9f7ab72bca06ba6c80cd8 Mon Sep 17 00:00:00 2001 From: rittik9 Date: Fri, 8 Nov 2024 17:06:15 +0530 Subject: [PATCH 6/8] fix: test_rouge.py --- CHANGELOG.md | 9 --------- tests/unittests/text/test_rouge.py | 2 +- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d3c8bc8392..65624b8d87b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,15 +48,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Removed `num_outputs` in `R2Score` ([#2800](https://github.com/Lightning-AI/torchmetrics/pull/2800)) -### Fixed - - -- - - - ---- - ## [1.5.2] - 2024-11-07 ### Changed diff --git a/tests/unittests/text/test_rouge.py b/tests/unittests/text/test_rouge.py index f7de944943e..7862bf6a277 100644 --- a/tests/unittests/text/test_rouge.py +++ b/tests/unittests/text/test_rouge.py @@ -274,7 +274,7 @@ def test_rouge_lsum_score(pl_rouge_metric_key, use_stemmer): pytest.mark.parametrize( - "preds, references, expected_scores", + ("preds", "references", "expected_scores"), [ ( "a b c", From beeb00195a40f07480d3328e631fdb6b4344e3bc Mon Sep 17 00:00:00 2001 From: rittik9 Date: Fri, 8 Nov 2024 17:26:23 +0530 Subject: [PATCH 7/8] minor fix: test_rouge.py --- tests/unittests/text/test_rouge.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/unittests/text/test_rouge.py b/tests/unittests/text/test_rouge.py index 7862bf6a277..2457e19a627 100644 --- a/tests/unittests/text/test_rouge.py +++ b/tests/unittests/text/test_rouge.py @@ -273,7 +273,7 @@ def test_rouge_lsum_score(pl_rouge_metric_key, use_stemmer): assert torch.isclose(metrics_score[rouge_level + "_" + metric], original_score) -pytest.mark.parametrize( +@pytest.mark.parametrize( ("preds", "references", "expected_scores"), [ ( @@ -314,8 +314,6 @@ def test_rouge_lsum_score(pl_rouge_metric_key, use_stemmer): ), ], ) - - def test_rouge_score_accumulate_best(preds, references, expected_scores): """Issue: https://github.com/Lightning-AI/torchmetrics/issues/2148.""" # Calculate ROUGE scores From 6eaa882d3d0d2e4409a594eada11992b48d31dee Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:11:46 +0000 Subject: [PATCH 8/8] Update CHANGELOG.md --- CHANGELOG.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 65624b8d87b..b8f73495da4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -47,6 +47,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Removed `num_outputs` in `R2Score` ([#2800](https://github.com/Lightning-AI/torchmetrics/pull/2800)) +### Fixed + +- Fixed mixed results of `rouge_score` with `accumulate='best'` ([#2830](https://github.com/Lightning-AI/torchmetrics/pull/2830)) + ## [1.5.2] - 2024-11-07 @@ -62,8 +66,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Patched `np.Inf` for `numpy` 2.0+ ([#2826](https://github.com/Lightning-AI/torchmetrics/pull/2826)) - - ## [1.5.1] - 2024-10-22 ### Fixed