From 8302300bd7fb391cfcea172c2ddbe777bf881ea7 Mon Sep 17 00:00:00 2001
From: rittik9 <rittikpanda24@gmail.com>
Date: Mon, 9 Sep 2024 17:16:59 +0530
Subject: [PATCH 1/8] Fix: Handle zero division error in binary IoU (Jaccard
 index) calculation

---
 .../functional/classification/jaccard.py      |  2 +-
 .../unittests/classification/test_jaccard.py  | 21 +++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/torchmetrics/functional/classification/jaccard.py b/src/torchmetrics/functional/classification/jaccard.py
index 1d240df68af..dfddd68255f 100644
--- a/src/torchmetrics/functional/classification/jaccard.py
+++ b/src/torchmetrics/functional/classification/jaccard.py
@@ -67,7 +67,7 @@ def _jaccard_index_reduce(
         raise ValueError(f"The `average` has to be one of {allowed_average}, got {average}.")
     confmat = confmat.float()
     if average == "binary":
-        return confmat[1, 1] / (confmat[0, 1] + confmat[1, 0] + confmat[1, 1])
+        return _safe_divide(confmat[1, 1], (confmat[0, 1] + confmat[1, 0] + confmat[1, 1]), zero_division=zero_division)
 
     ignore_index_cond = ignore_index is not None and 0 <= ignore_index < confmat.shape[0]
     multilabel = confmat.ndim == 3
diff --git a/tests/unittests/classification/test_jaccard.py b/tests/unittests/classification/test_jaccard.py
index 6901868eac9..e7afdb557a6 100644
--- a/tests/unittests/classification/test_jaccard.py
+++ b/tests/unittests/classification/test_jaccard.py
@@ -26,6 +26,7 @@
     MultilabelJaccardIndex,
 )
 from torchmetrics.functional.classification.jaccard import (
+    _jaccard_index_reduce,
     binary_jaccard_index,
     multiclass_jaccard_index,
     multilabel_jaccard_index,
@@ -403,6 +404,26 @@ def test_corner_case():
     assert torch.allclose(res, out)
 
 
+def test_jaccard_index_zero_division():
+    """Issue: https://github.com/Lightning-AI/torchmetrics/issues/2658."""
+    # Test case where all pixels are background (zeros)
+    confmat = torch.tensor([[4, 0], [0, 0]])
+
+    # Test with zero_division=0.0
+    result = _jaccard_index_reduce(confmat, average="binary", zero_division=0.0)
+    assert result == 0.0, f"Expected 0.0, but got {result}"
+
+    # Test with zero_division=1.0
+    result = _jaccard_index_reduce(confmat, average="binary", zero_division=1.0)
+    assert result == 1.0, f"Expected 1.0, but got {result}"
+
+    # Test case with some foreground pixels
+    confmat = torch.tensor([[2, 1], [1, 1]])
+    result = _jaccard_index_reduce(confmat, average="binary", zero_division=0.0)
+    expected = 1 / 3
+    assert torch.isclose(result, torch.tensor(expected)), f"Expected {expected}, but got {result}"
+
+
 @pytest.mark.parametrize(
     ("metric", "kwargs"),
     [

From 9098d0a02dd7481c016531cc7842ba4a0285617d Mon Sep 17 00:00:00 2001
From: Jirka Borovec <6035284+Borda@users.noreply.github.com>
Date: Mon, 9 Sep 2024 19:38:53 +0200
Subject: [PATCH 2/8] chlog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2b0b0022476..0e6d5f40643 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -39,7 +39,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
--
+- Fixed handling zero division error in binary IoU (Jaccard index) calculation ([#2726](https://github.com/Lightning-AI/torchmetrics/pull/2726))
 
 
 ## [1.4.1] - 2024-08-02

From 21948b2725ab9decc7ee7687fe0e40b7c86f8b2c Mon Sep 17 00:00:00 2001
From: rittik9 <rittikpanda24@gmail.com>
Date: Fri, 8 Nov 2024 02:04:37 +0530
Subject: [PATCH 3/8] fix: rouge_score with accumulate='best' gives mixed
 results #2148

---
 src/torchmetrics/functional/text/rouge.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/torchmetrics/functional/text/rouge.py b/src/torchmetrics/functional/text/rouge.py
index 58c9a05fecf..5543a1e5d1f 100644
--- a/src/torchmetrics/functional/text/rouge.py
+++ b/src/torchmetrics/functional/text/rouge.py
@@ -361,12 +361,9 @@ def _rouge_score_update(
             list_results.append(result_inner.copy())
 
         if accumulate == "best":
-            key_curr = rouge_keys_values[0]
-            all_fmeasure = torch.tensor([v[key_curr]["fmeasure"] for v in list_results])
-            highest_idx = int(torch.argmax(all_fmeasure).item())
-
-            for rouge_key in rouge_keys_values:
-                results[rouge_key].append(list_results[highest_idx][rouge_key])  # todo
+            for k in rouge_keys_values:
+                index = torch.argmax(torch.tensor([s[k]["fmeasure"] for s in list_results]))
+                results[k].append(list_results[index][k])
 
         elif accumulate == "avg":
             new_result_avg: Dict[Union[int, str], Dict[str, Tensor]] = {

From bb208f426ce4a47cb7014b7dbdd13de0e05511aa Mon Sep 17 00:00:00 2001
From: rittik9 <rittikpanda24@gmail.com>
Date: Fri, 8 Nov 2024 13:43:35 +0530
Subject: [PATCH 4/8] fix: test_rouge.py

---
 tests/unittests/text/test_rouge.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/unittests/text/test_rouge.py b/tests/unittests/text/test_rouge.py
index a40885587e8..4b30d2ad4d9 100644
--- a/tests/unittests/text/test_rouge.py
+++ b/tests/unittests/text/test_rouge.py
@@ -74,10 +74,12 @@ def _reference_rouge_score(
         aggregator_avg = BootstrapAggregator()
 
         if accumulate == "best":
-            key_curr = next(iter(list_results[0].keys()))
-            all_fmeasure = torch.tensor([v[key_curr].fmeasure for v in list_results])
-            highest_idx = torch.argmax(all_fmeasure).item()
-            aggregator.add_scores(list_results[highest_idx])
+            scores = {}
+            for rouge_key in list_results[0]:
+                all_fmeasure = torch.tensor([v[rouge_key].fmeasure for v in list_results])
+                highest_idx = torch.argmax(all_fmeasure).item()
+                scores[rouge_key] = list_results[highest_idx][rouge_key]
+            aggregator.add_scores(scores)
         elif accumulate == "avg":
             for _score in list_results:
                 aggregator_avg.add_scores(_score)

From e09d751e9af3097824bd0fb9f33f3518a3cca7d7 Mon Sep 17 00:00:00 2001
From: rittik9 <rittikpanda24@gmail.com>
Date: Fri, 8 Nov 2024 16:46:40 +0530
Subject: [PATCH 5/8] test: test_rouge.py

---
 tests/unittests/text/test_rouge.py | 55 ++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/tests/unittests/text/test_rouge.py b/tests/unittests/text/test_rouge.py
index 4b30d2ad4d9..f7de944943e 100644
--- a/tests/unittests/text/test_rouge.py
+++ b/tests/unittests/text/test_rouge.py
@@ -271,3 +271,58 @@ def test_rouge_lsum_score(pl_rouge_metric_key, use_stemmer):
         use_stemmer=use_stemmer,
     )
     assert torch.isclose(metrics_score[rouge_level + "_" + metric], original_score)
+
+
+pytest.mark.parametrize(
+    "preds, references, expected_scores",
+    [
+        (
+            "a b c",
+            ["a b c", "c b a"],
+            {
+                "rouge1_fmeasure": 1.0,
+                "rouge1_precision": 1.0,
+                "rouge1_recall": 1.0,
+                "rouge2_fmeasure": 1.0,
+                "rouge2_precision": 1.0,
+                "rouge2_recall": 1.0,
+                "rougeL_fmeasure": 1.0,
+                "rougeL_precision": 1.0,
+                "rougeL_recall": 1.0,
+                "rougeLsum_fmeasure": 1.0,
+                "rougeLsum_precision": 1.0,
+                "rougeLsum_recall": 1.0,
+            },
+        ),
+        (
+            "a b c",
+            ["c b a", "a b c"],
+            {
+                "rouge1_fmeasure": 1.0,
+                "rouge1_precision": 1.0,
+                "rouge1_recall": 1.0,
+                "rouge2_fmeasure": 1.0,
+                "rouge2_precision": 1.0,
+                "rouge2_recall": 1.0,
+                "rougeL_fmeasure": 1.0,
+                "rougeL_precision": 1.0,
+                "rougeL_recall": 1.0,
+                "rougeLsum_fmeasure": 1.0,
+                "rougeLsum_precision": 1.0,
+                "rougeLsum_recall": 1.0,
+            },
+        ),
+    ],
+)
+
+
+def test_rouge_score_accumulate_best(preds, references, expected_scores):
+    """Issue: https://github.com/Lightning-AI/torchmetrics/issues/2148."""
+    # Calculate ROUGE scores
+    result = rouge_score(preds, references, accumulate="best")
+
+    # Assert each expected score
+    for key in expected_scores:
+        assert torch.isclose(
+            result[key], torch.tensor(expected_scores[key])
+        ), f"Expected {expected_scores[key]} for {key}, but got {result[key]}"

From da834d0e4cb50f1407e9f7ab72bca06ba6c80cd8 Mon Sep 17 00:00:00 2001
From: rittik9 <rittikpanda24@gmail.com>
Date: Fri, 8 Nov 2024 17:06:15 +0530
Subject: [PATCH 6/8] fix: test_rouge.py

---
 CHANGELOG.md                       | 9 ---------
 tests/unittests/text/test_rouge.py | 2 +-
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5d3c8bc8392..65624b8d87b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -48,15 +48,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Removed `num_outputs` in `R2Score` ([#2800](https://github.com/Lightning-AI/torchmetrics/pull/2800))
 
 
-### Fixed
-
-
--
-
-
-
----
-
 ## [1.5.2] - 2024-11-07
 
 ### Changed
diff --git a/tests/unittests/text/test_rouge.py b/tests/unittests/text/test_rouge.py
index f7de944943e..7862bf6a277 100644
--- a/tests/unittests/text/test_rouge.py
+++ b/tests/unittests/text/test_rouge.py
@@ -274,7 +274,7 @@ def test_rouge_lsum_score(pl_rouge_metric_key, use_stemmer):
 
 
 pytest.mark.parametrize(
-    "preds, references, expected_scores",
+    ("preds", "references", "expected_scores"),
     [
         (
             "a b c",

From beeb00195a40f07480d3328e631fdb6b4344e3bc Mon Sep 17 00:00:00 2001
From: rittik9 <rittikpanda24@gmail.com>
Date: Fri, 8 Nov 2024 17:26:23 +0530
Subject: [PATCH 7/8] minor fix: test_rouge.py

---
 tests/unittests/text/test_rouge.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/unittests/text/test_rouge.py b/tests/unittests/text/test_rouge.py
index 7862bf6a277..2457e19a627 100644
--- a/tests/unittests/text/test_rouge.py
+++ b/tests/unittests/text/test_rouge.py
@@ -273,7 +273,7 @@ def test_rouge_lsum_score(pl_rouge_metric_key, use_stemmer):
     assert torch.isclose(metrics_score[rouge_level + "_" + metric], original_score)
 
 
-pytest.mark.parametrize(
+@pytest.mark.parametrize(
     ("preds", "references", "expected_scores"),
     [
         (
@@ -314,8 +314,6 @@ def test_rouge_lsum_score(pl_rouge_metric_key, use_stemmer):
         ),
     ],
 )
-
-
 def test_rouge_score_accumulate_best(preds, references, expected_scores):
     """Issue: https://github.com/Lightning-AI/torchmetrics/issues/2148."""
     # Calculate ROUGE scores

From 6eaa882d3d0d2e4409a594eada11992b48d31dee Mon Sep 17 00:00:00 2001
From: Jirka Borovec <6035284+Borda@users.noreply.github.com>
Date: Fri, 8 Nov 2024 16:11:46 +0000
Subject: [PATCH 8/8] Update CHANGELOG.md

---
 CHANGELOG.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 65624b8d87b..b8f73495da4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -47,6 +47,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Removed `num_outputs` in `R2Score` ([#2800](https://github.com/Lightning-AI/torchmetrics/pull/2800))
 
+### Fixed
+
+- Fixed mixed results of `rouge_score` with `accumulate='best'` ([#2830](https://github.com/Lightning-AI/torchmetrics/pull/2830))
+
 
 ## [1.5.2] - 2024-11-07
 
@@ -62,8 +66,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Patched `np.Inf` for `numpy` 2.0+ ([#2826](https://github.com/Lightning-AI/torchmetrics/pull/2826))
 
 
-
-
 ## [1.5.1] - 2024-10-22
 
 ### Fixed