From e12ea7f037f2c03d35839b924501dfefb1123d2f Mon Sep 17 00:00:00 2001
From: clvincen <cedvincentcuaz@gmail.com>
Date: Fri, 24 Nov 2023 00:59:03 +0100
Subject: [PATCH 1/6] add citation for srgw-kl

---
 README.md                 |  4 +++-
 ot/gromov/_semirelaxed.py | 20 ++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 57b845edb..84b3cf0ee 100644
--- a/README.md
+++ b/README.md
@@ -340,4 +340,6 @@ distances between Gaussian distributions](https://hal.science/hal-03197398v2/fil
 
 [60] Feydy, J., Roussillon, P., Trouvé, A., & Gori, P. (2019). [Fast and scalable optimal transport for brain tractograms](https://arxiv.org/pdf/2107.02010.pdf). In Medical Image Computing and Computer Assisted Intervention–MICCAI 2019: 22nd International Conference, Shenzhen, China, October 13–17, 2019, Proceedings, Part III 22 (pp. 636-644). Springer International Publishing.
 
-[61] Charlier, B., Feydy, J., Glaunes, J. A., Collin, F. D., & Durif, G. (2021). [Kernel operations on the gpu, with autodiff, without memory overflows](https://www.jmlr.org/papers/volume22/20-275/20-275.pdf). The Journal of Machine Learning Research, 22(1), 3457-3462.
\ No newline at end of file
+[61] Charlier, B., Feydy, J., Glaunes, J. A., Collin, F. D., & Durif, G. (2021). [Kernel operations on the gpu, with autodiff, without memory overflows](https://www.jmlr.org/papers/volume22/20-275/20-275.pdf). The Journal of Machine Learning Research, 22(1), 3457-3462.
+
+[62] H. Van Assel, C. Vincent-Cuaz, T. Vayer, R. Flamary, N. Courty (2023). [Interpolating between Clustering and Dimensionality Reduction with Gromov-Wasserstein](https://arxiv.org/pdf/2310.03398.pdf). NeurIPS 2023 Workshop Optimal Transport and Machine Learning.
diff --git a/ot/gromov/_semirelaxed.py b/ot/gromov/_semirelaxed.py
index cbfe64ea8..d064dc669 100644
--- a/ot/gromov/_semirelaxed.py
+++ b/ot/gromov/_semirelaxed.py
@@ -90,6 +90,10 @@ def semirelaxed_gromov_wasserstein(C1, C2, p=None, loss_fun='square_loss', symme
     .. [48]  Cédric Vincent-Cuaz, Rémi Flamary, Marco Corneli, Titouan Vayer, Nicolas Courty.
             "Semi-relaxed Gromov-Wasserstein divergence and applications on graphs"
             International Conference on Learning Representations (ICLR), 2022.
+    .. [62] H. Van Assel, C. Vincent-Cuaz, T. Vayer, R. Flamary, N. Courty.
+            "Interpolating between Clustering and Dimensionality Reduction with
+            Gromov-Wasserstein". NeurIPS 2023 Workshop OTML.
+
     """
     arr = [C1, C2]
     if p is not None:
@@ -220,6 +224,10 @@ def semirelaxed_gromov_wasserstein2(C1, C2, p=None, loss_fun='square_loss', symm
     .. [48]  Cédric Vincent-Cuaz, Rémi Flamary, Marco Corneli, Titouan Vayer, Nicolas Courty.
             "Semi-relaxed Gromov-Wasserstein divergence and applications on graphs"
             International Conference on Learning Representations (ICLR), 2022.
+
+    .. [62] H. Van Assel, C. Vincent-Cuaz, T. Vayer, R. Flamary, N. Courty.
+            "Interpolating between Clustering and Dimensionality Reduction with
+            Gromov-Wasserstein". NeurIPS 2023 Workshop OTML.
     """
     # partial get_backend as the full one will be handled in gromov_wasserstein
     nx = get_backend(C1, C2)
@@ -331,6 +339,10 @@ def semirelaxed_fused_gromov_wasserstein(
     .. [48] Cédric Vincent-Cuaz, Rémi Flamary, Marco Corneli, Titouan Vayer, Nicolas Courty.
             "Semi-relaxed Gromov-Wasserstein divergence and applications on graphs"
             International Conference on Learning Representations (ICLR), 2022.
+
+    .. [62] H. Van Assel, C. Vincent-Cuaz, T. Vayer, R. Flamary, N. Courty.
+            "Interpolating between Clustering and Dimensionality Reduction with
+            Gromov-Wasserstein". NeurIPS 2023 Workshop OTML.
     """
     arr = [M, C1, C2]
     if p is not None:
@@ -470,6 +482,10 @@ def semirelaxed_fused_gromov_wasserstein2(M, C1, C2, p=None, loss_fun='square_lo
     .. [48] Cédric Vincent-Cuaz, Rémi Flamary, Marco Corneli, Titouan Vayer, Nicolas Courty.
             "Semi-relaxed Gromov-Wasserstein divergence and applications on graphs"
             International Conference on Learning Representations (ICLR), 2022.
+
+    .. [62] H. Van Assel, C. Vincent-Cuaz, T. Vayer, R. Flamary, N. Courty.
+            "Interpolating between Clustering and Dimensionality Reduction with
+            Gromov-Wasserstein". NeurIPS 2023 Workshop OTML.
     """
     # partial get_backend as the full one will be handled in gromov_wasserstein
     nx = get_backend(C1, C2)
@@ -561,6 +577,10 @@ def solve_semirelaxed_gromov_linesearch(G, deltaG, cost_G, C1, C2, ones_p,
     .. [48]  Cédric Vincent-Cuaz, Rémi Flamary, Marco Corneli, Titouan Vayer, Nicolas Courty.
             "Semi-relaxed Gromov-Wasserstein divergence and applications on graphs"
             International Conference on Learning Representations (ICLR), 2021.
+
+    .. [62] H. Van Assel, C. Vincent-Cuaz, T. Vayer, R. Flamary, N. Courty.
+            "Interpolating between Clustering and Dimensionality Reduction with
+            Gromov-Wasserstein". NeurIPS 2023 Workshop OTML.
     """
     if nx is None:
         G, deltaG, C1, C2, M = list_to_array(G, deltaG, C1, C2, M)

From c80499f4c0b553cf8580651e7fcc3c1fd92d8304 Mon Sep 17 00:00:00 2001
From: clvincen <cedvincentcuaz@gmail.com>
Date: Fri, 24 Nov 2023 12:31:01 +0100
Subject: [PATCH 2/6] init commit - BAPG for GW and FGW

---
 README.md             |   4 +
 ot/gromov/_bregman.py | 575 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 579 insertions(+)

diff --git a/README.md b/README.md
index 84b3cf0ee..939cc6158 100644
--- a/README.md
+++ b/README.md
@@ -343,3 +343,7 @@ distances between Gaussian distributions](https://hal.science/hal-03197398v2/fil
 [61] Charlier, B., Feydy, J., Glaunes, J. A., Collin, F. D., & Durif, G. (2021). [Kernel operations on the gpu, with autodiff, without memory overflows](https://www.jmlr.org/papers/volume22/20-275/20-275.pdf). The Journal of Machine Learning Research, 22(1), 3457-3462.
 
 [62] H. Van Assel, C. Vincent-Cuaz, T. Vayer, R. Flamary, N. Courty (2023). [Interpolating between Clustering and Dimensionality Reduction with Gromov-Wasserstein](https://arxiv.org/pdf/2310.03398.pdf). NeurIPS 2023 Workshop Optimal Transport and Machine Learning.
+
+[63] Li, J., Tang, J., Kong, L., Liu, H., Li, J., So, A. M. C., & Blanchet, J. (2022). [A Convergent Single-Loop Algorithm for Relaxation of Gromov-Wasserstein in Graph Data](https://openreview.net/pdf?id=0jxPyVWmiiF). In The Eleventh International Conference on Learning Representations.
+
+[64] Ma, X., Chu, X., Wang, Y., Lin, Y., Zhao, J., Ma, L., & Zhu, W. (2023). [Fused Gromov-Wasserstein Graph Mixup for Graph-level Classifications](https://openreview.net/pdf?id=uqkUguNu40). In Thirty-seventh Conference on Neural Information Processing Systems.
diff --git a/ot/gromov/_bregman.py b/ot/gromov/_bregman.py
index 3539428d5..5dcdf0165 100644
--- a/ot/gromov/_bregman.py
+++ b/ot/gromov/_bregman.py
@@ -343,6 +343,281 @@ def entropic_gromov_wasserstein2(
         return logv['gw_dist']
 
 
+def entropic_BAPG_gromov_wasserstein(
+        C1, C2, p=None, q=None, loss_fun='square_loss', epsilon=0.1,
+        symmetric=None, G0=None, max_iter=1000, tol=1e-9, marginal_loss=False,
+        verbose=False, log=False):
+    r"""
+    Returns the Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
+    estimated using Bregman Alternated Projected Gradient method.
+
+    The function solves the following Gromov-Wasserstein
+    optimization problem [63]:
+
+    .. math::
+        \mathbf{T}^* \in \mathop{\arg\min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
+
+             \mathbf{T}^T \mathbf{1} &= \mathbf{q}
+
+             \mathbf{T} &\geq 0
+    Where :
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity matrices
+
+    .. note:: By algorithmic design the optimal coupling :math:`\mathbf{T}`
+        returned by this function does not necessarily satisfy the marginal
+        constraints :math:`\mathbf{T}\mathbf{1}=\mathbf{p}` and
+        :math:`\mathbf{T}^T\mathbf{1}=\mathbf{q}`. So the returned
+        Gromov-Wasserstein loss does not necessarily satisfy distance
+        properties and may be negative.
+
+    Parameters
+    ----------
+    C1 : array-like, shape (ns, ns)
+        Metric cost matrix in the source space
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p : array-like, shape (ns,), optional
+        Distribution in the source space.
+        If let to its default value None, uniform distribution is taken.
+    q : array-like, shape (nt,), optional
+        Distribution in the target space.
+        If let to its default value None, uniform distribution is taken.
+    loss_fun : string, optional (default='square_loss')
+        Loss function used for the solver either 'square_loss' or 'kl_loss'
+    epsilon : float, optional
+        Regularization term >0
+    symmetric : bool, optional
+        Either C1 and C2 are to be assumed symmetric or not.
+        If let to its default None value, a symmetry test will be conducted.
+        Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymmetric).
+    G0: array-like, shape (ns,nt), optional
+        If None the initial transport plan of the solver is pq^T.
+        Otherwise G0 will be used as initial transport of the solver. G0 is not
+        required to satisfy marginal constraints but we strongly recommend it
+        to correctly estimate the GW distance.
+    max_iter : int, optional
+        Max number of iterations
+    tol : float, optional
+        Stop threshold on error (>0)
+    marginal_loss: bool, optional. Default is False.
+        Include constant terms or not in the matching objective function.
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        Record log if True.
+    Returns
+    -------
+    T : array-like, shape (`ns`, `nt`)
+        Optimal coupling between the two spaces
+
+    References
+    ----------
+    .. [63] Li, J., Tang, J., Kong, L., Liu, H., Li, J., So, A. M. C., & Blanchet, J.
+        "A Convergent Single-Loop Algorithm for Relaxation of Gromov-Wasserstein
+        in Graph Data". International Conference on Learning Representations (ICLR), 2022.
+
+    """
+    if loss_fun not in ('square_loss', 'kl_loss'):
+        raise ValueError(f"Unknown `loss_fun='{loss_fun}'`. Use one of: {'square_loss', 'kl_loss'}.")
+
+    C1, C2 = list_to_array(C1, C2)
+    arr = [C1, C2]
+    if p is not None:
+        arr.append(list_to_array(p))
+    else:
+        p = unif(C1.shape[0], type_as=C1)
+    if q is not None:
+        arr.append(list_to_array(q))
+    else:
+        q = unif(C2.shape[0], type_as=C2)
+
+    if G0 is not None:
+        arr.append(G0)
+
+    nx = get_backend(*arr)
+
+    if G0 is None:
+        G0 = nx.outer(p, q)
+
+    T = G0
+    constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun, nx)
+
+    if symmetric is None:
+        symmetric = nx.allclose(C1, C1.T, atol=1e-10) and nx.allclose(C2, C2.T, atol=1e-10)
+    if not symmetric:
+        constCt, hC1t, hC2t = init_matrix(C1.T, C2.T, p, q, loss_fun, nx)
+
+    if marginal_loss:
+        if symmetric:
+            def df(T):
+                return gwggrad(constC, hC1, hC2, T, nx)
+        else:
+            def df(T):
+                return 0.5 * (gwggrad(constC, hC1, hC2, T, nx) + gwggrad(constCt, hC1t, hC2t, T, nx))
+
+    else:
+        if symmetric:
+            def df(T):
+                A = - nx.dot(nx.dot(hC1, T), hC2.T)
+                return 2 * A
+        else:
+            def df(T):
+                A = - nx.dot(nx.dot(hC1, T), hC2t)
+                At = - nx.dot(nx.dot(hC1t, T), hC2)
+                return A + At
+
+    cpt = 0
+    err = 1e15
+
+    if log:
+        log = {'err': []}
+
+    while (err > tol and cpt < max_iter):
+
+        Tprev = T
+
+        # rows update
+        T = T * nx.exp(- df(T) / epsilon)
+        row_scaling = p / nx.sum(T, 1)
+        T = nx.reshape(row_scaling, (-1, 1)) * T
+
+        # columns update
+        T = T * nx.exp(- df(T) / epsilon)
+        column_scaling = q / nx.sum(T, 0)
+        T = nx.reshape(column_scaling, (1, -1)) * T
+
+        if cpt % 10 == 0:
+            # we can speed up the process by checking for the error only all
+            # the 10th iterations
+            err = nx.norm(T - Tprev)
+
+            if log:
+                log['err'].append(err)
+
+            if verbose:
+                if cpt % 200 == 0:
+                    print('{:5s}|{:12s}'.format(
+                        'It.', 'Err') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(cpt, err))
+
+        cpt += 1
+
+    if abs(nx.sum(T) - 1) > 1e-5:
+        warnings.warn("Solver failed to produce a transport plan. You might "
+                      "want to increase the regularization parameter `epsilon`.")
+    if log:
+        log['gw_dist'] = gwloss(constC, hC1, hC2, T, nx)
+
+        if not marginal_loss:
+            log['loss'] = log['gw_dist'] - nx.sum(constC * T)
+
+        return T, log
+    else:
+        return T
+
+
+def entropic_BAPG_gromov_wasserstein2(
+        C1, C2, p=None, q=None, loss_fun='square_loss', epsilon=0.1, symmetric=None, G0=None, max_iter=1000,
+        tol=1e-9, marginal_loss=False, verbose=False, log=False):
+    r"""
+    Returns the Gromov-Wasserstein loss :math:`\mathbf{GW}` between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
+    estimated using Bregman Alternated Projected Gradient method.
+
+    The function solves the following Gromov-Wasserstein
+    optimization problem [63]:
+
+    .. math::
+        \mathbf{GW} = \mathop{\min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
+
+             \mathbf{T}^T \mathbf{1} &= \mathbf{q}
+
+             \mathbf{T} &\geq 0
+    Where :
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity matrices
+
+    .. note:: By algorithmic design the optimal coupling :math:`\mathbf{T}`
+        returned by this function does not necessarily satisfy the marginal
+        constraints :math:`\mathbf{T}\mathbf{1}=\mathbf{p}` and
+        :math:`\mathbf{T}^T\mathbf{1}=\mathbf{q}`. So the returned
+        Gromov-Wasserstein loss does not necessarily satisfy distance
+        properties and may be negative.
+
+
+    Parameters
+    ----------
+    C1 : array-like, shape (ns, ns)
+        Metric cost matrix in the source space
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p : array-like, shape (ns,), optional
+        Distribution in the source space.
+        If let to its default value None, uniform distribution is taken.
+    q : array-like, shape (nt,), optional
+        Distribution in the target space.
+        If let to its default value None, uniform distribution is taken.
+    loss_fun : string, optional (default='square_loss')
+        Loss function used for the solver either 'square_loss' or 'kl_loss'
+    epsilon : float, optional
+        Regularization term >0
+    symmetric : bool, optional
+        Either C1 and C2 are to be assumed symmetric or not.
+        If let to its default None value, a symmetry test will be conducted.
+        Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymmetric).
+    G0: array-like, shape (ns,nt), optional
+        If None the initial transport plan of the solver is pq^T.
+        Otherwise G0 will be used as initial transport of the solver. G0 is not
+        required to satisfy marginal constraints but we strongly recommand it
+        to correcly estimate the GW distance.
+    max_iter : int, optional
+        Max number of iterations
+    tol : float, optional
+        Stop threshold on error (>0)
+    marginal_loss: bool, optional. Default is False.
+        Include constant terms or not in the matching objective function.
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        Record log if True.
+
+    Returns
+    -------
+    gw_dist : float
+        Gromov-Wasserstein distance
+
+    References
+    ----------
+    .. [63] Li, J., Tang, J., Kong, L., Liu, H., Li, J., So, A. M. C., & Blanchet, J.
+        "A Convergent Single-Loop Algorithm for Relaxation of Gromov-Wasserstein
+        in Graph Data". International Conference on Learning Representations (ICLR), 2023.
+
+    """
+
+    T, logv = entropic_BAPG_gromov_wasserstein(
+        C1, C2, p, q, loss_fun, epsilon, symmetric, G0, max_iter,
+        tol, marginal_loss, verbose, log=True)
+
+    logv['T'] = T
+
+    if log:
+        return logv['gw_dist'], logv
+    else:
+        return logv['gw_dist']
+
+
 def entropic_gromov_barycenters(
         N, Cs, ps=None, p=None, lambdas=None, loss_fun='square_loss',
         epsilon=0.1, symmetric=True, max_iter=1000, tol=1e-9,
@@ -877,6 +1152,306 @@ def entropic_fused_gromov_wasserstein2(
         return logv['fgw_dist']
 
 
+def entropic_BAPG_fused_gromov_wasserstein(
+        M, C1, C2, p=None, q=None, loss_fun='square_loss', epsilon=0.1,
+        symmetric=None, alpha=0.5, G0=None, max_iter=1000, tol=1e-9,
+        marginal_loss=False, verbose=False, log=False):
+    r"""
+    Returns the Fused Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{Y_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{Y_2}, \mathbf{q})`
+    with pairwise distance matrix :math:`\mathbf{M}` between node feature matrices :math:`\mathbf{Y_1}` and :math:`\mathbf{Y_2}`,
+    estimated using Bregman Alternated Projected Gradient method.
+
+    The function solves the following Fused Gromov-Wasserstein
+    optimization problem [63, 64]:
+
+    .. math::
+        \mathbf{T}^* \in\mathop{\arg\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
+        \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
+
+             \mathbf{T}^T \mathbf{1} &= \mathbf{q}
+
+             \mathbf{T} &\geq 0
+    Where :
+
+    - :math:`\mathbf{M}`: metric cost matrix between features across domains
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity and feature matrices
+    - :math:`\alpha`: trade-off parameter
+
+    .. note:: By algorithmic design the optimal coupling :math:`\mathbf{T}`
+        returned by this function does not necessarily satisfy the marginal
+        constraints :math:`\mathbf{T}\mathbf{1}=\mathbf{p}` and
+        :math:`\mathbf{T}^T\mathbf{1}=\mathbf{q}`. So the returned Fused
+        Gromov-Wasserstein loss does not necessarily satisfy distance
+        properties and may be negative.
+
+    Parameters
+    ----------
+    M : array-like, shape (ns, nt)
+        Metric cost matrix between features across domains
+    C1 : array-like, shape (ns, ns)
+        Metric cost matrix in the source space
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p : array-like, shape (ns,), optional
+        Distribution in the source space.
+        If let to its default value None, uniform distribution is taken.
+    q : array-like, shape (nt,), optional
+        Distribution in the target space.
+        If let to its default value None, uniform distribution is taken.
+    loss_fun : string, optional (default='square_loss')
+        Loss function used for the solver either 'square_loss' or 'kl_loss'
+    epsilon : float, optional
+        Regularization term >0
+    symmetric : bool, optional
+        Either C1 and C2 are to be assumed symmetric or not.
+        If let to its default None value, a symmetry test will be conducted.
+        Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymmetric).
+    alpha : float, optional
+        Trade-off parameter (0 < alpha < 1)
+    G0: array-like, shape (ns,nt), optional
+        If None the initial transport plan of the solver is pq^T.
+        Otherwise G0 will be used as initial transport of the solver. G0 is not
+        required to satisfy marginal constraints but we strongly recommend it
+        to correctly estimate the GW distance.
+    max_iter : int, optional
+        Max number of iterations
+    tol : float, optional
+        Stop threshold on error (>0)
+    marginal_loss: bool, optional. Default is False.
+        Include constant terms or not in the matching objective function.
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        Record log if True.
+    Returns
+    -------
+    T : array-like, shape (`ns`, `nt`)
+        Optimal coupling between the two joint spaces
+
+    References
+    ----------
+    .. [63] Li, J., Tang, J., Kong, L., Liu, H., Li, J., So, A. M. C., & Blanchet, J.
+        "A Convergent Single-Loop Algorithm for Relaxation of Gromov-Wasserstein
+        in Graph Data". International Conference on Learning Representations (ICLR), 2023.
+
+    .. [64] Ma, X., Chu, X., Wang, Y., Lin, Y., Zhao, J., Ma, L., & Zhu, W.
+        "Fused Gromov-Wasserstein Graph Mixup for Graph-level Classifications".
+        In Thirty-seventh Conference on Neural Information Processing Systems.
+    """
+    if loss_fun not in ('square_loss', 'kl_loss'):
+        raise ValueError(f"Unknown `loss_fun='{loss_fun}'`. Use one of: {'square_loss', 'kl_loss'}.")
+
+    M, C1, C2 = list_to_array(M, C1, C2)
+    arr = [M, C1, C2]
+    if p is not None:
+        arr.append(list_to_array(p))
+    else:
+        p = unif(C1.shape[0], type_as=C1)
+    if q is not None:
+        arr.append(list_to_array(q))
+    else:
+        q = unif(C2.shape[0], type_as=C2)
+
+    if G0 is not None:
+        arr.append(G0)
+
+    nx = get_backend(*arr)
+
+    if G0 is None:
+        G0 = nx.outer(p, q)
+
+    T = G0
+    constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun, nx)
+    if symmetric is None:
+        symmetric = nx.allclose(C1, C1.T, atol=1e-10) and nx.allclose(C2, C2.T, atol=1e-10)
+    if not symmetric:
+        constCt, hC1t, hC2t = init_matrix(C1.T, C2.T, p, q, loss_fun, nx)
+
+    # Define gradients
+    if marginal_loss:
+        if symmetric:
+            def df(T):
+                return alpha * gwggrad(constC, hC1, hC2, T, nx) + (1 - alpha) * M
+        else:
+            def df(T):
+                return (alpha * 0.5) * (gwggrad(constC, hC1, hC2, T, nx) + gwggrad(constCt, hC1t, hC2t, T, nx)) + (1 - alpha) * M
+
+    else:
+        if symmetric:
+            def df(T):
+                A = - nx.dot(nx.dot(hC1, T), hC2.T)
+                return 2 * alpha * A + (1 - alpha) * M
+        else:
+            def df(T):
+                A = - nx.dot(nx.dot(hC1, T), hC2t)
+                At = - nx.dot(nx.dot(hC1t, T), hC2)
+                return alpha * (A + At) + (1 - alpha) * M
+    cpt = 0
+    err = 1e15
+
+    if log:
+        log = {'err': []}
+
+    while (err > tol and cpt < max_iter):
+
+        Tprev = T
+
+        # rows update
+        T = T * nx.exp(- df(T) / epsilon)
+        row_scaling = p / nx.sum(T, 1)
+        T = nx.reshape(row_scaling, (-1, 1)) * T
+
+        # columns update
+        T = T * nx.exp(- df(T) / epsilon)
+        column_scaling = q / nx.sum(T, 0)
+        T = nx.reshape(column_scaling, (1, -1)) * T
+
+        if cpt % 10 == 0:
+            # we can speed up the process by checking for the error only all
+            # the 10th iterations
+            err = nx.norm(T - Tprev)
+
+            if log:
+                log['err'].append(err)
+
+            if verbose:
+                if cpt % 200 == 0:
+                    print('{:5s}|{:12s}'.format(
+                        'It.', 'Err') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(cpt, err))
+
+        cpt += 1
+
+    if abs(nx.sum(T) - 1) > 1e-5:
+        warnings.warn("Solver failed to produce a transport plan. You might "
+                      "want to increase the regularization parameter `epsilon`.")
+    if log:
+        log['fgw_dist'] = (1 - alpha) * nx.sum(M * T) + alpha * gwloss(constC, hC1, hC2, T, nx)
+
+        if not marginal_loss:
+            log['loss'] = log['fgw_dist'] - alpha * nx.sum(constC * T)
+
+        return T, log
+    else:
+        return T
+
+
+def entropic_BAPG_fused_gromov_wasserstein2(
+        M, C1, C2, p=None, q=None, loss_fun='square_loss', epsilon=0.1,
+        symmetric=None, alpha=0.5, G0=None, max_iter=1000, tol=1e-9,
+        marginal_loss=False, verbose=False, log=False):
+    r"""
+    Returns the Fused Gromov-Wasserstein loss between :math:`(\mathbf{C_1}, \mathbf{Y_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{Y_2}, \mathbf{q})`
+    with pairwise distance matrix :math:`\mathbf{M}` between node feature matrices :math:`\mathbf{Y_1}` and :math:`\mathbf{Y_2}`,
+    estimated using Bregman Alternated Projected Gradient method.
+
+    The function solves the following Fused Gromov-Wasserstein
+    optimization problem [63, 64]:
+
+    .. math::
+        \mathbf{FGW} = \mathop{\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
+        \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
+
+             \mathbf{T}^T \mathbf{1} &= \mathbf{q}
+
+             \mathbf{T} &\geq 0
+    Where :
+
+    - :math:`\mathbf{M}`: metric cost matrix between features across domains
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity and feature matrices
+    - :math:`\alpha`: trade-off parameter
+
+    .. note:: By algorithmic design the optimal coupling :math:`\mathbf{T}`
+        returned by this function does not necessarily satisfy the marginal
+        constraints :math:`\mathbf{T}\mathbf{1}=\mathbf{p}` and
+        :math:`\mathbf{T}^T\mathbf{1}=\mathbf{q}`. So the returned Fused
+        Gromov-Wasserstein loss does not necessarily satisfy distance
+        properties and may be negative.
+
+    Parameters
+    ----------
+    M : array-like, shape (ns, nt)
+        Metric cost matrix between features across domains
+    C1 : array-like, shape (ns, ns)
+        Metric cost matrix in the source space
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p : array-like, shape (ns,), optional
+        Distribution in the source space.
+        If let to its default value None, uniform distribution is taken.
+    q : array-like, shape (nt,), optional
+        Distribution in the target space.
+        If let to its default value None, uniform distribution is taken.
+    loss_fun : string, optional (default='square_loss')
+        Loss function used for the solver either 'square_loss' or 'kl_loss'
+    epsilon : float, optional
+        Regularization term >0
+    symmetric : bool, optional
+        Either C1 and C2 are to be assumed symmetric or not.
+        If let to its default None value, a symmetry test will be conducted.
+        Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymmetric).
+    alpha : float, optional
+        Trade-off parameter (0 < alpha < 1)
+    G0: array-like, shape (ns,nt), optional
+        If None the initial transport plan of the solver is pq^T.
+        Otherwise G0 will be used as initial transport of the solver. G0 is not
+        required to satisfy marginal constraints but we strongly recommend it
+        to correctly estimate the GW distance.
+    max_iter : int, optional
+        Max number of iterations
+    tol : float, optional
+        Stop threshold on error (>0)
+    marginal_loss: bool, optional. Default is False.
+        Include constant terms or not in the matching objective function.
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        Record log if True.
+    Returns
+    -------
+    T : array-like, shape (`ns`, `nt`)
+        Optimal coupling between the two joint spaces
+
+    References
+    ----------
+    .. [63] Li, J., Tang, J., Kong, L., Liu, H., Li, J., So, A. M. C., & Blanchet, J.
+        "A Convergent Single-Loop Algorithm for Relaxation of Gromov-Wasserstein
+        in Graph Data". International Conference on Learning Representations (ICLR), 2023.
+
+    .. [64] Ma, X., Chu, X., Wang, Y., Lin, Y., Zhao, J., Ma, L., & Zhu, W.
+        "Fused Gromov-Wasserstein Graph Mixup for Graph-level Classifications".
+        In Thirty-seventh Conference on Neural Information Processing Systems.
+    """
+    nx = get_backend(M, C1, C2)
+
+    T, logv = entropic_BAPG_fused_gromov_wasserstein(
+        M, C1, C2, p, q, loss_fun, epsilon, symmetric, alpha, G0, max_iter,
+        tol, marginal_loss, verbose, log=True)
+
+    logv['T'] = T
+
+    lin_term = nx.sum(T * M)
+    logv['quad_loss'] = (logv['fgw_dist'] - (1 - alpha) * lin_term)
+    logv['lin_loss'] = lin_term * (1 - alpha)
+
+    if log:
+        return logv['fgw_dist'], logv
+    else:
+        return logv['fgw_dist']
+
+
 def entropic_fused_gromov_barycenters(
         N, Ys, Cs, ps=None, p=None, lambdas=None, loss_fun='square_loss',
         epsilon=0.1, symmetric=True, alpha=0.5, max_iter=1000, tol=1e-9,

From 6381c6369d0340b5c5a37a34d58c4407c19c7217 Mon Sep 17 00:00:00 2001
From: clvincen <cedvincentcuaz@gmail.com>
Date: Fri, 24 Nov 2023 18:19:00 +0100
Subject: [PATCH 3/6] add tests

---
 RELEASES.md           |   1 +
 ot/gromov/__init__.py |   8 +-
 ot/gromov/_bregman.py |  10 +-
 test/test_gromov.py   | 220 ++++++++++++++++++++++++++++++++++++++----
 4 files changed, 216 insertions(+), 23 deletions(-)

diff --git a/RELEASES.md b/RELEASES.md
index 349c56214..11429ec26 100644
--- a/RELEASES.md
+++ b/RELEASES.md
@@ -20,6 +20,7 @@
 + Wrapper for `geomloss`` solver on empirical samples (PR #571)
 + Add `stop_criterion` feature to (un)regularized (f)gw barycenter solvers (PR #578)
 + Add `fixed_structure` and `fixed_features` to entropic fgw barycenter solver (PR #578)
++ Add new entropic BAPG solvers for GW and FGW (PR #581)
 
 #### Closed issues
 - Fix line search evaluating cost outside of the interpolation range (Issue #502, PR #504)
diff --git a/ot/gromov/__init__.py b/ot/gromov/__init__.py
index e39d906cf..63223e961 100644
--- a/ot/gromov/__init__.py
+++ b/ot/gromov/__init__.py
@@ -20,9 +20,13 @@
 
 from ._bregman import (entropic_gromov_wasserstein,
                        entropic_gromov_wasserstein2,
+                       entropic_BAPG_gromov_wasserstein,
+                       entropic_BAPG_gromov_wasserstein2,
                        entropic_gromov_barycenters,
                        entropic_fused_gromov_wasserstein,
                        entropic_fused_gromov_wasserstein2,
+                       entropic_BAPG_fused_gromov_wasserstein,
+                       entropic_BAPG_fused_gromov_wasserstein2,
                        entropic_fused_gromov_barycenters)
 
 from ._estimators import (GW_distance_estimation, pointwise_gromov_wasserstein,
@@ -49,8 +53,10 @@
            'gromov_wasserstein', 'gromov_wasserstein2', 'fused_gromov_wasserstein',
            'fused_gromov_wasserstein2', 'solve_gromov_linesearch', 'gromov_barycenters',
            'fgw_barycenters', 'entropic_gromov_wasserstein', 'entropic_gromov_wasserstein2',
+           'entropic_BAPG_gromov_wasserstein', 'entropic_BAPG_gromov_wasserstein2',
            'entropic_gromov_barycenters', 'entropic_fused_gromov_wasserstein',
-           'entropic_fused_gromov_wasserstein2', 'entropic_fused_gromov_barycenters',
+           'entropic_fused_gromov_wasserstein2', 'entropic_BAPG_fused_gromov_wasserstein',
+           'entropic_BAPG_fused_gromov_wasserstein2', 'entropic_fused_gromov_barycenters',
            'GW_distance_estimation', 'pointwise_gromov_wasserstein', 'sampled_gromov_wasserstein',
            'semirelaxed_gromov_wasserstein', 'semirelaxed_gromov_wasserstein2',
            'semirelaxed_fused_gromov_wasserstein', 'semirelaxed_fused_gromov_wasserstein2',
diff --git a/ot/gromov/_bregman.py b/ot/gromov/_bregman.py
index 5dcdf0165..bb3ba5627 100644
--- a/ot/gromov/_bregman.py
+++ b/ot/gromov/_bregman.py
@@ -509,9 +509,10 @@ def df(T):
 
         cpt += 1
 
-    if abs(nx.sum(T) - 1) > 1e-5:
+    if nx.any(nx.isnan(T)):
         warnings.warn("Solver failed to produce a transport plan. You might "
-                      "want to increase the regularization parameter `epsilon`.")
+                      "want to increase the regularization parameter `epsilon`.",
+                      UserWarning)
     if log:
         log['gw_dist'] = gwloss(constC, hC1, hC2, T, nx)
 
@@ -1328,9 +1329,10 @@ def df(T):
 
         cpt += 1
 
-    if abs(nx.sum(T) - 1) > 1e-5:
+    if nx.any(nx.isnan(T)):
         warnings.warn("Solver failed to produce a transport plan. You might "
-                      "want to increase the regularization parameter `epsilon`.")
+                      "want to increase the regularization parameter `epsilon`.",
+                      UserWarning)
     if log:
         log['fgw_dist'] = (1 - alpha) * nx.sum(M * T) + alpha * gwloss(constC, hC1, hC2, T, nx)
 
diff --git a/test/test_gromov.py b/test/test_gromov.py
index 3158f9dc9..c156154ed 100644
--- a/test/test_gromov.py
+++ b/test/test_gromov.py
@@ -570,20 +570,108 @@ def test_entropic_gromov_dtype_device(nx):
 
         C1b, C2b, pb, qb = nx.from_numpy(C1, C2, p, q, type_as=tp)
 
-        for solver in ['PGD', 'PPA']:
-            Gb = ot.gromov.entropic_gromov_wasserstein(
-                C1b, C2b, pb, qb, 'square_loss', epsilon=1e-1, max_iter=5,
-                solver=solver, verbose=True
-            )
-            gw_valb = ot.gromov.entropic_gromov_wasserstein2(
-                C1b, C2b, pb, qb, 'square_loss', epsilon=1e-1, max_iter=5,
-                solver=solver, verbose=True
-            )
+        for solver in ['PGD', 'PPA', 'BAPG']:
+            if solver == 'BAPG':
+                Gb = ot.gromov.entropic_BAPG_gromov_wasserstein(
+                    C1b, C2b, pb, qb, max_iter=2, verbose=True)
+                gw_valb = ot.gromov.entropic_BAPG_gromov_wasserstein2(
+                    C1b, C2b, pb, qb, max_iter=2, verbose=True)
+            else:
+                Gb = ot.gromov.entropic_gromov_wasserstein(
+                    C1b, C2b, pb, qb, max_iter=2, solver=solver, verbose=True)
+                gw_valb = ot.gromov.entropic_gromov_wasserstein2(
+                    C1b, C2b, pb, qb, max_iter=2, solver=solver, verbose=True)
 
             nx.assert_same_dtype_device(C1b, Gb)
             nx.assert_same_dtype_device(C1b, gw_valb)
 
 
+def test_entropic_BAPG_gromov(nx):
+    n_samples = 10  # nb samples
+
+    mu_s = np.array([0, 0])
+    cov_s = np.array([[1, 0], [0, 1]])
+
+    xs = ot.datasets.make_2D_samples_gauss(n_samples, mu_s, cov_s, random_state=42)
+
+    xt = xs[::-1].copy()
+
+    p = ot.unif(n_samples)
+    q = ot.unif(n_samples)
+    G0 = p[:, None] * q[None, :]
+    C1 = ot.dist(xs, xs)
+    C2 = ot.dist(xt, xt)
+
+    C1 /= C1.max()
+    C2 /= C2.max()
+
+    C1b, C2b, pb, qb, G0b = nx.from_numpy(C1, C2, p, q, G0)
+
+    # complete test with marginal loss = True
+    marginal_loss = True
+    with pytest.raises(ValueError):
+        loss_fun = 'weird_loss_fun'
+        G, log = ot.gromov.entropic_BAPG_gromov_wasserstein(
+            C1, C2, None, q, loss_fun, symmetric=None, G0=G0,
+            epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss,
+            verbose=True, log=True)
+
+    G, log = ot.gromov.entropic_BAPG_gromov_wasserstein(
+        C1, C2, None, q, 'square_loss', symmetric=None, G0=G0,
+        epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss,
+        verbose=True, log=True)
+    Gb = nx.to_numpy(ot.gromov.entropic_BAPG_gromov_wasserstein(
+        C1b, C2b, pb, None, 'square_loss', symmetric=True, G0=None,
+        epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss, verbose=True,
+        log=False
+    ))
+
+    # check constraints
+    np.testing.assert_allclose(G, Gb, atol=1e-06)
+    np.testing.assert_allclose(
+        p, Gb.sum(1), atol=1e-02)  # cf convergence gromov
+    np.testing.assert_allclose(
+        q, Gb.sum(0), atol=1e-02)  # cf convergence gromov
+
+    with pytest.warns(UserWarning):
+
+        gw = ot.gromov.entropic_BAPG_gromov_wasserstein2(
+            C1, C2, p, q, 'kl_loss', symmetric=False, G0=None,
+            max_iter=10, epsilon=1e-2, marginal_loss=marginal_loss, log=False)
+
+    gw, log = ot.gromov.entropic_BAPG_gromov_wasserstein2(
+        C1, C2, p, q, 'kl_loss', symmetric=False, G0=None,
+        max_iter=10, epsilon=1., marginal_loss=marginal_loss, log=True)
+    gwb, logb = ot.gromov.entropic_BAPG_gromov_wasserstein2(
+        C1b, C2b, pb, qb, 'kl_loss', symmetric=None, G0=G0b,
+        max_iter=10, epsilon=1., marginal_loss=marginal_loss, log=True)
+    gwb = nx.to_numpy(gwb)
+
+    G = log['T']
+    Gb = nx.to_numpy(logb['T'])
+
+    np.testing.assert_allclose(gw, gwb, atol=1e-06)
+    np.testing.assert_allclose(gw, 0, atol=1e-1, rtol=1e-1)
+
+    # check constraints
+    np.testing.assert_allclose(G, Gb, atol=1e-06)
+    np.testing.assert_allclose(
+        p, Gb.sum(1), atol=1e-02)  # cf convergence gromov
+    np.testing.assert_allclose(
+        q, Gb.sum(0), atol=1e-02)  # cf convergence gromov
+
+    marginal_loss = False
+    G, log = ot.gromov.entropic_BAPG_gromov_wasserstein(
+        C1, C2, None, q, 'square_loss', symmetric=None, G0=G0,
+        epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss,
+        verbose=True, log=True)
+    Gb = nx.to_numpy(ot.gromov.entropic_BAPG_gromov_wasserstein(
+        C1b, C2b, pb, None, 'square_loss', symmetric=False, G0=None,
+        epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss, verbose=True,
+        log=False
+    ))
+
+
 @pytest.skip_backend("tf", reason="test very slow with tf backend")
 def test_entropic_fgw(nx):
     n_samples = 5  # nb samples
@@ -722,6 +810,99 @@ def test_entropic_proximal_fgw(nx):
         q, Gb.sum(0), atol=1e-04)  # cf convergence gromov
 
 
+def test_entropic_BAPG_fgw(nx):
+    n_samples = 5  # nb samples
+
+    mu_s = np.array([0, 0])
+    cov_s = np.array([[1, 0], [0, 1]])
+
+    xs = ot.datasets.make_2D_samples_gauss(n_samples, mu_s, cov_s, random_state=42)
+
+    xt = xs[::-1].copy()
+
+    rng = np.random.RandomState(42)
+    ys = rng.randn(xs.shape[0], 2)
+    yt = ys[::-1].copy()
+
+    p = ot.unif(n_samples)
+    q = ot.unif(n_samples)
+    G0 = p[:, None] * q[None, :]
+
+    C1 = ot.dist(xs, xs)
+    C2 = ot.dist(xt, xt)
+
+    C1 /= C1.max()
+    C2 /= C2.max()
+
+    M = ot.dist(ys, yt)
+
+    Mb, C1b, C2b, pb, qb, G0b = nx.from_numpy(M, C1, C2, p, q, G0)
+
+    with pytest.raises(ValueError):
+        loss_fun = 'weird_loss_fun'
+        G, log = ot.gromov.entropic_BAPG_fused_gromov_wasserstein(
+            M, C1, C2, p, q, loss_fun=loss_fun, max_iter=1, log=True)
+
+    # complete test with marginal loss = True
+    marginal_loss = True
+
+    G, log = ot.gromov.entropic_BAPG_fused_gromov_wasserstein(
+        M, C1, C2, p, q, 'square_loss', symmetric=None, G0=G0,
+        epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss, log=True)
+    Gb = nx.to_numpy(ot.gromov.entropic_BAPG_fused_gromov_wasserstein(
+        Mb, C1b, C2b, pb, qb, 'square_loss', symmetric=True, G0=None,
+        epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss, verbose=True))
+
+    # check constraints
+    np.testing.assert_allclose(G, Gb, atol=1e-06)
+    np.testing.assert_allclose(
+        p, Gb.sum(1), atol=1e-02)  # cf convergence gromov
+    np.testing.assert_allclose(
+        q, Gb.sum(0), atol=1e-02)  # cf convergence gromov
+
+    with pytest.warns(UserWarning):
+
+        fgw = ot.gromov.entropic_BAPG_fused_gromov_wasserstein2(
+            M, C1, C2, p, q, 'kl_loss', symmetric=False, G0=None,
+            max_iter=10, epsilon=1e-3, marginal_loss=marginal_loss, log=False)
+
+    fgw, log = ot.gromov.entropic_BAPG_fused_gromov_wasserstein2(
+        M, C1, C2, p, None, 'kl_loss', symmetric=True, G0=None,
+        max_iter=5, epsilon=1, marginal_loss=marginal_loss, log=True)
+    fgwb, logb = ot.gromov.entropic_BAPG_fused_gromov_wasserstein2(
+        Mb, C1b, C2b, None, qb, 'kl_loss', symmetric=None, G0=G0b,
+        max_iter=5, epsilon=1, marginal_loss=marginal_loss, log=True)
+    fgwb = nx.to_numpy(fgwb)
+
+    G = log['T']
+    Gb = nx.to_numpy(logb['T'])
+
+    np.testing.assert_allclose(fgw, fgwb, atol=1e-06)
+    np.testing.assert_allclose(fgw, 0, atol=1e-1, rtol=1e-1)
+
+    # check constraints
+    np.testing.assert_allclose(G, Gb, atol=1e-06)
+    np.testing.assert_allclose(
+        p, Gb.sum(1), atol=1e-02)  # cf convergence gromov
+    np.testing.assert_allclose(
+        q, Gb.sum(0), atol=1e-02)  # cf convergence gromov
+
+    # Tests with marginal_loss = False
+    marginal_loss = False
+    G, log = ot.gromov.entropic_BAPG_fused_gromov_wasserstein(
+        M, C1, C2, p, q, 'square_loss', symmetric=False, G0=G0,
+        epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss, log=True)
+    Gb = nx.to_numpy(ot.gromov.entropic_BAPG_fused_gromov_wasserstein(
+        Mb, C1b, C2b, pb, qb, 'square_loss', symmetric=None, G0=None,
+        epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss, verbose=True))
+    # check constraints
+    np.testing.assert_allclose(G, Gb, atol=1e-06)
+    np.testing.assert_allclose(
+        p, Gb.sum(1), atol=1e-02)  # cf convergence gromov
+    np.testing.assert_allclose(
+        q, Gb.sum(0), atol=1e-02)  # cf convergence gromov
+
+
 def test_asymmetric_entropic_fgw(nx):
     n_samples = 5  # nb samples
     rng = np.random.RandomState(0)
@@ -797,15 +978,18 @@ def test_entropic_fgw_dtype_device(nx):
 
         Mb, C1b, C2b, pb, qb = nx.from_numpy(M, C1, C2, p, q, type_as=tp)
 
-        for solver in ['PGD', 'PPA']:
-            Gb = ot.gromov.entropic_fused_gromov_wasserstein(
-                Mb, C1b, C2b, pb, qb, 'square_loss', epsilon=0.1, max_iter=5,
-                solver=solver, verbose=True
-            )
-            fgw_valb = ot.gromov.entropic_fused_gromov_wasserstein2(
-                Mb, C1b, C2b, pb, qb, 'square_loss', epsilon=0.1, max_iter=5,
-                solver=solver, verbose=True
-            )
+        for solver in ['PGD', 'PPA', 'BAPG']:
+            if solver == 'BAPG':
+                Gb = ot.gromov.entropic_BAPG_fused_gromov_wasserstein(
+                    Mb, C1b, C2b, pb, qb, max_iter=2)
+                fgw_valb = ot.gromov.entropic_BAPG_fused_gromov_wasserstein2(
+                    Mb, C1b, C2b, pb, qb, max_iter=2)
+
+            else:
+                Gb = ot.gromov.entropic_fused_gromov_wasserstein(
+                    Mb, C1b, C2b, pb, qb, max_iter=2, solver=solver)
+                fgw_valb = ot.gromov.entropic_fused_gromov_wasserstein2(
+                    Mb, C1b, C2b, pb, qb, max_iter=2, solver=solver)
 
             nx.assert_same_dtype_device(C1b, Gb)
             nx.assert_same_dtype_device(C1b, fgw_valb)

From a0344ba9f3d79e1d1b3ea7e94381ef061cf8bb46 Mon Sep 17 00:00:00 2001
From: clvincen <cedvincentcuaz@gmail.com>
Date: Sun, 26 Nov 2023 14:00:03 +0100
Subject: [PATCH 4/6] update example with fgw solvers comparison

---
 examples/gromov/plot_fgw_solvers.py | 133 +++++++++++++++++++---------
 1 file changed, 89 insertions(+), 44 deletions(-)

diff --git a/examples/gromov/plot_fgw_solvers.py b/examples/gromov/plot_fgw_solvers.py
index 5f8a885c9..4a66b5858 100644
--- a/examples/gromov/plot_fgw_solvers.py
+++ b/examples/gromov/plot_fgw_solvers.py
@@ -5,8 +5,9 @@
 ==============================
 
 This example illustrates the computation of FGW for attributed graphs
-using 3 different solvers to estimate the distance based on Conditional
-Gradient [24] or Sinkhorn projections [12, 51].
+using 4 different solvers to estimate the distance based on Conditional
+Gradient [24], Sinkhorn projections [12, 51] and alternated Bregman
+projections [63, 64].
 
 We generate two graphs following Stochastic Block Models further endowed with
 node features and compute their FGW matchings.
@@ -23,6 +24,16 @@
 [51] Xu, H., Luo, D., Zha, H., & Duke, L. C. (2019).
 "Gromov-wasserstein learning for graph matching and node embedding".
 In International Conference on Machine Learning (ICML), 2019.
+
+[63] Li, J., Tang, J., Kong, L., Liu, H., Li, J., So, A. M. C., & Blanchet, J.
+"A Convergent Single-Loop Algorithm for Relaxation of Gromov-Wasserstein in
+Graph Data". International Conference on Learning Representations (ICLR), 2023.
+
+[64] Ma, X., Chu, X., Wang, Y., Lin, Y., Zhao, J., Ma, L., & Zhu, W.
+"Fused Gromov-Wasserstein Graph Mixup for Graph-level Classifications".
+In Thirty-seventh Conference on Neural Information Processing Systems
+(NeurIPS), 2023.
+
 """
 
 # Author: Cédric Vincent-Cuaz <cedvincentcuaz@gmail.com>
@@ -33,9 +44,12 @@
 
 import numpy as np
 import matplotlib.pylab as pl
-from ot.gromov import fused_gromov_wasserstein, entropic_fused_gromov_wasserstein
+from ot.gromov import (fused_gromov_wasserstein,
+                       entropic_fused_gromov_wasserstein,
+                       entropic_BAPG_fused_gromov_wasserstein)
 import networkx
 from networkx.generators.community import stochastic_block_model as sbm
+from time import time
 
 #############################################################################
 #
@@ -85,34 +99,59 @@
 
 
 # Conditional Gradient algorithm
-fgw0, log0 = fused_gromov_wasserstein(
-    M, C2, C3, h2, h3, 'square_loss', alpha=alpha, verbose=True, log=True)
+print('Conditional Gradient \n')
+start_cg = time()
+T_cg, log_cg = fused_gromov_wasserstein(
+    M, C2, C3, h2, h3, 'square_loss', alpha=alpha, tol_rel=1e-9,
+    verbose=True, log=True)
+end_cg = time()
+time_cg = 1000 * (end_cg - start_cg)
 
 # Proximal Point algorithm with Kullback-Leibler as proximal operator
-fgw, log = entropic_fused_gromov_wasserstein(
+print('Proximal Point Algorithm \n')
+start_ppa = time()
+T_ppa, log_ppa = entropic_fused_gromov_wasserstein(
     M, C2, C3, h2, h3, 'square_loss', alpha=alpha, epsilon=1., solver='PPA',
-    log=True, verbose=True, warmstart=False, numItermax=10)
+    tol=1e-9, log=True, verbose=True, warmstart=False, numItermax=10)
+end_ppa = time()
+time_ppa = 1000 * (end_ppa - start_ppa)
 
 # Projected Gradient algorithm with entropic regularization
-fgwe, loge = entropic_fused_gromov_wasserstein(
+print('Projected Gradient Descent \n')
+start_pgd = time()
+T_pgd, log_pgd = entropic_fused_gromov_wasserstein(
     M, C2, C3, h2, h3, 'square_loss', alpha=alpha, epsilon=0.01, solver='PGD',
-    log=True, verbose=True, warmstart=False, numItermax=10)
-
-print('Fused Gromov-Wasserstein distance estimated with Conditional Gradient solver: ' + str(log0['fgw_dist']))
-print('Fused Gromov-Wasserstein distance estimated with Proximal Point solver: ' + str(log['fgw_dist']))
-print('Entropic Fused Gromov-Wasserstein distance estimated with Projected Gradient solver: ' + str(loge['fgw_dist']))
+    tol=1e-9, log=True, verbose=True, warmstart=False, numItermax=10)
+end_pgd = time()
+time_pgd = 1000 * (end_pgd - start_pgd)
+
+# Alternated Bregman Projected Gradient algorithm with Kullback-Leibler as proximal operator
+print('Bregman Alternated Projected Gradient \n')
+start_bapg = time()
+T_bapg, log_bapg = entropic_BAPG_fused_gromov_wasserstein(
+    M, C2, C3, h2, h3, 'square_loss', alpha=alpha, epsilon=1.,
+    tol=1e-9, marginal_loss=True, verbose=True, log=True)
+end_bapg = time()
+time_bapg = 1000 * (end_bapg - start_bapg)
+
+print('Fused Gromov-Wasserstein distance estimated with Conditional Gradient solver: ' + str(log_cg['fgw_dist']))
+print('Fused Gromov-Wasserstein distance estimated with Proximal Point solver: ' + str(log_ppa['fgw_dist']))
+print('Entropic Fused Gromov-Wasserstein distance estimated with Projected Gradient solver: ' + str(log_pgd['fgw_dist']))
+print('Fused Gromov-Wasserstein distance estimated with Projected Gradient solver: ' + str(log_bapg['fgw_dist']))
 
 # compute OT sparsity level
-fgw0_sparsity = 100 * (fgw0 == 0.).astype(np.float64).sum() / (N2 * N3)
-fgw_sparsity = 100 * (fgw == 0.).astype(np.float64).sum() / (N2 * N3)
-fgwe_sparsity = 100 * (fgwe == 0.).astype(np.float64).sum() / (N2 * N3)
+T_cg_sparsity = 100 * (T_cg == 0.).astype(np.float64).sum() / (N2 * N3)
+T_ppa_sparsity = 100 * (T_ppa == 0.).astype(np.float64).sum() / (N2 * N3)
+T_pgd_sparsity = 100 * (T_pgd == 0.).astype(np.float64).sum() / (N2 * N3)
+T_bapg_sparsity = 100 * (T_bapg == 0.).astype(np.float64).sum() / (N2 * N3)
 
-# Methods using Sinkhorn projections tend to produce feasibility errors on the
+# Methods using Sinkhorn/Bregman projections tend to produce feasibility errors on the
 # marginal constraints
 
-err0 = np.linalg.norm(fgw0.sum(1) - h2) + np.linalg.norm(fgw0.sum(0) - h3)
-err = np.linalg.norm(fgw.sum(1) - h2) + np.linalg.norm(fgw.sum(0) - h3)
-erre = np.linalg.norm(fgwe.sum(1) - h2) + np.linalg.norm(fgwe.sum(0) - h3)
+err_cg = np.linalg.norm(T_cg.sum(1) - h2) + np.linalg.norm(T_cg.sum(0) - h3)
+err_ppa = np.linalg.norm(T_ppa.sum(1) - h2) + np.linalg.norm(T_ppa.sum(0) - h3)
+err_pgd = np.linalg.norm(T_pgd.sum(1) - h2) + np.linalg.norm(T_pgd.sum(0) - h3)
+err_bapg = np.linalg.norm(T_bapg.sum(1) - h2) + np.linalg.norm(T_bapg.sum(0) - h3)
 
 #############################################################################
 #
@@ -242,46 +281,52 @@ def draw_transp_colored_GW(G1, C1, G2, C2, part_G1, p1, p2, T,
 seed_G2 = 0
 seed_G3 = 4
 
-pl.figure(2, figsize=(12, 3.5))
+pl.figure(2, figsize=(15, 3.5))
 pl.clf()
-pl.subplot(131)
+pl.subplot(141)
 pl.axis('off')
-pl.axis
-pl.title('(CG algo) FGW=%s \n \n OT sparsity = %s \n feasibility error = %s' % (
-    np.round(log0['fgw_dist'], 3), str(np.round(fgw0_sparsity, 2)) + ' %',
-    np.round(err0, 4)), fontsize=fontsize)
 
-p0, q0 = fgw0.sum(1), fgw0.sum(0)  # check marginals
+pl.title('(CG) FGW=%s\n \n OT sparsity = %s \n marg. error = %s \n runtime = %s' % (
+    np.round(log_cg['fgw_dist'], 3), str(np.round(T_cg_sparsity, 2)) + ' %',
+    np.round(err_cg, 4), str(np.round(time_cg, 2)) + ' ms'), fontsize=fontsize)
 
 pos1, pos2 = draw_transp_colored_GW(
-    weightedG2, C2, weightedG3, C3, part_G2, p1=p0, p2=q0, T=fgw0,
-    shiftx=1.5, node_size=node_size, seed_G1=seed_G2, seed_G2=seed_G3)
+    weightedG2, C2, weightedG3, C3, part_G2, p1=T_cg.sum(1), p2=T_cg.sum(0),
+    T=T_cg, shiftx=1.5, node_size=node_size, seed_G1=seed_G2, seed_G2=seed_G3)
 
-pl.subplot(132)
+pl.subplot(142)
 pl.axis('off')
 
-p, q = fgw.sum(1), fgw.sum(0)  # check marginals
-
-pl.title('(PP algo) FGW=%s\n \n OT sparsity = %s \n feasibility error = %s' % (
-    np.round(log['fgw_dist'], 3), str(np.round(fgw_sparsity, 2)) + ' %',
-    np.round(err, 4)), fontsize=fontsize)
+pl.title('(PPA) FGW=%s\n \n OT sparsity = %s \n marg. error = %s \n runtime = %s' % (
+    np.round(log_ppa['fgw_dist'], 3), str(np.round(T_ppa_sparsity, 2)) + ' %',
+    np.round(err_ppa, 4), str(np.round(time_ppa, 2)) + ' ms'), fontsize=fontsize)
 
 pos1, pos2 = draw_transp_colored_GW(
-    weightedG2, C2, weightedG3, C3, part_G2, p1=p, p2=q, T=fgw,
-    pos1=pos1, pos2=pos2, shiftx=0., node_size=node_size, seed_G1=0, seed_G2=0)
+    weightedG2, C2, weightedG3, C3, part_G2, p1=T_ppa.sum(1), p2=T_ppa.sum(0),
+    T=T_ppa, pos1=pos1, pos2=pos2, shiftx=0., node_size=node_size, seed_G1=0, seed_G2=0)
 
-pl.subplot(133)
+pl.subplot(143)
 pl.axis('off')
 
-pe, qe = fgwe.sum(1), fgwe.sum(0)  # check marginals
+pl.title('(PGD) Entropic FGW=%s\n \n OT sparsity = %s \n marg. error = %s \n runtime = %s' % (
+    np.round(log_pgd['fgw_dist'], 3), str(np.round(T_pgd_sparsity, 2)) + ' %',
+    np.round(err_pgd, 4), str(np.round(time_pgd, 2)) + ' ms'), fontsize=fontsize)
+
+pos1, pos2 = draw_transp_colored_GW(
+    weightedG2, C2, weightedG3, C3, part_G2, p1=T_pgd.sum(1), p2=T_pgd.sum(0),
+    T=T_pgd, pos1=pos1, pos2=pos2, shiftx=0., node_size=node_size, seed_G1=0, seed_G2=0)
+
+
+pl.subplot(144)
+pl.axis('off')
 
-pl.title('Entropic FGW=%s\n \n OT sparsity = %s \n feasibility error = %s' % (
-    np.round(loge['fgw_dist'], 3), str(np.round(fgwe_sparsity, 2)) + ' %',
-    np.round(erre, 4)), fontsize=fontsize)
+pl.title('(BAPG) FGW=%s\n \n OT sparsity = %s \n marg. error = %s \n runtime = %s' % (
+    np.round(log_bapg['fgw_dist'], 3), str(np.round(T_bapg_sparsity, 2)) + ' %',
+    np.round(err_bapg, 4), str(np.round(time_bapg, 2)) + ' ms'), fontsize=fontsize)
 
 pos1, pos2 = draw_transp_colored_GW(
-    weightedG2, C2, weightedG3, C3, part_G2, p1=pe, p2=qe, T=fgwe,
-    pos1=pos1, pos2=pos2, shiftx=0., node_size=node_size, seed_G1=0, seed_G2=0)
+    weightedG2, C2, weightedG3, C3, part_G2, p1=T_bapg.sum(1), p2=T_bapg.sum(0),
+    T=T_bapg, pos1=pos1, pos2=pos2, shiftx=0., node_size=node_size, seed_G1=0, seed_G2=0)
 
 pl.tight_layout()
 

From 345f0943d3f7f7a1bf38e272d1cb60fbc569c5d4 Mon Sep 17 00:00:00 2001
From: clvincen <cedvincentcuaz@gmail.com>
Date: Mon, 27 Nov 2023 15:22:40 +0100
Subject: [PATCH 5/6] change BAPG names + improve doc

---
 examples/gromov/plot_fgw_solvers.py |  4 +-
 ot/gromov/__init__.py               | 14 ++---
 ot/gromov/_bregman.py               | 98 +++++++++++++++++++++++------
 test/test_gromov.py                 | 44 ++++++-------
 4 files changed, 109 insertions(+), 51 deletions(-)

diff --git a/examples/gromov/plot_fgw_solvers.py b/examples/gromov/plot_fgw_solvers.py
index 4a66b5858..75c12cca0 100644
--- a/examples/gromov/plot_fgw_solvers.py
+++ b/examples/gromov/plot_fgw_solvers.py
@@ -46,7 +46,7 @@
 import matplotlib.pylab as pl
 from ot.gromov import (fused_gromov_wasserstein,
                        entropic_fused_gromov_wasserstein,
-                       entropic_BAPG_fused_gromov_wasserstein)
+                       BAPG_fused_gromov_wasserstein)
 import networkx
 from networkx.generators.community import stochastic_block_model as sbm
 from time import time
@@ -128,7 +128,7 @@
 # Alternated Bregman Projected Gradient algorithm with Kullback-Leibler as proximal operator
 print('Bregman Alternated Projected Gradient \n')
 start_bapg = time()
-T_bapg, log_bapg = entropic_BAPG_fused_gromov_wasserstein(
+T_bapg, log_bapg = BAPG_fused_gromov_wasserstein(
     M, C2, C3, h2, h3, 'square_loss', alpha=alpha, epsilon=1.,
     tol=1e-9, marginal_loss=True, verbose=True, log=True)
 end_bapg = time()
diff --git a/ot/gromov/__init__.py b/ot/gromov/__init__.py
index 63223e961..4d77fc57a 100644
--- a/ot/gromov/__init__.py
+++ b/ot/gromov/__init__.py
@@ -20,13 +20,13 @@
 
 from ._bregman import (entropic_gromov_wasserstein,
                        entropic_gromov_wasserstein2,
-                       entropic_BAPG_gromov_wasserstein,
-                       entropic_BAPG_gromov_wasserstein2,
+                       BAPG_gromov_wasserstein,
+                       BAPG_gromov_wasserstein2,
                        entropic_gromov_barycenters,
                        entropic_fused_gromov_wasserstein,
                        entropic_fused_gromov_wasserstein2,
-                       entropic_BAPG_fused_gromov_wasserstein,
-                       entropic_BAPG_fused_gromov_wasserstein2,
+                       BAPG_fused_gromov_wasserstein,
+                       BAPG_fused_gromov_wasserstein2,
                        entropic_fused_gromov_barycenters)
 
 from ._estimators import (GW_distance_estimation, pointwise_gromov_wasserstein,
@@ -53,10 +53,10 @@
            'gromov_wasserstein', 'gromov_wasserstein2', 'fused_gromov_wasserstein',
            'fused_gromov_wasserstein2', 'solve_gromov_linesearch', 'gromov_barycenters',
            'fgw_barycenters', 'entropic_gromov_wasserstein', 'entropic_gromov_wasserstein2',
-           'entropic_BAPG_gromov_wasserstein', 'entropic_BAPG_gromov_wasserstein2',
+           'BAPG_gromov_wasserstein', 'BAPG_gromov_wasserstein2',
            'entropic_gromov_barycenters', 'entropic_fused_gromov_wasserstein',
-           'entropic_fused_gromov_wasserstein2', 'entropic_BAPG_fused_gromov_wasserstein',
-           'entropic_BAPG_fused_gromov_wasserstein2', 'entropic_fused_gromov_barycenters',
+           'entropic_fused_gromov_wasserstein2', 'BAPG_fused_gromov_wasserstein',
+           'BAPG_fused_gromov_wasserstein2', 'entropic_fused_gromov_barycenters',
            'GW_distance_estimation', 'pointwise_gromov_wasserstein', 'sampled_gromov_wasserstein',
            'semirelaxed_gromov_wasserstein', 'semirelaxed_gromov_wasserstein2',
            'semirelaxed_fused_gromov_wasserstein', 'semirelaxed_fused_gromov_wasserstein2',
diff --git a/ot/gromov/_bregman.py b/ot/gromov/_bregman.py
index bb3ba5627..df4ba0ae3 100644
--- a/ot/gromov/_bregman.py
+++ b/ot/gromov/_bregman.py
@@ -343,7 +343,7 @@ def entropic_gromov_wasserstein2(
         return logv['gw_dist']
 
 
-def entropic_BAPG_gromov_wasserstein(
+def BAPG_gromov_wasserstein(
         C1, C2, p=None, q=None, loss_fun='square_loss', epsilon=0.1,
         symmetric=None, G0=None, max_iter=1000, tol=1e-9, marginal_loss=False,
         verbose=False, log=False):
@@ -351,12 +351,25 @@ def entropic_BAPG_gromov_wasserstein(
     Returns the Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
     estimated using Bregman Alternated Projected Gradient method.
 
-    The function solves the following Gromov-Wasserstein
-    optimization problem [63]:
+    If `marginal_loss=True`, the function solves the following Gromov-Wasserstein
+    optimization problem :
 
     .. math::
         \mathbf{T}^* \in \mathop{\arg\min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
+
+             \mathbf{T}^T \mathbf{1} &= \mathbf{q}
+
+             \mathbf{T} &\geq 0
+
+    Else, the function solves an equivalent problem [63], where constant terms only
+    depending on the marginals :math:`\mathbf{p}`: and :math:`\mathbf{q}`: are
+    discarded while assuming that L decomposes as in Proposition 1 in [12]:
+
+    .. math::
+        \mathbf{T}^* \in\mathop{\arg\min}_\mathbf{T} \quad - \langle h_1(\mathbf{C}_1) \mathbf{T} h_2(\mathbf{C_2})^\top , \mathbf{T} \rangle_F
+
         s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
 
              \mathbf{T}^T \mathbf{1} &= \mathbf{q}
@@ -369,6 +382,7 @@ def entropic_BAPG_gromov_wasserstein(
     - :math:`\mathbf{p}`: distribution in the source space
     - :math:`\mathbf{q}`: distribution in the target space
     - `L`: loss function to account for the misfit between the similarity matrices
+        satisfying :math:`L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)`
 
     .. note:: By algorithmic design the optimal coupling :math:`\mathbf{T}`
         returned by this function does not necessarily satisfy the marginal
@@ -407,7 +421,7 @@ def entropic_BAPG_gromov_wasserstein(
     tol : float, optional
         Stop threshold on error (>0)
     marginal_loss: bool, optional. Default is False.
-        Include constant terms or not in the matching objective function.
+        Include constant marginal terms or not in the objective function.
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
@@ -524,19 +538,33 @@ def df(T):
         return T
 
 
-def entropic_BAPG_gromov_wasserstein2(
+def BAPG_gromov_wasserstein2(
         C1, C2, p=None, q=None, loss_fun='square_loss', epsilon=0.1, symmetric=None, G0=None, max_iter=1000,
         tol=1e-9, marginal_loss=False, verbose=False, log=False):
     r"""
     Returns the Gromov-Wasserstein loss :math:`\mathbf{GW}` between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
     estimated using Bregman Alternated Projected Gradient method.
 
-    The function solves the following Gromov-Wasserstein
-    optimization problem [63]:
+    If `marginal_loss=True`, the function solves the following Gromov-Wasserstein
+    optimization problem :
+
 
     .. math::
         \mathbf{GW} = \mathop{\min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
+
+             \mathbf{T}^T \mathbf{1} &= \mathbf{q}
+
+             \mathbf{T} &\geq 0
+
+    Else, the function solves an equivalent problem [63, 64], where constant terms only
+    depending on the marginals :math:`\mathbf{p}`: and :math:`\mathbf{q}`: are
+    discarded while assuming that L decomposes as in Proposition 1 in [12]:
+
+    .. math::
+        \mathop{\min}_\mathbf{T}  \quad - \langle h_1(\mathbf{C}_1) \mathbf{T} h_2(\mathbf{C_2})^\top , \mathbf{T} \rangle_F
+
         s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
 
              \mathbf{T}^T \mathbf{1} &= \mathbf{q}
@@ -549,6 +577,7 @@ def entropic_BAPG_gromov_wasserstein2(
     - :math:`\mathbf{p}`: distribution in the source space
     - :math:`\mathbf{q}`: distribution in the target space
     - `L`: loss function to account for the misfit between the similarity matrices
+        satisfying :math:`L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)`
 
     .. note:: By algorithmic design the optimal coupling :math:`\mathbf{T}`
         returned by this function does not necessarily satisfy the marginal
@@ -588,7 +617,7 @@ def entropic_BAPG_gromov_wasserstein2(
     tol : float, optional
         Stop threshold on error (>0)
     marginal_loss: bool, optional. Default is False.
-        Include constant terms or not in the matching objective function.
+        Include constant marginal terms or not in the objective function.
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
@@ -607,7 +636,7 @@ def entropic_BAPG_gromov_wasserstein2(
 
     """
 
-    T, logv = entropic_BAPG_gromov_wasserstein(
+    T, logv = BAPG_gromov_wasserstein(
         C1, C2, p, q, loss_fun, epsilon, symmetric, G0, max_iter,
         tol, marginal_loss, verbose, log=True)
 
@@ -1153,7 +1182,7 @@ def entropic_fused_gromov_wasserstein2(
         return logv['fgw_dist']
 
 
-def entropic_BAPG_fused_gromov_wasserstein(
+def BAPG_fused_gromov_wasserstein(
         M, C1, C2, p=None, q=None, loss_fun='square_loss', epsilon=0.1,
         symmetric=None, alpha=0.5, G0=None, max_iter=1000, tol=1e-9,
         marginal_loss=False, verbose=False, log=False):
@@ -1162,8 +1191,8 @@ def entropic_BAPG_fused_gromov_wasserstein(
     with pairwise distance matrix :math:`\mathbf{M}` between node feature matrices :math:`\mathbf{Y_1}` and :math:`\mathbf{Y_2}`,
     estimated using Bregman Alternated Projected Gradient method.
 
-    The function solves the following Fused Gromov-Wasserstein
-    optimization problem [63, 64]:
+    If `marginal_loss=True`, the function solves the following Fused Gromov-Wasserstein
+    optimization problem :
 
     .. math::
         \mathbf{T}^* \in\mathop{\arg\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
@@ -1174,14 +1203,29 @@ def entropic_BAPG_fused_gromov_wasserstein(
              \mathbf{T}^T \mathbf{1} &= \mathbf{q}
 
              \mathbf{T} &\geq 0
+
+    Else, the function solves an equivalent problem [63, 64], where constant terms only
+    depending on the marginals :math:`\mathbf{p}`: and :math:`\mathbf{q}`: are
+    discarded while assuming that L decomposes as in Proposition 1 in [12]:
+
+    .. math::
+        \mathbf{T}^* \in\mathop{\arg\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F -
+        \alpha \langle h_1(\mathbf{C}_1) \mathbf{T} h_2(\mathbf{C_2})^\top , \mathbf{T} \rangle_F
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
+
+             \mathbf{T}^T \mathbf{1} &= \mathbf{q}
+
+             \mathbf{T} &\geq 0
+
     Where :
 
-    - :math:`\mathbf{M}`: metric cost matrix between features across domains
+    - :math:`\mathbf{M}`: pairwise relation matrix between features across domains
     - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
     - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
     - :math:`\mathbf{p}`: distribution in the source space
     - :math:`\mathbf{q}`: distribution in the target space
     - `L`: loss function to account for the misfit between the similarity and feature matrices
+        satisfying :math:`L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)`
     - :math:`\alpha`: trade-off parameter
 
     .. note:: By algorithmic design the optimal coupling :math:`\mathbf{T}`
@@ -1194,7 +1238,7 @@ def entropic_BAPG_fused_gromov_wasserstein(
     Parameters
     ----------
     M : array-like, shape (ns, nt)
-        Metric cost matrix between features across domains
+        Pairwise relation matrix between features across domains
     C1 : array-like, shape (ns, ns)
         Metric cost matrix in the source space
     C2 : array-like, shape (nt, nt)
@@ -1225,7 +1269,7 @@ def entropic_BAPG_fused_gromov_wasserstein(
     tol : float, optional
         Stop threshold on error (>0)
     marginal_loss: bool, optional. Default is False.
-        Include constant terms or not in the matching objective function.
+        Include constant marginal terms or not in the objective function.
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
@@ -1344,7 +1388,7 @@ def df(T):
         return T
 
 
-def entropic_BAPG_fused_gromov_wasserstein2(
+def BAPG_fused_gromov_wasserstein2(
         M, C1, C2, p=None, q=None, loss_fun='square_loss', epsilon=0.1,
         symmetric=None, alpha=0.5, G0=None, max_iter=1000, tol=1e-9,
         marginal_loss=False, verbose=False, log=False):
@@ -1353,8 +1397,8 @@ def entropic_BAPG_fused_gromov_wasserstein2(
     with pairwise distance matrix :math:`\mathbf{M}` between node feature matrices :math:`\mathbf{Y_1}` and :math:`\mathbf{Y_2}`,
     estimated using Bregman Alternated Projected Gradient method.
 
-    The function solves the following Fused Gromov-Wasserstein
-    optimization problem [63, 64]:
+    If `marginal_loss=True`, the function solves the following Fused Gromov-Wasserstein
+    optimization problem :
 
     .. math::
         \mathbf{FGW} = \mathop{\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
@@ -1365,6 +1409,19 @@ def entropic_BAPG_fused_gromov_wasserstein2(
              \mathbf{T}^T \mathbf{1} &= \mathbf{q}
 
              \mathbf{T} &\geq 0
+
+    Else, the function solves an equivalent problem [63, 64], where constant terms only
+    depending on the marginals :math:`\mathbf{p}`: and :math:`\mathbf{q}`: are
+    discarded while assuming that L decomposes as in Proposition 1 in [12]:
+
+    .. math::
+        \mathop{\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F -
+        \alpha \langle h_1(\mathbf{C}_1) \mathbf{T} h_2(\mathbf{C_2})^\top , \mathbf{T} \rangle_F
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
+
+             \mathbf{T}^T \mathbf{1} &= \mathbf{q}
+
+             \mathbf{T} &\geq 0
     Where :
 
     - :math:`\mathbf{M}`: metric cost matrix between features across domains
@@ -1373,6 +1430,7 @@ def entropic_BAPG_fused_gromov_wasserstein2(
     - :math:`\mathbf{p}`: distribution in the source space
     - :math:`\mathbf{q}`: distribution in the target space
     - `L`: loss function to account for the misfit between the similarity and feature matrices
+        satisfying :math:`L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)`
     - :math:`\alpha`: trade-off parameter
 
     .. note:: By algorithmic design the optimal coupling :math:`\mathbf{T}`
@@ -1416,7 +1474,7 @@ def entropic_BAPG_fused_gromov_wasserstein2(
     tol : float, optional
         Stop threshold on error (>0)
     marginal_loss: bool, optional. Default is False.
-        Include constant terms or not in the matching objective function.
+        Include constant marginal terms or not in the objective function.
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
@@ -1438,7 +1496,7 @@ def entropic_BAPG_fused_gromov_wasserstein2(
     """
     nx = get_backend(M, C1, C2)
 
-    T, logv = entropic_BAPG_fused_gromov_wasserstein(
+    T, logv = BAPG_fused_gromov_wasserstein(
         M, C1, C2, p, q, loss_fun, epsilon, symmetric, alpha, G0, max_iter,
         tol, marginal_loss, verbose, log=True)
 
diff --git a/test/test_gromov.py b/test/test_gromov.py
index c156154ed..83d65306b 100644
--- a/test/test_gromov.py
+++ b/test/test_gromov.py
@@ -572,9 +572,9 @@ def test_entropic_gromov_dtype_device(nx):
 
         for solver in ['PGD', 'PPA', 'BAPG']:
             if solver == 'BAPG':
-                Gb = ot.gromov.entropic_BAPG_gromov_wasserstein(
+                Gb = ot.gromov.BAPG_gromov_wasserstein(
                     C1b, C2b, pb, qb, max_iter=2, verbose=True)
-                gw_valb = ot.gromov.entropic_BAPG_gromov_wasserstein2(
+                gw_valb = ot.gromov.BAPG_gromov_wasserstein2(
                     C1b, C2b, pb, qb, max_iter=2, verbose=True)
             else:
                 Gb = ot.gromov.entropic_gromov_wasserstein(
@@ -586,7 +586,7 @@ def test_entropic_gromov_dtype_device(nx):
             nx.assert_same_dtype_device(C1b, gw_valb)
 
 
-def test_entropic_BAPG_gromov(nx):
+def test_BAPG_gromov(nx):
     n_samples = 10  # nb samples
 
     mu_s = np.array([0, 0])
@@ -611,16 +611,16 @@ def test_entropic_BAPG_gromov(nx):
     marginal_loss = True
     with pytest.raises(ValueError):
         loss_fun = 'weird_loss_fun'
-        G, log = ot.gromov.entropic_BAPG_gromov_wasserstein(
+        G, log = ot.gromov.BAPG_gromov_wasserstein(
             C1, C2, None, q, loss_fun, symmetric=None, G0=G0,
             epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss,
             verbose=True, log=True)
 
-    G, log = ot.gromov.entropic_BAPG_gromov_wasserstein(
+    G, log = ot.gromov.BAPG_gromov_wasserstein(
         C1, C2, None, q, 'square_loss', symmetric=None, G0=G0,
         epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss,
         verbose=True, log=True)
-    Gb = nx.to_numpy(ot.gromov.entropic_BAPG_gromov_wasserstein(
+    Gb = nx.to_numpy(ot.gromov.BAPG_gromov_wasserstein(
         C1b, C2b, pb, None, 'square_loss', symmetric=True, G0=None,
         epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss, verbose=True,
         log=False
@@ -635,14 +635,14 @@ def test_entropic_BAPG_gromov(nx):
 
     with pytest.warns(UserWarning):
 
-        gw = ot.gromov.entropic_BAPG_gromov_wasserstein2(
+        gw = ot.gromov.BAPG_gromov_wasserstein2(
             C1, C2, p, q, 'kl_loss', symmetric=False, G0=None,
             max_iter=10, epsilon=1e-2, marginal_loss=marginal_loss, log=False)
 
-    gw, log = ot.gromov.entropic_BAPG_gromov_wasserstein2(
+    gw, log = ot.gromov.BAPG_gromov_wasserstein2(
         C1, C2, p, q, 'kl_loss', symmetric=False, G0=None,
         max_iter=10, epsilon=1., marginal_loss=marginal_loss, log=True)
-    gwb, logb = ot.gromov.entropic_BAPG_gromov_wasserstein2(
+    gwb, logb = ot.gromov.BAPG_gromov_wasserstein2(
         C1b, C2b, pb, qb, 'kl_loss', symmetric=None, G0=G0b,
         max_iter=10, epsilon=1., marginal_loss=marginal_loss, log=True)
     gwb = nx.to_numpy(gwb)
@@ -661,11 +661,11 @@ def test_entropic_BAPG_gromov(nx):
         q, Gb.sum(0), atol=1e-02)  # cf convergence gromov
 
     marginal_loss = False
-    G, log = ot.gromov.entropic_BAPG_gromov_wasserstein(
+    G, log = ot.gromov.BAPG_gromov_wasserstein(
         C1, C2, None, q, 'square_loss', symmetric=None, G0=G0,
         epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss,
         verbose=True, log=True)
-    Gb = nx.to_numpy(ot.gromov.entropic_BAPG_gromov_wasserstein(
+    Gb = nx.to_numpy(ot.gromov.BAPG_gromov_wasserstein(
         C1b, C2b, pb, None, 'square_loss', symmetric=False, G0=None,
         epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss, verbose=True,
         log=False
@@ -810,7 +810,7 @@ def test_entropic_proximal_fgw(nx):
         q, Gb.sum(0), atol=1e-04)  # cf convergence gromov
 
 
-def test_entropic_BAPG_fgw(nx):
+def test_BAPG_fgw(nx):
     n_samples = 5  # nb samples
 
     mu_s = np.array([0, 0])
@@ -840,16 +840,16 @@ def test_entropic_BAPG_fgw(nx):
 
     with pytest.raises(ValueError):
         loss_fun = 'weird_loss_fun'
-        G, log = ot.gromov.entropic_BAPG_fused_gromov_wasserstein(
+        G, log = ot.gromov.BAPG_fused_gromov_wasserstein(
             M, C1, C2, p, q, loss_fun=loss_fun, max_iter=1, log=True)
 
     # complete test with marginal loss = True
     marginal_loss = True
 
-    G, log = ot.gromov.entropic_BAPG_fused_gromov_wasserstein(
+    G, log = ot.gromov.BAPG_fused_gromov_wasserstein(
         M, C1, C2, p, q, 'square_loss', symmetric=None, G0=G0,
         epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss, log=True)
-    Gb = nx.to_numpy(ot.gromov.entropic_BAPG_fused_gromov_wasserstein(
+    Gb = nx.to_numpy(ot.gromov.BAPG_fused_gromov_wasserstein(
         Mb, C1b, C2b, pb, qb, 'square_loss', symmetric=True, G0=None,
         epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss, verbose=True))
 
@@ -862,14 +862,14 @@ def test_entropic_BAPG_fgw(nx):
 
     with pytest.warns(UserWarning):
 
-        fgw = ot.gromov.entropic_BAPG_fused_gromov_wasserstein2(
+        fgw = ot.gromov.BAPG_fused_gromov_wasserstein2(
             M, C1, C2, p, q, 'kl_loss', symmetric=False, G0=None,
             max_iter=10, epsilon=1e-3, marginal_loss=marginal_loss, log=False)
 
-    fgw, log = ot.gromov.entropic_BAPG_fused_gromov_wasserstein2(
+    fgw, log = ot.gromov.BAPG_fused_gromov_wasserstein2(
         M, C1, C2, p, None, 'kl_loss', symmetric=True, G0=None,
         max_iter=5, epsilon=1, marginal_loss=marginal_loss, log=True)
-    fgwb, logb = ot.gromov.entropic_BAPG_fused_gromov_wasserstein2(
+    fgwb, logb = ot.gromov.BAPG_fused_gromov_wasserstein2(
         Mb, C1b, C2b, None, qb, 'kl_loss', symmetric=None, G0=G0b,
         max_iter=5, epsilon=1, marginal_loss=marginal_loss, log=True)
     fgwb = nx.to_numpy(fgwb)
@@ -889,10 +889,10 @@ def test_entropic_BAPG_fgw(nx):
 
     # Tests with marginal_loss = False
     marginal_loss = False
-    G, log = ot.gromov.entropic_BAPG_fused_gromov_wasserstein(
+    G, log = ot.gromov.BAPG_fused_gromov_wasserstein(
         M, C1, C2, p, q, 'square_loss', symmetric=False, G0=G0,
         epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss, log=True)
-    Gb = nx.to_numpy(ot.gromov.entropic_BAPG_fused_gromov_wasserstein(
+    Gb = nx.to_numpy(ot.gromov.BAPG_fused_gromov_wasserstein(
         Mb, C1b, C2b, pb, qb, 'square_loss', symmetric=None, G0=None,
         epsilon=1e-1, max_iter=10, marginal_loss=marginal_loss, verbose=True))
     # check constraints
@@ -980,9 +980,9 @@ def test_entropic_fgw_dtype_device(nx):
 
         for solver in ['PGD', 'PPA', 'BAPG']:
             if solver == 'BAPG':
-                Gb = ot.gromov.entropic_BAPG_fused_gromov_wasserstein(
+                Gb = ot.gromov.BAPG_fused_gromov_wasserstein(
                     Mb, C1b, C2b, pb, qb, max_iter=2)
-                fgw_valb = ot.gromov.entropic_BAPG_fused_gromov_wasserstein2(
+                fgw_valb = ot.gromov.BAPG_fused_gromov_wasserstein2(
                     Mb, C1b, C2b, pb, qb, max_iter=2)
 
             else:

From 655a95e87c5e60cc76d489966b15f34602a2c28f Mon Sep 17 00:00:00 2001
From: clvincen <cedvincentcuaz@gmail.com>
Date: Thu, 30 Nov 2023 13:35:07 +0100
Subject: [PATCH 6/6] merge

---
 RELEASES.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RELEASES.md b/RELEASES.md
index a6c517065..9919076f6 100644
--- a/RELEASES.md
+++ b/RELEASES.md
@@ -20,7 +20,7 @@
 + Wrapper for `geomloss`` solver on empirical samples (PR #571)
 + Add `stop_criterion` feature to (un)regularized (f)gw barycenter solvers (PR #578)
 + Add `fixed_structure` and `fixed_features` to entropic fgw barycenter solver (PR #578)
-+ Add new entropic BAPG solvers for GW and FGW (PR #581)
++ Add new BAPG solvers with KL projections for GW and FGW (PR #581)
 + Add Bures-Wasserstein barycenter in `ot.gaussian` (PR #582)