diff --git a/pygmtools/multi_graph_solvers.py b/pygmtools/multi_graph_solvers.py
index 63935b2c..7cc60d53 100644
--- a/pygmtools/multi_graph_solvers.py
+++ b/pygmtools/multi_graph_solvers.py
@@ -655,6 +655,7 @@ def gamgm(A, W,
     .. dropdown:: Numpy Example
 
         ::
+
             >>> import numpy as np
             >>> import pygmtools as pygm
             >>> import itertools
diff --git a/pygmtools/neural_solvers.py b/pygmtools/neural_solvers.py
index 9afe287c..b815d743 100644
--- a/pygmtools/neural_solvers.py
+++ b/pygmtools/neural_solvers.py
@@ -67,6 +67,53 @@ def pca_gm(feat1, feat2, A1, A2, n1=None, n2=None,
     .. note::
         This function also supports non-batched input, by ignoring all batch dimensions in the input tensors.
 
+    .. dropdown:: Numpy Example
+
+        ::
+
+            >>> import numpy as np
+            >>> import pygmtools as pygm
+            >>> pygm.BACKEND = 'numpy'
+            >>> np.random.seed(1)
+
+            # Generate a batch of isomorphic graphs
+            >>> batch_size = 10
+            >>> X_gt = np.zeros((batch_size, 4, 4))
+            >>> X_gt[:, np.arange(0, 4, dtype='i4'), np.random.permutation(4)] = 1
+            >>> A1 = 1. * (np.random.rand(batch_size, 4, 4) > 0.5)
+            >>> for i in np.arange(4): # discard self-loop edges
+            ...    for j in np.arange(batch_size):
+            ...        A1[j][i][i] = 0
+            >>> A2 = np.matmul(np.matmul(X_gt.swapaxes(1, 2), A1), X_gt)
+            >>> feat1 = np.random.rand(batch_size, 4, 1024) - 0.5
+            >>> feat2 = np.matmul(X_gt.swapaxes(1, 2), feat1)
+            >>> n1 = n2 = np.array([4] * batch_size)
+
+            # Match by PCA-GM (load pretrained model)
+            >>> X, net = pygm.pca_gm(feat1, feat2, A1, A2, n1, n2, return_network=True)
+            Downloading to ~/.cache/pygmtools/pca_gm_voc_numpy.npy...
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+
+            # Pass the net object to avoid rebuilding the model agian
+            >>> X = pygm.pca_gm(feat1, feat2, A1, A2, n1, n2, network=net)
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+
+            # You may also load other pretrained weights
+            >>> X, net = pygm.pca_gm(feat1, feat2, A1, A2, n1, n2, return_network=True, pretrain='willow')
+            Downloading to ~/.cache/pygmtools/pca_gm_willow_numpy.npy...
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+
+            # You may configure your own model and integrate the model into a deep learning pipeline. For example:
+            >>> net = pygm.utils.get_network(pygm.pca_gm, in_channel=1024, hidden_channel=2048, out_channel=512, num_layers=3, pretrain=False)
+            # feat1/feat2 may be outputs by other neural networks
+            >>> X = pygm.pca_gm(feat1, feat2, A1, A2, n1, n2, network=net)
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+
+
     .. dropdown:: PyTorch Example
 
         ::
@@ -274,6 +321,53 @@ def ipca_gm(feat1, feat2, A1, A2, n1=None, n2=None,
     .. note::
         This function also supports non-batched input, by ignoring all batch dimensions in the input tensors.
 
+    .. dropdown:: Numpy Example
+
+        ::
+
+            >>> import numpy as np
+            >>> import pygmtools as pygm
+            >>> pygm.BACKEND = 'numpy'
+            >>> np.random.seed(1)
+
+            # Generate a batch of isomorphic graphs
+            >>> batch_size = 10
+            >>> X_gt = np.zeros((batch_size, 4, 4))
+            >>> X_gt[:, np.arange(0, 4, dtype='i4'), np.random.permutation(4)] = 1
+            >>> A1 = 1. * (np.random.rand(batch_size, 4, 4) > 0.5)
+            >>> for i in np.arange(4): # discard self-loop edges
+            ...    for j in np.arange(batch_size):
+            ...        A1[j][i][i] = 0
+            >>> A2 = np.matmul(np.matmul(X_gt.swapaxes(1, 2), A1), X_gt)
+            >>> feat1 = np.random.rand(batch_size, 4, 1024) - 0.5
+            >>> feat2 = np.matmul(X_gt.swapaxes(1, 2), feat1)
+            >>> n1 = n2 = np.array([4] * batch_size)
+
+            # Match by IPCA-GM (load pretrained model)
+            >>> X, net = pygm.ipca_gm(feat1, feat2, A1, A2, n1, n2, return_network=True)
+            Downloading to ~/.cache/pygmtools/ipca_gm_voc_numpy.npy...
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+
+            # Pass the net object to avoid rebuilding the model agian
+            >>> X = pygm.ipca_gm(feat1, feat2, A1, A2, n1, n2, network=net)
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+
+            # You may also load other pretrained weights
+            >>> X, net = pygm.ipca_gm(feat1, feat2, A1, A2, n1, n2, return_network=True, pretrain='willow')
+            Downloading to ~/.cache/pygmtools/ipca_gm_willow_numpy.npy...
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+
+            # You may configure your own model and integrate the model into a deep learning pipeline. For example:
+            >>> net = pygm.utils.get_network(pygm.ipca_gm, in_channel=1024, hidden_channel=2048, out_channel=512, num_layers=3, cross_iter=10, pretrain=False)
+            # feat1/feat2 may be outputs by other neural networks
+            >>> X = pygm.ipca_gm(feat1, feat2, A1, A2, n1, n2, network=net)
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+
+
     .. dropdown:: PyTorch Example
 
         ::
@@ -489,6 +583,55 @@ def cie(feat_node1, feat_node2, A1, A2, feat_edge1, feat_edge2, n1=None, n2=None
     .. note::
         This function also supports non-batched input, by ignoring all batch dimensions in the input tensors.
 
+    .. dropdown:: Numpy Example
+
+        ::
+
+            >>> import numpy as np
+            >>> import pygmtools as pygm
+            >>> pygm.BACKEND = 'numpy'
+            >>> np.random.seed(1)
+
+            # Generate a batch of isomorphic graphs
+            >>> batch_size = 10
+            >>> X_gt = np.zeros((batch_size, 4, 4))
+            >>> X_gt[:, np.arange(0, 4, dtype='i4'), np.random.permutation(4)] = 1
+            >>> A1 = 1. * (np.random.rand(batch_size, 4, 4) > 0.5)
+            >>> for i in np.arange(4): # discard self-loop edges
+            ...    for j in np.arange(batch_size):
+            ...        A1[j][i][i] = 0
+            >>> e_feat1 = np.expand_dims(np.random.rand(batch_size, 4, 4) * A1,axis=-1) # shape: (10, 4, 4, 1)
+            >>> A2 = np.matmul(np.matmul(X_gt.swapaxes(1, 2), A1), X_gt)
+            >>> e_feat2 = np.expand_dims(np.matmul(np.matmul(X_gt.swapaxes(1, 2),np.squeeze(e_feat1,axis=-1)), X_gt),axis=-1)
+            >>> feat1 = np.random.rand(batch_size, 4, 1024) - 0.5
+            >>> feat2 = np.matmul(X_gt.swapaxes(1, 2), feat1)
+            >>> n1 = n2 = np.array([4] * batch_size)
+
+            # Match by CIE (load pretrained model)
+            >>> X, net = pygm.cie(feat1, feat2, A1, A2, e_feat1, e_feat2, n1, n2, return_network=True)
+            Downloading to ~/.cache/pygmtools/cie_voc_numpy.npy...
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+
+            # Pass the net object to avoid rebuilding the model agian
+            >>> X = pygm.cie(feat1, feat2, A1, A2, e_feat1, e_feat2, n1, n2, network=net)
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+
+            # You may also load other pretrained weights
+            >>> X, net = pygm.cie(feat1, feat2, A1, A2, e_feat1, e_feat2, n1, n2, return_network=True, pretrain='willow')
+            Downloading to ~/.cache/pygmtools/cie_willow_numpy.npy...
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+
+            # You may configure your own model and integrate the model into a deep learning pipeline. For example:
+            >>> net = pygm.utils.get_network(pygm.cie, in_node_channel=1024, in_edge_channel=1, hidden_channel=2048, out_channel=512, num_layers=3, pretrain=False)
+            # feat1/feat2/e_feat1/e_feat2 may be outputs by other neural networks
+            >>> X = pygm.cie(feat1, feat2, A1, A2, e_feat1, e_feat2, n1, n2, network=net)
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+            
+
     .. dropdown:: PyTorch Example
 
         ::
@@ -710,6 +853,55 @@ def ngm(K, n1=None, n2=None, n1max=None, n2max=None, x0=None,
     .. note::
         This function also supports non-batched input, by ignoring all batch dimensions in the input tensors.
 
+    .. dropdown:: Numpy Example
+
+        ::
+
+            >>> import numpy as np
+            >>> import pygmtools as pygm
+            >>> pygm.BACKEND = 'numpy'
+            >>> np.random.seed(1)
+
+            # Generate a batch of isomorphic graphs
+            >>> batch_size = 10
+            >>> X_gt = np.zeros((batch_size, 4, 4))
+            >>> X_gt[:, np.arange(0, 4, dtype='i4'), np.random.permutation(4)] = 1
+            >>> A1 = np.random.rand(batch_size, 4, 4)
+            >>> A2 = np.matmul(np.matmul(X_gt.swapaxes(1, 2), A1), X_gt)
+            >>> n1 = n2 = np.array([4] * batch_size)
+
+            # Build affinity matrix
+            >>> conn1, edge1, ne1 = pygm.utils.dense_to_sparse(A1)
+            >>> conn2, edge2, ne2 = pygm.utils.dense_to_sparse(A2)
+            >>> import functools
+            >>> gaussian_aff = functools.partial(pygm.utils.gaussian_aff_fn, sigma=1.) # set affinity function
+            >>> K = pygm.utils.build_aff_mat(None, edge1, conn1, None, edge2, conn2, n1, None, n2, None, edge_aff_fn=gaussian_aff)
+
+            # Solve by NGM
+            >>> X, net = pygm.ngm(K, n1, n2, return_network=True)
+            Downloading to ~/.cache/pygmtools/ngm_voc_numpy.npy...
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+
+            # Pass the net object to avoid rebuilding the model agian
+            >>> X = pygm.ngm(K, n1, n2, network=net)
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+
+            # You may also load other pretrained weights
+            >>> X, net = pygm.ngm(feat1, feat2, A1, A2, e_feat1, e_feat2, n1, n2, return_network=True, pretrain='willow')
+            Downloading to ~/.cache/pygmtools/ngm_willow_numpy.npy...
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+
+            # You may configure your own model and integrate the model into a deep learning pipeline. For example:
+            >>> net = pygm.utils.get_network(pygm.ngm, gnn_channels=(32, 64, 128, 64, 32), sk_emb=8, pretrain=False)
+            # K may be outputs by other neural networks (constructed K from node/edge features by pygm.utils.build_aff_mat)
+            >>> X = pygm.ngm(K, n1, n2, network=net)
+            >>> (pygm.hungarian(X) * X_gt).sum() / X_gt.sum() # accuracy
+            1.0
+
+
     .. dropdown:: PyTorch Example
 
         ::
diff --git a/pygmtools/numpy_backend.py b/pygmtools/numpy_backend.py
index e48d0a19..298d61cb 100644
--- a/pygmtools/numpy_backend.py
+++ b/pygmtools/numpy_backend.py
@@ -796,7 +796,379 @@ def gamgm_real(
 #          Neural Network Solvers          #
 ############################################
 
+from pygmtools.numpy_modules import *
 
+def add_module(self, name: str, module) -> None:
+        self._modules[name] = module
+
+class PCA_GM_Net():
+    """
+    Numpy implementation of PCA-GM and IPCA-GM network
+    """
+    def __init__(self, in_channel, hidden_channel, out_channel, num_layers, cross_iter_num=-1):
+        self.gnn_layer = num_layers
+        self.dict = {}
+        for i in range(self.gnn_layer):
+            if i == 0:
+                gnn_layer = Siamese_Gconv(in_channel, hidden_channel)
+            elif 0 < i < self.gnn_layer - 1:
+                gnn_layer = Siamese_Gconv(hidden_channel, hidden_channel)
+            else:
+                gnn_layer = Siamese_Gconv(hidden_channel, out_channel)
+                self.dict['affinity_{}'.format(i)] =  WeightedInnerProdAffinity(out_channel)
+            self.dict['gnn_layer_{}'.format(i)] = gnn_layer
+            if i == self.gnn_layer - 2:  # only the second last layer will have cross-graph module
+                self.dict['cross_graph_{}'.format(i)] = Linear(hidden_channel * 2, hidden_channel)
+                if cross_iter_num <= 0:
+                    self.dict['affinity_{}'.format(i)] = WeightedInnerProdAffinity(hidden_channel)
+
+    def forward(self, feat1, feat2, A1, A2, n1, n2, cross_iter_num, sk_max_iter, sk_tau):
+        _sinkhorn_func = functools.partial(sinkhorn,
+                                           dummy_row=False, max_iter=sk_max_iter, tau=sk_tau, batched_operation=False)
+        emb1, emb2 = feat1, feat2
+        if cross_iter_num <= 0:
+            # Vanilla PCA-GM
+            for i in range(self.gnn_layer):
+                gnn_layer = self.dict['gnn_layer_{}'.format(i)]
+                emb1, emb2 = gnn_layer.forward([A1, emb1], [A2, emb2])
+                if i == self.gnn_layer - 2:
+                    affinity = self.dict['affinity_{}'.format(i)]
+                    s = affinity.forward(emb1, emb2)
+                    s = _sinkhorn_func(s, n1, n2)
+
+                    cross_graph = self.dict['cross_graph_{}'.format(i)]
+                    new_emb1 = cross_graph.forward(np.concatenate((emb1, np.matmul(s, emb2)), axis=-1))
+                    new_emb2 = cross_graph.forward(np.concatenate((emb2, np.matmul(s.swapaxes(1, 2), emb1)), axis=-1))
+                    emb1 = new_emb1
+                    emb2 = new_emb2
+
+            affinity = self.dict['affinity_{}'.format(self.gnn_layer - 1)]
+            s = affinity.forward(emb1, emb2)
+            s = _sinkhorn_func(s, n1, n2)
+
+        else:
+            # IPCA-GM
+            for i in range(self.gnn_layer - 1):
+                gnn_layer = self.dict['gnn_layer_{}'.format(i)]
+                emb1, emb2 = gnn_layer.forward([A1, emb1], [A2, emb2])
+
+            emb1_0, emb2_0 = emb1, emb2
+            s = np.zeros((emb1.shape[0], emb1.shape[1], emb2.shape[1]))
+
+            for x in range(cross_iter_num):
+                # cross-graph convolution in second last layer
+                i = self.gnn_layer - 2
+                cross_graph = self.dict['cross_graph_{}'.format(i)]
+                emb1 = cross_graph.forward(np.concatenate((emb1_0, np.matmul(s, emb2_0)), axis=-1))
+                emb2 = cross_graph.forward(np.concatenate((emb2_0, np.matmul(s.swapaxes(1, 2), emb1_0)), axis=-1))
+
+                # last layer
+                i = self.gnn_layer - 1
+                gnn_layer = self.dict['gnn_layer_{}'.format(i)]
+                emb1, emb2 = gnn_layer.forward([A1, emb1], [A2, emb2])
+                affinity = self.dict['affinity_{}'.format(i)]
+                s = affinity.forward(emb1, emb2)
+                s = _sinkhorn_func(s, n1, n2)
+
+        return s
+
+
+pca_gm_pretrain_path = {
+    'voc':('https://drive.google.com/u/0/uc?export=download&confirm=Z-AR&id=1En_9f5Zi5rSsS-JTIce7B1BV6ijGEAPd',
+           'd85f97498157d723793b8fc1501841ce'),
+    'willow':('https://drive.google.com/u/0/uc?export=download&confirm=Z-AR&id=1LAnK6ASYu0CO1fEe6WpvMbt5vskuvwLo',
+              'c32f7c8a7a6978619b8fdbb6ad5b505f'),
+    'voc-all':('https://drive.google.com/u/0/uc?export=download&confirm=Z-AR&id=1c_aw4wxEBuY7JFC4Rt8rlcise777n189',
+               '0e2725b3ac51f87f0303bbcfaae5df80')
+}
+
+def pca_gm(feat1, feat2, A1, A2, n1, n2,
+           in_channel, hidden_channel, out_channel, num_layers, sk_max_iter, sk_tau,
+           network, pretrain):
+    """
+    Numpy implementation of PCA-GM
+    """
+    if feat1 is None:
+        forward_pass = False
+    else:
+        forward_pass = True
+    if network is None:
+        network = PCA_GM_Net(in_channel, hidden_channel, out_channel, num_layers)
+        if pretrain:
+            if pretrain in pca_gm_pretrain_path.keys():
+                url, md5 = pca_gm_pretrain_path[pretrain]
+                filename = pygmtools.utils.download(f'pca_gm_{pretrain}_numpy.npy', url, md5)
+                pca_gm_numpy_dict = np.load(filename,allow_pickle=True)
+                for i in range(network.gnn_layer):
+                    gnn_layer = network.dict['gnn_layer_{}'.format(i)]
+                    gnn_layer.gconv.a_fc.weight = pca_gm_numpy_dict.item()['gnn_layer_{}.gconv.a_fc.weight'.format(i)]
+                    gnn_layer.gconv.a_fc.bias = pca_gm_numpy_dict.item()['gnn_layer_{}.gconv.a_fc.bias'.format(i)]
+                    gnn_layer.gconv.u_fc.weight = pca_gm_numpy_dict.item()['gnn_layer_{}.gconv.u_fc.weight'.format(i)]
+                    gnn_layer.gconv.u_fc.bias = pca_gm_numpy_dict.item()['gnn_layer_{}.gconv.u_fc.bias'.format(i)]
+                    if i == network.gnn_layer - 2:
+                        affinity = network.dict['affinity_{}'.format(i)]
+                        affinity.A = pca_gm_numpy_dict.item()['affinity_{}.A'.format(i)]
+                        cross_graph = network.dict['cross_graph_{}'.format(i)]
+                        cross_graph.weight = pca_gm_numpy_dict.item()['cross_graph_{}.weight'.format(i)]
+                        cross_graph.bias = pca_gm_numpy_dict.item()['cross_graph_{}.bias'.format(i)]
+                affinity = affinity = network.dict['affinity_{}'.format(network.gnn_layer - 1)]
+                affinity.A = pca_gm_numpy_dict.item()['affinity_{}.A'.format(network.gnn_layer - 1)]
+            else:
+                raise ValueError(f'Unknown pretrain tag. Available tags: {cie_pretrain_path.keys()}')
+    if forward_pass:
+        batch_size = feat1.shape[0]
+        if n1 is None:
+            n1 = np.array([feat1.shape[1]] * batch_size)
+        if n2 is None:
+            n2 = np.array([feat2.shape[1]] * batch_size)
+        result = network.forward(feat1, feat2, A1, A2, n1, n2, -1, sk_max_iter, sk_tau)
+    else:
+        result = None
+    return result, network
+
+ipca_gm_pretrain_path = {
+    'voc':('https://drive.google.com/u/0/uc?export=download&confirm=Z-AR&id=13g9iBjXZ804bKo6p8wMQe8yNUZBwVGJj',
+           '4479a25558780a4b4c9891b4386659cd'),
+    'willow':('https://drive.google.com/u/0/uc?export=download&confirm=Z-AR&id=1vq0FqjPhiSR80cu9jk0qMljkC4gSFvQA',
+              'ada1df350d45cc877f08e12919993345')
+}
+
+def ipca_gm(feat1, feat2, A1, A2, n1, n2,
+           in_channel, hidden_channel, out_channel, num_layers, cross_iter, sk_max_iter, sk_tau,
+           network, pretrain):
+    """
+    Numpy implementation of IPCA-GM
+    """
+    if feat1 is None:
+        forward_pass = False
+    else:
+        forward_pass = True
+    if network is None:
+        network = PCA_GM_Net(in_channel, hidden_channel, out_channel, num_layers, cross_iter)
+        if pretrain:
+            if pretrain in ipca_gm_pretrain_path.keys():
+                url, md5 = ipca_gm_pretrain_path[pretrain]
+                filename = pygmtools.utils.download(f'ipca_gm_{pretrain}_numpy.npy', url, md5)
+                ipca_gm_numpy_dict = np.load(filename,allow_pickle=True)
+                for i in range(network.gnn_layer-1):
+                    gnn_layer = network.dict['gnn_layer_{}'.format(i)]
+                    gnn_layer.gconv.a_fc.weight = ipca_gm_numpy_dict.item()['gnn_layer_{}.gconv.a_fc.weight'.format(i)]
+                    gnn_layer.gconv.a_fc.bias = ipca_gm_numpy_dict.item()['gnn_layer_{}.gconv.a_fc.bias'.format(i)]
+                    gnn_layer.gconv.u_fc.weight = ipca_gm_numpy_dict.item()['gnn_layer_{}.gconv.u_fc.weight'.format(i)]
+                    gnn_layer.gconv.u_fc.bias = ipca_gm_numpy_dict.item()['gnn_layer_{}.gconv.u_fc.bias'.format(i)]
+                
+                for x in range(cross_iter):
+                    i = network.gnn_layer - 2
+                    cross_graph = network.dict['cross_graph_{}'.format(i)]
+                    cross_graph.weight = ipca_gm_numpy_dict.item()['cross_graph_{}.weight'.format(i)]
+                    cross_graph.bias = ipca_gm_numpy_dict.item()['cross_graph_{}.bias'.format(i)]
+                    
+                    i = network.gnn_layer - 1
+                    gnn_layer = network.dict['gnn_layer_{}'.format(i)]
+                    gnn_layer.gconv.a_fc.weight = ipca_gm_numpy_dict.item()['gnn_layer_{}.gconv.a_fc.weight'.format(i)]
+                    gnn_layer.gconv.a_fc.bias = ipca_gm_numpy_dict.item()['gnn_layer_{}.gconv.a_fc.bias'.format(i)]
+                    gnn_layer.gconv.u_fc.weight = ipca_gm_numpy_dict.item()['gnn_layer_{}.gconv.u_fc.weight'.format(i)]
+                    gnn_layer.gconv.u_fc.bias = ipca_gm_numpy_dict.item()['gnn_layer_{}.gconv.u_fc.bias'.format(i)]
+
+                    affinity = network.dict['affinity_{}'.format(i)]
+                    affinity.A = ipca_gm_numpy_dict.item()['affinity_{}.A'.format(i)]
+            else:
+                raise ValueError(f'Unknown pretrain tag. Available tags: {ipca_gm_pretrain_path.keys()}') 
+    if forward_pass:
+        batch_size = feat1.shape[0]
+        if n1 is None:
+            n1 = np.array([feat1.shape[1]] * batch_size)
+        if n2 is None:
+            n2 = np.array([feat2.shape[1]] * batch_size)
+        result = network.forward(feat1, feat2, A1, A2, n1, n2, cross_iter, sk_max_iter, sk_tau)
+    else:
+        result = None
+    return result, network
+
+
+class CIE_Net():
+    """
+    Numpy implementation of CIE graph matching network
+    """
+    def __init__(self, in_node_channel, in_edge_channel, hidden_channel, out_channel, num_layers):
+        self.gnn_layer = num_layers
+        self.dict = {}
+        for i in range(self.gnn_layer):
+            if i == 0:
+                gnn_layer = Siamese_ChannelIndependentConv(in_node_channel, hidden_channel, in_edge_channel)
+            elif 0 < i < self.gnn_layer - 1:
+                gnn_layer = Siamese_ChannelIndependentConv(hidden_channel, hidden_channel, hidden_channel)
+            else:
+                gnn_layer = Siamese_ChannelIndependentConv(hidden_channel, out_channel, hidden_channel)
+                self.dict['affinity_{}'.format(i)] = WeightedInnerProdAffinity(out_channel)
+            self.dict['gnn_layer_{}'.format(i)] = gnn_layer
+            if i == self.gnn_layer - 2:  # only the second last layer will have cross-graph module
+                self.dict['cross_graph_{}'.format(i)] = Linear(hidden_channel * 2, hidden_channel)
+                self.dict['affinity_{}'.format(i)] = WeightedInnerProdAffinity(hidden_channel)
+
+    def forward(self, feat_node1, feat_node2, A1, A2, feat_edge1, feat_edge2, n1, n2, sk_max_iter, sk_tau):
+        _sinkhorn_func = functools.partial(sinkhorn,
+                                           dummy_row=False, max_iter=sk_max_iter, tau=sk_tau, batched_operation=False)
+        emb1, emb2 = feat_node1, feat_node2
+        emb_edge1, emb_edge2 = feat_edge1, feat_edge2
+        
+        for i in range(self.gnn_layer):
+            gnn_layer = self.dict['gnn_layer_{}'.format(i)]
+            # during forward process, the network structure will not change
+            emb1, emb2, emb_edge1, emb_edge2 = gnn_layer.forward([A1, emb1, emb_edge1], [A2, emb2, emb_edge2])
+            
+            if i == self.gnn_layer - 2:
+                affinity = self.dict['affinity_{}'.format(i)]
+                s = affinity.forward(emb1, emb2)
+                s = _sinkhorn_func(s, n1, n2)
+
+                cross_graph = self.dict['cross_graph_{}'.format(i)]
+                new_emb1 = cross_graph.forward(np.concatenate((emb1, np.matmul(s, emb2)), axis=-1))
+                new_emb2 = cross_graph.forward(np.concatenate((emb2, np.matmul(s.swapaxes(1, 2), emb1)), axis=-1))
+                emb1 = new_emb1
+                emb2 = new_emb2
+        
+        affinity = self.dict['affinity_{}'.format(self.gnn_layer - 1)]
+        s = affinity.forward(emb1, emb2)
+        s = _sinkhorn_func(s, n1, n2)
+        return s
+
+cie_pretrain_path = {
+    'voc':('https://drive.google.com/u/0/uc?export=download&confirm=Z-AR&id=1rP9sJY1fh493LLMWw-7RaeFAMHlbSs2D',
+           '9cbd55fa77d124b95052378643715bae'),
+    'willow':('https://drive.google.com/u/0/uc?export=download&confirm=Z-AR&id=1cMiXrSQjXZ9lDxeB6194z1-luyslVTR8',
+              'bd36e1bf314503c1f1482794e1648b18')
+}
+
+def cie(feat_node1, feat_node2, A1, A2, feat_edge1, feat_edge2, n1, n2,
+        in_node_channel, in_edge_channel, hidden_channel, out_channel, num_layers, sk_max_iter, sk_tau,
+        network, pretrain):
+    """
+    Numpy implementation of CIE
+    """
+    if feat_node1 is None:
+        forward_pass = False
+    else:
+        forward_pass = True
+    if network is None:
+        network = CIE_Net(in_node_channel, in_edge_channel, hidden_channel, out_channel, num_layers)
+        if pretrain:
+            if pretrain in cie_pretrain_path.keys():
+                url, md5 = cie_pretrain_path[pretrain]
+                filename = pygmtools.utils.download(f'cie_{pretrain}_numpy.npy', url, md5)
+                cie_numpy_dict = np.load(filename,allow_pickle=True)
+                for i in range(network.gnn_layer):
+                    gnn_layer = network.dict['gnn_layer_{}'.format(i)]
+                    gnn_layer.gconv.node_fc.weight = cie_numpy_dict.item()['gnn_layer_{}.gconv.node_fc.weight'.format(i)]
+                    gnn_layer.gconv.node_fc.bias = cie_numpy_dict.item()['gnn_layer_{}.gconv.node_fc.bias'.format(i)]
+                    gnn_layer.gconv.node_sfc.weight = cie_numpy_dict.item()['gnn_layer_{}.gconv.node_sfc.weight'.format(i)]
+                    gnn_layer.gconv.node_sfc.bias = cie_numpy_dict.item()['gnn_layer_{}.gconv.node_sfc.bias'.format(i)]
+                    gnn_layer.gconv.edge_fc.weight = cie_numpy_dict.item()['gnn_layer_{}.gconv.edge_fc.weight'.format(i)]
+                    gnn_layer.gconv.edge_fc.bias = cie_numpy_dict.item()['gnn_layer_{}.gconv.edge_fc.bias'.format(i)]
+                    if i == network.gnn_layer - 2:
+                        affinity = network.dict['affinity_{}'.format(i)]
+                        affinity.A = cie_numpy_dict.item()['affinity_{}.A'.format(i)]
+                        cross_graph = network.dict['cross_graph_{}'.format(i)]
+                        cross_graph.weight = cie_numpy_dict.item()['cross_graph_{}.weight'.format(i)]
+                        cross_graph.bias = cie_numpy_dict.item()['cross_graph_{}.bias'.format(i)]
+                affinity = affinity = network.dict['affinity_{}'.format(network.gnn_layer - 1)]
+                affinity.A = cie_numpy_dict.item()['affinity_{}.A'.format(network.gnn_layer - 1)]
+            else:
+                raise ValueError(f'Unknown pretrain tag. Available tags: {cie_pretrain_path.keys()}')
+    if forward_pass:
+        batch_size = feat_node1.shape[0]
+        if n1 is None:
+            n1 = np.array([feat_node1.shape[1]] * batch_size)
+        if n2 is None:
+            n2 = np.array([feat_node1.shape[1]] * batch_size)
+        result = network.forward(feat_node1, feat_node2, A1, A2, feat_edge1, feat_edge2, n1, n2, sk_max_iter, sk_tau)
+    else:
+        result = None
+
+    return result, network
+
+
+class NGM_Net():
+    """
+    Numpy implementation of NGM network
+    """
+    def __init__(self, gnn_channels, sk_emb):
+        self.gnn_layer = len(gnn_channels)
+        self.dict = {}
+        for i in range(self.gnn_layer):
+            if i == 0:
+                gnn_layer = NGMConvLayer(1, 1,
+                                         gnn_channels[i] + sk_emb, gnn_channels[i],
+                                         sk_channel=sk_emb, edge_emb=False)
+            else:
+                gnn_layer = NGMConvLayer(gnn_channels[i - 1] + sk_emb, gnn_channels[i - 1],
+                                         gnn_channels[i] + sk_emb, gnn_channels[i],
+                                         sk_channel=sk_emb, edge_emb=False)
+            self.dict['gnn_layer_{}'.format(i)] = gnn_layer
+        self.classifier = Linear(gnn_channels[-1] + sk_emb, 1)
+
+    def forward(self, K, n1, n2, n1max, n2max, v0, sk_max_iter, sk_tau):
+        _sinkhorn_func = functools.partial(sinkhorn,
+                                           dummy_row=False, max_iter=sk_max_iter, tau=sk_tau, batched_operation=False)
+        emb = v0
+        A = (K != 0)
+        emb_K = np.expand_dims(K,axis=-1)
+
+        # NGM qap solver
+        for i in range(self.gnn_layer):
+            gnn_layer = self.dict['gnn_layer_{}'.format(i)]
+            emb_K, emb = gnn_layer.forward(A, emb_K, emb, n1, n2, sk_func=_sinkhorn_func)
+        v = self.classifier.forward(emb)
+        
+        s = v.reshape(v.shape[0], n2max, -1).swapaxes(1, 2)
+        
+        return _sinkhorn_func(s, n1, n2, dummy_row=True)
+
+ngm_pretrain_path = {
+    'voc':('https://drive.google.com/u/0/uc?export=download&confirm=Z-AR&id=1LY93fLCjH5vDcWsjZxGPmXmrYMF8HZIR',
+           '19cd48afab71b3277d2062624934702c'),
+    'willow':('https://drive.google.com/u/0/uc?export=download&confirm=Z-AR&id=1iD8FHqahRsVV_H6o3ByB6nwBHU8sEgnt',
+              '31968e30c399845f34d80733d0118b8b')
+}
+
+def ngm(K, n1, n2, n1max, n2max, x0, gnn_channels, sk_emb, sk_max_iter, sk_tau, network, return_network, pretrain):
+    """
+    Numpy implementation of NGM
+    """
+    if K is None:
+        forward_pass = False
+    else:
+        forward_pass = True
+    if network is None:
+        network = NGM_Net(gnn_channels, sk_emb)
+        if pretrain:
+            if pretrain in ngm_pretrain_path.keys():
+                url, md5 = ngm_pretrain_path[pretrain]
+                filename = pygmtools.utils.download(f'ngm_{pretrain}_numpy.npy', url, md5)
+                ngm_numpy_dict = np.load(filename,allow_pickle=True)
+                for i in range(network.gnn_layer):
+                    gnn_layer = network.dict['gnn_layer_{}'.format(i)]
+                    gnn_layer.classifier.weight = ngm_numpy_dict.item()['gnn_layer_{}.classifier.weight'.format(i)]
+                    gnn_layer.classifier.bias = ngm_numpy_dict.item()['gnn_layer_{}.classifier.bias'.format(i)]
+                    gnn_layer.n_func.getitem(0).weight = ngm_numpy_dict.item()['gnn_layer_{}.n_func.0.weight'.format(i)]
+                    gnn_layer.n_func.getitem(0).bias = ngm_numpy_dict.item()['gnn_layer_{}.n_func.0.bias'.format(i)]
+                    gnn_layer.n_func.getitem(2).weight = ngm_numpy_dict.item()['gnn_layer_{}.n_func.2.weight'.format(i)]
+                    gnn_layer.n_func.getitem(2).bias = ngm_numpy_dict.item()['gnn_layer_{}.n_func.2.bias'.format(i)]
+                    gnn_layer.n_self_func.getitem(0).weight = ngm_numpy_dict.item()['gnn_layer_{}.n_self_func.0.weight'.format(i)]
+                    gnn_layer.n_self_func.getitem(0).bias = ngm_numpy_dict.item()['gnn_layer_{}.n_self_func.0.bias'.format(i)]
+                    gnn_layer.n_self_func.getitem(2).weight = ngm_numpy_dict.item()['gnn_layer_{}.n_self_func.2.weight'.format(i)]
+                    gnn_layer.n_self_func.getitem(2).bias = ngm_numpy_dict.item()['gnn_layer_{}.n_self_func.2.bias'.format(i)]
+                network.classifier.weight = ngm_numpy_dict.item()['classifier.weight']
+                network.classifier.bias = ngm_numpy_dict.item()['classifier.bias']
+            else:
+                raise ValueError(f'Unknown pretrain tag. Available tags: {ngm_pretrain_path.keys()}')
+    if forward_pass:
+        batch_num, n1, n2, n1max, n2max, n1n2, v0 = _check_and_init_gm(K, n1, n2, n1max, n2max, x0)
+        v0 = v0 / np.mean(v0)
+        result = network.forward(K, n1, n2, n1max, n2max, v0, sk_max_iter, sk_tau)
+    else:
+        result = None
+    return result, network
 #############################################
 #              Utils Functions              #
 #############################################
@@ -913,40 +1285,6 @@ def generate_isomorphic_graphs(node_num, graph_num, node_feat_dim=0):
     else:
         return np.stack(As,axis=0), X_gt
 
-
-"""
-def permutation_loss(pred_dsmat:np.ndarray, gt_perm: np.ndarray, n1: np.ndarray, n2:np.ndarray) -> np.ndarray:
-
-    #Numpy implementation of permutation_loss
-
-    batch_num = pred_dsmat.shape[0]
-
-    pred_dsmat = pred_dsmat.to(dtype='f')
-
-    if not np.all((pred_dsmat >= 0) * (pred_dsmat <= 1)):
-        raise ValueError("pred_dsmat contains invalid numerical entries.")
-    if not np.all((gt_perm >= 0) * (gt_perm <= 1)):
-        raise ValueError("gt_perm contains invalid numerical entries.")
-
-    if n1 is None:
-        n1 = np.array([pred_dsmat.shape[1] for _ in range(batch_num)])
-    if n2 is None:
-        n2 = np.array([pred_dsmat.shape[2] for _ in range(batch_num)])
-
-    loss = np.array(0.)
-    n_sum = np.zeros_like(loss)
-    for b in range(batch_num):
-        batch_slice = [b, slice(n1[b]), slice(n2[b])]
-        loss += array.nn.functional.binary_cross_entropy(
-            pred_dsmat[batch_slice],
-            gt_perm[batch_slice],
-            reduction='sum')
-        n_sum += n1[b].to(n_sum.dtype).to(pred_dsmat.device)
-
-    return loss / n_sum
-"""
-
-
 def _aff_mat_from_node_edge_aff(node_aff: np.ndarray, edge_aff: np.ndarray, connectivity1: np.ndarray, connectivity2: np.ndarray,
                                 n1, n2, ne1, ne2):
     """
diff --git a/pygmtools/numpy_modules.py b/pygmtools/numpy_modules.py
new file mode 100644
index 00000000..33a81fef
--- /dev/null
+++ b/pygmtools/numpy_modules.py
@@ -0,0 +1,381 @@
+import numpy as np
+import math
+
+############################################
+#            Affinity Modules              #
+############################################
+
+class WeightedInnerProdAffinity():
+    def __init__(self, d):
+        self.d = d
+
+        stdv = 1. / math.sqrt(self.d)
+        self.A = np.random.uniform(-stdv,stdv,[self.d,self.d])
+        self.A += np.eye(self.d)
+
+    def forward(self, X, Y):
+        assert X.shape[2] == Y.shape[2] == self.d
+        M = np.matmul(X, self.A)
+        M = np.matmul(M, Y.swapaxes(1, 2))
+        return M
+
+############################################
+#         Graph Convolution Modules        #
+############################################
+def relu(X):
+    X[X<0] = 0
+    return X
+
+def kaiming_uniform_(array: np.ndarray, a: float = 0, mode: str = 'fan_in', nonlinearity: str = 'leaky_relu'):
+    """Numpy's kaiming_uniform_"""
+    gain = math.sqrt(2/(a*a+1))
+    fan_in = array.shape[1]
+    fan_out = array.shape[0]
+    if mode == 'fan_in':
+        fan_mode = fan_in
+    if mode == 'fan_out':
+        fan_mode = fan_out
+    bound = gain * math.sqrt(3/fan_mode)
+    array = uniform_(array, -bound, bound)
+    return array
+
+def uniform_(array,a,b):
+    array = np.random.uniform(a,b,array.shape)
+    return array
+
+def normalize_abs(array,axis):
+    array_shape = array.shape
+    k = abs(array).sum(axis)
+    k = k.repeat(array_shape[axis],(axis-1+len(array_shape)) % len(array_shape)).reshape(array_shape)
+    array = np.nan_to_num(array/k)
+    return array
+
+def expand_as(array,target_arary):
+    ori_array_shape = array.shape
+    array_axis = len(ori_array_shape)
+    ori_target_arary_shape = target_arary.shape
+    target_arary_axis = len(ori_target_arary_shape)
+    if(array_axis != target_arary_axis):
+        if(target_arary_axis > array_axis):
+            for _ in np.arange(target_arary_axis-array_axis):
+                array = np.expand_dims(array,axis=0)
+        else:
+            message = "The size of the input array exceeds the target array!"
+            message += "\ninput array's shape:" + str(ori_array_shape)
+            message += "\ntarget array's shape:" + str(ori_target_arary_shape)
+            raise ValueError(message)
+    array_shape = array.shape
+    target_arary_shape = target_arary.shape
+    l = target_arary_axis
+    for i in np.arange(target_arary_axis):
+        k = l-i-1
+        m = array_shape[k]
+        n = target_arary_shape[k]
+        if(m == 1):
+            array = array.repeat(n/m,axis=k)
+        elif(m != n):
+            message = "\nThe expanded size of the array (" + str(n) + ") must match the existing size (" + str(m)  
+            message += ") at non-singleton dimension " + str(k)
+            message += "\ninput array's shape:" + str(ori_array_shape)
+            message += "\ntarget array's shape:" + str(ori_target_arary_shape)
+            raise ValueError(message)
+    return array
+
+class Linear():
+    """Numpy's Linear"""
+    __constants__ = ['in_features', 'out_features']
+    in_features: int
+    out_features: int
+    weight: np.ndarray
+
+    def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = np.empty((out_features, in_features), dtype='f')
+        if bias:
+            self.bias = np.empty(out_features, dtype='f')
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        self.weight = kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in = self.weight.shape[1]
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            self.bias = uniform_(self.bias, -bound, bound)
+
+    def forward(self, input: np.ndarray) -> np.ndarray:
+        return np.matmul(input,self.weight.swapaxes(-1,-2)) + self.bias
+
+    def extra_repr(self) -> str:
+        return 'in_features={}, out_features={}, bias={}'.format(
+            self.in_features, self.out_features, self.bias is not None
+        )
+
+class Sequential():
+
+    def __init__(self, *args):
+        self._modules = {}
+        for idx, module in enumerate(args):
+            self._modules[idx] = module
+
+    def getitem(self, idx):
+        return self._modules[idx]
+
+    def setitem(self, idx, module):
+        if (idx >= len(self._modules)):
+            raise ValueError("Maximum value exceeded!")
+        self._modules[idx] = module
+
+    def delitem(self, idx):
+        for i in range(idx, len(self._modules) - 1):
+            self._modules[i] = self._modules[i + 1]
+        del self._modules[len(self._modules) - 1]
+
+    def len(self):
+        return len(self._modules)
+
+    def append(self, module):
+        new_idx = int(list(self._modules.keys())[-1]) + 1
+        self._modules[new_idx] = module
+
+    def forward(self, inputs):
+        for module in self._modules.values():
+            inputs = module.forward(inputs)
+        return inputs
+
+class ReLU():
+
+    def __init__(self, inplace: bool = False):
+        self.inplace = inplace
+
+    def forward(self, input: np.ndarray) -> np.ndarray:
+        return relu(input)
+
+    def extra_repr(self) -> str:
+        inplace_str = 'inplace=True' if self.inplace else ''
+        return inplace_str
+
+
+class Gconv():
+    def __init__(self, in_features: int, out_features: int):
+        super(Gconv, self).__init__()
+        self.num_inputs = in_features
+        self.num_outputs = out_features
+        self.a_fc = Linear(self.num_inputs, self.num_outputs)
+        self.u_fc = Linear(self.num_inputs, self.num_outputs)
+
+    def forward(self, A: np.ndarray, x: np.ndarray, norm: bool=True) -> np.ndarray:
+        r"""
+        Forward computation of graph convolution network.
+
+        :param A: :math:`(b\times n\times n)` {0,1} adjacency matrix. :math:`b`: batch size, :math:`n`: number of nodes
+        :param x: :math:`(b\times n\times d)` input node embedding. :math:`d`: feature dimension
+        :param norm: normalize connectivity matrix or not
+        :return: :math:`(b\times n\times d^\prime)` new node embedding
+        """
+        if norm is True:
+            A = normalize_abs(A,axis=-2)
+        ax = self.a_fc.forward(x)
+        ux = self.u_fc.forward(x)
+        x = np.matmul(A,relu(ax)) + relu(ux) # has size (bs, N, num_outputs)
+        return x
+
+class ChannelIndependentConv():
+    r"""
+    Channel Independent Embedding Convolution.
+    Proposed by `"Yu et al. Learning deep graph matching with channel-independent embedding and Hungarian attention.
+    ICLR 2020." <https://openreview.net/forum?id=rJgBd2NYPH>`_
+
+    :param in_features: the dimension of input node features
+    :param out_features: the dimension of output node features
+    :param in_edges: the dimension of input edge features
+    :param out_edges: (optional) the dimension of output edge features. It needs to be the same as ``out_features``
+    """
+    def __init__(self, in_features: int, out_features: int, in_edges: int, out_edges: int=None):
+        if out_edges is None:
+            out_edges = out_features
+        self.in_features = in_features
+        self.out_features = out_features
+        self.out_edges = out_edges
+        self.node_fc = Linear(in_features, out_features)
+        self.node_sfc = Linear(in_features, out_features)
+        self.edge_fc = Linear(in_edges, self.out_edges)
+
+    def forward(self, A: np.ndarray, emb_node: np.ndarray, emb_edge: np.ndarray, mode: int=1):
+        r"""
+        :param A: :math:`(b\times n\times n)` {0,1} adjacency matrix. :math:`b`: batch size, :math:`n`: number of nodes
+        :param emb_node: :math:`(b\times n\times d_n)` input node embedding. :math:`d_n`: node feature dimension
+        :param emb_edge: :math:`(b\times n\times n\times d_e)` input edge embedding. :math:`d_e`: edge feature dimension
+        :param mode: 1 or 2, refer to the paper for details
+        :return: :math:`(b\times n\times d^\prime)` new node embedding,
+         :math:`(b\times n\times n\times d^\prime)` new edge embedding
+        """
+        if mode == 1:
+            node_x = self.node_fc.forward(emb_node)
+            node_sx = self.node_sfc.forward(emb_node)
+            edge_x = self.edge_fc.forward(emb_edge)
+            
+            A = np.expand_dims(A,axis=-1)
+            A =  expand_as(A,edge_x) * edge_x
+
+            node_x = np.matmul(A.swapaxes(2, 3).swapaxes(1, 2),
+                                  np.expand_dims(node_x,axis=2).swapaxes(2, 3).swapaxes(1, 2))
+            node_x = np.squeeze(node_x,axis=-1).swapaxes(1, 2)
+            node_x = relu(node_x) + relu(node_sx)
+            edge_x = relu(edge_x)
+
+            return node_x, edge_x
+
+        elif mode == 2:
+            node_x = self.node_fc(emb_node)
+            node_sx = self.node_sfc(emb_node)
+            edge_x = self.edge_fc(emb_edge)
+
+            d_x = np.expand_dims(node_x,axis=-1) - np.expand_dims(node_x,axis=2)
+            d_x = np.sum(d_x ** 2, axis=3, keepdim=False)
+            d_x = np.exp(-d_x)
+
+            A = np.expand_dims(A,axis=-1)
+            A = expand_as(A,edge_x) * edge_x
+
+            node_x = np.matmul(A.swapaxes(2, 3).swapaxes(1, 2),
+                                  np.expand_dims(node_x,axis=2).swapaxes(2, 3).swapaxes(1, 2))
+            node_x = np.squeeze(node_x,axis=-1).swapaxes(1, 2)
+            node_x = relu(node_x) + relu(node_sx)
+            edge_x = relu(edge_x)
+            return node_x, edge_x
+
+        else:
+            raise ValueError('Unknown mode {}. Possible options: 1 or 2'.format(mode))
+
+class Siamese_Gconv():
+    r"""
+    Siamese Gconv neural network for processing arbitrary number of graphs.
+
+    :param in_features: the dimension of input node features
+    :param num_features: the dimension of output node features
+    """
+    def __init__(self, in_features, num_features):
+        self.gconv = Gconv(in_features, num_features)
+
+    def forward(self, g1, *args):
+        # embx are tensors of size (bs, N, num_features)
+        emb1 = self.gconv.forward(*g1)
+        if len(args) == 0:
+            return emb1
+        else:
+            returns = [emb1]
+            for g in args:
+                returns.append(self.gconv.forward(*g))
+            return returns
+
+class Siamese_ChannelIndependentConv():
+    r"""
+    Siamese Channel Independent Conv neural network for processing arbitrary number of graphs.
+
+    :param in_features: the dimension of input node features
+    :param num_features: the dimension of output node features
+    :param in_edges: the dimension of input edge features
+    :param out_edges: (optional) the dimension of output edge features. It needs to be the same as ``num_features``
+    """
+    def __init__(self, in_features, num_features, in_edges, out_edges=None):
+        self.in_feature = in_features
+        self.gconv = ChannelIndependentConv(in_features, num_features, in_edges, out_edges)
+
+    def forward(self, g1, *args):
+        r"""
+        Forward computation of Siamese Channel Independent Conv.
+
+        :param g1: The first graph, which is a tuple of (:math:`(b\times n\times n)` {0,1} adjacency matrix,
+         :math:`(b\times n\times d_n)` input node embedding, :math:`(b\times n\times n\times d_e)` input edge embedding,
+         mode (``1`` or ``2``))
+        :param args: Other graphs
+        :return: A list of tensors composed of new node embeddings :math:`(b\times n\times d^\prime)`, appended with new
+         edge embeddings :math:`(b\times n\times n\times d^\prime)`
+        """
+        emb1, emb_edge1 = self.gconv.forward(*g1)
+        embs = [emb1]
+        emb_edges = [emb_edge1]
+        for g in args:
+            emb2, emb_edge2 = self.gconv.forward(*g)
+            embs.append(emb2), emb_edges.append(emb_edge2)
+        return embs + emb_edges
+
+class NGMConvLayer():
+    def __init__(self, in_node_features, in_edge_features, out_node_features, out_edge_features,
+                 sk_channel=0, edge_emb=False):
+        self.in_nfeat = in_node_features
+        self.in_efeat = in_edge_features
+        self.out_efeat = out_edge_features
+        self.sk_channel = sk_channel
+        assert out_node_features == out_edge_features + self.sk_channel
+        if self.sk_channel > 0:
+            self.out_nfeat = out_node_features - self.sk_channel
+            self.classifier = Linear(self.out_nfeat, self.sk_channel)
+        else:
+            self.out_nfeat = out_node_features
+            self.classifier = None
+
+        if edge_emb:
+            self.e_func = Sequential(
+                Linear(self.in_efeat + self.in_nfeat, self.out_efeat),
+                ReLU(),
+                Linear(self.out_efeat, self.out_efeat),
+                ReLU()
+            )
+        else:
+            self.e_func = None
+
+        self.n_func = Sequential(
+            Linear(self.in_nfeat, self.out_nfeat),
+            #nn.Linear(self.in_nfeat, self.out_nfeat // self.out_efeat),
+            ReLU(),
+            Linear(self.out_nfeat, self.out_nfeat),
+            #nn.Linear(self.out_nfeat // self.out_efeat, self.out_nfeat // self.out_efeat),
+            ReLU(),
+        )
+
+        self.n_self_func = Sequential(
+            Linear(self.in_nfeat, self.out_nfeat),
+            ReLU(),
+            Linear(self.out_nfeat, self.out_nfeat),
+            ReLU()
+        )
+
+    def forward(self, A, W, x, n1=None, n2=None, norm=True, sk_func=None):
+        """
+        :param A: adjacent matrix in 0/1 (b x n x n)
+        :param W: edge feature tensor (b x n x n x feat_dim)
+        :param x: node feature tensor (b x n x feat_dim)
+        """
+        if self.e_func is not None:
+            W1 = np.expand_dims(A,axis=-1) * np.expand_dims(x,axis=1)
+            W2 = np.concatenate((W, W1), axis=-1)
+            W_new = self.e_func(W2)
+        else:
+            W_new = W
+
+        if norm is True:
+            A = normalize_abs(A,axis=2)
+        
+        x1 = self.n_func.forward(x)
+        tmp1 = (np.expand_dims(A,axis=-1) * W_new).transpose((0, 3, 1, 2))
+        tmp2 = np.expand_dims(x1,axis=2).transpose((0, 3, 1, 2))
+        x2 = np.squeeze(np.matmul(tmp1,tmp2),axis=-1).swapaxes(1, 2)
+        x2 += self.n_self_func.forward(x)
+        
+        if self.classifier is not None:
+            assert n1.max() * n2.max() == x.shape[1]
+            assert sk_func is not None
+            x3 = self.classifier.forward(x2)
+            n1_rep = n1.repeat(self.sk_channel, axis=0)
+            n2_rep = n2.repeat(self.sk_channel, axis=0)
+            x4 = x3.transpose((0,2,1)).reshape(x.shape[0] * self.sk_channel, n2.max(), n1.max()).swapaxes(1, 2)
+            x5 = np.ascontiguousarray(sk_func(x4, n1_rep, n2_rep, dummy_row=True).swapaxes(2, 1))
+            
+            x6 = x5.reshape(x.shape[0], self.sk_channel, n1.max() * n2.max()).transpose((0, 2, 1))
+            x_new = np.concatenate((x2, x6), axis=-1)
+        else:
+            x_new = x2
+        
+        return W_new, x_new
\ No newline at end of file
diff --git a/pygmtools/utils.py b/pygmtools/utils.py
index d76f9c59..c6c937d6 100644
--- a/pygmtools/utils.py
+++ b/pygmtools/utils.py
@@ -13,7 +13,7 @@
 import shutil
 from tqdm.auto import tqdm
 import inspect
-
+import wget
 import pygmtools
 
 NOT_IMPLEMENTED_MSG = \
@@ -1080,12 +1080,10 @@ def _mm(input1, input2, backend=None):
         )
     return fn(*args)
 
-
 def download(filename, url, md5=None, retries=5):
     r"""
     Check if content exits. If not, download the content to ``<user cache path>/pygmtools/<filename>``. ``<user cache path>``
     depends on your system. For example, on Debian, it should be ``$HOME/.cache``.
-
     :param filename: the destination file name
     :param url: the url
     :param md5: (optional) the md5sum to verify the content. It should match the result of ``md5sum file`` on Linux.
@@ -1100,27 +1098,36 @@ def download(filename, url, md5=None, retries=5):
         os.makedirs(dirs)
     filename = os.path.join(dirs, filename)
     if not os.path.exists(filename):
-        print(f'Downloading to {filename}...')
-        down_res = requests.get(url, stream=True)
-        file_size = int(down_res.headers.get('Content-Length', 0))
-        with tqdm.wrapattr(down_res.raw, "read", total=file_size) as content:
-            with open(filename, 'wb') as file:
-                shutil.copyfileobj(content, file)
-
+        print(f'\nDownloading to {filename}...')
+        if retries % 2 == 1:
+            try:
+                down_res = requests.get(url, stream=True)
+                file_size = int(down_res.headers.get('Content-Length', 0))
+                with tqdm.wrapattr(down_res.raw, "read", total=file_size) as content:
+                    with open(filename, 'wb') as file:
+                        shutil.copyfileobj(content, file)
+            except requests.exceptions.ConnectionError as err:
+                print('Warning: Network error. Retrying...\n', err)
+                return download(filename, url, md5, retries - 1)
+        else:
+            wget.download(url,out=filename)
     if md5 is not None:
-        hash_md5 = hashlib.md5()
-        chunk = 8192
-        with open(filename, 'rb') as file_to_check:
-            while True:
-                buffer = file_to_check.read(chunk)
-                if not buffer:
-                    break
-                hash_md5.update(buffer)
-            md5_returned = hash_md5.hexdigest()
+        md5_returned = _get_md5(filename)
         if md5 != md5_returned:
             print('Warning: MD5 check failed for the downloaded content. Retrying...')
             os.remove(filename)
             time.sleep(1)
             return download(filename, url, md5, retries - 1)
-
     return filename
+
+def _get_md5(filename):
+    hash_md5 = hashlib.md5()
+    chunk = 8192
+    with open(filename, 'rb') as file_to_check:
+        while True:
+            buffer = file_to_check.read(chunk)
+            if not buffer:
+                break
+            hash_md5.update(buffer)
+        md5_returned = hash_md5.hexdigest()
+        return md5_returned
diff --git a/setup.py b/setup.py
index af1fa5df..ff6a210d 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@ def get_property(prop, project):
 VERSION = get_property('__version__', NAME)
 
 REQUIRED = [
-     'requests>=2.25.1', 'scipy>=1.4.1', 'Pillow>=7.2.0', 'numpy>=1.18.5', 'easydict>=1.7', 'appdirs>=1.4.4', 'tqdm>=4.64.1'
+     'requests>=2.25.1', 'scipy>=1.4.1', 'Pillow>=7.2.0', 'numpy>=1.18.5', 'easydict>=1.7', 'appdirs>=1.4.4', 'tqdm>=4.64.1','wget>=3.2'
 ]
 
 EXTRAS = {}
diff --git a/tests/requirements.txt b/tests/requirements.txt
index a921300d..435b7496 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -10,3 +10,4 @@ tqdm
 jittor
 appdirs>=1.4.4
 tensorflow
+wget
diff --git a/tests/test_neural_solvers.py b/tests/test_neural_solvers.py
index dbb6fe41..74d83f1f 100644
--- a/tests/test_neural_solvers.py
+++ b/tests/test_neural_solvers.py
@@ -101,18 +101,18 @@ def _test_neural_solver_on_isomorphic_graphs(graph_num_nodes, node_feat_dim, sol
 def test_pca_gm():
     _test_neural_solver_on_isomorphic_graphs(list(range(10, 30, 2)), 1024, pygm.pca_gm, 'individual-graphs', {
         'pretrain': ['voc', 'willow', 'voc-all'],
-    }, ['pytorch', 'jittor'])
+    }, ['pytorch', 'numpy','jittor'])
 
 def test_ipca_gm():
     _test_neural_solver_on_isomorphic_graphs(list(range(10, 30, 2)), 1024, pygm.ipca_gm, 'individual-graphs', {
         'pretrain': ['voc', 'willow'],
-    }, ['pytorch', 'jittor'])
+    }, ['pytorch', 'numpy', 'jittor'])
 
 def test_cie():
     args = (
         list(range(10, 30, 2)), 1024, pygm.cie, 'individual-graphs-edge', {
             'pretrain': ['voc', 'willow'],
-        }, ['pytorch', 'jittor']
+        }, ['pytorch', 'numpy', 'jittor']
     )
     max_retries = 5
     for i in range(max_retries - 1):
@@ -132,7 +132,7 @@ def test_ngm():
         'edge_aff_fn': [functools.partial(pygm.utils.gaussian_aff_fn, sigma=1.), pygm.utils.inner_prod_aff_fn],
         'node_aff_fn': [functools.partial(pygm.utils.gaussian_aff_fn, sigma=.1), pygm.utils.inner_prod_aff_fn],
         'pretrain': ['voc', 'willow'],
-    }, ['pytorch', 'jittor'])
+    }, ['pytorch', 'numpy', 'jittor'])
 
 
 if __name__ == '__main__':