From 91090ad78db50731a4c5a7d27d19e2fe6354e9f6 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 15 Jun 2023 10:52:26 +0200
Subject: [PATCH 1/2] Optimizer for feature transformer weights permutation.
 Integration with serialize.py

---
 ftperm.py    | 467 +++++++++++++++++++++++++++++++++++++++++++++++++++
 serialize.py |  20 +++
 2 files changed, 487 insertions(+)
 create mode 100644 ftperm.py

diff --git a/ftperm.py b/ftperm.py
new file mode 100644
index 00000000..5c3bf72c
--- /dev/null
+++ b/ftperm.py
@@ -0,0 +1,467 @@
+'''
+
+NOTE: This script uses CUDA and may requires large amounts of VRAM. Decrease --count if encountering problems.
+
+Example use:
+
+1. Generate the activation matrix for some sample dataset.
+
+python ftperm.py gather --data=data\fishpack32.binpack --net=networks\nn-5af11540bbfe.nnue --count=1000000 --features=HalfKAv2_hm --out ftact1m.npy
+
+python ftperm.py gather --data=noob_master_leaf_static_d12_85M_0.binpack --net=nn-5af11540bbfe.nnue --count=10000 --features=HalfKAv2_hm --out ftact1m.npy
+
+2. Find a permutation
+
+python ftperm.py find_perm --data=ftact1m.npy --out=ftact.perm
+
+3. Test the permutation against the baseline
+
+python ftperm.py eval_perm --data=ftact1m.npy --perm=ftact.perm
+
+4. Apply permutation and save
+python serialize.py nn-5af11540bbfe.nnue permuted.nnue --features=HalfKAv2_hm --ft_perm=ftact.perm
+
+----------------------------------------------------------------
+
+OR do the whole process in one step
+
+python serialize.py networks\nn-5af11540bbfe.nnue permuted.nnue --features=HalfKAv2_hm --ft_optimize --ft_optimize_data=data\fishpack32.binpack --ft_optimize_count=1000000
+
+python serialize.py nn-5af11540bbfe.nnue permuted.nnue --features=HalfKAv2_hm --ft_optimize --ft_optimize_data=noob_master_leaf_static_d12_85M_0.binpack --ft_optimize_count=10000
+
+'''
+
+import time
+import argparse
+import features
+import serialize
+import nnue_dataset
+import chess
+import model as M
+import torch
+import copy
+import numpy as np
+from model import NNUE
+import cupy as cp
+from math import ceil
+
+'''
+
+Algorithm by Daniel Monroe. Github @Ergodice.
+
+'''
+
+def apply_swap(perm, i, j):
+    perm[i], perm[j] = perm[j], perm[i]
+
+def apply_cycle(perm, idx):
+    values = [perm[i] for i in idx]
+    new_values = values[1:] + [values[0]]
+    for i, j in zip(idx, new_values):
+        perm[i] = j
+
+def get_swapped_zero_count(actmat, use_cupy=True):
+    shape = actmat.shape
+    actmat = actmat.reshape((actmat.shape[0], actmat.shape[1]//4, 4))
+    if use_cupy:
+        actmat = cp.asarray(actmat, dtype=cp.int8)
+        num_zeros = cp.sum(actmat, axis=2, keepdims=True)
+        num_zeros = cp.tile(num_zeros, (1, 1, 4))
+        
+        num_zeros = cp.reshape(num_zeros, shape)
+        actmat = cp.reshape(actmat, shape)
+
+        rest_zero_indicator = num_zeros - actmat == 3
+        rest_zero_indicator = cp.reshape(rest_zero_indicator, shape).astype(cp.int8)
+
+
+    else:
+        num_zeros = np.sum(actmat, axis=2, keepdims=True)
+        num_zeros = np.tile(num_zeros, (1, 1, 4))
+        
+        num_zeros = np.reshape(num_zeros, shape).astype(int)
+        actmat = np.reshape(actmat, shape).astype(int)
+
+        rest_zero_indicator = num_zeros - actmat == 3
+        rest_zero_indicator = np.reshape(rest_zero_indicator, shape).astype(int)
+    
+
+    if use_cupy:
+        swapped_zero_count = cp.einsum('bi,bj->ij', actmat, rest_zero_indicator, dtype=int)
+
+    else:
+        swapped_zero_count = np.einsum('bi,bj->ij', actmat, rest_zero_indicator)
+    
+
+    return swapped_zero_count
+
+def get_score_change(actmat, use_cupy=True):
+    n_neurons = actmat.shape[1]
+    n_samples = actmat.shape[0]
+    # actmat is a boolean matrix of shape (N, L1) with "True" meaning 0
+    swapped_zero_count = 0
+
+
+    # process in batches since the arrays are too large
+    # TODO: Find a good batch size. Try lowest as possible as VRAM is an issue on low end devices.
+    BATCH_SIZE = 10000
+    idx = 0
+    while idx < n_samples:
+        actmat_batch = actmat[idx:min(idx+BATCH_SIZE, n_samples)]
+        swapped_zero_count += get_swapped_zero_count(actmat_batch, use_cupy=use_cupy)
+        idx += BATCH_SIZE
+   
+
+    # 768 x 768
+    if use_cupy:
+        swapped_zero_increase = swapped_zero_count - cp.reshape(cp.diag(swapped_zero_count), (1, n_neurons))
+        swapped_zero_increase = cp.asnumpy(swapped_zero_increase)
+
+    else:
+        swapped_zero_increase = swapped_zero_count - np.reshape(np.diag(swapped_zero_count), (1, n_neurons))
+
+    score_change = swapped_zero_increase
+
+    # kill off swaps between neurons in the same block
+    blocks = np.arange(n_neurons).reshape((n_neurons, 1)) // 4
+    same_block_killer = 1 - (blocks == blocks.T).astype(int)
+    score_change = score_change * same_block_killer
+    return score_change
+
+
+def make_swaps_2(actmat, use_cupy=True):
+    # For each pair of nodes, we want to calculate the difference between the number of 4-zero runs when swapping them
+    start_time = time.time()
+    print("Starting make_swaps_2")
+    n_neurons = actmat.shape[1]
+    n_samples = actmat.shape[0]
+    n_blocks = n_neurons // 4
+
+    score_change = get_score_change(actmat, use_cupy=use_cupy)
+    score_change = score_change + score_change.T
+
+
+    def make_indices_to_kill(i):
+        block = i // 4
+        return list(range(block * 4, block * 4 + 4))
+    swaps = []
+    total_score_change = 0
+    while True:
+        swap = np.argmax(score_change)
+        i, j = swap // n_neurons, swap % n_neurons
+        indices_to_kill = make_indices_to_kill(i) + make_indices_to_kill(j)
+        improvement = score_change[i, j]
+        if improvement == 0:
+            break
+        #print(f"Swapping {i} and {j} for improvement {improvement}")
+        total_score_change += improvement
+        swaps.append((i, j))
+        for index in indices_to_kill:
+            score_change[:, index] = -9999
+            score_change[index, :] = -9999
+    total_improvement = total_score_change / n_samples / (n_neurons//4) *100
+    print(f"Time elapsed: {time.time() - start_time:0.3f}")
+    print(f"Improvement this iteration: {total_improvement:0.3f}")
+
+    return swaps, total_improvement
+
+def make_swaps_3(actmat, use_cupy=True):
+    # for each triplet of nodes, we want to calculate the change in score when moving them in a cycle
+    
+    score_changes = get_score_change(actmat, use_cupy=use_cupy)
+    n_neurons = score_changes.shape[0]
+    n_samples = actmat.shape[0]
+    n_blocks = n_neurons // 4
+    orig_shape = (n_neurons,) * 3
+    compressed_shape = (n_blocks, 4) * 3
+    cycles = []
+    total_score_change = 0
+
+    print("Starting make_swaps_3")
+    start_time = time.time()
+
+    # For each neuron i, j, k we sum score_change[i, j] + score_change[j, k] + score_change[k, i]
+    score_changes_3 = score_changes[:, :, None] + score_changes[None, :, :] + (score_changes.T)[:, None, :]
+
+    # improvement = score_changes_3[4, 8, 12] / n_samples / (n_neurons//4) *100
+    # print(improvement)
+    # cycles.append((12,8,4))
+    # return cycles, improvement
+    
+
+    # We don't want to have to go through an enormous array so compress it to represent blocks rather than neurons
+    # Cupy doesn't support a list of axes
+    # TODO: Maybe there is some cheeky way to use cupy here? This part takes by far the longest.
+    # TODO: Uses quite a bit of RAM, see if it can be improved.
+    max_values = cp.amax(cp.reshape(score_changes_3, compressed_shape), axis=5, keepdims=False)
+    max_values = cp.amax(max_values, axis=3, keepdims=False)
+    max_values = cp.amax(max_values, axis=1, keepdims=False)
+    
+    for block in range(n_blocks):
+        max_values[block, block, :] = 0
+        max_values[block, :, block] = 0
+        max_values[:, block, block] = 0
+
+    while True:
+        out_argmax = max_values.argmax()
+        val = max_values.flatten()[out_argmax]
+        if val <= 0:
+            break # Finish!
+        total_score_change += val
+        b1, b2, b3 = np.unravel_index(out_argmax, (n_blocks, n_blocks, n_blocks))
+        i, j, k = b1 * 4, b2 * 4, b3 * 4
+        # Now we need to find the best swap for this triplet of blocks (we already know there is a gain available)
+        in_argmax = score_changes_3[i:i+4, j:j+4, k:k+4].argmax()
+        i1, j1, k1 = np.unravel_index(in_argmax, (4, 4, 4))
+        i, j, k = i + i1, j + j1, k + k1
+        cycles.append((k, j, i))
+
+        # Now silence these blocks since the scores are no longer accurate
+        # We only need to affect the smaller array since gains of zeros and under are ignored
+        for b in (b1, b2, b3):
+            max_values[b, :, :] = 0
+            max_values[:, b, :] = 0
+            max_values[:, :, b] = 0
+
+    total_improvement = total_score_change / n_samples / (n_neurons//4) *100
+    print(f"Time elapsed: {time.time() - start_time:0.3f}")
+    print(f"Improvement this iteration: {total_improvement:0.3f}")
+    return cycles, total_improvement
+
+
+def find_perm_impl(actmat):
+    actmat = np.reshape(actmat, (actmat.shape[0] * 2, actmat.shape[1]//2))
+    actmat = cp.asarray(actmat, dtype=cp.int8)
+    actmat_orig = actmat.copy()
+    total_score_change = 0
+    perm = np.arange(M.L1 // 2)
+    stage1 = True
+    stop_after_stage1 = False
+    fails_in_a_row = 0
+    for i in range(50):
+        swap_fn = make_swaps_2 if stage1 else make_swaps_3
+        print("Iteration", i+1)
+        actmat = actmat_orig[:, perm]
+        swaps, score_change = swap_fn(actmat)
+        for cycle in swaps:
+            apply_cycle(perm, cycle)
+
+        total_score_change += score_change
+        print("Total improvement:", total_score_change)
+        print()
+        if score_change == 0:
+            fails_in_a_row += 1
+            if fails_in_a_row == 2 or stop_after_stage1:
+                print("No more improvement possible.")
+                break
+            else:
+                stage1=not stage1
+                print(f"Switching to stage {1 if stage1 else 2}")
+
+        else:
+            fails_in_a_row = 0
+
+    return perm
+
+# -------------------------------------------------------------
+
+def read_model(nnue_path, feature_set):
+    with open(nnue_path, 'rb') as f:
+        reader = serialize.NNUEReader(f, feature_set)
+        return reader.model
+    
+
+def make_fen_batch_provider(data_path, batch_size):
+    return nnue_dataset.FenBatchProvider(data_path, True, 1, batch_size, False, 10)
+
+def filter_fens(fens):
+    # We don't want fens where a king is in check, as these cannot be evaluated by the engine.
+    filtered_fens = []
+    for fen in fens:
+        board = chess.Board(fen=fen)
+        if not board.is_check():
+            filtered_fens.append(fen)
+    return filtered_fens
+
+def quantize_ft(model):
+    model.input.weight.data = model.input.weight.data.mul(model.quantized_one).round()
+    model.input.bias.data = model.input.bias.data.mul(model.quantized_one).round()
+
+def forward_ft(model, us, them, white_indices, white_values, black_indices, black_values, psqt_indices, layer_stack_indices):
+    wp, bp = model.input(white_indices, white_values, black_indices, black_values)
+    w, wpsqt = torch.split(wp, M.L1, dim=1)
+    b, bpsqt = torch.split(bp, M.L1, dim=1)
+    l0_ = (us * torch.cat([w, b], dim=1)) + (them * torch.cat([b, w], dim=1))
+    l0_ = torch.clamp(l0_, 0.0, 127.0)
+
+    l0_s = torch.split(l0_, M.L1 // 2, dim=1)
+    l0_s1 = [l0_s[0] * l0_s[1], l0_s[2] * l0_s[3]]
+    # We multiply by 127/128 because in the quantized network 1.0 is represented by 127
+    # and it's more efficient to divide by 128 instead.
+    l0_ = torch.cat(l0_s1, dim=1) * (1/128)
+
+    return l0_.round()
+
+def eval_ft(model, batch):
+    with torch.no_grad():
+        us, them, white_indices, white_values, black_indices, black_values, outcome, score, psqt_indices, layer_stack_indices = batch.contents.get_tensors('cuda')
+        res = forward_ft(model, us, them, white_indices, white_values, black_indices, black_values, psqt_indices, layer_stack_indices)
+        return res
+
+def ft_permute_impl(model, permutation):
+    permutation = list(permutation)
+
+    l1_size = model.layer_stacks.l1.in_features
+    if l1_size != len(permutation)*2:
+        raise Exception(f'Invalid permutation size. Expected {l1_size}. Got {len(permutation)*2}.')
+
+    # Both sides of the FT must use the same permutation.
+    permutation.extend([x + l1_size // 2 for x in permutation])
+
+    # Add identity permutation for PSQT weights
+    ft_permutation = permutation + list(range(l1_size, model.input.num_outputs))
+
+    # Apply the permutation in place.
+    model.input.weight.data = model.input.weight.data[:, ft_permutation]
+    model.input.bias.data = model.input.bias.data[ft_permutation]
+    model.layer_stacks.l1.weight.data = model.layer_stacks.l1.weight.data[:, permutation]
+
+def ft_permute(model, ft_perm_path):
+    with open(ft_perm_path, 'rb') as f:
+        permutation = np.load(f)
+
+    ft_permute_impl(model, permutation)
+
+def gather_impl(model, dataset, count):
+    ZERO_POINT = 0.0 # Vary this to check hypothetical forced larger truncation to zero
+    BATCH_SIZE = 1000
+
+    old_device = model.device
+
+    quantized_model = copy.deepcopy(model)
+    quantize_ft(quantized_model)
+    quantized_model.cuda()
+
+    fen_batch_provider = make_fen_batch_provider(dataset, BATCH_SIZE)
+
+    actmats = []
+
+    done = 0
+    print('Processed {} positions.'.format(done))
+    while done < count:
+        fens = filter_fens(next(fen_batch_provider))
+
+        b = nnue_dataset.make_sparse_batch_from_fens(quantized_model.feature_set, fens, [0] * len(fens), [1] * len(fens), [0] * len(fens))
+        actmat = eval_ft(quantized_model, b).cpu()
+        actmat = (actmat <= ZERO_POINT)
+        actmats.append(actmat.numpy())
+        nnue_dataset.destroy_sparse_batch(b)
+
+        done += len(fens)
+        print('Processed {} positions.'.format(done))
+
+    return np.concatenate(actmats, axis=0)
+
+def command_gather(args):
+    feature_set = features.get_feature_set_from_name(args.features)
+    if args.checkpoint:
+        model = NNUE.load_from_checkpoint(args.checkpoint, feature_set=feature_set)
+    else:
+        model = read_model(args.net, feature_set)
+
+    model.eval()
+
+    actmat = gather_impl(model, args.data, args.count)
+
+    with open(args.out, 'wb') as file:
+        np.save(file, actmat)
+
+def eval_act_mat(actmat):
+    actmat = actmat.reshape((actmat.shape[0], actmat.shape[1]//4, 4))
+    r = np.all(actmat, axis=2)
+    return np.count_nonzero(r) / r.shape[0] / r.shape[1]
+
+
+def eval_perm_impl(actmat, perm=None):
+    actmat = np.reshape(actmat, (actmat.shape[0] * 2, actmat.shape[1]//2))
+
+    actmat_eval = eval_act_mat(actmat)
+    print(f'Combined zeros in base matrix: {actmat_eval*100:0.6f}')
+
+    if perm is not None:
+        perm_act_mat = actmat[:, perm]
+        perm_act_mat_eval = eval_act_mat(perm_act_mat)
+        print(f'Combined zeros in perm matrix: {perm_act_mat_eval*100:0.6f}')
+
+
+def command_eval_perm(args):
+    with open(args.data, 'rb') as file:
+        actmat = np.load(file)
+
+    if args.perm is not None:
+        with open(args.perm, 'rb') as file:
+            perm = np.load(file)
+    else:
+        perm = None
+
+    eval_perm_impl(actmat, perm)
+
+def command_find_perm(args):
+    with open(args.data, 'rb') as file:
+        actmat = np.load(file)
+
+    perm = find_perm_impl(actmat)
+
+    # perm = np.random.permutation([i for i in range(M.L1)])
+    with open(args.out, 'wb') as file:
+        np.save(file, perm)
+
+
+def ft_optimize(model, dataset_path, count, actmat_save_path=None, perm_save_path=None):
+    print('Gathering activation data...')
+    actmat = gather_impl(model, dataset_path, count)
+    if actmat_save_path is not None:
+        with open(actmat_save_path, 'wb') as file:
+            np.save(file, actmat)
+
+    print('Finding permutation...')
+    perm = find_perm_impl(actmat)
+    if actmat_save_path is not None:
+        with open(perm_save_path, 'wb') as file:
+            np.save(file, perm)
+
+    print('Evaluating permutation...')
+    eval_perm_impl(actmat, perm)
+
+    print('Applying permutation...')
+    ft_permute_impl(model, perm)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="")
+    subparsers = parser.add_subparsers()
+
+    parser_gather = subparsers.add_parser('gather', help='a help')
+    parser_gather.add_argument("--net", type=str, help="path to a .nnue net")
+    parser_gather.add_argument("--data", type=str, help="path to a .bin or .binpack dataset")
+    parser_gather.add_argument("--checkpoint", type=str, help="Optional checkpoint (used instead of nnue for local eval)")
+    parser_gather.add_argument("--count", type=int, default=1000, help="number of datapoints to process")
+    parser_gather.add_argument("--out", type=str, help="Filename under which to save the resulting ft matrix")
+    features.add_argparse_args(parser_gather)
+    parser_gather.set_defaults(func=command_gather)
+
+    parser_gather = subparsers.add_parser('find_perm', help='a help')
+    parser_gather.add_argument("--data", type=str, help="path to the previously gathered ft activation data")
+    parser_gather.add_argument("--out", type=str, help="path to where to save the permutation")
+    parser_gather.set_defaults(func=command_find_perm)
+
+    parser_gather = subparsers.add_parser('eval_perm', help='a help')
+    parser_gather.add_argument("--data", type=str, help="path to the previously gathered ft activation data")
+    parser_gather.add_argument("--perm", type=str, help="path to the previously generated perm file")
+    parser_gather.set_defaults(func=command_eval_perm)
+
+    args = parser.parse_args()
+    args.func(args)
+
+if __name__ == '__main__':
+    main()
diff --git a/serialize.py b/serialize.py
index dd91a9ac..e4fa0cbd 100644
--- a/serialize.py
+++ b/serialize.py
@@ -298,6 +298,10 @@ def main():
   parser.add_argument("target", help="Target file (can be .pt or .nnue)")
   parser.add_argument("--description", default=None, type=str, dest='description', help="The description string to include in the network. Only works when serializing into a .nnue file.")
   parser.add_argument("--ft_compression", default='leb128', type=str, dest='ft_compression', help="Compression method to use for FT weights and biases. Either 'none' or 'leb128'. Only allowed if saving to .nnue.")
+  parser.add_argument("--ft_perm", default=None, type=str, dest='ft_perm', help="Path to a file that defines the permutation to use on the feature transformer.")
+  parser.add_argument("--ft_optimize", action='store_true', dest='ft_optimize', help="Whether to perform full feature transformer optimization (ftperm.py) on the resulting network. This process is very time consuming.")
+  parser.add_argument("--ft_optimize_data", default=None, type=str, dest='ft_optimize_data', help="Path to the dataset to use for FT optimization.")
+  parser.add_argument("--ft_optimize_count", default=10000, type=int, dest='ft_optimize_count', help="Number of positions to use for FT optimization.")
   features.add_argparse_args(parser)
   args = parser.parse_args()
 
@@ -324,6 +328,22 @@ def main():
   if args.ft_compression not in ['none', 'leb128']:
     raise Exception('Invalid compression method.')
 
+  if args.ft_optimize and args.ft_perm is not None:
+    raise Exception('Options --ft_perm and --ft_optimize are mutually exclusive.')
+
+  if args.ft_perm is not None:
+    import ftperm
+    ftperm.ft_permute(nnue, args.ft_perm)
+
+  if args.ft_optimize:
+    import ftperm
+    if args.ft_optimize_data is None:
+      raise Exception('Invalid dataset path for FT optimization. (--ft_optimize_data)')
+    if args.ft_optimize_count is None or args.ft_optimize_count < 1:
+      raise Exception('Invalid number of positions to optimize FT with. (--ft_optimize_count)')
+
+    ftperm.ft_optimize(nnue, args.ft_optimize_data, args.ft_optimize_count)
+
   if args.target.endswith('.ckpt'):
     raise Exception('Cannot convert into .ckpt')
   elif args.target.endswith('.pt'):

From b341816cc073c1c900693f2f9a163e25964dc6f0 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 30 Jun 2023 13:11:44 +0200
Subject: [PATCH 2/2] Tidy up the algorithm code. Allow selection between numpy
 and cupy.

---
 ftperm.py | 309 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 201 insertions(+), 108 deletions(-)

diff --git a/ftperm.py b/ftperm.py
index 5c3bf72c..c1539913 100644
--- a/ftperm.py
+++ b/ftperm.py
@@ -51,170 +51,248 @@
 
 '''
 
+ZERO_BLOCK_SIZE = 4
+VERBOSE = False
+USE_CUPY = False
+
+def batched(arr, batch_size):
+    '''
+    Utility generator that yields chunks of array `arr` of size `batch_size`
+    Expects arr to be a numpy-like array
+    '''
+    n_samples = arr.shape[0]
+    idx = 0
+    while idx < n_samples:
+        yield arr[idx:min(idx+batch_size, n_samples)]
+        idx += batch_size
+
+
 def apply_swap(perm, i, j):
+    '''
+    Swap `i`-th and `j`-th elements in the array `perm`.
+    '''
     perm[i], perm[j] = perm[j], perm[i]
 
-def apply_cycle(perm, idx):
-    values = [perm[i] for i in idx]
-    new_values = values[1:] + [values[0]]
-    for i, j in zip(idx, new_values):
+
+def apply_rotate_right(perm, indices):
+    '''
+    Rotates right the values in `perm` at selected indices `indices`.
+    The rotation is performed as-if the selected indices were layed out in the order
+    specified in the `indices` list.
+    '''
+    values = [perm[i] for i in indices]
+    new_values = [values[-1]] + values[:-1]
+    for i, j in zip(indices, new_values):
         perm[i] = j
 
-def get_swapped_zero_count(actmat, use_cupy=True):
-    shape = actmat.shape
-    actmat = actmat.reshape((actmat.shape[0], actmat.shape[1]//4, 4))
+
+def get_swapped_zero_positive_count(actmat_flat, use_cupy=True):
     if use_cupy:
-        actmat = cp.asarray(actmat, dtype=cp.int8)
-        num_zeros = cp.sum(actmat, axis=2, keepdims=True)
-        num_zeros = cp.tile(num_zeros, (1, 1, 4))
-        
-        num_zeros = cp.reshape(num_zeros, shape)
-        actmat = cp.reshape(actmat, shape)
+        actmat_flat = cp.asarray(actmat_flat, dtype=cp.int8)
 
-        rest_zero_indicator = num_zeros - actmat == 3
-        rest_zero_indicator = cp.reshape(rest_zero_indicator, shape).astype(cp.int8)
+    shape = actmat_flat.shape
+    # Group into blocks that are processed at once during inference
+    # actmat is a boolean matrix of shape (N, L1 // 2) with "True" meaning 0
+    actmat_chunked = actmat_flat.reshape((actmat_flat.shape[0], actmat_flat.shape[1]//ZERO_BLOCK_SIZE, ZERO_BLOCK_SIZE))
 
+    if use_cupy:
+        # Calculate number of zeros in each block
+        num_zeros = cp.sum(actmat_chunked, axis=2, keepdims=True)
+        # Broadcast back to the same shape as actmat_chunked so it's easier to work with
+        num_zeros = cp.tile(num_zeros, (1, 1, ZERO_BLOCK_SIZE))
+
+        # Marks an element if all other elements in a block are zero.
+        #
+        # Example:
+        #                                   b  i   k      b  i   k      b  i   k
+        # slice                            [0, 13, :]    [0, 14, :]    [0, 15, :]
+        # num_zeros           = [... [... [3, 3, 3, 3], [1, 1, 1, 1], [4, 4, 4, 4] ...] ...]
+        # actmat_chunked      = [... [... [1, 1, 0, 1], [0, 0, 1, 0], [1, 1, 1, 1] ...] ...]
+        # rest_zero_indicator = [... [... [0, 0, 1, 0], [0, 0, 0, 0], [1, 1, 1, 1] ...] ...]
+        #
+        rest_zero_indicator = (num_zeros - actmat_chunked == ZERO_BLOCK_SIZE - 1).reshape(shape).astype(cp.int8)
+
+        # Sum all possible pairs of elements in a single sample of actmat_flat and rest_zero_indicator.
+        # Aggregate sum over the whole batch.
+        # This tells us how much "good" a swap of i-th and j-th slices would do. It doesn't consider
+        # how much "bad" it would do though, that will be accounted for later, for performance reasons.
+        swapped_zero_count = cp.einsum('bi,bj->ij', actmat_flat, rest_zero_indicator, dtype=int)
 
     else:
-        num_zeros = np.sum(actmat, axis=2, keepdims=True)
-        num_zeros = np.tile(num_zeros, (1, 1, 4))
+        # Same operation but with numpy
+        num_zeros = np.sum(actmat_chunked, axis=2, keepdims=True)
+        num_zeros = np.tile(num_zeros, (1, 1, ZERO_BLOCK_SIZE))
         
-        num_zeros = np.reshape(num_zeros, shape).astype(int)
-        actmat = np.reshape(actmat, shape).astype(int)
+        rest_zero_indicator = (num_zeros - actmat_chunked == ZERO_BLOCK_SIZE - 1).reshape(shape).astype(int)
 
-        rest_zero_indicator = num_zeros - actmat == 3
-        rest_zero_indicator = np.reshape(rest_zero_indicator, shape).astype(int)
-    
-
-    if use_cupy:
-        swapped_zero_count = cp.einsum('bi,bj->ij', actmat, rest_zero_indicator, dtype=int)
-
-    else:
-        swapped_zero_count = np.einsum('bi,bj->ij', actmat, rest_zero_indicator)
-    
+        swapped_zero_count = np.einsum('bi,bj->ij', actmat_flat, rest_zero_indicator)
 
     return swapped_zero_count
 
-def get_score_change(actmat, use_cupy=True):
+
+def get_swapped_zero_increase(actmat, use_cupy=True):
     n_neurons = actmat.shape[1]
-    n_samples = actmat.shape[0]
-    # actmat is a boolean matrix of shape (N, L1) with "True" meaning 0
     swapped_zero_count = 0
 
-
-    # process in batches since the arrays are too large
+    # Process in batches since the arrays are too large
     # TODO: Find a good batch size. Try lowest as possible as VRAM is an issue on low end devices.
     BATCH_SIZE = 10000
-    idx = 0
-    while idx < n_samples:
-        actmat_batch = actmat[idx:min(idx+BATCH_SIZE, n_samples)]
-        swapped_zero_count += get_swapped_zero_count(actmat_batch, use_cupy=use_cupy)
-        idx += BATCH_SIZE
-   
+    for actmat_batch in batched(actmat, BATCH_SIZE):
+        swapped_zero_count += get_swapped_zero_positive_count(actmat_batch, use_cupy=use_cupy)
 
-    # 768 x 768
+    # (L1/2) x (L1/2)
     if use_cupy:
+        # Subtract from each i-th slice the positive value of the current i-th placement.
+        # This is the place where we account for how much "bad" it would do.
+        # It is done here because we process earlier in batches, but this operation is distributive,
+        # so it needs to only be done once at the end.
         swapped_zero_increase = swapped_zero_count - cp.reshape(cp.diag(swapped_zero_count), (1, n_neurons))
         swapped_zero_increase = cp.asnumpy(swapped_zero_increase)
 
     else:
         swapped_zero_increase = swapped_zero_count - np.reshape(np.diag(swapped_zero_count), (1, n_neurons))
 
-    score_change = swapped_zero_increase
+    return swapped_zero_increase
+
+
+def get_score_change(actmat, use_cupy=True):
+    # actmat is a boolean matrix of shape (N, L1) with "True" meaning 0
+
+    n_neurons = actmat.shape[1]
+
+    score_change = get_swapped_zero_increase(actmat, use_cupy)
 
-    # kill off swaps between neurons in the same block
-    blocks = np.arange(n_neurons).reshape((n_neurons, 1)) // 4
+    # Kill off swaps between neurons in the same block
+    blocks = np.arange(n_neurons).reshape((n_neurons, 1)) // ZERO_BLOCK_SIZE
     same_block_killer = 1 - (blocks == blocks.T).astype(int)
     score_change = score_change * same_block_killer
     return score_change
 
 
 def make_swaps_2(actmat, use_cupy=True):
+    '''
+    Returns a series of independent 2-swap operations that collectively improve the objective function.
+    '''
+
     # For each pair of nodes, we want to calculate the difference between the number of 4-zero runs when swapping them
     start_time = time.time()
     print("Starting make_swaps_2")
+
     n_neurons = actmat.shape[1]
     n_samples = actmat.shape[0]
-    n_blocks = n_neurons // 4
+    n_blocks = n_neurons // ZERO_BLOCK_SIZE
 
+    # Compute the score change of swapping i-th and j-th neurons
     score_change = get_score_change(actmat, use_cupy=use_cupy)
+    # Sum score_change[i, j] + score_change[j, i] to get the cumulative impact of the swap.
     score_change = score_change + score_change.T
 
+    def all_indices_in_same_block(i):
+        ''' Returns a list of indices of all neurons in the same block as the i-th neuron. '''
+        # Floor to the start of the block.
+        base = i // ZERO_BLOCK_SIZE * ZERO_BLOCK_SIZE
+        return list(range(base, base + ZERO_BLOCK_SIZE))
 
-    def make_indices_to_kill(i):
-        block = i // 4
-        return list(range(block * 4, block * 4 + 4))
     swaps = []
     total_score_change = 0
     while True:
         swap = np.argmax(score_change)
+        # argmax returns a flat index, so we need to recompute the position.
         i, j = swap // n_neurons, swap % n_neurons
-        indices_to_kill = make_indices_to_kill(i) + make_indices_to_kill(j)
+
         improvement = score_change[i, j]
         if improvement == 0:
             break
-        #print(f"Swapping {i} and {j} for improvement {improvement}")
+
+        if VERBOSE:
+            print(f"Swapping {i} and {j} for improvement {improvement}")
+
+        # The swap is an improvement, add it to the list.
         total_score_change += improvement
         swaps.append((i, j))
+
+        indices_to_kill = all_indices_in_same_block(i) + all_indices_in_same_block(j)
         for index in indices_to_kill:
-            score_change[:, index] = -9999
-            score_change[index, :] = -9999
-    total_improvement = total_score_change / n_samples / (n_neurons//4) *100
+            # Zero out the improvement for the swaps to and from blocks which had neurons swapped.
+            # This ensures they won't be picked later, and therefore all swaps will be independent.
+            score_change[:, index] = 0
+            score_change[index, :] = 0
+
+    total_improvement = total_score_change / n_samples / (n_neurons//ZERO_BLOCK_SIZE) * 100
+
     print(f"Time elapsed: {time.time() - start_time:0.3f}")
     print(f"Improvement this iteration: {total_improvement:0.3f}")
 
     return swaps, total_improvement
 
+
 def make_swaps_3(actmat, use_cupy=True):
-    # for each triplet of nodes, we want to calculate the change in score when moving them in a cycle
-    
-    score_changes = get_score_change(actmat, use_cupy=use_cupy)
-    n_neurons = score_changes.shape[0]
-    n_samples = actmat.shape[0]
-    n_blocks = n_neurons // 4
-    orig_shape = (n_neurons,) * 3
-    compressed_shape = (n_blocks, 4) * 3
-    cycles = []
-    total_score_change = 0
+    '''
+    Returns a series of independent left-rotates operations that collectively improve the objective function.
+    '''
 
+    # For each triplet of nodes, we want to calculate the change in score when moving them in a cycle
     print("Starting make_swaps_3")
     start_time = time.time()
 
+    n_neurons = actmat.shape[1]
+    n_samples = actmat.shape[0]
+    n_blocks = n_neurons // ZERO_BLOCK_SIZE
+
+    score_changes = get_score_change(actmat, use_cupy=use_cupy)
+
     # For each neuron i, j, k we sum score_change[i, j] + score_change[j, k] + score_change[k, i]
-    score_changes_3 = score_changes[:, :, None] + score_changes[None, :, :] + (score_changes.T)[:, None, :]
+    # This is the cumulative impact of the right-rotation.
+    score_changes = score_changes[:, :, None] + score_changes[None, :, :] + (score_changes.T)[:, None, :]
 
-    # improvement = score_changes_3[4, 8, 12] / n_samples / (n_neurons//4) *100
-    # print(improvement)
-    # cycles.append((12,8,4))
-    # return cycles, improvement
-    
+    orig_shape = (n_neurons,) * 3
+    compressed_shape = (n_blocks, ZERO_BLOCK_SIZE) * 3
+    cycles = []
+    total_score_change = 0
 
-    # We don't want to have to go through an enormous array so compress it to represent blocks rather than neurons
-    # Cupy doesn't support a list of axes
-    # TODO: Maybe there is some cheeky way to use cupy here? This part takes by far the longest.
-    # TODO: Uses quite a bit of RAM, see if it can be improved.
-    max_values = cp.amax(cp.reshape(score_changes_3, compressed_shape), axis=5, keepdims=False)
-    max_values = cp.amax(max_values, axis=3, keepdims=False)
-    max_values = cp.amax(max_values, axis=1, keepdims=False)
-    
+    if use_cupy:
+        # We don't want to have to go through an enormous array so compress it to represent blocks rather than neurons
+        # Cupy doesn't support a list of axes so we go one by one.
+        max_values = cp.amax(cp.reshape(score_changes, compressed_shape), axis=5, keepdims=False)
+        max_values = cp.amax(max_values, axis=3, keepdims=False)
+        max_values = cp.amax(max_values, axis=1, keepdims=False)
+    else:
+        max_values = np.amax(np.reshape(score_changes, compressed_shape), axis=(5, 3, 1), keepdims=False)
+
+    # Kill rotates that would only affect less than 3 different blocks.
+    # We must do this, because the rest of the algorithm relies on it for correctness.
+    # It would also be pointless as such cases degenerate to the ones handled by make_swaps_2.
     for block in range(n_blocks):
         max_values[block, block, :] = 0
         max_values[block, :, block] = 0
         max_values[:, block, block] = 0
 
     while True:
-        out_argmax = max_values.argmax()
-        val = max_values.flatten()[out_argmax]
-        if val <= 0:
-            break # Finish!
-        total_score_change += val
-        b1, b2, b3 = np.unravel_index(out_argmax, (n_blocks, n_blocks, n_blocks))
-        i, j, k = b1 * 4, b2 * 4, b3 * 4
-        # Now we need to find the best swap for this triplet of blocks (we already know there is a gain available)
-        in_argmax = score_changes_3[i:i+4, j:j+4, k:k+4].argmax()
-        i1, j1, k1 = np.unravel_index(in_argmax, (4, 4, 4))
+        best_blocks = max_values.argmax()
+        improvement_blocks = max_values.flatten()[best_blocks]
+        if improvement_blocks == 0:
+            break
+
+        total_score_change += improvement_blocks
+
+        # We first find the blocks that have neurons that can be rotated with a gain.
+        b1, b2, b3 = np.unravel_index(best_blocks, (n_blocks, n_blocks, n_blocks))
+        i, j, k = b1 * ZERO_BLOCK_SIZE, b2 * ZERO_BLOCK_SIZE, b3 * ZERO_BLOCK_SIZE
+
+        # Now we need to find the best set of neurons for this rotation in the found blocks
+        # (we already know there is a gain available)
+        local_score_changes = score_changes[i:i+ZERO_BLOCK_SIZE, j:j+ZERO_BLOCK_SIZE, k:k+ZERO_BLOCK_SIZE]
+        best_neurons = local_score_changes.argmax()
+        improvement_neurons = local_score_changes.flatten()[best_neurons]
+        assert improvement_blocks == improvement_neurons
+        i1, j1, k1 = np.unravel_index(best_neurons, (ZERO_BLOCK_SIZE, ZERO_BLOCK_SIZE, ZERO_BLOCK_SIZE))
         i, j, k = i + i1, j + j1, k + k1
-        cycles.append((k, j, i))
+
+        if VERBOSE:
+            print(f"Right-rotating {i}, {j}, {k} for improvement {improvement_neurons}")
+
+        # Add the right-rotate indices. We add them in reverse order as we previously computed for a right-rotate.
+        cycles.append((i, j, k))
 
         # Now silence these blocks since the scores are no longer accurate
         # We only need to affect the smaller array since gains of zeros and under are ignored
@@ -223,7 +301,7 @@ def make_swaps_3(actmat, use_cupy=True):
             max_values[:, b, :] = 0
             max_values[:, :, b] = 0
 
-    total_improvement = total_score_change / n_samples / (n_neurons//4) *100
+    total_improvement = total_score_change / n_samples / (n_neurons//4) * 100
     print(f"Time elapsed: {time.time() - start_time:0.3f}")
     print(f"Improvement this iteration: {total_improvement:0.3f}")
     return cycles, total_improvement
@@ -231,35 +309,50 @@ def make_swaps_3(actmat, use_cupy=True):
 
 def find_perm_impl(actmat):
     actmat = np.reshape(actmat, (actmat.shape[0] * 2, actmat.shape[1]//2))
-    actmat = cp.asarray(actmat, dtype=cp.int8)
+    if USE_CUPY:
+        actmat = cp.asarray(actmat, dtype=cp.int8)
     actmat_orig = actmat.copy()
+
     total_score_change = 0
     perm = np.arange(M.L1 // 2)
-    stage1 = True
-    stop_after_stage1 = False
-    fails_in_a_row = 0
+
+    stages = [make_swaps_2, make_swaps_3]
+    # The optimization routines are deterministic, so no need to retry.
+    stages_max_fails = [0, 0]
+    stage_id = 0
+    stop_after_stage = None
+    num_fails = 0
+
     for i in range(50):
-        swap_fn = make_swaps_2 if stage1 else make_swaps_3
         print("Iteration", i+1)
+
+        # Choose the current stage optimization function
+        swap_fn = stages[stage_id]
+
+        # Apply the current permutation to get the current best neuron order.
         actmat = actmat_orig[:, perm]
-        swaps, score_change = swap_fn(actmat)
+
+        # Calculate a set of independent right rotates (so swaps for 2 element case)
+        # that when applied improve the objective function
+        swaps, score_change = swap_fn(actmat, USE_CUPY)
         for cycle in swaps:
-            apply_cycle(perm, cycle)
+            # Update the current best permutation with the newly found adjustments.
+            apply_rotate_right(perm, cycle)
 
         total_score_change += score_change
-        print("Total improvement:", total_score_change)
-        print()
+        print(f'Total improvement: {total_score_change}\n')
+
         if score_change == 0:
-            fails_in_a_row += 1
-            if fails_in_a_row == 2 or stop_after_stage1:
-                print("No more improvement possible.")
-                break
-            else:
-                stage1=not stage1
-                print(f"Switching to stage {1 if stage1 else 2}")
-
-        else:
-            fails_in_a_row = 0
+            num_fails += 1
+            if num_fails > stages_max_fails[stage_id]:
+                num_fails = 0
+                stage_id += 1
+
+                if stage_id >= len(stages) or (stop_after_stage is not None and stage_id > stop_after_stage):
+                    print('No more improvement possible.')
+                    break
+
+                print(f'Switching to stage {stage_id}')
 
     return perm