diff --git a/ftperm.py b/ftperm.py index 5c3bf72c..533dc283 100644 --- a/ftperm.py +++ b/ftperm.py @@ -51,170 +51,248 @@ ''' +ZERO_BLOCK_SIZE = 4 +VERBOSE = False +USE_CUPY = False + +def batched(arr, batch_size): + ''' + Utility generator that yields chunks of array `arr` of size `batch_size` + Expects arr to be a numpy-like array + ''' + n_samples = arr.shape[0] + idx = 0 + while idx < n_samples: + yield arr[idx:min(idx+batch_size, n_samples)] + idx += batch_size + + def apply_swap(perm, i, j): + ''' + Swap `i`-th and `j`-th elements in the array `perm`. + ''' perm[i], perm[j] = perm[j], perm[i] -def apply_cycle(perm, idx): - values = [perm[i] for i in idx] - new_values = values[1:] + [values[0]] - for i, j in zip(idx, new_values): + +def apply_rotate_right(perm, indices): + ''' + Rotates right the values in `perm` at selected indices `indices`. + The rotation is performed as-if the selected indices were layed out in the order + specified in the `indices` list. + ''' + values = [perm[i] for i in indices] + new_values = [values[-1]] + values[:-1] + for i, j in zip(indices, new_values): perm[i] = j -def get_swapped_zero_count(actmat, use_cupy=True): - shape = actmat.shape - actmat = actmat.reshape((actmat.shape[0], actmat.shape[1]//4, 4)) + +def get_swapped_zero_positive_count(actmat_flat, use_cupy=True): if use_cupy: - actmat = cp.asarray(actmat, dtype=cp.int8) - num_zeros = cp.sum(actmat, axis=2, keepdims=True) - num_zeros = cp.tile(num_zeros, (1, 1, 4)) - - num_zeros = cp.reshape(num_zeros, shape) - actmat = cp.reshape(actmat, shape) + actmat_flat = cp.asarray(actmat_flat, dtype=cp.int8) - rest_zero_indicator = num_zeros - actmat == 3 - rest_zero_indicator = cp.reshape(rest_zero_indicator, shape).astype(cp.int8) + shape = actmat_flat.shape + # Group into blocks that are processed at once during inference + # actmat is a boolean matrix of shape (N, L1) with "True" meaning 0 + actmat_chunked = actmat_flat.reshape((actmat_flat.shape[0], actmat_flat.shape[1]//ZERO_BLOCK_SIZE, ZERO_BLOCK_SIZE)) + if use_cupy: + # Calculate number of zeros in each block + num_zeros = cp.sum(actmat_chunked, axis=2, keepdims=True) + # Broadcast back to the same shape as actmat_chunked so it's easier to work with + num_zeros = cp.tile(num_zeros, (1, 1, ZERO_BLOCK_SIZE)) + + # Marks an element if all other elements in a block are zero. + # + # Example: + # b i k b i k b i k + # slice [0, 13, :] [0, 14, :] [0, 15, :] + # num_zeros = [... [... [3, 3, 3, 3], [1, 1, 1, 1], [4, 4, 4, 4] ...] ...] + # actmat_chunked = [... [... [1, 1, 0, 1], [0, 0, 1, 0], [1, 1, 1, 1] ...] ...] + # rest_zero_indicator = [... [... [0, 0, 1, 0], [0, 0, 0, 0], [1, 1, 1, 1] ...] ...] + # + rest_zero_indicator = (num_zeros - actmat_chunked == ZERO_BLOCK_SIZE - 1).reshape(shape).astype(cp.int8) + + # Sum all possible pairs of elements in a single sample of actmat_flat and rest_zero_indicator. + # Aggregate sum over the whole batch. + # This tells us how much "good" a swap of i-th and j-th slices would do. It doesn't consider + # how much "bad" it would do though, that will be accounted for later, for performance reasons. + swapped_zero_count = cp.einsum('bi,bj->ij', actmat_flat, rest_zero_indicator, dtype=int) else: - num_zeros = np.sum(actmat, axis=2, keepdims=True) - num_zeros = np.tile(num_zeros, (1, 1, 4)) + # Same operation but with numpy + num_zeros = np.sum(actmat_chunked, axis=2, keepdims=True) + num_zeros = np.tile(num_zeros, (1, 1, ZERO_BLOCK_SIZE)) - num_zeros = np.reshape(num_zeros, shape).astype(int) - actmat = np.reshape(actmat, shape).astype(int) + rest_zero_indicator = (num_zeros - actmat_chunked == ZERO_BLOCK_SIZE - 1).reshape(shape).astype(int) - rest_zero_indicator = num_zeros - actmat == 3 - rest_zero_indicator = np.reshape(rest_zero_indicator, shape).astype(int) - - - if use_cupy: - swapped_zero_count = cp.einsum('bi,bj->ij', actmat, rest_zero_indicator, dtype=int) - - else: - swapped_zero_count = np.einsum('bi,bj->ij', actmat, rest_zero_indicator) - + swapped_zero_count = np.einsum('bi,bj->ij', actmat_flat, rest_zero_indicator) return swapped_zero_count -def get_score_change(actmat, use_cupy=True): + +def get_swapped_zero_increase(actmat, use_cupy=True): n_neurons = actmat.shape[1] - n_samples = actmat.shape[0] - # actmat is a boolean matrix of shape (N, L1) with "True" meaning 0 swapped_zero_count = 0 - - # process in batches since the arrays are too large + # Process in batches since the arrays are too large # TODO: Find a good batch size. Try lowest as possible as VRAM is an issue on low end devices. BATCH_SIZE = 10000 - idx = 0 - while idx < n_samples: - actmat_batch = actmat[idx:min(idx+BATCH_SIZE, n_samples)] - swapped_zero_count += get_swapped_zero_count(actmat_batch, use_cupy=use_cupy) - idx += BATCH_SIZE - + for actmat_batch in batched(actmat, BATCH_SIZE): + swapped_zero_count += get_swapped_zero_positive_count(actmat_batch, use_cupy=use_cupy) - # 768 x 768 + # (L1/2) x (L1/2) if use_cupy: + # Subtract from each i-th slice the positive value of the current i-th placement. + # This is the place where we account for how much "bad" it would do. + # It is done here because we process earlier in batches, but this operation is distributive, + # so it needs to only be done once at the end. swapped_zero_increase = swapped_zero_count - cp.reshape(cp.diag(swapped_zero_count), (1, n_neurons)) swapped_zero_increase = cp.asnumpy(swapped_zero_increase) else: swapped_zero_increase = swapped_zero_count - np.reshape(np.diag(swapped_zero_count), (1, n_neurons)) - score_change = swapped_zero_increase + return swapped_zero_increase - # kill off swaps between neurons in the same block - blocks = np.arange(n_neurons).reshape((n_neurons, 1)) // 4 + +def get_score_change(actmat, use_cupy=True): + # actmat is a boolean matrix of shape (N, L1) with "True" meaning 0 + + n_neurons = actmat.shape[1] + + score_change = get_swapped_zero_increase(actmat, use_cupy) + + # Kill off swaps between neurons in the same block + blocks = np.arange(n_neurons).reshape((n_neurons, 1)) // ZERO_BLOCK_SIZE same_block_killer = 1 - (blocks == blocks.T).astype(int) score_change = score_change * same_block_killer return score_change def make_swaps_2(actmat, use_cupy=True): + ''' + Returns a series of independent 2-swap operations that collectively improve the objective function. + ''' + # For each pair of nodes, we want to calculate the difference between the number of 4-zero runs when swapping them start_time = time.time() print("Starting make_swaps_2") + n_neurons = actmat.shape[1] n_samples = actmat.shape[0] - n_blocks = n_neurons // 4 + n_blocks = n_neurons // ZERO_BLOCK_SIZE + # Compute the score change of swapping i-th and j-th neurons score_change = get_score_change(actmat, use_cupy=use_cupy) + # Sum score_change[i, j] + score_change[j, i] to get the cumulative impact of the swap. score_change = score_change + score_change.T + def all_indices_in_same_block(i): + ''' Returns a list of indices of all neurons in the same block as the i-th neuron. ''' + # Floor to the start of the block. + base = i // ZERO_BLOCK_SIZE * ZERO_BLOCK_SIZE + return list(range(base, base + ZERO_BLOCK_SIZE)) - def make_indices_to_kill(i): - block = i // 4 - return list(range(block * 4, block * 4 + 4)) swaps = [] total_score_change = 0 while True: swap = np.argmax(score_change) + # argmax returns a flat index, so we need to recompute the position. i, j = swap // n_neurons, swap % n_neurons - indices_to_kill = make_indices_to_kill(i) + make_indices_to_kill(j) + improvement = score_change[i, j] if improvement == 0: break - #print(f"Swapping {i} and {j} for improvement {improvement}") + + if VERBOSE: + print(f"Swapping {i} and {j} for improvement {improvement}") + + # The swap is an improvement, add it to the list. total_score_change += improvement swaps.append((i, j)) + + indices_to_kill = all_indices_in_same_block(i) + all_indices_in_same_block(j) for index in indices_to_kill: - score_change[:, index] = -9999 - score_change[index, :] = -9999 - total_improvement = total_score_change / n_samples / (n_neurons//4) *100 + # Zero out the improvement for the swaps to and from blocks which had neurons swapped. + # This ensures they won't be picked later, and therefore all swaps will be independent. + score_change[:, index] = 0 + score_change[index, :] = 0 + + total_improvement = total_score_change / n_samples / (n_neurons//ZERO_BLOCK_SIZE) * 100 + print(f"Time elapsed: {time.time() - start_time:0.3f}") print(f"Improvement this iteration: {total_improvement:0.3f}") return swaps, total_improvement + def make_swaps_3(actmat, use_cupy=True): - # for each triplet of nodes, we want to calculate the change in score when moving them in a cycle - - score_changes = get_score_change(actmat, use_cupy=use_cupy) - n_neurons = score_changes.shape[0] - n_samples = actmat.shape[0] - n_blocks = n_neurons // 4 - orig_shape = (n_neurons,) * 3 - compressed_shape = (n_blocks, 4) * 3 - cycles = [] - total_score_change = 0 + ''' + Returns a series of independent left-rotates operations that collectively improve the objective function. + ''' + # For each triplet of nodes, we want to calculate the change in score when moving them in a cycle print("Starting make_swaps_3") start_time = time.time() + n_neurons = actmat.shape[1] + n_samples = actmat.shape[0] + n_blocks = n_neurons // ZERO_BLOCK_SIZE + + score_changes = get_score_change(actmat, use_cupy=use_cupy) + # For each neuron i, j, k we sum score_change[i, j] + score_change[j, k] + score_change[k, i] - score_changes_3 = score_changes[:, :, None] + score_changes[None, :, :] + (score_changes.T)[:, None, :] + # This is the cumulative impact of the right-rotation. + score_changes = score_changes[:, :, None] + score_changes[None, :, :] + (score_changes.T)[:, None, :] - # improvement = score_changes_3[4, 8, 12] / n_samples / (n_neurons//4) *100 - # print(improvement) - # cycles.append((12,8,4)) - # return cycles, improvement - + orig_shape = (n_neurons,) * 3 + compressed_shape = (n_blocks, ZERO_BLOCK_SIZE) * 3 + cycles = [] + total_score_change = 0 - # We don't want to have to go through an enormous array so compress it to represent blocks rather than neurons - # Cupy doesn't support a list of axes - # TODO: Maybe there is some cheeky way to use cupy here? This part takes by far the longest. - # TODO: Uses quite a bit of RAM, see if it can be improved. - max_values = cp.amax(cp.reshape(score_changes_3, compressed_shape), axis=5, keepdims=False) - max_values = cp.amax(max_values, axis=3, keepdims=False) - max_values = cp.amax(max_values, axis=1, keepdims=False) - + if use_cupy: + # We don't want to have to go through an enormous array so compress it to represent blocks rather than neurons + # Cupy doesn't support a list of axes so we go one by one. + max_values = cp.amax(cp.reshape(score_changes, compressed_shape), axis=5, keepdims=False) + max_values = cp.amax(max_values, axis=3, keepdims=False) + max_values = cp.amax(max_values, axis=1, keepdims=False) + else: + max_values = np.amax(np.reshape(score_changes, compressed_shape), axis=[5, 3, 1], keepdims=False) + + # Kill rotates that would only affect two different blocks. + # Checking them is pointless, they should have been done by 2-swap procedure, + # as one swap will always be redundant. for block in range(n_blocks): max_values[block, block, :] = 0 max_values[block, :, block] = 0 max_values[:, block, block] = 0 while True: - out_argmax = max_values.argmax() - val = max_values.flatten()[out_argmax] - if val <= 0: - break # Finish! - total_score_change += val - b1, b2, b3 = np.unravel_index(out_argmax, (n_blocks, n_blocks, n_blocks)) - i, j, k = b1 * 4, b2 * 4, b3 * 4 - # Now we need to find the best swap for this triplet of blocks (we already know there is a gain available) - in_argmax = score_changes_3[i:i+4, j:j+4, k:k+4].argmax() - i1, j1, k1 = np.unravel_index(in_argmax, (4, 4, 4)) + best_blocks = max_values.argmax() + improvement_blocks = max_values.flatten()[best_blocks] + if improvement_blocks == 0: + break + + total_score_change += improvement_blocks + + # We first find the blocks that have neurons that can be rotated with a gain. + b1, b2, b3 = np.unravel_index(best_blocks, (n_blocks, n_blocks, n_blocks)) + i, j, k = b1 * ZERO_BLOCK_SIZE, b2 * ZERO_BLOCK_SIZE, b3 * ZERO_BLOCK_SIZE + + # Now we need to find the best set of neurons for this rotation in the found blocks + # (we already know there is a gain available) + local_score_changes = score_changes[i:i+ZERO_BLOCK_SIZE, j:j+ZERO_BLOCK_SIZE, k:k+ZERO_BLOCK_SIZE] + best_neurons = local_score_changes.argmax() + improvement_neurons = local_score_changes.flatten()[best_neurons] + assert improvement_blocks == improvement_neurons + i1, j1, k1 = np.unravel_index(best_neurons, (ZERO_BLOCK_SIZE, ZERO_BLOCK_SIZE, ZERO_BLOCK_SIZE)) i, j, k = i + i1, j + j1, k + k1 - cycles.append((k, j, i)) + + if VERBOSE: + print(f"Right-rotating {i}, {j}, {k} for improvement {improvement_neurons}") + + # Add the right-rotate indices. We add them in reverse order as we previously computed for a right-rotate. + cycles.append((i, j, k)) # Now silence these blocks since the scores are no longer accurate # We only need to affect the smaller array since gains of zeros and under are ignored @@ -223,7 +301,7 @@ def make_swaps_3(actmat, use_cupy=True): max_values[:, b, :] = 0 max_values[:, :, b] = 0 - total_improvement = total_score_change / n_samples / (n_neurons//4) *100 + total_improvement = total_score_change / n_samples / (n_neurons//4) * 100 print(f"Time elapsed: {time.time() - start_time:0.3f}") print(f"Improvement this iteration: {total_improvement:0.3f}") return cycles, total_improvement @@ -231,35 +309,50 @@ def make_swaps_3(actmat, use_cupy=True): def find_perm_impl(actmat): actmat = np.reshape(actmat, (actmat.shape[0] * 2, actmat.shape[1]//2)) - actmat = cp.asarray(actmat, dtype=cp.int8) + if USE_CUPY: + actmat = cp.asarray(actmat, dtype=cp.int8) actmat_orig = actmat.copy() + total_score_change = 0 perm = np.arange(M.L1 // 2) - stage1 = True - stop_after_stage1 = False - fails_in_a_row = 0 + + stages = [make_swaps_2, make_swaps_3] + # The optimization routines are deterministic, so no need to retry. + stages_max_fails = [0, 0] + stage_id = 0 + stop_after_stage = None + num_fails = 0 + for i in range(50): - swap_fn = make_swaps_2 if stage1 else make_swaps_3 print("Iteration", i+1) + + # Choose the current stage optimization function + swap_fn = stages[stage_id] + + # Apply the current permutation to get the current best neuron order. actmat = actmat_orig[:, perm] - swaps, score_change = swap_fn(actmat) + + # Calculate a set of independent right rotates (so swaps for 2 element case) + # that when applied improve the objective function + swaps, score_change = swap_fn(actmat, USE_CUPY) for cycle in swaps: - apply_cycle(perm, cycle) + # Update the current best permutation with the newly found adjustments. + apply_rotate_right(perm, cycle) total_score_change += score_change - print("Total improvement:", total_score_change) - print() + print(f'Total improvement: {total_score_change}\n') + if score_change == 0: - fails_in_a_row += 1 - if fails_in_a_row == 2 or stop_after_stage1: - print("No more improvement possible.") - break - else: - stage1=not stage1 - print(f"Switching to stage {1 if stage1 else 2}") - - else: - fails_in_a_row = 0 + num_fails += 1 + if num_fails > stages_max_fails[stage_id]: + num_fails = 0 + stage_id += 1 + + if stage_id >= len(stages) or (stop_after_stage is not None and stage_id > stop_after_stage): + print('No more improvement possible.') + break + + print(f'Switching to stage {stage_id}') return perm