diff --git a/Cargo.toml b/Cargo.toml
index 085137f5f6..fdd4178a3d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,7 +46,6 @@ include = [
     "crypto/curve25519/curve25519_64_adx.c",
     "crypto/curve25519/curve25519_tables.h",
     "crypto/curve25519/internal.h",
-    "crypto/fipsmodule/aes/aes_nohw.c",
     "crypto/fipsmodule/aes/asm/aesni-x86.pl",
     "crypto/fipsmodule/aes/asm/aesni-x86_64.pl",
     "crypto/fipsmodule/aes/asm/aesv8-armx.pl",
diff --git a/build.rs b/build.rs
index 01ed161d14..863444c664 100644
--- a/build.rs
+++ b/build.rs
@@ -54,7 +54,6 @@ const WASM32: &str = "wasm32";
 #[rustfmt::skip]
 const RING_SRCS: &[(&[&str], &str)] = &[
     (&[], "crypto/curve25519/curve25519.c"),
-    (&[], "crypto/fipsmodule/aes/aes_nohw.c"),
     (&[], "crypto/fipsmodule/bn/montgomery.c"),
     (&[], "crypto/fipsmodule/bn/montgomery_inv.c"),
     (&[], "crypto/fipsmodule/ec/ecp_nistz.c"),
@@ -884,9 +883,6 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "aes_hw_ctr32_encrypt_blocks",
         "aes_hw_encrypt",
         "aes_hw_set_encrypt_key",
-        "aes_nohw_ctr32_encrypt_blocks",
-        "aes_nohw_encrypt",
-        "aes_nohw_set_encrypt_key",
         "aesni_gcm_decrypt",
         "aesni_gcm_encrypt",
         "bn_from_montgomery_in_place",
diff --git a/crypto/fipsmodule/aes/aes_nohw.c b/crypto/fipsmodule/aes/aes_nohw.c
deleted file mode 100644
index 731178516d..0000000000
--- a/crypto/fipsmodule/aes/aes_nohw.c
+++ /dev/null
@@ -1,959 +0,0 @@
-/* Copyright (c) 2019, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-#include <ring-core/aes.h>
-
-#include "../../internal.h"
-
-#if defined(OPENSSL_SSE2)
-#include <emmintrin.h>
-#endif
-
-
-// This file contains a constant-time implementation of AES, bitsliced with
-// 32-bit, 64-bit, or 128-bit words, operating on two-, four-, and eight-block
-// batches, respectively. The 128-bit implementation requires SSE2 intrinsics.
-//
-// This implementation is based on the algorithms described in the following
-// references:
-// - https://bearssl.org/constanttime.html#aes
-// - https://eprint.iacr.org/2009/129.pdf
-// - https://eprint.iacr.org/2009/191.pdf
-
-
-// Word operations.
-//
-// An aes_word_t is the word used for this AES implementation. Throughout this
-// file, bits and bytes are ordered little-endian, though "left" and "right"
-// shifts match the operations themselves, which makes them reversed in a
-// little-endian, left-to-right reading.
-//
-// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an
-// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE|
-// bits each, each corresponding to a byte in an AES block in column-major
-// order (AES's byte order). We refer to these as "logical bytes". Note, in the
-// 32-bit and 64-bit implementations, they are smaller than a byte. (The
-// contents of a logical byte will be described later.)
-//
-// MSVC does not support C bit operators on |__m128i|, so the wrapper functions
-// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and
-// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift
-// value ranges from 0 to 15 independent of |aes_word_t| and
-// |AES_NOHW_BATCH_SIZE|.
-//
-// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which
-// uses row-major order. Matching the AES order was easier to reason about, and
-// we do not have PSHUFB available to arbitrarily permute bytes.
-
-#if defined(OPENSSL_SSE2)
-typedef __m128i aes_word_t;
-// AES_NOHW_WORD_SIZE is sizeof(aes_word_t). alignas(sizeof(T)) does not work in
-// MSVC, so we define a constant.
-#define AES_NOHW_WORD_SIZE 16
-#define AES_NOHW_BATCH_SIZE 8
-#define AES_NOHW_ROW0_MASK \
-  _mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff)
-#define AES_NOHW_ROW1_MASK \
-  _mm_set_epi32(0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00)
-#define AES_NOHW_ROW2_MASK \
-  _mm_set_epi32(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000)
-#define AES_NOHW_ROW3_MASK \
-  _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000)
-
-static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
-  return _mm_and_si128(a, b);
-}
-
-static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
-  return _mm_or_si128(a, b);
-}
-
-static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
-  return _mm_xor_si128(a, b);
-}
-
-static inline aes_word_t aes_nohw_not(aes_word_t a) {
-  return _mm_xor_si128(
-      a, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff));
-}
-
-// These are macros because parameters to |_mm_slli_si128| and |_mm_srli_si128|
-// must be constants.
-#define aes_nohw_shift_left(/* aes_word_t */ a, /* const */ i) \
-  _mm_slli_si128((a), (i))
-#define aes_nohw_shift_right(/* aes_word_t */ a, /* const */ i) \
-  _mm_srli_si128((a), (i))
-#else  // !OPENSSL_SSE2
-#if defined(OPENSSL_64_BIT)
-typedef uint64_t aes_word_t;
-#define AES_NOHW_WORD_SIZE 8
-#define AES_NOHW_BATCH_SIZE 4
-#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f)
-#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0)
-#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00)
-#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000)
-#else  // !OPENSSL_64_BIT
-typedef uint32_t aes_word_t;
-#define AES_NOHW_WORD_SIZE 4
-#define AES_NOHW_BATCH_SIZE 2
-#define AES_NOHW_ROW0_MASK 0x03030303
-#define AES_NOHW_ROW1_MASK 0x0c0c0c0c
-#define AES_NOHW_ROW2_MASK 0x30303030
-#define AES_NOHW_ROW3_MASK 0xc0c0c0c0
-#endif  // OPENSSL_64_BIT
-
-static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
-  return a & b;
-}
-
-static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
-  return a | b;
-}
-
-static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
-  return a ^ b;
-}
-
-static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; }
-
-static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) {
-  return a << (i * AES_NOHW_BATCH_SIZE);
-}
-
-static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) {
-  return a >> (i * AES_NOHW_BATCH_SIZE);
-}
-#endif  // OPENSSL_SSE2
-
-OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t),
-                      "batch size does not match word size");
-OPENSSL_STATIC_ASSERT(AES_NOHW_WORD_SIZE == sizeof(aes_word_t),
-                      "AES_NOHW_WORD_SIZE is incorrect");
-
-
-// Block representations.
-//
-// This implementation uses three representations for AES blocks. First, the
-// public API represents blocks as uint8_t[16] in the usual way. Second, most
-// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|.
-// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words
-// containing bitsliced blocks a, b, c, d, this would be as follows (vertical
-// bars divide logical bytes):
-//
-//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
-//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
-//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
-//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
-//   ...
-//
-// Finally, an individual block may be stored as an intermediate form in an
-// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each
-// block, so that block[0]'s ith logical byte contains least-significant
-// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of
-// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as
-// "compacting" the block. Note this is no-op with 128-bit words because then
-// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit
-// words, one block would be stored in two words:
-//
-//   block[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
-//   block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ...
-//
-// Observe that the distances between corresponding bits in bitsliced and
-// compact bit orders match. If we line up corresponding words of each block,
-// the bitsliced and compact representations may be converted by tranposing bits
-// in corresponding logical bytes. Continuing the 64-bit example:
-//
-//   block_a[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
-//   block_b[0] = b0 b1 b2 b3 |  b8  b9 b10 b11 | b16 b17 b18 b19 ...
-//   block_c[0] = c0 c1 c2 c3 |  c8  c9 c10 c11 | c16 c17 c18 c19 ...
-//   block_d[0] = d0 d1 d2 d3 |  d8  d9 d10 d11 | d16 d17 d18 d19 ...
-//
-//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
-//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
-//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
-//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
-//
-// Note also that bitwise operations and (logical) byte permutations on an
-// |aes_word_t| work equally for the bitsliced and compact words.
-//
-// We use the compact form in the |AES_KEY| representation to save work
-// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists
-// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately
-// before or after |aes_nohw_transpose|.
-
-#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t))
-
-// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise
-// specified, it is in bitsliced form.
-typedef struct {
-  aes_word_t w[8];
-} AES_NOHW_BATCH;
-
-// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is
-// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH|
-// |AES_KEY|s so it should not be used as a long-term key representation.
-typedef struct {
-  // keys is an array of batches, one for each round key. Each batch stores
-  // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form.
-  AES_NOHW_BATCH keys[AES_MAXNR + 1];
-} AES_NOHW_SCHEDULE;
-
-// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in
-// compact form.
-static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch,
-                                      const aes_word_t in[AES_NOHW_BLOCK_WORDS],
-                                      size_t i) {
-  // Note the words are interleaved. The order comes from |aes_nohw_transpose|.
-  // If |i| is zero and this is the 64-bit implementation, in[0] contains bits
-  // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at
-  // w[4] so that bits 0 and 4 are in the correct position. (In general, bits
-  // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares
-  // will be correctly placed.)
-  dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
-#if defined(OPENSSL_SSE2)
-  batch->w[i] = in[0];
-#elif defined(OPENSSL_64_BIT)
-  batch->w[i] = in[0];
-  batch->w[i + 4] = in[1];
-#else
-  batch->w[i] = in[0];
-  batch->w[i + 2] = in[1];
-  batch->w[i + 4] = in[2];
-  batch->w[i + 6] = in[3];
-#endif
-}
-
-// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in
-// compact form.
-static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch,
-                                      aes_word_t out[AES_NOHW_BLOCK_WORDS],
-                                      size_t i) {
-  dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
-#if defined(OPENSSL_SSE2)
-  out[0] = batch->w[i];
-#elif defined(OPENSSL_64_BIT)
-  out[0] = batch->w[i];
-  out[1] = batch->w[i + 4];
-#else
-  out[0] = batch->w[i];
-  out[1] = batch->w[i + 2];
-  out[2] = batch->w[i + 4];
-  out[3] = batch->w[i + 6];
-#endif
-}
-
-#if !defined(OPENSSL_SSE2)
-// aes_nohw_delta_swap returns |a| with bits |a & mask| and
-// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
-static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask,
-                                             aes_word_t shift) {
-  // See
-  // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
-  aes_word_t b = (a ^ (a >> shift)) & mask;
-  return a ^ b ^ (b << shift);
-}
-
-// In the 32-bit and 64-bit implementations, a block spans multiple words.
-// |aes_nohw_compact_block| must permute bits across different words. First we
-// implement |aes_nohw_compact_word| which performs a smaller version of the
-// transformation which stays within a single word.
-//
-// These transformations are generalizations of the output of
-// http://programming.sirrida.de/calcperm.php on smaller inputs.
-#if defined(OPENSSL_64_BIT)
-static inline uint64_t aes_nohw_compact_word(uint64_t a) {
-#if defined(RING_BIG_ENDIAN)
-  a = CRYPTO_bswap8(a);
-#endif
-  // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
-  // quartets of those chunks:
-  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
-  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15
-  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
-  // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
-  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15 =>
-  //   0 2 4 6 | 1 3 5 7 | 8 10 12 14 |  9 11 13 15
-  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
-  // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
-  //   0 2 4 6 | 1  3  5  7 | 8 10 12 14 | 9 11 13 15 =>
-  //   0 2 4 6 | 8 10 12 14 | 1  3  5  7 | 9 11 13 15
-  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
-  return a;
-}
-
-static inline uint64_t aes_nohw_uncompact_word(uint64_t a) {
-  // Reverse the steps of |aes_nohw_uncompact_word|.
-  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
-  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
-  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
-#if defined(RING_BIG_ENDIAN)
-  a = CRYPTO_bswap8(a);
-#endif
-  return a;
-}
-#else   // !OPENSSL_64_BIT
-static inline uint32_t aes_nohw_compact_word(uint32_t a) {
-#if defined(RING_BIG_ENDIAN)
-  a = CRYPTO_bswap4(a);
-#endif
-  // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
-  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
-  //   0 4 2 6 | 1 5 3 7 | 8 12 10 14 |  9 13 11 15
-  // Note:  0x00cc = 0b0000_0000_1100_1100
-  //   0x00cc << 6 = 0b0011_0011_0000_0000
-  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
-  // Now we swap groups of four bits (still numbering by pairs):
-  //   0 4 2  6 | 1 5 3  7 | 8 12 10 14 | 9 13 11 15 =>
-  //   0 4 8 12 | 1 5 9 13 | 2  6 10 14 | 3  7 11 15
-  // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
-  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
-  return a;
-}
-
-static inline uint32_t aes_nohw_uncompact_word(uint32_t a) {
-  // Reverse the steps of |aes_nohw_uncompact_word|.
-  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
-  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
-#if defined(RING_BIG_ENDIAN)
-  a = CRYPTO_bswap4(a);
-#endif
-  return a;
-}
-
-static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1,
-                                                uint8_t a2, uint8_t a3) {
-  return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
-         ((uint32_t)a3 << 24);
-}
-
-static inline uint8_t lo(uint32_t a) {
-  return (uint8_t)a;
-}
-
-#endif  // OPENSSL_64_BIT
-#endif  // !OPENSSL_SSE2
-
-static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
-                                          const uint8_t in[16]) {
-  OPENSSL_memcpy(out, in, 16);
-#if defined(OPENSSL_SSE2)
-  // No conversions needed.
-#elif defined(OPENSSL_64_BIT)
-  uint64_t a0 = aes_nohw_compact_word(out[0]);
-  uint64_t a1 = aes_nohw_compact_word(out[1]);
-  out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
-  out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
-#else
-  uint32_t a0 = aes_nohw_compact_word(out[0]);
-  uint32_t a1 = aes_nohw_compact_word(out[1]);
-  uint32_t a2 = aes_nohw_compact_word(out[2]);
-  uint32_t a3 = aes_nohw_compact_word(out[3]);
-  // Note clang, when building for ARM Thumb2, will sometimes miscompile
-  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
-  // without optimizations. This bug was introduced in
-  // https://reviews.llvm.org/rL340261 and fixed in
-  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
-  out[0] = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
-  out[1] = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
-  out[2] = aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
-  out[3] = aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
-#endif
-}
-
-static inline void aes_nohw_uncompact_block(
-    uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
-#if defined(OPENSSL_SSE2)
-  OPENSSL_memcpy(out, in, 16);  // No conversions needed.
-#elif defined(OPENSSL_64_BIT)
-  uint64_t a0 = in[0];
-  uint64_t a1 = in[1];
-  uint64_t b0 =
-      aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
-  uint64_t b1 =
-      aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
-  OPENSSL_memcpy(out, &b0, 8);
-  OPENSSL_memcpy(out + 8, &b1, 8);
-#else
-  uint32_t a0 = in[0];
-  uint32_t a1 = in[1];
-  uint32_t a2 = in[2];
-  uint32_t a3 = in[3];
-  // Note clang, when building for ARM Thumb2, will sometimes miscompile
-  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
-  // without optimizations. This bug was introduced in
-  // https://reviews.llvm.org/rL340261 and fixed in
-  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
-  uint32_t b0 = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
-  uint32_t b1 = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
-  uint32_t b2 =
-      aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
-  uint32_t b3 =
-      aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
-  b0 = aes_nohw_uncompact_word(b0);
-  b1 = aes_nohw_uncompact_word(b1);
-  b2 = aes_nohw_uncompact_word(b2);
-  b3 = aes_nohw_uncompact_word(b3);
-  OPENSSL_memcpy(out, &b0, 4);
-  OPENSSL_memcpy(out + 4, &b1, 4);
-  OPENSSL_memcpy(out + 8, &b2, 4);
-  OPENSSL_memcpy(out + 12, &b3, 4);
-#endif
-}
-
-// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
-// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
-// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
-// is repeated to the full width of |aes_word_t|.
-#if defined(OPENSSL_SSE2)
-// This must be a macro because |_mm_srli_epi32| and |_mm_slli_epi32| require
-// constant shift values.
-#define aes_nohw_swap_bits(/*__m128i* */ a, /*__m128i* */ b,              \
-                           /* uint32_t */ mask, /* const */ shift)        \
-  do {                                                                    \
-    __m128i swap =                                                        \
-        _mm_and_si128(_mm_xor_si128(_mm_srli_epi32(*(a), (shift)), *(b)), \
-                      _mm_set_epi32((mask), (mask), (mask), (mask)));     \
-    *(a) = _mm_xor_si128(*(a), _mm_slli_epi32(swap, (shift)));            \
-    *(b) = _mm_xor_si128(*(b), swap);                                     \
-                                                                          \
-  } while (0)
-#else
-static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b,
-                                      uint32_t mask, aes_word_t shift) {
-#if defined(OPENSSL_64_BIT)
-  aes_word_t mask_w = (((uint64_t)mask) << 32) | mask;
-#else
-  aes_word_t mask_w = mask;
-#endif
-  // This is a variation on a delta swap.
-  aes_word_t swap = ((*a >> shift) ^ *b) & mask_w;
-  *a ^= swap << shift;
-  *b ^= swap;
-}
-#endif  // OPENSSL_SSE2
-
-// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides
-// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares
-// and transposes each square.
-static void aes_nohw_transpose(AES_NOHW_BATCH *batch) {
-  // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101).
-  aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1);
-  aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1);
-  aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1);
-  aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1);
-
-#if AES_NOHW_BATCH_SIZE >= 4
-  // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011).
-  aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2);
-  aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2);
-  aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2);
-  aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2);
-#endif
-
-#if AES_NOHW_BATCH_SIZE >= 8
-  // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111).
-  aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4);
-  aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4);
-  aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4);
-  aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4);
-#endif
-}
-
-// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|.
-// |num_blocks| must be at most |AES_NOHW_BATCH|.
-static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in,
-                              size_t num_blocks) {
-  // Don't leave unused blocks uninitialized.
-  OPENSSL_memset(out, 0, sizeof(AES_NOHW_BATCH));
-  debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
-  for (size_t i = 0; i < num_blocks; i++) {
-    aes_word_t block[AES_NOHW_BLOCK_WORDS];
-    aes_nohw_compact_block(block, in + 16 * i);
-    aes_nohw_batch_set(out, block, i);
-  }
-
-  aes_nohw_transpose(out);
-}
-
-// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|.
-// |num_blocks| must be at most |AES_NOHW_BATCH|.
-static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks,
-                                const AES_NOHW_BATCH *batch) {
-  AES_NOHW_BATCH copy = *batch;
-  aes_nohw_transpose(&copy);
-
-  debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
-  for (size_t i = 0; i < num_blocks; i++) {
-    aes_word_t block[AES_NOHW_BLOCK_WORDS];
-    aes_nohw_batch_get(&copy, block, i);
-    aes_nohw_uncompact_block(out + 16 * i, block);
-  }
-}
-
-
-// AES round steps.
-
-static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch,
-                                   const AES_NOHW_BATCH *key) {
-  for (size_t i = 0; i < 8; i++) {
-    batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]);
-  }
-}
-
-static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) {
-  // See https://eprint.iacr.org/2009/191.pdf, Appendix C.
-  aes_word_t x0 = batch->w[7];
-  aes_word_t x1 = batch->w[6];
-  aes_word_t x2 = batch->w[5];
-  aes_word_t x3 = batch->w[4];
-  aes_word_t x4 = batch->w[3];
-  aes_word_t x5 = batch->w[2];
-  aes_word_t x6 = batch->w[1];
-  aes_word_t x7 = batch->w[0];
-
-  // Figure 2, the top linear transformation.
-  aes_word_t y14 = aes_nohw_xor(x3, x5);
-  aes_word_t y13 = aes_nohw_xor(x0, x6);
-  aes_word_t y9 = aes_nohw_xor(x0, x3);
-  aes_word_t y8 = aes_nohw_xor(x0, x5);
-  aes_word_t t0 = aes_nohw_xor(x1, x2);
-  aes_word_t y1 = aes_nohw_xor(t0, x7);
-  aes_word_t y4 = aes_nohw_xor(y1, x3);
-  aes_word_t y12 = aes_nohw_xor(y13, y14);
-  aes_word_t y2 = aes_nohw_xor(y1, x0);
-  aes_word_t y5 = aes_nohw_xor(y1, x6);
-  aes_word_t y3 = aes_nohw_xor(y5, y8);
-  aes_word_t t1 = aes_nohw_xor(x4, y12);
-  aes_word_t y15 = aes_nohw_xor(t1, x5);
-  aes_word_t y20 = aes_nohw_xor(t1, x1);
-  aes_word_t y6 = aes_nohw_xor(y15, x7);
-  aes_word_t y10 = aes_nohw_xor(y15, t0);
-  aes_word_t y11 = aes_nohw_xor(y20, y9);
-  aes_word_t y7 = aes_nohw_xor(x7, y11);
-  aes_word_t y17 = aes_nohw_xor(y10, y11);
-  aes_word_t y19 = aes_nohw_xor(y10, y8);
-  aes_word_t y16 = aes_nohw_xor(t0, y11);
-  aes_word_t y21 = aes_nohw_xor(y13, y16);
-  aes_word_t y18 = aes_nohw_xor(x0, y16);
-
-  // Figure 3, the middle non-linear section.
-  aes_word_t t2 = aes_nohw_and(y12, y15);
-  aes_word_t t3 = aes_nohw_and(y3, y6);
-  aes_word_t t4 = aes_nohw_xor(t3, t2);
-  aes_word_t t5 = aes_nohw_and(y4, x7);
-  aes_word_t t6 = aes_nohw_xor(t5, t2);
-  aes_word_t t7 = aes_nohw_and(y13, y16);
-  aes_word_t t8 = aes_nohw_and(y5, y1);
-  aes_word_t t9 = aes_nohw_xor(t8, t7);
-  aes_word_t t10 = aes_nohw_and(y2, y7);
-  aes_word_t t11 = aes_nohw_xor(t10, t7);
-  aes_word_t t12 = aes_nohw_and(y9, y11);
-  aes_word_t t13 = aes_nohw_and(y14, y17);
-  aes_word_t t14 = aes_nohw_xor(t13, t12);
-  aes_word_t t15 = aes_nohw_and(y8, y10);
-  aes_word_t t16 = aes_nohw_xor(t15, t12);
-  aes_word_t t17 = aes_nohw_xor(t4, t14);
-  aes_word_t t18 = aes_nohw_xor(t6, t16);
-  aes_word_t t19 = aes_nohw_xor(t9, t14);
-  aes_word_t t20 = aes_nohw_xor(t11, t16);
-  aes_word_t t21 = aes_nohw_xor(t17, y20);
-  aes_word_t t22 = aes_nohw_xor(t18, y19);
-  aes_word_t t23 = aes_nohw_xor(t19, y21);
-  aes_word_t t24 = aes_nohw_xor(t20, y18);
-  aes_word_t t25 = aes_nohw_xor(t21, t22);
-  aes_word_t t26 = aes_nohw_and(t21, t23);
-  aes_word_t t27 = aes_nohw_xor(t24, t26);
-  aes_word_t t28 = aes_nohw_and(t25, t27);
-  aes_word_t t29 = aes_nohw_xor(t28, t22);
-  aes_word_t t30 = aes_nohw_xor(t23, t24);
-  aes_word_t t31 = aes_nohw_xor(t22, t26);
-  aes_word_t t32 = aes_nohw_and(t31, t30);
-  aes_word_t t33 = aes_nohw_xor(t32, t24);
-  aes_word_t t34 = aes_nohw_xor(t23, t33);
-  aes_word_t t35 = aes_nohw_xor(t27, t33);
-  aes_word_t t36 = aes_nohw_and(t24, t35);
-  aes_word_t t37 = aes_nohw_xor(t36, t34);
-  aes_word_t t38 = aes_nohw_xor(t27, t36);
-  aes_word_t t39 = aes_nohw_and(t29, t38);
-  aes_word_t t40 = aes_nohw_xor(t25, t39);
-  aes_word_t t41 = aes_nohw_xor(t40, t37);
-  aes_word_t t42 = aes_nohw_xor(t29, t33);
-  aes_word_t t43 = aes_nohw_xor(t29, t40);
-  aes_word_t t44 = aes_nohw_xor(t33, t37);
-  aes_word_t t45 = aes_nohw_xor(t42, t41);
-  aes_word_t z0 = aes_nohw_and(t44, y15);
-  aes_word_t z1 = aes_nohw_and(t37, y6);
-  aes_word_t z2 = aes_nohw_and(t33, x7);
-  aes_word_t z3 = aes_nohw_and(t43, y16);
-  aes_word_t z4 = aes_nohw_and(t40, y1);
-  aes_word_t z5 = aes_nohw_and(t29, y7);
-  aes_word_t z6 = aes_nohw_and(t42, y11);
-  aes_word_t z7 = aes_nohw_and(t45, y17);
-  aes_word_t z8 = aes_nohw_and(t41, y10);
-  aes_word_t z9 = aes_nohw_and(t44, y12);
-  aes_word_t z10 = aes_nohw_and(t37, y3);
-  aes_word_t z11 = aes_nohw_and(t33, y4);
-  aes_word_t z12 = aes_nohw_and(t43, y13);
-  aes_word_t z13 = aes_nohw_and(t40, y5);
-  aes_word_t z14 = aes_nohw_and(t29, y2);
-  aes_word_t z15 = aes_nohw_and(t42, y9);
-  aes_word_t z16 = aes_nohw_and(t45, y14);
-  aes_word_t z17 = aes_nohw_and(t41, y8);
-
-  // Figure 4, bottom linear transformation.
-  aes_word_t t46 = aes_nohw_xor(z15, z16);
-  aes_word_t t47 = aes_nohw_xor(z10, z11);
-  aes_word_t t48 = aes_nohw_xor(z5, z13);
-  aes_word_t t49 = aes_nohw_xor(z9, z10);
-  aes_word_t t50 = aes_nohw_xor(z2, z12);
-  aes_word_t t51 = aes_nohw_xor(z2, z5);
-  aes_word_t t52 = aes_nohw_xor(z7, z8);
-  aes_word_t t53 = aes_nohw_xor(z0, z3);
-  aes_word_t t54 = aes_nohw_xor(z6, z7);
-  aes_word_t t55 = aes_nohw_xor(z16, z17);
-  aes_word_t t56 = aes_nohw_xor(z12, t48);
-  aes_word_t t57 = aes_nohw_xor(t50, t53);
-  aes_word_t t58 = aes_nohw_xor(z4, t46);
-  aes_word_t t59 = aes_nohw_xor(z3, t54);
-  aes_word_t t60 = aes_nohw_xor(t46, t57);
-  aes_word_t t61 = aes_nohw_xor(z14, t57);
-  aes_word_t t62 = aes_nohw_xor(t52, t58);
-  aes_word_t t63 = aes_nohw_xor(t49, t58);
-  aes_word_t t64 = aes_nohw_xor(z4, t59);
-  aes_word_t t65 = aes_nohw_xor(t61, t62);
-  aes_word_t t66 = aes_nohw_xor(z1, t63);
-  aes_word_t s0 = aes_nohw_xor(t59, t63);
-  aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62));
-  aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60));
-  aes_word_t t67 = aes_nohw_xor(t64, t65);
-  aes_word_t s3 = aes_nohw_xor(t53, t66);
-  aes_word_t s4 = aes_nohw_xor(t51, t66);
-  aes_word_t s5 = aes_nohw_xor(t47, t65);
-  aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3));
-  aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67));
-
-  batch->w[0] = s7;
-  batch->w[1] = s6;
-  batch->w[2] = s5;
-  batch->w[3] = s4;
-  batch->w[4] = s3;
-  batch->w[5] = s2;
-  batch->w[6] = s1;
-  batch->w[7] = s0;
-}
-
-// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated
-// to the right by |n|. This is a macro because |aes_nohw_shift_*| require
-// constant shift counts in the SSE2 implementation.
-#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \
-  (aes_nohw_or(aes_nohw_shift_right((v), (n)*4),                      \
-               aes_nohw_shift_left((v), 16 - (n)*4)))
-
-static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) {
-  for (size_t i = 0; i < 8; i++) {
-    aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
-    aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
-    aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
-    aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
-    row1 = aes_nohw_rotate_cols_right(row1, 1);
-    row2 = aes_nohw_rotate_cols_right(row2, 2);
-    row3 = aes_nohw_rotate_cols_right(row3, 3);
-    batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
-  }
-}
-
-// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated
-// down by one.
-static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) {
-#if defined(OPENSSL_SSE2)
-  return _mm_or_si128(_mm_srli_epi32(v, 8), _mm_slli_epi32(v, 24));
-#elif defined(OPENSSL_64_BIT)
-  return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) |
-         ((v << 12) & UINT64_C(0xf000f000f000f000));
-#else
-  return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0);
-#endif
-}
-
-// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated
-// by two.
-static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) {
-#if defined(OPENSSL_SSE2)
-  return _mm_or_si128(_mm_srli_epi32(v, 16), _mm_slli_epi32(v, 16));
-#elif defined(OPENSSL_64_BIT)
-  return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) |
-         ((v << 8) & UINT64_C(0xff00ff00ff00ff00));
-#else
-  return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0);
-#endif
-}
-
-static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) {
-  // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A.
-  aes_word_t a0 = batch->w[0];
-  aes_word_t a1 = batch->w[1];
-  aes_word_t a2 = batch->w[2];
-  aes_word_t a3 = batch->w[3];
-  aes_word_t a4 = batch->w[4];
-  aes_word_t a5 = batch->w[5];
-  aes_word_t a6 = batch->w[6];
-  aes_word_t a7 = batch->w[7];
-
-  aes_word_t r0 = aes_nohw_rotate_rows_down(a0);
-  aes_word_t a0_r0 = aes_nohw_xor(a0, r0);
-  aes_word_t r1 = aes_nohw_rotate_rows_down(a1);
-  aes_word_t a1_r1 = aes_nohw_xor(a1, r1);
-  aes_word_t r2 = aes_nohw_rotate_rows_down(a2);
-  aes_word_t a2_r2 = aes_nohw_xor(a2, r2);
-  aes_word_t r3 = aes_nohw_rotate_rows_down(a3);
-  aes_word_t a3_r3 = aes_nohw_xor(a3, r3);
-  aes_word_t r4 = aes_nohw_rotate_rows_down(a4);
-  aes_word_t a4_r4 = aes_nohw_xor(a4, r4);
-  aes_word_t r5 = aes_nohw_rotate_rows_down(a5);
-  aes_word_t a5_r5 = aes_nohw_xor(a5, r5);
-  aes_word_t r6 = aes_nohw_rotate_rows_down(a6);
-  aes_word_t a6_r6 = aes_nohw_xor(a6, r6);
-  aes_word_t r7 = aes_nohw_rotate_rows_down(a7);
-  aes_word_t a7_r7 = aes_nohw_xor(a7, r7);
-
-  batch->w[0] =
-      aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0));
-  batch->w[1] =
-      aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7),
-                   aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1)));
-  batch->w[2] =
-      aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2));
-  batch->w[3] =
-      aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7),
-                   aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3)));
-  batch->w[4] =
-      aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7),
-                   aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4)));
-  batch->w[5] =
-      aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5));
-  batch->w[6] =
-      aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6));
-  batch->w[7] =
-      aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7));
-}
-
-static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key,
-                                   size_t num_rounds, AES_NOHW_BATCH *batch) {
-  aes_nohw_add_round_key(batch, &key->keys[0]);
-  for (size_t i = 1; i < num_rounds; i++) {
-    aes_nohw_sub_bytes(batch);
-    aes_nohw_shift_rows(batch);
-    aes_nohw_mix_columns(batch);
-    aes_nohw_add_round_key(batch, &key->keys[i]);
-  }
-  aes_nohw_sub_bytes(batch);
-  aes_nohw_shift_rows(batch);
-  aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
-}
-
-// Key schedule.
-
-static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
-                                       const AES_KEY *key) {
-  for (size_t i = 0; i <= key->rounds; i++) {
-    // Copy the round key into each block in the batch.
-    for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
-      aes_word_t tmp[AES_NOHW_BLOCK_WORDS];
-      OPENSSL_memcpy(tmp, key->rd_key + 4 * i, 16);
-      aes_nohw_batch_set(&out->keys[i], tmp, j);
-    }
-    aes_nohw_transpose(&out->keys[i]);
-  }
-}
-
-static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10,
-                                          0x20, 0x40, 0x80, 0x1b, 0x36};
-
-// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in
-// |rcon|, stored in a |aes_word_t|.
-static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) {
-  rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1);
-#if defined(OPENSSL_SSE2)
-  return _mm_set_epi32(0, 0, 0, rcon);
-#else
-  return ((aes_word_t)rcon);
-#endif
-}
-
-static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
-                               const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
-  AES_NOHW_BATCH batch;
-  OPENSSL_memset(&batch, 0, sizeof(batch));
-  aes_nohw_batch_set(&batch, in, 0);
-  aes_nohw_transpose(&batch);
-  aes_nohw_sub_bytes(&batch);
-  aes_nohw_transpose(&batch);
-  aes_nohw_batch_get(&batch, out, 0);
-}
-
-static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) {
-  key->rounds = 10;
-
-  aes_word_t block[AES_NOHW_BLOCK_WORDS];
-  aes_nohw_compact_block(block, in);
-  OPENSSL_memcpy(key->rd_key, block, 16);
-
-  for (size_t i = 1; i <= 10; i++) {
-    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
-    aes_nohw_sub_block(sub, block);
-    uint8_t rcon = aes_nohw_rcon[i - 1];
-    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
-      // Incorporate |rcon| and the transformed word into the first word.
-      block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j));
-      block[j] = aes_nohw_xor(
-          block[j],
-          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
-      // Propagate to the remaining words. Note this is reordered from the usual
-      // formulation to avoid needing masks.
-      aes_word_t v = block[j];
-      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4));
-      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8));
-      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12));
-    }
-    OPENSSL_memcpy(key->rd_key + 4 * i, block, 16);
-  }
-}
-
-static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
-  key->rounds = 14;
-
-  // Each key schedule iteration produces two round keys.
-  aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS];
-  aes_nohw_compact_block(block1, in);
-  OPENSSL_memcpy(key->rd_key, block1, 16);
-
-  aes_nohw_compact_block(block2, in + 16);
-  OPENSSL_memcpy(key->rd_key + 4, block2, 16);
-
-  for (size_t i = 2; i <= 14; i += 2) {
-    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
-    aes_nohw_sub_block(sub, block2);
-    uint8_t rcon = aes_nohw_rcon[i / 2 - 1];
-    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
-      // Incorporate |rcon| and the transformed word into the first word.
-      block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j));
-      block1[j] = aes_nohw_xor(
-          block1[j],
-          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
-      // Propagate to the remaining words.
-      aes_word_t v = block1[j];
-      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
-      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
-      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
-    }
-    OPENSSL_memcpy(key->rd_key + 4 * i, block1, 16);
-
-    if (i == 14) {
-      break;
-    }
-
-    aes_nohw_sub_block(sub, block1);
-    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
-      // Incorporate the transformed word into the first word.
-      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12));
-      // Propagate to the remaining words.
-      aes_word_t v = block2[j];
-      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
-      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
-      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
-    }
-    OPENSSL_memcpy(key->rd_key + 4 * (i + 1), block2, 16);
-  }
-}
-
-
-// External API.
-
-int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
-                             AES_KEY *aeskey) {
-  switch (bits) {
-    case 128:
-      aes_nohw_setup_key_128(aeskey, key);
-      return 0;
-    case 256:
-      aes_nohw_setup_key_256(aeskey, key);
-      return 0;
-  }
-  return 1;
-}
-
-void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
-  AES_NOHW_SCHEDULE sched;
-  aes_nohw_expand_round_keys(&sched, key);
-  AES_NOHW_BATCH batch;
-  aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
-  aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
-  aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
-}
-
-static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16],
-                                      const uint8_t b[16]) {
-  for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) {
-    aes_word_t x, y;
-    OPENSSL_memcpy(&x, a + i, sizeof(aes_word_t));
-    OPENSSL_memcpy(&y, b + i, sizeof(aes_word_t));
-    x = aes_nohw_xor(x, y);
-    OPENSSL_memcpy(out + i, &x, sizeof(aes_word_t));
-  }
-}
-
-void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
-                                   size_t blocks, const AES_KEY *key,
-                                   const uint8_t ivec[16]) {
-  if (blocks == 0) {
-    return;
-  }
-
-  AES_NOHW_SCHEDULE sched;
-  aes_nohw_expand_round_keys(&sched, key);
-
-  // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|.
-  alignas(AES_NOHW_WORD_SIZE) uint8_t ivs[AES_NOHW_BATCH_SIZE * 16];
-  alignas(AES_NOHW_WORD_SIZE) uint8_t enc_ivs[AES_NOHW_BATCH_SIZE * 16];
-  for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
-    OPENSSL_memcpy(ivs + 16 * i, ivec, 16);
-  }
-
-  uint32_t ctr = CRYPTO_load_u32_be(ivs + 12);
-  for (;;) {
-    // Update counters.
-    for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
-      CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + (uint32_t)i);
-    }
-
-    size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
-    AES_NOHW_BATCH batch;
-    aes_nohw_to_batch(&batch, ivs, todo);
-    aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
-    aes_nohw_from_batch(enc_ivs, todo, &batch);
-
-    for (size_t i = 0; i < todo; i++) {
-      aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs + 16 * i);
-    }
-
-    blocks -= todo;
-    if (blocks == 0) {
-      break;
-    }
-
-    in += 16 * AES_NOHW_BATCH_SIZE;
-    out += 16 * AES_NOHW_BATCH_SIZE;
-    ctr += AES_NOHW_BATCH_SIZE;
-  }
-}
diff --git a/include/ring-core/aes.h b/include/ring-core/aes.h
index 5b5130dad7..94827f6500 100644
--- a/include/ring-core/aes.h
+++ b/include/ring-core/aes.h
@@ -60,7 +60,7 @@
 // aes_key_st should be an opaque type, but EVP requires that the size be
 // known.
 struct aes_key_st {
-  uint32_t rd_key[4 * (AES_MAXNR + 1)];
+  uint32_t rd_key[240];
   unsigned rounds;
 };
 typedef struct aes_key_st AES_KEY;
diff --git a/src/aead/aes.rs b/src/aead/aes.rs
index 4076df1b25..9834b71577 100644
--- a/src/aead/aes.rs
+++ b/src/aead/aes.rs
@@ -19,6 +19,7 @@ use crate::{
     endian::BigEndian,
     error,
     polyfill::{self, ArrayFlatten as _, ArraySplitMap as _},
+    rust_crypto::aes,
 };
 use core::ops::RangeFrom;
 
@@ -120,6 +121,25 @@ fn ctr32_encrypt_blocks_(
     ctr.increment_by_less_safe(blocks_u32);
 }
 
+fn u32_array_from_u64_array(from: &[u64], to: &mut [u32]) {
+    for (i, byte) in from.iter().enumerate() {
+        let idx = 2 * i;
+        if idx + 1 >= to.len() {
+            break;
+        }
+        let lhsu32 = (byte >> 32) as u32;
+        let rhsu32 = (byte & 0xffffffff) as u32;
+        to[idx] = lhsu32;
+        to[idx + 1] = rhsu32;
+    }
+}
+fn u64_array_from_u32_array(from: &[u32], to: &mut [u64]) {
+    for (i, _) in from.iter().enumerate().step_by(2) {
+        if i / 2 >= to.len() { break; }
+        to[i / 2] = (from[i] as u64) << 32 | from[i + 1] as u64;
+    }
+}
+
 impl Key {
     #[inline]
     pub fn new(
@@ -136,7 +156,7 @@ impl Key {
         }
 
         let mut key = AES_KEY {
-            rd_key: [0u32; 4 * (MAX_ROUNDS + 1)],
+            rd_key: [0u32; 240],
             rounds: 0,
         };
 
@@ -162,7 +182,19 @@ impl Key {
             }
 
             Implementation::NOHW => {
-                set_encrypt_key!(aes_nohw_set_encrypt_key, bytes, key_bits, &mut key)?
+                match key_bits.as_bits() {
+                    128 => {
+                        let sched = aes::fixslice::aes128_key_schedule(bytes.try_into()?);
+                        u32_array_from_u64_array(&sched, &mut key.rd_key);
+                        key.rounds = 10;
+                    }
+                    256 => {
+                        let sched = aes::fixslice::aes256_key_schedule(bytes.try_into()?);
+                        u32_array_from_u64_array(&sched, &mut key.rd_key);
+                        key.rounds = 14;
+                    }
+                    _ => unreachable!(),
+                };
             }
         };
 
@@ -188,7 +220,21 @@ impl Key {
             ))]
             Implementation::VPAES_BSAES => encrypt_block!(vpaes_encrypt, a, self),
 
-            Implementation::NOHW => encrypt_block!(aes_nohw_encrypt, a, self),
+            Implementation::NOHW => match self.inner.rounds {
+                10 => {
+                    let mut enc_key: [u64; 88] = [0; 88];
+                    u64_array_from_u32_array(&self.inner.rd_key, &mut enc_key);
+                    let blocks: [Block; 4] = [a, [0; 16], [0; 16], [0; 16]];
+                    aes::fixslice::aes128_encrypt(&enc_key, &blocks)[0]
+                }
+                14 => {
+                    let mut enc_key: [u64; 120] = [0; 120];
+                    u64_array_from_u32_array(&self.inner.rd_key, &mut enc_key);
+                    let blocks: [Block; 4] = [a, [0; 16], [0; 16], [0; 16]];
+                    aes::fixslice::aes256_encrypt(&enc_key, &blocks)[0]
+                }
+                _ => unimplemented!()
+            },
         }
     }
 
@@ -267,7 +313,76 @@ impl Key {
             }
 
             Implementation::NOHW => {
-                ctr32_encrypt_blocks!(aes_nohw_ctr32_encrypt_blocks, in_out, src, &self.inner, ctr)
+                let in_out_len = in_out[src.clone()].len();
+                assert_eq!(in_out_len % BLOCK_LEN, 0);
+
+                let blocks = in_out_len / BLOCK_LEN;
+                #[allow(clippy::cast_possible_truncation)]
+                let blocks_u32 = blocks as u32;
+                assert_eq!(blocks, polyfill::usize_from_u32(blocks_u32));
+                const MAX_BATCH_SIZE: usize = 4;
+                fn encrypt<const N: usize, F>(
+                    encryptor_fn: F,
+                    mut blocks: usize,
+                    key: &Key,
+                    ctr: &mut Counter,
+                    in_out: &mut [u8],
+                    src: RangeFrom<usize>,
+                ) where
+                    F: Fn(&[u64; N], &[Block; MAX_BATCH_SIZE]) -> [[u8; BLOCK_LEN]; MAX_BATCH_SIZE],
+                {
+                    let mut offset = src.clone();
+                    let mut out_range = RangeFrom { start: 0 };
+                    loop {
+                        let todo = if blocks > MAX_BATCH_SIZE {
+                            MAX_BATCH_SIZE
+                        } else {
+                            blocks
+                        };
+                        let mut enc_key: [u64; N] = [0; N];
+                        u64_array_from_u32_array(&key.inner.rd_key, &mut enc_key);
+                        let mut b: [Block; MAX_BATCH_SIZE] = [[0; BLOCK_LEN]; MAX_BATCH_SIZE];
+                        for i in 0..todo {
+                            b[i] = ctr.increment().into_block_less_safe()
+                        }
+                        let enc_ivs = encryptor_fn(&enc_key, &b);
+                        for i in 0..todo {
+                            for j in 0..BLOCK_LEN {
+                                in_out[out_range.clone()][j] =
+                                    in_out[offset.clone()][j] ^ enc_ivs[i][j];
+                            }
+                            offset.start = offset.start.checked_add(BLOCK_LEN).unwrap();
+                            out_range.start = out_range.start.checked_add(BLOCK_LEN).unwrap();
+                        }
+                        blocks -= todo;
+                        if blocks <= 0 {
+                            break;
+                        }
+                    }
+                }
+                match self.inner.rounds {
+                    10 => {
+                        encrypt(
+                            aes::fixslice::aes128_encrypt,
+                            blocks,
+                            self,
+                            ctr,
+                            in_out,
+                            src,
+                        );
+                    }
+                    14 => {
+                        encrypt(
+                            aes::fixslice::aes256_encrypt,
+                            blocks,
+                            self,
+                            ctr,
+                            in_out,
+                            src,
+                        );
+                    }
+                    _ => unreachable!(),
+                };
             }
         }
     }
@@ -294,7 +409,7 @@ impl Key {
 #[repr(C)]
 #[derive(Clone)]
 pub(super) struct AES_KEY {
-    pub rd_key: [u32; 4 * (MAX_ROUNDS + 1)],
+    pub rd_key: [u32; 240],
     pub rounds: c::uint,
 }
 
diff --git a/src/lib.rs b/src/lib.rs
index 64a68e4e67..f5bcd4a772 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -122,6 +122,8 @@ pub mod rand;
 #[cfg(feature = "alloc")]
 pub mod rsa;
 
+pub mod rust_crypto;
+
 pub mod signature;
 
 #[cfg(test)]
diff --git a/src/rust_crypto.rs b/src/rust_crypto.rs
new file mode 100644
index 0000000000..5657404aa8
--- /dev/null
+++ b/src/rust_crypto.rs
@@ -0,0 +1 @@
+pub mod aes;
diff --git a/src/rust_crypto/aes/mod.rs b/src/rust_crypto/aes/mod.rs
new file mode 100644
index 0000000000..db1bce79be
--- /dev/null
+++ b/src/rust_crypto/aes/mod.rs
@@ -0,0 +1,3 @@
+#[cfg_attr(not(target_pointer_width = "64"), path = "soft/fixslice32.rs")]
+#[cfg_attr(target_pointer_width = "64", path = "soft/fixslice64.rs")]
+pub(crate) mod fixslice;
diff --git a/src/rust_crypto/aes/soft/block.rs b/src/rust_crypto/aes/soft/block.rs
new file mode 100644
index 0000000000..7e088d9a89
--- /dev/null
+++ b/src/rust_crypto/aes/soft/block.rs
@@ -0,0 +1 @@
+pub type Block = [u8; 16];
diff --git a/src/rust_crypto/aes/soft/fixslice32.rs b/src/rust_crypto/aes/soft/fixslice32.rs
new file mode 100644
index 0000000000..80453a7c25
--- /dev/null
+++ b/src/rust_crypto/aes/soft/fixslice32.rs
@@ -0,0 +1,1371 @@
+//! Fixsliced implementations of AES-128, AES-192 and AES-256 (32-bit)
+//! adapted from the C implementation
+//!
+//! All implementations are fully bitsliced and do not rely on any
+//! Look-Up Table (LUT).
+//!
+//! See the paper at <https://eprint.iacr.org/2020/1123.pdf> for more details.
+//!
+//! # Author (original C code)
+//!
+//! Alexandre Adomnicai, Nanyang Technological University, Singapore
+//! <alexandre.adomnicai@ntu.edu.sg>
+//!
+//! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission.
+
+#![allow(clippy::unreadable_literal)]
+
+use crate::Block;
+use cipher::{array::Array, consts::U2};
+
+/// AES block batch size for this implementation
+pub(crate) type FixsliceBlocks = U2;
+
+pub(crate) type BatchBlocks = Array<Block, FixsliceBlocks>;
+
+/// AES-128 round keys
+pub(crate) type FixsliceKeys128 = [u32; 88];
+
+/// AES-192 round keys
+pub(crate) type FixsliceKeys192 = [u32; 104];
+
+/// AES-256 round keys
+pub(crate) type FixsliceKeys256 = [u32; 120];
+
+/// 256-bit internal state
+pub(crate) type State = [u32; 8];
+
+/// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes128_key_schedule(key: &[u8; 16]) -> FixsliceKeys128 {
+    let mut rkeys = [0u32; 88];
+
+    bitslice(&mut rkeys[..8], key, key);
+
+    let mut rk_off = 0;
+    for rcon in 0..10 {
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        if rcon < 8 {
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
+        } else {
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4);
+        }
+
+        xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3));
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(aes_compact)]
+    {
+        for i in (8..88).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(aes_compact))]
+    {
+        for i in (8..72).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
+        }
+        inv_shift_rows_1(&mut rkeys[72..80]);
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..11 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation.
+#[allow(dead_code)]
+pub(crate) fn aes192_key_schedule(key: &[u8; 24]) -> FixsliceKeys192 {
+    let mut rkeys = [0u32; 104];
+    let mut tmp = [0u32; 8];
+
+    bitslice(&mut rkeys[..8], &key[..16], &key[..16]);
+    bitslice(&mut tmp, &key[8..], &key[8..]);
+
+    let mut rcon = 0;
+    let mut rk_off = 8;
+
+    loop {
+        for i in 0..8 {
+            rkeys[rk_off + i] =
+                (0x0f0f0f0f & (tmp[i] >> 4)) | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4));
+        }
+
+        sub_bytes(&mut tmp);
+        sub_bytes_nots(&mut tmp);
+
+        add_round_constant_bit(&mut tmp, rcon);
+        rcon += 1;
+
+        for i in 0..8 {
+            let mut ti = rkeys[rk_off + i];
+            ti ^= 0x30303030 & ror(tmp[i], ror_distance(1, 1));
+            ti ^= 0xc0c0c0c0 & (ti << 2);
+            tmp[i] = ti;
+        }
+        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
+        rk_off += 8;
+
+        for i in 0..8 {
+            let ui = tmp[i];
+            let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) | (0xf0f0f0f0 & (ui << 4));
+            ti ^= 0x03030303 & (ui >> 6);
+            tmp[i] =
+                ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6));
+        }
+        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
+        rk_off += 8;
+
+        sub_bytes(&mut tmp);
+        sub_bytes_nots(&mut tmp);
+
+        add_round_constant_bit(&mut tmp, rcon);
+        rcon += 1;
+
+        for i in 0..8 {
+            let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4))
+                | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4));
+            ti ^= 0x03030303 & ror(tmp[i], ror_distance(1, 3));
+            rkeys[rk_off + i] =
+                ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6));
+        }
+        rk_off += 8;
+
+        if rcon >= 8 {
+            break;
+        }
+
+        for i in 0..8 {
+            let ui = rkeys[(rk_off - 8) + i];
+            let mut ti = rkeys[(rk_off - 16) + i];
+            ti ^= 0x30303030 & (ui >> 2);
+            ti ^= 0xc0c0c0c0 & (ti << 2);
+            tmp[i] = ti;
+        }
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(aes_compact)]
+    {
+        for i in (8..104).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(aes_compact))]
+    {
+        for i in (0..96).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]);
+            inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]);
+        }
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..13 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes256_key_schedule(key: &[u8; 32]) -> FixsliceKeys256 {
+    let mut rkeys = [0u32; 120];
+
+    bitslice(&mut rkeys[..8], &key[..16], &key[..16]);
+    bitslice(&mut rkeys[8..16], &key[16..], &key[16..]);
+
+    let mut rk_off = 8;
+
+    let mut rcon = 0;
+    loop {
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
+        xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3));
+        rcon += 1;
+
+        if rcon == 7 {
+            break;
+        }
+
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3));
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(aes_compact)]
+    {
+        for i in (8..120).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(aes_compact))]
+    {
+        for i in (8..104).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
+        }
+        inv_shift_rows_1(&mut rkeys[104..112]);
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..15 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+#[allow(dead_code)]
+pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[80..]);
+    inv_sub_bytes(&mut state);
+
+    #[cfg(not(aes_compact))]
+    {
+        inv_shift_rows_2(&mut state);
+    }
+
+    let mut rk_off = 72;
+    loop {
+        #[cfg(aes_compact)]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        #[cfg(not(aes_compact))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(aes_compact)]
+        {
+            shift_rows_2(&mut state);
+        }
+
+        if rk_off == 80 {
+            break;
+        }
+
+        #[cfg(not(aes_compact))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    #[cfg(not(aes_compact))]
+    {
+        shift_rows_2(&mut state);
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[80..]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+#[allow(dead_code)]
+pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[96..]);
+    inv_sub_bytes(&mut state);
+
+    let mut rk_off = 88;
+    loop {
+        #[cfg(aes_compact)]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+        #[cfg(not(aes_compact))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+#[allow(dead_code)]
+pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(aes_compact)]
+        {
+            shift_rows_2(&mut state);
+        }
+        #[cfg(not(aes_compact))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        if rk_off == 96 {
+            break;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[96..]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[112..]);
+    inv_sub_bytes(&mut state);
+
+    #[cfg(not(aes_compact))]
+    {
+        inv_shift_rows_2(&mut state);
+    }
+
+    let mut rk_off = 104;
+    loop {
+        #[cfg(aes_compact)]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        #[cfg(not(aes_compact))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(aes_compact)]
+        {
+            shift_rows_2(&mut state);
+        }
+
+        if rk_off == 112 {
+            break;
+        }
+
+        #[cfg(not(aes_compact))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    #[cfg(not(aes_compact))]
+    {
+        shift_rows_2(&mut state);
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[112..]);
+
+    inv_bitslice(&state)
+}
+
+/// Note that the 4 bitwise NOT (^= 0xffffffff) are accounted for here so that it is a true
+/// inverse of 'sub_bytes'.
+fn inv_sub_bytes(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+
+    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
+    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
+
+    let u7 = state[0];
+    let u6 = state[1];
+    let u5 = state[2];
+    let u4 = state[3];
+    let u3 = state[4];
+    let u2 = state[5];
+    let u1 = state[6];
+    let u0 = state[7];
+
+    let t23 = u0 ^ u3;
+    let t8 = u1 ^ t23;
+    let m2 = t23 & t8;
+    let t4 = u4 ^ t8;
+    let t22 = u1 ^ u3;
+    let t2 = u0 ^ u1;
+    let t1 = u3 ^ u4;
+    // t23 -> stack
+    let t9 = u7 ^ t1;
+    // t8 -> stack
+    let m7 = t22 & t9;
+    // t9 -> stack
+    let t24 = u4 ^ u7;
+    // m7 -> stack
+    let t10 = t2 ^ t24;
+    // u4 -> stack
+    let m14 = t2 & t10;
+    let r5 = u6 ^ u7;
+    // m2 -> stack
+    let t3 = t1 ^ r5;
+    // t2 -> stack
+    let t13 = t2 ^ r5;
+    let t19 = t22 ^ r5;
+    // t3 -> stack
+    let t17 = u2 ^ t19;
+    // t4 -> stack
+    let t25 = u2 ^ t1;
+    let r13 = u1 ^ u6;
+    // t25 -> stack
+    let t20 = t24 ^ r13;
+    // t17 -> stack
+    let m9 = t20 & t17;
+    // t20 -> stack
+    let r17 = u2 ^ u5;
+    // t22 -> stack
+    let t6 = t22 ^ r17;
+    // t13 -> stack
+    let m1 = t13 & t6;
+    let y5 = u0 ^ r17;
+    let m4 = t19 & y5;
+    let m5 = m4 ^ m1;
+    let m17 = m5 ^ t24;
+    let r18 = u5 ^ u6;
+    let t27 = t1 ^ r18;
+    let t15 = t10 ^ t27;
+    // t6 -> stack
+    let m11 = t1 & t15;
+    let m15 = m14 ^ m11;
+    let m21 = m17 ^ m15;
+    // t1 -> stack
+    // t4 <- stack
+    let m12 = t4 & t27;
+    let m13 = m12 ^ m11;
+    let t14 = t10 ^ r18;
+    let m3 = t14 ^ m1;
+    // m2 <- stack
+    let m16 = m3 ^ m2;
+    let m20 = m16 ^ m13;
+    // u4 <- stack
+    let r19 = u2 ^ u4;
+    let t16 = r13 ^ r19;
+    // t3 <- stack
+    let t26 = t3 ^ t16;
+    let m6 = t3 & t16;
+    let m8 = t26 ^ m6;
+    // t10 -> stack
+    // m7 <- stack
+    let m18 = m8 ^ m7;
+    let m22 = m18 ^ m13;
+    let m25 = m22 & m20;
+    let m26 = m21 ^ m25;
+    let m10 = m9 ^ m6;
+    let m19 = m10 ^ m15;
+    // t25 <- stack
+    let m23 = m19 ^ t25;
+    let m28 = m23 ^ m25;
+    let m24 = m22 ^ m23;
+    let m30 = m26 & m24;
+    let m39 = m23 ^ m30;
+    let m48 = m39 & y5;
+    let m57 = m39 & t19;
+    // m48 -> stack
+    let m36 = m24 ^ m25;
+    let m31 = m20 & m23;
+    let m27 = m20 ^ m21;
+    let m32 = m27 & m31;
+    let m29 = m28 & m27;
+    let m37 = m21 ^ m29;
+    // m39 -> stack
+    let m42 = m37 ^ m39;
+    let m52 = m42 & t15;
+    // t27 -> stack
+    // t1 <- stack
+    let m61 = m42 & t1;
+    let p0 = m52 ^ m61;
+    let p16 = m57 ^ m61;
+    // m57 -> stack
+    // t20 <- stack
+    let m60 = m37 & t20;
+    // p16 -> stack
+    // t17 <- stack
+    let m51 = m37 & t17;
+    let m33 = m27 ^ m25;
+    let m38 = m32 ^ m33;
+    let m43 = m37 ^ m38;
+    let m49 = m43 & t16;
+    let p6 = m49 ^ m60;
+    let p13 = m49 ^ m51;
+    let m58 = m43 & t3;
+    // t9 <- stack
+    let m50 = m38 & t9;
+    // t22 <- stack
+    let m59 = m38 & t22;
+    // p6 -> stack
+    let p1 = m58 ^ m59;
+    let p7 = p0 ^ p1;
+    let m34 = m21 & m22;
+    let m35 = m24 & m34;
+    let m40 = m35 ^ m36;
+    let m41 = m38 ^ m40;
+    let m45 = m42 ^ m41;
+    // t27 <- stack
+    let m53 = m45 & t27;
+    let p8 = m50 ^ m53;
+    let p23 = p7 ^ p8;
+    // t4 <- stack
+    let m62 = m45 & t4;
+    let p14 = m49 ^ m62;
+    let s6 = p14 ^ p23;
+    // t10 <- stack
+    let m54 = m41 & t10;
+    let p2 = m54 ^ m62;
+    let p22 = p2 ^ p7;
+    let s0 = p13 ^ p22;
+    let p17 = m58 ^ p2;
+    let p15 = m54 ^ m59;
+    // t2 <- stack
+    let m63 = m41 & t2;
+    // m39 <- stack
+    let m44 = m39 ^ m40;
+    // p17 -> stack
+    // t6 <- stack
+    let m46 = m44 & t6;
+    let p5 = m46 ^ m51;
+    // p23 -> stack
+    let p18 = m63 ^ p5;
+    let p24 = p5 ^ p7;
+    // m48 <- stack
+    let p12 = m46 ^ m48;
+    let s3 = p12 ^ p22;
+    // t13 <- stack
+    let m55 = m44 & t13;
+    let p9 = m55 ^ m63;
+    // p16 <- stack
+    let s7 = p9 ^ p16;
+    // t8 <- stack
+    let m47 = m40 & t8;
+    let p3 = m47 ^ m50;
+    let p19 = p2 ^ p3;
+    let s5 = p19 ^ p24;
+    let p11 = p0 ^ p3;
+    let p26 = p9 ^ p11;
+    // t23 <- stack
+    let m56 = m40 & t23;
+    let p4 = m48 ^ m56;
+    // p6 <- stack
+    let p20 = p4 ^ p6;
+    let p29 = p15 ^ p20;
+    let s1 = p26 ^ p29;
+    // m57 <- stack
+    let p10 = m57 ^ p4;
+    let p27 = p10 ^ p18;
+    // p23 <- stack
+    let s4 = p23 ^ p27;
+    let p25 = p6 ^ p10;
+    let p28 = p11 ^ p25;
+    // p17 <- stack
+    let s2 = p17 ^ p28;
+
+    state[0] = s7;
+    state[1] = s6;
+    state[2] = s5;
+    state[3] = s4;
+    state[4] = s3;
+    state[5] = s2;
+    state[6] = s1;
+    state[7] = s0;
+}
+
+/// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik.
+///
+/// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt>
+///
+/// Note that the 4 bitwise NOT (^= 0xffffffff) are moved to the key schedule.
+fn sub_bytes(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+
+    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
+    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
+
+    let u7 = state[0];
+    let u6 = state[1];
+    let u5 = state[2];
+    let u4 = state[3];
+    let u3 = state[4];
+    let u2 = state[5];
+    let u1 = state[6];
+    let u0 = state[7];
+
+    let y14 = u3 ^ u5;
+    let y13 = u0 ^ u6;
+    let y12 = y13 ^ y14;
+    let t1 = u4 ^ y12;
+    let y15 = t1 ^ u5;
+    let t2 = y12 & y15;
+    let y6 = y15 ^ u7;
+    let y20 = t1 ^ u1;
+    // y12 -> stack
+    let y9 = u0 ^ u3;
+    // y20 -> stack
+    let y11 = y20 ^ y9;
+    // y9 -> stack
+    let t12 = y9 & y11;
+    // y6 -> stack
+    let y7 = u7 ^ y11;
+    let y8 = u0 ^ u5;
+    let t0 = u1 ^ u2;
+    let y10 = y15 ^ t0;
+    // y15 -> stack
+    let y17 = y10 ^ y11;
+    // y14 -> stack
+    let t13 = y14 & y17;
+    let t14 = t13 ^ t12;
+    // y17 -> stack
+    let y19 = y10 ^ y8;
+    // y10 -> stack
+    let t15 = y8 & y10;
+    let t16 = t15 ^ t12;
+    let y16 = t0 ^ y11;
+    // y11 -> stack
+    let y21 = y13 ^ y16;
+    // y13 -> stack
+    let t7 = y13 & y16;
+    // y16 -> stack
+    let y18 = u0 ^ y16;
+    let y1 = t0 ^ u7;
+    let y4 = y1 ^ u3;
+    // u7 -> stack
+    let t5 = y4 & u7;
+    let t6 = t5 ^ t2;
+    let t18 = t6 ^ t16;
+    let t22 = t18 ^ y19;
+    let y2 = y1 ^ u0;
+    let t10 = y2 & y7;
+    let t11 = t10 ^ t7;
+    let t20 = t11 ^ t16;
+    let t24 = t20 ^ y18;
+    let y5 = y1 ^ u6;
+    let t8 = y5 & y1;
+    let t9 = t8 ^ t7;
+    let t19 = t9 ^ t14;
+    let t23 = t19 ^ y21;
+    let y3 = y5 ^ y8;
+    // y6 <- stack
+    let t3 = y3 & y6;
+    let t4 = t3 ^ t2;
+    // y20 <- stack
+    let t17 = t4 ^ y20;
+    let t21 = t17 ^ t14;
+    let t26 = t21 & t23;
+    let t27 = t24 ^ t26;
+    let t31 = t22 ^ t26;
+    let t25 = t21 ^ t22;
+    // y4 -> stack
+    let t28 = t25 & t27;
+    let t29 = t28 ^ t22;
+    let z14 = t29 & y2;
+    let z5 = t29 & y7;
+    let t30 = t23 ^ t24;
+    let t32 = t31 & t30;
+    let t33 = t32 ^ t24;
+    let t35 = t27 ^ t33;
+    let t36 = t24 & t35;
+    let t38 = t27 ^ t36;
+    let t39 = t29 & t38;
+    let t40 = t25 ^ t39;
+    let t43 = t29 ^ t40;
+    // y16 <- stack
+    let z3 = t43 & y16;
+    let tc12 = z3 ^ z5;
+    // tc12 -> stack
+    // y13 <- stack
+    let z12 = t43 & y13;
+    let z13 = t40 & y5;
+    let z4 = t40 & y1;
+    let tc6 = z3 ^ z4;
+    let t34 = t23 ^ t33;
+    let t37 = t36 ^ t34;
+    let t41 = t40 ^ t37;
+    // y10 <- stack
+    let z8 = t41 & y10;
+    let z17 = t41 & y8;
+    let t44 = t33 ^ t37;
+    // y15 <- stack
+    let z0 = t44 & y15;
+    // z17 -> stack
+    // y12 <- stack
+    let z9 = t44 & y12;
+    let z10 = t37 & y3;
+    let z1 = t37 & y6;
+    let tc5 = z1 ^ z0;
+    let tc11 = tc6 ^ tc5;
+    // y4 <- stack
+    let z11 = t33 & y4;
+    let t42 = t29 ^ t33;
+    let t45 = t42 ^ t41;
+    // y17 <- stack
+    let z7 = t45 & y17;
+    let tc8 = z7 ^ tc6;
+    // y14 <- stack
+    let z16 = t45 & y14;
+    // y11 <- stack
+    let z6 = t42 & y11;
+    let tc16 = z6 ^ tc8;
+    // z14 -> stack
+    // y9 <- stack
+    let z15 = t42 & y9;
+    let tc20 = z15 ^ tc16;
+    let tc1 = z15 ^ z16;
+    let tc2 = z10 ^ tc1;
+    let tc21 = tc2 ^ z11;
+    let tc3 = z9 ^ tc2;
+    let s0 = tc3 ^ tc16;
+    let s3 = tc3 ^ tc11;
+    let s1 = s3 ^ tc16;
+    let tc13 = z13 ^ tc1;
+    // u7 <- stack
+    let z2 = t33 & u7;
+    let tc4 = z0 ^ z2;
+    let tc7 = z12 ^ tc4;
+    let tc9 = z8 ^ tc7;
+    let tc10 = tc8 ^ tc9;
+    // z14 <- stack
+    let tc17 = z14 ^ tc10;
+    let s5 = tc21 ^ tc17;
+    let tc26 = tc17 ^ tc20;
+    // z17 <- stack
+    let s2 = tc26 ^ z17;
+    // tc12 <- stack
+    let tc14 = tc4 ^ tc12;
+    let tc18 = tc13 ^ tc14;
+    let s6 = tc10 ^ tc18;
+    let s7 = z12 ^ tc18;
+    let s4 = tc14 ^ s3;
+
+    state[0] = s7;
+    state[1] = s6;
+    state[2] = s5;
+    state[3] = s4;
+    state[4] = s3;
+    state[5] = s2;
+    state[6] = s1;
+    state[7] = s0;
+}
+
+/// NOT operations that are omitted in S-box
+#[inline]
+fn sub_bytes_nots(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+    state[0] ^= 0xffffffff;
+    state[1] ^= 0xffffffff;
+    state[5] ^= 0xffffffff;
+    state[6] ^= 0xffffffff;
+}
+
+/// Computation of the MixColumns transformation in the fixsliced representation, with different
+/// rotations used according to the round number mod 4.
+///
+/// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm.
+macro_rules! define_mix_columns {
+    (
+        $name:ident,
+        $name_inv:ident,
+        $first_rotate:path,
+        $second_rotate:path
+    ) => {
+        #[rustfmt::skip]
+        fn $name(state: &mut State) {
+            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
+                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
+            );
+            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
+                $first_rotate(a0),
+                $first_rotate(a1),
+                $first_rotate(a2),
+                $first_rotate(a3),
+                $first_rotate(a4),
+                $first_rotate(a5),
+                $first_rotate(a6),
+                $first_rotate(a7),
+            );
+            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
+                a0 ^ b0,
+                a1 ^ b1,
+                a2 ^ b2,
+                a3 ^ b3,
+                a4 ^ b4,
+                a5 ^ b5,
+                a6 ^ b6,
+                a7 ^ b7,
+            );
+            state[0] = b0      ^ c7 ^ $second_rotate(c0);
+            state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1);
+            state[2] = b2 ^ c1      ^ $second_rotate(c2);
+            state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3);
+            state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4);
+            state[5] = b5 ^ c4      ^ $second_rotate(c5);
+            state[6] = b6 ^ c5      ^ $second_rotate(c6);
+            state[7] = b7 ^ c6      ^ $second_rotate(c7);
+        }
+
+        #[rustfmt::skip]
+        fn $name_inv(state: &mut State) {
+            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
+                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
+            );
+            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
+                $first_rotate(a0),
+                $first_rotate(a1),
+                $first_rotate(a2),
+                $first_rotate(a3),
+                $first_rotate(a4),
+                $first_rotate(a5),
+                $first_rotate(a6),
+                $first_rotate(a7),
+            );
+            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
+                a0 ^ b0,
+                a1 ^ b1,
+                a2 ^ b2,
+                a3 ^ b3,
+                a4 ^ b4,
+                a5 ^ b5,
+                a6 ^ b6,
+                a7 ^ b7,
+            );
+            let (d0, d1, d2, d3, d4, d5, d6, d7) = (
+                a0      ^ c7,
+                a1 ^ c0 ^ c7,
+                a2 ^ c1,
+                a3 ^ c2 ^ c7,
+                a4 ^ c3 ^ c7,
+                a5 ^ c4,
+                a6 ^ c5,
+                a7 ^ c6,
+            );
+            let (e0, e1, e2, e3, e4, e5, e6, e7) = (
+                c0      ^ d6,
+                c1      ^ d6 ^ d7,
+                c2 ^ d0      ^ d7,
+                c3 ^ d1 ^ d6,
+                c4 ^ d2 ^ d6 ^ d7,
+                c5 ^ d3      ^ d7,
+                c6 ^ d4,
+                c7 ^ d5,
+            );
+            state[0] = d0 ^ e0 ^ $second_rotate(e0);
+            state[1] = d1 ^ e1 ^ $second_rotate(e1);
+            state[2] = d2 ^ e2 ^ $second_rotate(e2);
+            state[3] = d3 ^ e3 ^ $second_rotate(e3);
+            state[4] = d4 ^ e4 ^ $second_rotate(e4);
+            state[5] = d5 ^ e5 ^ $second_rotate(e5);
+            state[6] = d6 ^ e6 ^ $second_rotate(e6);
+            state[7] = d7 ^ e7 ^ $second_rotate(e7);
+        }
+    }
+}
+
+define_mix_columns!(
+    mix_columns_0,
+    inv_mix_columns_0,
+    rotate_rows_1,
+    rotate_rows_2
+);
+
+define_mix_columns!(
+    mix_columns_1,
+    inv_mix_columns_1,
+    rotate_rows_and_columns_1_1,
+    rotate_rows_and_columns_2_2
+);
+
+#[cfg(not(aes_compact))]
+define_mix_columns!(
+    mix_columns_2,
+    inv_mix_columns_2,
+    rotate_rows_and_columns_1_2,
+    rotate_rows_2
+);
+
+#[cfg(not(aes_compact))]
+define_mix_columns!(
+    mix_columns_3,
+    inv_mix_columns_3,
+    rotate_rows_and_columns_1_3,
+    rotate_rows_and_columns_2_2
+);
+
+#[inline]
+fn delta_swap_1(a: &mut u32, shift: u32, mask: u32) {
+    let t = (*a ^ ((*a) >> shift)) & mask;
+    *a ^= t ^ (t << shift);
+}
+
+#[inline]
+fn delta_swap_2(a: &mut u32, b: &mut u32, shift: u32, mask: u32) {
+    let t = (*a ^ ((*b) >> shift)) & mask;
+    *a ^= t;
+    *b ^= t << shift;
+}
+
+/// Applies ShiftRows once on an AES state (or key).
+#[cfg(any(not(aes_compact), feature = "hazmat"))]
+#[inline]
+fn shift_rows_1(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 4, 0x0c0f0300);
+        delta_swap_1(x, 2, 0x33003300);
+    }
+}
+
+/// Applies ShiftRows twice on an AES state (or key).
+#[inline]
+fn shift_rows_2(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 4, 0x0f000f00);
+    }
+}
+
+/// Applies ShiftRows three times on an AES state (or key).
+#[inline]
+fn shift_rows_3(state: &mut [u32]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 4, 0x030f0c00);
+        delta_swap_1(x, 2, 0x33003300);
+    }
+}
+
+#[inline(always)]
+fn inv_shift_rows_1(state: &mut [u32]) {
+    shift_rows_3(state);
+}
+
+#[inline(always)]
+fn inv_shift_rows_2(state: &mut [u32]) {
+    shift_rows_2(state);
+}
+
+#[cfg(not(aes_compact))]
+#[inline(always)]
+fn inv_shift_rows_3(state: &mut [u32]) {
+    shift_rows_1(state);
+}
+
+/// XOR the columns after the S-box during the key schedule round function.
+///
+/// The `idx_xor` parameter refers to the index of the previous round key that is
+/// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256,
+/// respectively).
+///
+/// The `idx_ror` parameter refers to the rotation value, which varies between the
+/// different key schedules.
+fn xor_columns(rkeys: &mut [u32], offset: usize, idx_xor: usize, idx_ror: u32) {
+    for i in 0..8 {
+        let off_i = offset + i;
+        let rk = rkeys[off_i - idx_xor] ^ (0x03030303 & ror(rkeys[off_i], idx_ror));
+        rkeys[off_i] =
+            rk ^ (0xfcfcfcfc & (rk << 2)) ^ (0xf0f0f0f0 & (rk << 4)) ^ (0xc0c0c0c0 & (rk << 6));
+    }
+}
+
+/// Bitslice two 128-bit input blocks input0, input1 into a 256-bit internal state.
+fn bitslice(output: &mut [u32], input0: &[u8], input1: &[u8]) {
+    debug_assert_eq!(output.len(), 8);
+    debug_assert_eq!(input0.len(), 16);
+    debug_assert_eq!(input1.len(), 16);
+
+    // Bitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at an
+    // 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the
+    // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition):
+    //     b0 c1 c0 r1 r0 p2 p1 p0
+    //
+    // The desired bitsliced data groups first by bit position, then row, column, block:
+    //     p2 p1 p0 r1 r0 c1 c0 b0
+
+    // Interleave the columns on input (note the order of input)
+    //     b0 c1 c0 __ __ __ __ __ => c1 c0 b0 __ __ __ __ __
+    let mut t0 = u32::from_le_bytes(input0[0x00..0x04].try_into().unwrap());
+    let mut t2 = u32::from_le_bytes(input0[0x04..0x08].try_into().unwrap());
+    let mut t4 = u32::from_le_bytes(input0[0x08..0x0c].try_into().unwrap());
+    let mut t6 = u32::from_le_bytes(input0[0x0c..0x10].try_into().unwrap());
+    let mut t1 = u32::from_le_bytes(input1[0x00..0x04].try_into().unwrap());
+    let mut t3 = u32::from_le_bytes(input1[0x04..0x08].try_into().unwrap());
+    let mut t5 = u32::from_le_bytes(input1[0x08..0x0c].try_into().unwrap());
+    let mut t7 = u32::from_le_bytes(input1[0x0c..0x10].try_into().unwrap());
+
+    // Bit Index Swap 5 <-> 0:
+    //     __ __ b0 __ __ __ __ p0 => __ __ p0 __ __ __ __ b0
+    let m0 = 0x55555555;
+    delta_swap_2(&mut t1, &mut t0, 1, m0);
+    delta_swap_2(&mut t3, &mut t2, 1, m0);
+    delta_swap_2(&mut t5, &mut t4, 1, m0);
+    delta_swap_2(&mut t7, &mut t6, 1, m0);
+
+    // Bit Index Swap 6 <-> 1:
+    //     __ c0 __ __ __ __ p1 __ => __ p1 __ __ __ __ c0 __
+    let m1 = 0x33333333;
+    delta_swap_2(&mut t2, &mut t0, 2, m1);
+    delta_swap_2(&mut t3, &mut t1, 2, m1);
+    delta_swap_2(&mut t6, &mut t4, 2, m1);
+    delta_swap_2(&mut t7, &mut t5, 2, m1);
+
+    // Bit Index Swap 7 <-> 2:
+    //     c1 __ __ __ __ p2 __ __ => p2 __ __ __ __ c1 __ __
+    let m2 = 0x0f0f0f0f;
+    delta_swap_2(&mut t4, &mut t0, 4, m2);
+    delta_swap_2(&mut t5, &mut t1, 4, m2);
+    delta_swap_2(&mut t6, &mut t2, 4, m2);
+    delta_swap_2(&mut t7, &mut t3, 4, m2);
+
+    // Final bitsliced bit index, as desired:
+    //     p2 p1 p0 r1 r0 c1 c0 b0
+    output[0] = t0;
+    output[1] = t1;
+    output[2] = t2;
+    output[3] = t3;
+    output[4] = t4;
+    output[5] = t5;
+    output[6] = t6;
+    output[7] = t7;
+}
+
+/// Un-bitslice a 256-bit internal state into two 128-bit blocks of output.
+fn inv_bitslice(input: &[u32]) -> BatchBlocks {
+    debug_assert_eq!(input.len(), 8);
+
+    // Unbitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at
+    // an 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the
+    // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition):
+    //     b0 c1 c0 r1 r0 p2 p1 p0
+    //
+    // The initially bitsliced data groups first by bit position, then row, column, block:
+    //     p2 p1 p0 r1 r0 c1 c0 b0
+
+    let mut t0 = input[0];
+    let mut t1 = input[1];
+    let mut t2 = input[2];
+    let mut t3 = input[3];
+    let mut t4 = input[4];
+    let mut t5 = input[5];
+    let mut t6 = input[6];
+    let mut t7 = input[7];
+
+    // TODO: these bit index swaps are identical to those in 'packing'
+
+    // Bit Index Swap 5 <-> 0:
+    //     __ __ p0 __ __ __ __ b0 => __ __ b0 __ __ __ __ p0
+    let m0 = 0x55555555;
+    delta_swap_2(&mut t1, &mut t0, 1, m0);
+    delta_swap_2(&mut t3, &mut t2, 1, m0);
+    delta_swap_2(&mut t5, &mut t4, 1, m0);
+    delta_swap_2(&mut t7, &mut t6, 1, m0);
+
+    // Bit Index Swap 6 <-> 1:
+    //     __ p1 __ __ __ __ c0 __ => __ c0 __ __ __ __ p1 __
+    let m1 = 0x33333333;
+    delta_swap_2(&mut t2, &mut t0, 2, m1);
+    delta_swap_2(&mut t3, &mut t1, 2, m1);
+    delta_swap_2(&mut t6, &mut t4, 2, m1);
+    delta_swap_2(&mut t7, &mut t5, 2, m1);
+
+    // Bit Index Swap 7 <-> 2:
+    //     p2 __ __ __ __ c1 __ __ => c1 __ __ __ __ p2 __ __
+    let m2 = 0x0f0f0f0f;
+    delta_swap_2(&mut t4, &mut t0, 4, m2);
+    delta_swap_2(&mut t5, &mut t1, 4, m2);
+    delta_swap_2(&mut t6, &mut t2, 4, m2);
+    delta_swap_2(&mut t7, &mut t3, 4, m2);
+
+    let mut output = BatchBlocks::default();
+    // De-interleave the columns on output (note the order of output)
+    //     c1 c0 b0 __ __ __ __ __ => b0 c1 c0 __ __ __ __ __
+    output[0][0x00..0x04].copy_from_slice(&t0.to_le_bytes());
+    output[0][0x04..0x08].copy_from_slice(&t2.to_le_bytes());
+    output[0][0x08..0x0c].copy_from_slice(&t4.to_le_bytes());
+    output[0][0x0c..0x10].copy_from_slice(&t6.to_le_bytes());
+    output[1][0x00..0x04].copy_from_slice(&t1.to_le_bytes());
+    output[1][0x04..0x08].copy_from_slice(&t3.to_le_bytes());
+    output[1][0x08..0x0c].copy_from_slice(&t5.to_le_bytes());
+    output[1][0x0c..0x10].copy_from_slice(&t7.to_le_bytes());
+
+    // Final AES bit index, as desired:
+    //     b0 c1 c0 r1 r0 p2 p1 p0
+    output
+}
+
+/// Copy 32-bytes within the provided slice to an 8-byte offset
+fn memshift32(buffer: &mut [u32], src_offset: usize) {
+    debug_assert_eq!(src_offset % 8, 0);
+
+    let dst_offset = src_offset + 8;
+    debug_assert!(dst_offset + 8 <= buffer.len());
+
+    for i in (0..8).rev() {
+        buffer[dst_offset + i] = buffer[src_offset + i];
+    }
+}
+
+/// XOR the round key to the internal state. The round keys are expected to be
+/// pre-computed and to be packed in the fixsliced representation.
+#[inline]
+fn add_round_key(state: &mut State, rkey: &[u32]) {
+    debug_assert_eq!(rkey.len(), 8);
+    for (a, b) in state.iter_mut().zip(rkey) {
+        *a ^= b;
+    }
+}
+
+#[inline(always)]
+fn add_round_constant_bit(state: &mut [u32], bit: usize) {
+    state[bit] ^= 0x0000c000;
+}
+
+#[inline(always)]
+fn ror(x: u32, y: u32) -> u32 {
+    x.rotate_right(y)
+}
+
+#[inline(always)]
+fn ror_distance(rows: u32, cols: u32) -> u32 {
+    (rows << 3) + (cols << 1)
+}
+
+#[inline(always)]
+fn rotate_rows_1(x: u32) -> u32 {
+    ror(x, ror_distance(1, 0))
+}
+
+#[inline(always)]
+fn rotate_rows_2(x: u32) -> u32 {
+    ror(x, ror_distance(2, 0))
+}
+
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_1(x: u32) -> u32 {
+    (ror(x, ror_distance(1, 1)) & 0x3f3f3f3f) |
+    (ror(x, ror_distance(0, 1)) & 0xc0c0c0c0)
+}
+
+#[cfg(not(aes_compact))]
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_2(x: u32) -> u32 {
+    (ror(x, ror_distance(1, 2)) & 0x0f0f0f0f) |
+    (ror(x, ror_distance(0, 2)) & 0xf0f0f0f0)
+}
+
+#[cfg(not(aes_compact))]
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_3(x: u32) -> u32 {
+    (ror(x, ror_distance(1, 3)) & 0x03030303) |
+    (ror(x, ror_distance(0, 3)) & 0xfcfcfcfc)
+}
+
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_2_2(x: u32) -> u32 {
+    (ror(x, ror_distance(2, 2)) & 0x0f0f0f0f) |
+    (ror(x, ror_distance(1, 2)) & 0xf0f0f0f0)
+}
diff --git a/src/rust_crypto/aes/soft/fixslice64.rs b/src/rust_crypto/aes/soft/fixslice64.rs
new file mode 100644
index 0000000000..23556e3c58
--- /dev/null
+++ b/src/rust_crypto/aes/soft/fixslice64.rs
@@ -0,0 +1,1536 @@
+//! Fixsliced implementations of AES-128, AES-192 and AES-256 (64-bit)
+//! adapted from the C implementation.
+//!
+//! All implementations are fully bitsliced and do not rely on any
+//! Look-Up Table (LUT).
+//!
+//! See the paper at <https://eprint.iacr.org/2020/1123.pdf> for more details.
+//!
+//! # Author (original C code)
+//!
+//! Alexandre Adomnicai, Nanyang Technological University, Singapore
+//! <alexandre.adomnicai@ntu.edu.sg>
+//!
+//! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission.
+
+#![allow(clippy::unreadable_literal)]
+
+mod block;
+use block::Block;
+
+type BatchBlocks = [Block; 4];
+
+/// AES-128 round keys
+type FixsliceKeys128 = [u64; 88];
+
+/// AES-192 round keys
+type FixsliceKeys192 = [u64; 104];
+
+/// AES-256 round keys
+type FixsliceKeys256 = [u64; 120];
+
+/// 512-bit internal state
+pub(crate) type State = [u64; 8];
+
+/// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes128_key_schedule(key: &[u8; 16]) -> FixsliceKeys128 {
+    let mut rkeys = [0u64; 88];
+
+    bitslice(&mut rkeys[..8], key, key, key, key);
+
+    let mut rk_off = 0;
+    for rcon in 0..10 {
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        if rcon < 8 {
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
+        } else {
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5);
+            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4);
+        }
+
+        xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3));
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(aes_compact)]
+    {
+        for i in (8..88).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(aes_compact))]
+    {
+        for i in (8..72).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
+        }
+        inv_shift_rows_1(&mut rkeys[72..80]);
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..11 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation.
+#[allow(dead_code)]
+pub(crate) fn aes192_key_schedule(key: &[u8; 24]) -> FixsliceKeys192 {
+    let mut rkeys = [0u64; 104];
+    let mut tmp = [0u64; 8];
+
+    bitslice(
+        &mut rkeys[..8],
+        &key[..16],
+        &key[..16],
+        &key[..16],
+        &key[..16],
+    );
+    bitslice(&mut tmp, &key[8..], &key[8..], &key[8..], &key[8..]);
+
+    let mut rcon = 0;
+    let mut rk_off = 8;
+
+    loop {
+        for i in 0..8 {
+            rkeys[rk_off + i] = (0x00ff00ff00ff00ff & (tmp[i] >> 8))
+                | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
+        }
+
+        sub_bytes(&mut tmp);
+        sub_bytes_nots(&mut tmp);
+
+        add_round_constant_bit(&mut tmp, rcon);
+        rcon += 1;
+
+        for i in 0..8 {
+            let mut ti = rkeys[rk_off + i];
+            ti ^= 0x0f000f000f000f00 & ror(tmp[i], ror_distance(1, 1));
+            ti ^= 0xf000f000f000f000 & (ti << 4);
+            tmp[i] = ti;
+        }
+        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
+        rk_off += 8;
+
+        for i in 0..8 {
+            let ui = tmp[i];
+            let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
+                | (0xff00ff00ff00ff00 & (ui << 8));
+            ti ^= 0x000f000f000f000f & (ui >> 12);
+            tmp[i] = ti
+                ^ (0xfff0fff0fff0fff0 & (ti << 4))
+                ^ (0xff00ff00ff00ff00 & (ti << 8))
+                ^ (0xf000f000f000f000 & (ti << 12));
+        }
+        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
+        rk_off += 8;
+
+        sub_bytes(&mut tmp);
+        sub_bytes_nots(&mut tmp);
+
+        add_round_constant_bit(&mut tmp, rcon);
+        rcon += 1;
+
+        for i in 0..8 {
+            let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
+                | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
+            ti ^= 0x000f000f000f000f & ror(tmp[i], ror_distance(1, 3));
+            rkeys[rk_off + i] = ti
+                ^ (0xfff0fff0fff0fff0 & (ti << 4))
+                ^ (0xff00ff00ff00ff00 & (ti << 8))
+                ^ (0xf000f000f000f000 & (ti << 12));
+        }
+        rk_off += 8;
+
+        if rcon >= 8 {
+            break;
+        }
+
+        for i in 0..8 {
+            let ui = rkeys[(rk_off - 8) + i];
+            let mut ti = rkeys[(rk_off - 16) + i];
+            ti ^= 0x0f000f000f000f00 & (ui >> 4);
+            ti ^= 0xf000f000f000f000 & (ti << 4);
+            tmp[i] = ti;
+        }
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(aes_compact)]
+    {
+        for i in (8..104).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(aes_compact))]
+    {
+        for i in (0..96).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]);
+            inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]);
+        }
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..13 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation.
+pub(crate) fn aes256_key_schedule(key: &[u8; 32]) -> FixsliceKeys256 {
+    let mut rkeys = [0u64; 120];
+
+    bitslice(
+        &mut rkeys[..8],
+        &key[..16],
+        &key[..16],
+        &key[..16],
+        &key[..16],
+    );
+    bitslice(
+        &mut rkeys[8..16],
+        &key[16..],
+        &key[16..],
+        &key[16..],
+        &key[16..],
+    );
+
+    let mut rk_off = 8;
+
+    let mut rcon = 0;
+    loop {
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
+        xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3));
+        rcon += 1;
+
+        if rcon == 7 {
+            break;
+        }
+
+        memshift32(&mut rkeys, rk_off);
+        rk_off += 8;
+
+        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
+        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
+
+        xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3));
+    }
+
+    // Adjust to match fixslicing format
+    #[cfg(aes_compact)]
+    {
+        for i in (8..120).step_by(16) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+        }
+    }
+    #[cfg(not(aes_compact))]
+    {
+        for i in (8..104).step_by(32) {
+            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
+            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
+            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
+        }
+        inv_shift_rows_1(&mut rkeys[104..112]);
+    }
+
+    // Account for NOTs removed from sub_bytes
+    for i in 1..15 {
+        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
+    }
+
+    rkeys
+}
+
+/// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+#[allow(dead_code)]
+pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[80..]);
+    inv_sub_bytes(&mut state);
+
+    #[cfg(not(aes_compact))]
+    {
+        inv_shift_rows_2(&mut state);
+    }
+
+    let mut rk_off = 72;
+    loop {
+        #[cfg(aes_compact)]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        #[cfg(not(aes_compact))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(aes_compact)]
+        {
+            shift_rows_2(&mut state);
+        }
+
+        if rk_off == 80 {
+            break;
+        }
+
+        #[cfg(not(aes_compact))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    #[cfg(not(aes_compact))]
+    {
+        shift_rows_2(&mut state);
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[80..]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+#[allow(dead_code)]
+pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[96..]);
+    inv_sub_bytes(&mut state);
+
+    let mut rk_off = 88;
+    loop {
+        #[cfg(aes_compact)]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+        #[cfg(not(aes_compact))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+#[allow(dead_code)]
+pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(aes_compact)]
+        {
+            shift_rows_2(&mut state);
+        }
+        #[cfg(not(aes_compact))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        if rk_off == 96 {
+            break;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[96..]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted).
+///
+/// Decrypts four blocks in-place and in parallel.
+#[allow(dead_code)]
+pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[112..]);
+    inv_sub_bytes(&mut state);
+
+    #[cfg(not(aes_compact))]
+    {
+        inv_shift_rows_2(&mut state);
+    }
+
+    let mut rk_off = 104;
+    loop {
+        #[cfg(aes_compact)]
+        {
+            inv_shift_rows_2(&mut state);
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_1(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        if rk_off == 0 {
+            break;
+        }
+
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        inv_mix_columns_0(&mut state);
+        inv_sub_bytes(&mut state);
+        rk_off -= 8;
+
+        #[cfg(not(aes_compact))]
+        {
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_3(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            inv_mix_columns_2(&mut state);
+            inv_sub_bytes(&mut state);
+            rk_off -= 8;
+        }
+    }
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    inv_bitslice(&state)
+}
+
+/// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted).
+///
+/// Encrypts four blocks in-place and in parallel.
+pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks {
+    let mut state = State::default();
+
+    bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
+
+    add_round_key(&mut state, &rkeys[..8]);
+
+    let mut rk_off = 8;
+    loop {
+        sub_bytes(&mut state);
+        mix_columns_1(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+
+        #[cfg(aes_compact)]
+        {
+            shift_rows_2(&mut state);
+        }
+
+        if rk_off == 112 {
+            break;
+        }
+
+        #[cfg(not(aes_compact))]
+        {
+            sub_bytes(&mut state);
+            mix_columns_2(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+
+            sub_bytes(&mut state);
+            mix_columns_3(&mut state);
+            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+            rk_off += 8;
+        }
+
+        sub_bytes(&mut state);
+        mix_columns_0(&mut state);
+        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
+        rk_off += 8;
+    }
+
+    #[cfg(not(aes_compact))]
+    {
+        shift_rows_2(&mut state);
+    }
+
+    sub_bytes(&mut state);
+    add_round_key(&mut state, &rkeys[112..]);
+
+    inv_bitslice(&state)
+}
+
+/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are accounted for here so that it is a true
+/// inverse of 'sub_bytes'.
+fn inv_sub_bytes(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+
+    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
+    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
+
+    let u7 = state[0];
+    let u6 = state[1];
+    let u5 = state[2];
+    let u4 = state[3];
+    let u3 = state[4];
+    let u2 = state[5];
+    let u1 = state[6];
+    let u0 = state[7];
+
+    let t23 = u0 ^ u3;
+    let t8 = u1 ^ t23;
+    let m2 = t23 & t8;
+    let t4 = u4 ^ t8;
+    let t22 = u1 ^ u3;
+    let t2 = u0 ^ u1;
+    let t1 = u3 ^ u4;
+    // t23 -> stack
+    let t9 = u7 ^ t1;
+    // t8 -> stack
+    let m7 = t22 & t9;
+    // t9 -> stack
+    let t24 = u4 ^ u7;
+    // m7 -> stack
+    let t10 = t2 ^ t24;
+    // u4 -> stack
+    let m14 = t2 & t10;
+    let r5 = u6 ^ u7;
+    // m2 -> stack
+    let t3 = t1 ^ r5;
+    // t2 -> stack
+    let t13 = t2 ^ r5;
+    let t19 = t22 ^ r5;
+    // t3 -> stack
+    let t17 = u2 ^ t19;
+    // t4 -> stack
+    let t25 = u2 ^ t1;
+    let r13 = u1 ^ u6;
+    // t25 -> stack
+    let t20 = t24 ^ r13;
+    // t17 -> stack
+    let m9 = t20 & t17;
+    // t20 -> stack
+    let r17 = u2 ^ u5;
+    // t22 -> stack
+    let t6 = t22 ^ r17;
+    // t13 -> stack
+    let m1 = t13 & t6;
+    let y5 = u0 ^ r17;
+    let m4 = t19 & y5;
+    let m5 = m4 ^ m1;
+    let m17 = m5 ^ t24;
+    let r18 = u5 ^ u6;
+    let t27 = t1 ^ r18;
+    let t15 = t10 ^ t27;
+    // t6 -> stack
+    let m11 = t1 & t15;
+    let m15 = m14 ^ m11;
+    let m21 = m17 ^ m15;
+    // t1 -> stack
+    // t4 <- stack
+    let m12 = t4 & t27;
+    let m13 = m12 ^ m11;
+    let t14 = t10 ^ r18;
+    let m3 = t14 ^ m1;
+    // m2 <- stack
+    let m16 = m3 ^ m2;
+    let m20 = m16 ^ m13;
+    // u4 <- stack
+    let r19 = u2 ^ u4;
+    let t16 = r13 ^ r19;
+    // t3 <- stack
+    let t26 = t3 ^ t16;
+    let m6 = t3 & t16;
+    let m8 = t26 ^ m6;
+    // t10 -> stack
+    // m7 <- stack
+    let m18 = m8 ^ m7;
+    let m22 = m18 ^ m13;
+    let m25 = m22 & m20;
+    let m26 = m21 ^ m25;
+    let m10 = m9 ^ m6;
+    let m19 = m10 ^ m15;
+    // t25 <- stack
+    let m23 = m19 ^ t25;
+    let m28 = m23 ^ m25;
+    let m24 = m22 ^ m23;
+    let m30 = m26 & m24;
+    let m39 = m23 ^ m30;
+    let m48 = m39 & y5;
+    let m57 = m39 & t19;
+    // m48 -> stack
+    let m36 = m24 ^ m25;
+    let m31 = m20 & m23;
+    let m27 = m20 ^ m21;
+    let m32 = m27 & m31;
+    let m29 = m28 & m27;
+    let m37 = m21 ^ m29;
+    // m39 -> stack
+    let m42 = m37 ^ m39;
+    let m52 = m42 & t15;
+    // t27 -> stack
+    // t1 <- stack
+    let m61 = m42 & t1;
+    let p0 = m52 ^ m61;
+    let p16 = m57 ^ m61;
+    // m57 -> stack
+    // t20 <- stack
+    let m60 = m37 & t20;
+    // p16 -> stack
+    // t17 <- stack
+    let m51 = m37 & t17;
+    let m33 = m27 ^ m25;
+    let m38 = m32 ^ m33;
+    let m43 = m37 ^ m38;
+    let m49 = m43 & t16;
+    let p6 = m49 ^ m60;
+    let p13 = m49 ^ m51;
+    let m58 = m43 & t3;
+    // t9 <- stack
+    let m50 = m38 & t9;
+    // t22 <- stack
+    let m59 = m38 & t22;
+    // p6 -> stack
+    let p1 = m58 ^ m59;
+    let p7 = p0 ^ p1;
+    let m34 = m21 & m22;
+    let m35 = m24 & m34;
+    let m40 = m35 ^ m36;
+    let m41 = m38 ^ m40;
+    let m45 = m42 ^ m41;
+    // t27 <- stack
+    let m53 = m45 & t27;
+    let p8 = m50 ^ m53;
+    let p23 = p7 ^ p8;
+    // t4 <- stack
+    let m62 = m45 & t4;
+    let p14 = m49 ^ m62;
+    let s6 = p14 ^ p23;
+    // t10 <- stack
+    let m54 = m41 & t10;
+    let p2 = m54 ^ m62;
+    let p22 = p2 ^ p7;
+    let s0 = p13 ^ p22;
+    let p17 = m58 ^ p2;
+    let p15 = m54 ^ m59;
+    // t2 <- stack
+    let m63 = m41 & t2;
+    // m39 <- stack
+    let m44 = m39 ^ m40;
+    // p17 -> stack
+    // t6 <- stack
+    let m46 = m44 & t6;
+    let p5 = m46 ^ m51;
+    // p23 -> stack
+    let p18 = m63 ^ p5;
+    let p24 = p5 ^ p7;
+    // m48 <- stack
+    let p12 = m46 ^ m48;
+    let s3 = p12 ^ p22;
+    // t13 <- stack
+    let m55 = m44 & t13;
+    let p9 = m55 ^ m63;
+    // p16 <- stack
+    let s7 = p9 ^ p16;
+    // t8 <- stack
+    let m47 = m40 & t8;
+    let p3 = m47 ^ m50;
+    let p19 = p2 ^ p3;
+    let s5 = p19 ^ p24;
+    let p11 = p0 ^ p3;
+    let p26 = p9 ^ p11;
+    // t23 <- stack
+    let m56 = m40 & t23;
+    let p4 = m48 ^ m56;
+    // p6 <- stack
+    let p20 = p4 ^ p6;
+    let p29 = p15 ^ p20;
+    let s1 = p26 ^ p29;
+    // m57 <- stack
+    let p10 = m57 ^ p4;
+    let p27 = p10 ^ p18;
+    // p23 <- stack
+    let s4 = p23 ^ p27;
+    let p25 = p6 ^ p10;
+    let p28 = p11 ^ p25;
+    // p17 <- stack
+    let s2 = p17 ^ p28;
+
+    state[0] = s7;
+    state[1] = s6;
+    state[2] = s5;
+    state[3] = s4;
+    state[4] = s3;
+    state[5] = s2;
+    state[6] = s1;
+    state[7] = s0;
+}
+
+/// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik.
+///
+/// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt>
+///
+/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are moved to the key schedule.
+fn sub_bytes(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+
+    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
+    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
+
+    let u7 = state[0];
+    let u6 = state[1];
+    let u5 = state[2];
+    let u4 = state[3];
+    let u3 = state[4];
+    let u2 = state[5];
+    let u1 = state[6];
+    let u0 = state[7];
+
+    let y14 = u3 ^ u5;
+    let y13 = u0 ^ u6;
+    let y12 = y13 ^ y14;
+    let t1 = u4 ^ y12;
+    let y15 = t1 ^ u5;
+    let t2 = y12 & y15;
+    let y6 = y15 ^ u7;
+    let y20 = t1 ^ u1;
+    // y12 -> stack
+    let y9 = u0 ^ u3;
+    // y20 -> stack
+    let y11 = y20 ^ y9;
+    // y9 -> stack
+    let t12 = y9 & y11;
+    // y6 -> stack
+    let y7 = u7 ^ y11;
+    let y8 = u0 ^ u5;
+    let t0 = u1 ^ u2;
+    let y10 = y15 ^ t0;
+    // y15 -> stack
+    let y17 = y10 ^ y11;
+    // y14 -> stack
+    let t13 = y14 & y17;
+    let t14 = t13 ^ t12;
+    // y17 -> stack
+    let y19 = y10 ^ y8;
+    // y10 -> stack
+    let t15 = y8 & y10;
+    let t16 = t15 ^ t12;
+    let y16 = t0 ^ y11;
+    // y11 -> stack
+    let y21 = y13 ^ y16;
+    // y13 -> stack
+    let t7 = y13 & y16;
+    // y16 -> stack
+    let y18 = u0 ^ y16;
+    let y1 = t0 ^ u7;
+    let y4 = y1 ^ u3;
+    // u7 -> stack
+    let t5 = y4 & u7;
+    let t6 = t5 ^ t2;
+    let t18 = t6 ^ t16;
+    let t22 = t18 ^ y19;
+    let y2 = y1 ^ u0;
+    let t10 = y2 & y7;
+    let t11 = t10 ^ t7;
+    let t20 = t11 ^ t16;
+    let t24 = t20 ^ y18;
+    let y5 = y1 ^ u6;
+    let t8 = y5 & y1;
+    let t9 = t8 ^ t7;
+    let t19 = t9 ^ t14;
+    let t23 = t19 ^ y21;
+    let y3 = y5 ^ y8;
+    // y6 <- stack
+    let t3 = y3 & y6;
+    let t4 = t3 ^ t2;
+    // y20 <- stack
+    let t17 = t4 ^ y20;
+    let t21 = t17 ^ t14;
+    let t26 = t21 & t23;
+    let t27 = t24 ^ t26;
+    let t31 = t22 ^ t26;
+    let t25 = t21 ^ t22;
+    // y4 -> stack
+    let t28 = t25 & t27;
+    let t29 = t28 ^ t22;
+    let z14 = t29 & y2;
+    let z5 = t29 & y7;
+    let t30 = t23 ^ t24;
+    let t32 = t31 & t30;
+    let t33 = t32 ^ t24;
+    let t35 = t27 ^ t33;
+    let t36 = t24 & t35;
+    let t38 = t27 ^ t36;
+    let t39 = t29 & t38;
+    let t40 = t25 ^ t39;
+    let t43 = t29 ^ t40;
+    // y16 <- stack
+    let z3 = t43 & y16;
+    let tc12 = z3 ^ z5;
+    // tc12 -> stack
+    // y13 <- stack
+    let z12 = t43 & y13;
+    let z13 = t40 & y5;
+    let z4 = t40 & y1;
+    let tc6 = z3 ^ z4;
+    let t34 = t23 ^ t33;
+    let t37 = t36 ^ t34;
+    let t41 = t40 ^ t37;
+    // y10 <- stack
+    let z8 = t41 & y10;
+    let z17 = t41 & y8;
+    let t44 = t33 ^ t37;
+    // y15 <- stack
+    let z0 = t44 & y15;
+    // z17 -> stack
+    // y12 <- stack
+    let z9 = t44 & y12;
+    let z10 = t37 & y3;
+    let z1 = t37 & y6;
+    let tc5 = z1 ^ z0;
+    let tc11 = tc6 ^ tc5;
+    // y4 <- stack
+    let z11 = t33 & y4;
+    let t42 = t29 ^ t33;
+    let t45 = t42 ^ t41;
+    // y17 <- stack
+    let z7 = t45 & y17;
+    let tc8 = z7 ^ tc6;
+    // y14 <- stack
+    let z16 = t45 & y14;
+    // y11 <- stack
+    let z6 = t42 & y11;
+    let tc16 = z6 ^ tc8;
+    // z14 -> stack
+    // y9 <- stack
+    let z15 = t42 & y9;
+    let tc20 = z15 ^ tc16;
+    let tc1 = z15 ^ z16;
+    let tc2 = z10 ^ tc1;
+    let tc21 = tc2 ^ z11;
+    let tc3 = z9 ^ tc2;
+    let s0 = tc3 ^ tc16;
+    let s3 = tc3 ^ tc11;
+    let s1 = s3 ^ tc16;
+    let tc13 = z13 ^ tc1;
+    // u7 <- stack
+    let z2 = t33 & u7;
+    let tc4 = z0 ^ z2;
+    let tc7 = z12 ^ tc4;
+    let tc9 = z8 ^ tc7;
+    let tc10 = tc8 ^ tc9;
+    // z14 <- stack
+    let tc17 = z14 ^ tc10;
+    let s5 = tc21 ^ tc17;
+    let tc26 = tc17 ^ tc20;
+    // z17 <- stack
+    let s2 = tc26 ^ z17;
+    // tc12 <- stack
+    let tc14 = tc4 ^ tc12;
+    let tc18 = tc13 ^ tc14;
+    let s6 = tc10 ^ tc18;
+    let s7 = z12 ^ tc18;
+    let s4 = tc14 ^ s3;
+
+    state[0] = s7;
+    state[1] = s6;
+    state[2] = s5;
+    state[3] = s4;
+    state[4] = s3;
+    state[5] = s2;
+    state[6] = s1;
+    state[7] = s0;
+}
+
+/// NOT operations that are omitted in S-box
+#[inline]
+fn sub_bytes_nots(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+    state[0] ^= 0xffffffffffffffff;
+    state[1] ^= 0xffffffffffffffff;
+    state[5] ^= 0xffffffffffffffff;
+    state[6] ^= 0xffffffffffffffff;
+}
+
+/// Computation of the MixColumns transformation in the fixsliced representation, with different
+/// rotations used according to the round number mod 4.
+///
+/// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm.
+macro_rules! define_mix_columns {
+    (
+        $name:ident,
+        $name_inv:ident,
+        $first_rotate:path,
+        $second_rotate:path
+    ) => {
+        #[rustfmt::skip]
+        fn $name(state: &mut State) {
+            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
+                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
+            );
+            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
+                $first_rotate(a0),
+                $first_rotate(a1),
+                $first_rotate(a2),
+                $first_rotate(a3),
+                $first_rotate(a4),
+                $first_rotate(a5),
+                $first_rotate(a6),
+                $first_rotate(a7),
+            );
+            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
+                a0 ^ b0,
+                a1 ^ b1,
+                a2 ^ b2,
+                a3 ^ b3,
+                a4 ^ b4,
+                a5 ^ b5,
+                a6 ^ b6,
+                a7 ^ b7,
+            );
+            state[0] = b0      ^ c7 ^ $second_rotate(c0);
+            state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1);
+            state[2] = b2 ^ c1      ^ $second_rotate(c2);
+            state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3);
+            state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4);
+            state[5] = b5 ^ c4      ^ $second_rotate(c5);
+            state[6] = b6 ^ c5      ^ $second_rotate(c6);
+            state[7] = b7 ^ c6      ^ $second_rotate(c7);
+        }
+
+        #[rustfmt::skip]
+        fn $name_inv(state: &mut State) {
+            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
+                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
+            );
+            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
+                $first_rotate(a0),
+                $first_rotate(a1),
+                $first_rotate(a2),
+                $first_rotate(a3),
+                $first_rotate(a4),
+                $first_rotate(a5),
+                $first_rotate(a6),
+                $first_rotate(a7),
+            );
+            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
+                a0 ^ b0,
+                a1 ^ b1,
+                a2 ^ b2,
+                a3 ^ b3,
+                a4 ^ b4,
+                a5 ^ b5,
+                a6 ^ b6,
+                a7 ^ b7,
+            );
+            let (d0, d1, d2, d3, d4, d5, d6, d7) = (
+                a0      ^ c7,
+                a1 ^ c0 ^ c7,
+                a2 ^ c1,
+                a3 ^ c2 ^ c7,
+                a4 ^ c3 ^ c7,
+                a5 ^ c4,
+                a6 ^ c5,
+                a7 ^ c6,
+            );
+            let (e0, e1, e2, e3, e4, e5, e6, e7) = (
+                c0      ^ d6,
+                c1      ^ d6 ^ d7,
+                c2 ^ d0      ^ d7,
+                c3 ^ d1 ^ d6,
+                c4 ^ d2 ^ d6 ^ d7,
+                c5 ^ d3      ^ d7,
+                c6 ^ d4,
+                c7 ^ d5,
+            );
+            state[0] = d0 ^ e0 ^ $second_rotate(e0);
+            state[1] = d1 ^ e1 ^ $second_rotate(e1);
+            state[2] = d2 ^ e2 ^ $second_rotate(e2);
+            state[3] = d3 ^ e3 ^ $second_rotate(e3);
+            state[4] = d4 ^ e4 ^ $second_rotate(e4);
+            state[5] = d5 ^ e5 ^ $second_rotate(e5);
+            state[6] = d6 ^ e6 ^ $second_rotate(e6);
+            state[7] = d7 ^ e7 ^ $second_rotate(e7);
+        }
+    }
+}
+
+define_mix_columns!(
+    mix_columns_0,
+    inv_mix_columns_0,
+    rotate_rows_1,
+    rotate_rows_2
+);
+
+define_mix_columns!(
+    mix_columns_1,
+    inv_mix_columns_1,
+    rotate_rows_and_columns_1_1,
+    rotate_rows_and_columns_2_2
+);
+
+#[cfg(not(aes_compact))]
+define_mix_columns!(
+    mix_columns_2,
+    inv_mix_columns_2,
+    rotate_rows_and_columns_1_2,
+    rotate_rows_2
+);
+
+#[cfg(not(aes_compact))]
+define_mix_columns!(
+    mix_columns_3,
+    inv_mix_columns_3,
+    rotate_rows_and_columns_1_3,
+    rotate_rows_and_columns_2_2
+);
+
+#[inline]
+fn delta_swap_1(a: &mut u64, shift: u32, mask: u64) {
+    let t = (*a ^ ((*a) >> shift)) & mask;
+    *a ^= t ^ (t << shift);
+}
+
+#[inline]
+fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) {
+    let t = (*a ^ ((*b) >> shift)) & mask;
+    *a ^= t;
+    *b ^= t << shift;
+}
+
+/// Applies ShiftRows once on an AES state (or key).
+#[cfg(any(not(aes_compact), feature = "hazmat"))]
+#[inline]
+fn shift_rows_1(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 8, 0x00f000ff000f0000);
+        delta_swap_1(x, 4, 0x0f0f00000f0f0000);
+    }
+}
+
+/// Applies ShiftRows twice on an AES state (or key).
+#[inline]
+fn shift_rows_2(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 8, 0x00ff000000ff0000);
+    }
+}
+
+/// Applies ShiftRows three times on an AES state (or key).
+#[inline]
+fn shift_rows_3(state: &mut [u64]) {
+    debug_assert_eq!(state.len(), 8);
+    for x in state.iter_mut() {
+        delta_swap_1(x, 8, 0x000f00ff00f00000);
+        delta_swap_1(x, 4, 0x0f0f00000f0f0000);
+    }
+}
+
+#[inline(always)]
+fn inv_shift_rows_1(state: &mut [u64]) {
+    shift_rows_3(state);
+}
+
+#[inline(always)]
+fn inv_shift_rows_2(state: &mut [u64]) {
+    shift_rows_2(state);
+}
+
+#[cfg(not(aes_compact))]
+#[inline(always)]
+fn inv_shift_rows_3(state: &mut [u64]) {
+    shift_rows_1(state);
+}
+
+/// XOR the columns after the S-box during the key schedule round function.
+///
+/// The `idx_xor` parameter refers to the index of the previous round key that is
+/// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256,
+/// respectively).
+///
+/// The `idx_ror` parameter refers to the rotation value, which varies between the
+/// different key schedules.
+fn xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32) {
+    for i in 0..8 {
+        let off_i = offset + i;
+        let rk = rkeys[off_i - idx_xor] ^ (0x000f000f000f000f & ror(rkeys[off_i], idx_ror));
+        rkeys[off_i] = rk
+            ^ (0xfff0fff0fff0fff0 & (rk << 4))
+            ^ (0xff00ff00ff00ff00 & (rk << 8))
+            ^ (0xf000f000f000f000 & (rk << 12));
+    }
+}
+
+/// Bitslice four 128-bit input blocks input0, input1, input2, input3 into a 512-bit internal state.
+fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) {
+    debug_assert_eq!(output.len(), 8);
+    debug_assert_eq!(input0.len(), 16);
+    debug_assert_eq!(input1.len(), 16);
+    debug_assert_eq!(input2.len(), 16);
+    debug_assert_eq!(input3.len(), 16);
+
+    // Bitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at a
+    // 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
+    // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition):
+    //     b1 b0 c1 c0 r1 r0 p2 p1 p0
+    //
+    // The desired bitsliced data groups first by bit position, then row, column, block:
+    //     p2 p1 p0 r1 r0 c1 c0 b1 b0
+
+    #[rustfmt::skip]
+    fn read_reordered(input: &[u8]) -> u64 {
+        (u64::from(input[0x0])        ) |
+        (u64::from(input[0x1]) << 0x10) |
+        (u64::from(input[0x2]) << 0x20) |
+        (u64::from(input[0x3]) << 0x30) |
+        (u64::from(input[0x8]) << 0x08) |
+        (u64::from(input[0x9]) << 0x18) |
+        (u64::from(input[0xa]) << 0x28) |
+        (u64::from(input[0xb]) << 0x38)
+    }
+
+    // Reorder each block's bytes on input
+    //     __ __ c1 c0 r1 r0 __ __ __ => __ __ c0 r1 r0 c1 __ __ __
+    // Reorder by relabeling (note the order of input)
+    //     b1 b0 c0 __ __ __ __ __ __ => c0 b1 b0 __ __ __ __ __ __
+    let mut t0 = read_reordered(&input0[0x00..0x0c]);
+    let mut t4 = read_reordered(&input0[0x04..0x10]);
+    let mut t1 = read_reordered(&input1[0x00..0x0c]);
+    let mut t5 = read_reordered(&input1[0x04..0x10]);
+    let mut t2 = read_reordered(&input2[0x00..0x0c]);
+    let mut t6 = read_reordered(&input2[0x04..0x10]);
+    let mut t3 = read_reordered(&input3[0x00..0x0c]);
+    let mut t7 = read_reordered(&input3[0x04..0x10]);
+
+    // Bit Index Swap 6 <-> 0:
+    //     __ __ b0 __ __ __ __ __ p0 => __ __ p0 __ __ __ __ __ b0
+    let m0 = 0x5555555555555555;
+    delta_swap_2(&mut t1, &mut t0, 1, m0);
+    delta_swap_2(&mut t3, &mut t2, 1, m0);
+    delta_swap_2(&mut t5, &mut t4, 1, m0);
+    delta_swap_2(&mut t7, &mut t6, 1, m0);
+
+    // Bit Index Swap 7 <-> 1:
+    //     __ b1 __ __ __ __ __ p1 __ => __ p1 __ __ __ __ __ b1 __
+    let m1 = 0x3333333333333333;
+    delta_swap_2(&mut t2, &mut t0, 2, m1);
+    delta_swap_2(&mut t3, &mut t1, 2, m1);
+    delta_swap_2(&mut t6, &mut t4, 2, m1);
+    delta_swap_2(&mut t7, &mut t5, 2, m1);
+
+    // Bit Index Swap 8 <-> 2:
+    //     c0 __ __ __ __ __ p2 __ __ => p2 __ __ __ __ __ c0 __ __
+    let m2 = 0x0f0f0f0f0f0f0f0f;
+    delta_swap_2(&mut t4, &mut t0, 4, m2);
+    delta_swap_2(&mut t5, &mut t1, 4, m2);
+    delta_swap_2(&mut t6, &mut t2, 4, m2);
+    delta_swap_2(&mut t7, &mut t3, 4, m2);
+
+    // Final bitsliced bit index, as desired:
+    //     p2 p1 p0 r1 r0 c1 c0 b1 b0
+    output[0] = t0;
+    output[1] = t1;
+    output[2] = t2;
+    output[3] = t3;
+    output[4] = t4;
+    output[5] = t5;
+    output[6] = t6;
+    output[7] = t7;
+}
+
+/// Un-bitslice a 512-bit internal state into four 128-bit blocks of output.
+fn inv_bitslice(input: &[u64]) -> BatchBlocks {
+    debug_assert_eq!(input.len(), 8);
+
+    // Unbitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at
+    // a 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
+    // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition):
+    //     b1 b0 c1 c0 r1 r0 p2 p1 p0
+    //
+    // The initially bitsliced data groups first by bit position, then row, column, block:
+    //     p2 p1 p0 r1 r0 c1 c0 b1 b0
+
+    let mut t0 = input[0];
+    let mut t1 = input[1];
+    let mut t2 = input[2];
+    let mut t3 = input[3];
+    let mut t4 = input[4];
+    let mut t5 = input[5];
+    let mut t6 = input[6];
+    let mut t7 = input[7];
+
+    // TODO: these bit index swaps are identical to those in 'packing'
+
+    // Bit Index Swap 6 <-> 0:
+    //     __ __ p0 __ __ __ __ __ b0 => __ __ b0 __ __ __ __ __ p0
+    let m0 = 0x5555555555555555;
+    delta_swap_2(&mut t1, &mut t0, 1, m0);
+    delta_swap_2(&mut t3, &mut t2, 1, m0);
+    delta_swap_2(&mut t5, &mut t4, 1, m0);
+    delta_swap_2(&mut t7, &mut t6, 1, m0);
+
+    // Bit Index Swap 7 <-> 1:
+    //     __ p1 __ __ __ __ __ b1 __ => __ b1 __ __ __ __ __ p1 __
+    let m1 = 0x3333333333333333;
+    delta_swap_2(&mut t2, &mut t0, 2, m1);
+    delta_swap_2(&mut t3, &mut t1, 2, m1);
+    delta_swap_2(&mut t6, &mut t4, 2, m1);
+    delta_swap_2(&mut t7, &mut t5, 2, m1);
+
+    // Bit Index Swap 8 <-> 2:
+    //     p2 __ __ __ __ __ c0 __ __ => c0 __ __ __ __ __ p2 __ __
+    let m2 = 0x0f0f0f0f0f0f0f0f;
+    delta_swap_2(&mut t4, &mut t0, 4, m2);
+    delta_swap_2(&mut t5, &mut t1, 4, m2);
+    delta_swap_2(&mut t6, &mut t2, 4, m2);
+    delta_swap_2(&mut t7, &mut t3, 4, m2);
+
+    #[rustfmt::skip]
+    fn write_reordered(columns: u64, output: &mut [u8]) {
+        output[0x0] = (columns        ) as u8;
+        output[0x1] = (columns >> 0x10) as u8;
+        output[0x2] = (columns >> 0x20) as u8;
+        output[0x3] = (columns >> 0x30) as u8;
+        output[0x8] = (columns >> 0x08) as u8;
+        output[0x9] = (columns >> 0x18) as u8;
+        output[0xa] = (columns >> 0x28) as u8;
+        output[0xb] = (columns >> 0x38) as u8;
+    }
+
+    let mut output = BatchBlocks::default();
+    // Reorder by relabeling (note the order of output)
+    //     c0 b1 b0 __ __ __ __ __ __ => b1 b0 c0 __ __ __ __ __ __
+    // Reorder each block's bytes on output
+    //     __ __ c0 r1 r0 c1 __ __ __ => __ __ c1 c0 r1 r0 __ __ __
+    write_reordered(t0, &mut output[0][0x00..0x0c]);
+    write_reordered(t4, &mut output[0][0x04..0x10]);
+    write_reordered(t1, &mut output[1][0x00..0x0c]);
+    write_reordered(t5, &mut output[1][0x04..0x10]);
+    write_reordered(t2, &mut output[2][0x00..0x0c]);
+    write_reordered(t6, &mut output[2][0x04..0x10]);
+    write_reordered(t3, &mut output[3][0x00..0x0c]);
+    write_reordered(t7, &mut output[3][0x04..0x10]);
+
+    // Final AES bit index, as desired:
+    //     b1 b0 c1 c0 r1 r0 p2 p1 p0
+    output
+}
+
+/// Copy 32-bytes within the provided slice to an 8-byte offset
+fn memshift32(buffer: &mut [u64], src_offset: usize) {
+    debug_assert_eq!(src_offset % 8, 0);
+
+    let dst_offset = src_offset + 8;
+    debug_assert!(dst_offset + 8 <= buffer.len());
+
+    for i in (0..8).rev() {
+        buffer[dst_offset + i] = buffer[src_offset + i];
+    }
+}
+
+/// XOR the round key to the internal state. The round keys are expected to be
+/// pre-computed and to be packed in the fixsliced representation.
+#[inline]
+fn add_round_key(state: &mut State, rkey: &[u64]) {
+    debug_assert_eq!(rkey.len(), 8);
+    for (a, b) in state.iter_mut().zip(rkey) {
+        *a ^= b;
+    }
+}
+
+#[inline(always)]
+fn add_round_constant_bit(state: &mut [u64], bit: usize) {
+    state[bit] ^= 0x00000000f0000000;
+}
+
+#[inline(always)]
+fn ror(x: u64, y: u32) -> u64 {
+    x.rotate_right(y)
+}
+
+#[inline(always)]
+fn ror_distance(rows: u32, cols: u32) -> u32 {
+    (rows << 4) + (cols << 2)
+}
+
+#[inline(always)]
+fn rotate_rows_1(x: u64) -> u64 {
+    ror(x, ror_distance(1, 0))
+}
+
+#[inline(always)]
+fn rotate_rows_2(x: u64) -> u64 {
+    ror(x, ror_distance(2, 0))
+}
+
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_1(x: u64) -> u64 {
+    (ror(x, ror_distance(1, 1)) & 0x0fff0fff0fff0fff) |
+    (ror(x, ror_distance(0, 1)) & 0xf000f000f000f000)
+}
+
+#[cfg(not(aes_compact))]
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_2(x: u64) -> u64 {
+    (ror(x, ror_distance(1, 2)) & 0x00ff00ff00ff00ff) |
+    (ror(x, ror_distance(0, 2)) & 0xff00ff00ff00ff00)
+}
+
+#[cfg(not(aes_compact))]
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_1_3(x: u64) -> u64 {
+    (ror(x, ror_distance(1, 3)) & 0x000f000f000f000f) |
+    (ror(x, ror_distance(0, 3)) & 0xfff0fff0fff0fff0)
+}
+
+#[inline(always)]
+#[rustfmt::skip]
+fn rotate_rows_and_columns_2_2(x: u64) -> u64 {
+    (ror(x, ror_distance(2, 2)) & 0x00ff00ff00ff00ff) |
+    (ror(x, ror_distance(1, 2)) & 0xff00ff00ff00ff00)
+}
+
+/// Low-level "hazmat" AES functions.
+///
+/// Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256`
+/// implementations in this crate, but instead provides raw access to
+/// the AES round function gated under the `hazmat` crate feature.
+#[cfg(feature = "hazmat")]
+pub(crate) mod hazmat {
+    use super::{
+        bitslice, inv_bitslice, inv_mix_columns_0, inv_shift_rows_1, inv_sub_bytes, mix_columns_0,
+        shift_rows_1, sub_bytes, sub_bytes_nots, State,
+    };
+    use crate::{Block, Block8};
+
+    /// XOR the `src` block into the `dst` block in-place.
+    fn xor_in_place(dst: &mut Block, src: &Block) {
+        for (a, b) in dst.iter_mut().zip(src.as_slice()) {
+            *a ^= *b;
+        }
+    }
+
+    /// Perform a bitslice operation, loading a single block.
+    fn bitslice_block(block: &Block) -> State {
+        let mut state = State::default();
+        bitslice(&mut state, block, block, block, block);
+        state
+    }
+
+    /// Perform an inverse bitslice operation, extracting a single block.
+    fn inv_bitslice_block(block: &mut Block, state: &State) {
+        block.copy_from_slice(&inv_bitslice(state)[0]);
+    }
+
+    /// AES cipher (encrypt) round function.
+    #[inline]
+    pub(crate) fn cipher_round(block: &mut Block, round_key: &Block) {
+        let mut state = bitslice_block(block);
+        sub_bytes(&mut state);
+        sub_bytes_nots(&mut state);
+        shift_rows_1(&mut state);
+        mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+        xor_in_place(block, round_key);
+    }
+
+    /// AES cipher (encrypt) round function: parallel version.
+    #[inline]
+    pub(crate) fn cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
+        for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) {
+            let mut state = State::default();
+            bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]);
+            sub_bytes(&mut state);
+            sub_bytes_nots(&mut state);
+            shift_rows_1(&mut state);
+            mix_columns_0(&mut state);
+            let res = inv_bitslice(&state);
+
+            for i in 0..4 {
+                chunk[i] = res[i];
+                xor_in_place(&mut chunk[i], &keys[i]);
+            }
+        }
+    }
+
+    /// AES cipher (encrypt) round function.
+    #[inline]
+    pub(crate) fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) {
+        let mut state = State::default();
+        bitslice(&mut state, block, block, block, block);
+        sub_bytes_nots(&mut state);
+        inv_sub_bytes(&mut state);
+        inv_shift_rows_1(&mut state);
+        inv_mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+        xor_in_place(block, round_key);
+    }
+
+    /// AES cipher (encrypt) round function: parallel version.
+    #[inline]
+    pub(crate) fn equiv_inv_cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
+        for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) {
+            let mut state = State::default();
+            bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]);
+            sub_bytes_nots(&mut state);
+            inv_sub_bytes(&mut state);
+            inv_shift_rows_1(&mut state);
+            inv_mix_columns_0(&mut state);
+            let res = inv_bitslice(&state);
+
+            for i in 0..4 {
+                chunk[i] = res[i];
+                xor_in_place(&mut chunk[i], &keys[i]);
+            }
+        }
+    }
+
+    /// AES mix columns function.
+    #[inline]
+    pub(crate) fn mix_columns(block: &mut Block) {
+        let mut state = bitslice_block(block);
+        mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+    }
+
+    /// AES inverse mix columns function.
+    #[inline]
+    pub(crate) fn inv_mix_columns(block: &mut Block) {
+        let mut state = bitslice_block(block);
+        inv_mix_columns_0(&mut state);
+        inv_bitslice_block(block, &state);
+    }
+}