diff --git a/Cargo.toml b/Cargo.toml index 085137f5f6..fdd4178a3d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,7 +46,6 @@ include = [ "crypto/curve25519/curve25519_64_adx.c", "crypto/curve25519/curve25519_tables.h", "crypto/curve25519/internal.h", - "crypto/fipsmodule/aes/aes_nohw.c", "crypto/fipsmodule/aes/asm/aesni-x86.pl", "crypto/fipsmodule/aes/asm/aesni-x86_64.pl", "crypto/fipsmodule/aes/asm/aesv8-armx.pl", diff --git a/build.rs b/build.rs index 01ed161d14..863444c664 100644 --- a/build.rs +++ b/build.rs @@ -54,7 +54,6 @@ const WASM32: &str = "wasm32"; #[rustfmt::skip] const RING_SRCS: &[(&[&str], &str)] = &[ (&[], "crypto/curve25519/curve25519.c"), - (&[], "crypto/fipsmodule/aes/aes_nohw.c"), (&[], "crypto/fipsmodule/bn/montgomery.c"), (&[], "crypto/fipsmodule/bn/montgomery_inv.c"), (&[], "crypto/fipsmodule/ec/ecp_nistz.c"), @@ -884,9 +883,6 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { "aes_hw_ctr32_encrypt_blocks", "aes_hw_encrypt", "aes_hw_set_encrypt_key", - "aes_nohw_ctr32_encrypt_blocks", - "aes_nohw_encrypt", - "aes_nohw_set_encrypt_key", "aesni_gcm_decrypt", "aesni_gcm_encrypt", "bn_from_montgomery_in_place", diff --git a/crypto/fipsmodule/aes/aes_nohw.c b/crypto/fipsmodule/aes/aes_nohw.c deleted file mode 100644 index 731178516d..0000000000 --- a/crypto/fipsmodule/aes/aes_nohw.c +++ /dev/null @@ -1,959 +0,0 @@ -/* Copyright (c) 2019, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include "../../internal.h" - -#if defined(OPENSSL_SSE2) -#include -#endif - - -// This file contains a constant-time implementation of AES, bitsliced with -// 32-bit, 64-bit, or 128-bit words, operating on two-, four-, and eight-block -// batches, respectively. The 128-bit implementation requires SSE2 intrinsics. -// -// This implementation is based on the algorithms described in the following -// references: -// - https://bearssl.org/constanttime.html#aes -// - https://eprint.iacr.org/2009/129.pdf -// - https://eprint.iacr.org/2009/191.pdf - - -// Word operations. -// -// An aes_word_t is the word used for this AES implementation. Throughout this -// file, bits and bytes are ordered little-endian, though "left" and "right" -// shifts match the operations themselves, which makes them reversed in a -// little-endian, left-to-right reading. -// -// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an -// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE| -// bits each, each corresponding to a byte in an AES block in column-major -// order (AES's byte order). We refer to these as "logical bytes". Note, in the -// 32-bit and 64-bit implementations, they are smaller than a byte. (The -// contents of a logical byte will be described later.) -// -// MSVC does not support C bit operators on |__m128i|, so the wrapper functions -// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and -// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift -// value ranges from 0 to 15 independent of |aes_word_t| and -// |AES_NOHW_BATCH_SIZE|. -// -// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which -// uses row-major order. Matching the AES order was easier to reason about, and -// we do not have PSHUFB available to arbitrarily permute bytes. - -#if defined(OPENSSL_SSE2) -typedef __m128i aes_word_t; -// AES_NOHW_WORD_SIZE is sizeof(aes_word_t). alignas(sizeof(T)) does not work in -// MSVC, so we define a constant. -#define AES_NOHW_WORD_SIZE 16 -#define AES_NOHW_BATCH_SIZE 8 -#define AES_NOHW_ROW0_MASK \ - _mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff) -#define AES_NOHW_ROW1_MASK \ - _mm_set_epi32(0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00) -#define AES_NOHW_ROW2_MASK \ - _mm_set_epi32(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000) -#define AES_NOHW_ROW3_MASK \ - _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000) - -static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) { - return _mm_and_si128(a, b); -} - -static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) { - return _mm_or_si128(a, b); -} - -static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) { - return _mm_xor_si128(a, b); -} - -static inline aes_word_t aes_nohw_not(aes_word_t a) { - return _mm_xor_si128( - a, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff)); -} - -// These are macros because parameters to |_mm_slli_si128| and |_mm_srli_si128| -// must be constants. -#define aes_nohw_shift_left(/* aes_word_t */ a, /* const */ i) \ - _mm_slli_si128((a), (i)) -#define aes_nohw_shift_right(/* aes_word_t */ a, /* const */ i) \ - _mm_srli_si128((a), (i)) -#else // !OPENSSL_SSE2 -#if defined(OPENSSL_64_BIT) -typedef uint64_t aes_word_t; -#define AES_NOHW_WORD_SIZE 8 -#define AES_NOHW_BATCH_SIZE 4 -#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f) -#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0) -#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00) -#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000) -#else // !OPENSSL_64_BIT -typedef uint32_t aes_word_t; -#define AES_NOHW_WORD_SIZE 4 -#define AES_NOHW_BATCH_SIZE 2 -#define AES_NOHW_ROW0_MASK 0x03030303 -#define AES_NOHW_ROW1_MASK 0x0c0c0c0c -#define AES_NOHW_ROW2_MASK 0x30303030 -#define AES_NOHW_ROW3_MASK 0xc0c0c0c0 -#endif // OPENSSL_64_BIT - -static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) { - return a & b; -} - -static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) { - return a | b; -} - -static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) { - return a ^ b; -} - -static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; } - -static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) { - return a << (i * AES_NOHW_BATCH_SIZE); -} - -static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) { - return a >> (i * AES_NOHW_BATCH_SIZE); -} -#endif // OPENSSL_SSE2 - -OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t), - "batch size does not match word size"); -OPENSSL_STATIC_ASSERT(AES_NOHW_WORD_SIZE == sizeof(aes_word_t), - "AES_NOHW_WORD_SIZE is incorrect"); - - -// Block representations. -// -// This implementation uses three representations for AES blocks. First, the -// public API represents blocks as uint8_t[16] in the usual way. Second, most -// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|. -// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words -// containing bitsliced blocks a, b, c, d, this would be as follows (vertical -// bars divide logical bytes): -// -// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... -// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... -// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... -// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... -// ... -// -// Finally, an individual block may be stored as an intermediate form in an -// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each -// block, so that block[0]'s ith logical byte contains least-significant -// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of -// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as -// "compacting" the block. Note this is no-op with 128-bit words because then -// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit -// words, one block would be stored in two words: -// -// block[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... -// block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ... -// -// Observe that the distances between corresponding bits in bitsliced and -// compact bit orders match. If we line up corresponding words of each block, -// the bitsliced and compact representations may be converted by tranposing bits -// in corresponding logical bytes. Continuing the 64-bit example: -// -// block_a[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... -// block_b[0] = b0 b1 b2 b3 | b8 b9 b10 b11 | b16 b17 b18 b19 ... -// block_c[0] = c0 c1 c2 c3 | c8 c9 c10 c11 | c16 c17 c18 c19 ... -// block_d[0] = d0 d1 d2 d3 | d8 d9 d10 d11 | d16 d17 d18 d19 ... -// -// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... -// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... -// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... -// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... -// -// Note also that bitwise operations and (logical) byte permutations on an -// |aes_word_t| work equally for the bitsliced and compact words. -// -// We use the compact form in the |AES_KEY| representation to save work -// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists -// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately -// before or after |aes_nohw_transpose|. - -#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t)) - -// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise -// specified, it is in bitsliced form. -typedef struct { - aes_word_t w[8]; -} AES_NOHW_BATCH; - -// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is -// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH| -// |AES_KEY|s so it should not be used as a long-term key representation. -typedef struct { - // keys is an array of batches, one for each round key. Each batch stores - // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form. - AES_NOHW_BATCH keys[AES_MAXNR + 1]; -} AES_NOHW_SCHEDULE; - -// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in -// compact form. -static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch, - const aes_word_t in[AES_NOHW_BLOCK_WORDS], - size_t i) { - // Note the words are interleaved. The order comes from |aes_nohw_transpose|. - // If |i| is zero and this is the 64-bit implementation, in[0] contains bits - // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at - // w[4] so that bits 0 and 4 are in the correct position. (In general, bits - // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares - // will be correctly placed.) - dev_assert_secret(i < AES_NOHW_BATCH_SIZE); -#if defined(OPENSSL_SSE2) - batch->w[i] = in[0]; -#elif defined(OPENSSL_64_BIT) - batch->w[i] = in[0]; - batch->w[i + 4] = in[1]; -#else - batch->w[i] = in[0]; - batch->w[i + 2] = in[1]; - batch->w[i + 4] = in[2]; - batch->w[i + 6] = in[3]; -#endif -} - -// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in -// compact form. -static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch, - aes_word_t out[AES_NOHW_BLOCK_WORDS], - size_t i) { - dev_assert_secret(i < AES_NOHW_BATCH_SIZE); -#if defined(OPENSSL_SSE2) - out[0] = batch->w[i]; -#elif defined(OPENSSL_64_BIT) - out[0] = batch->w[i]; - out[1] = batch->w[i + 4]; -#else - out[0] = batch->w[i]; - out[1] = batch->w[i + 2]; - out[2] = batch->w[i + 4]; - out[3] = batch->w[i + 6]; -#endif -} - -#if !defined(OPENSSL_SSE2) -// aes_nohw_delta_swap returns |a| with bits |a & mask| and -// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap. -static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask, - aes_word_t shift) { - // See - // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/ - aes_word_t b = (a ^ (a >> shift)) & mask; - return a ^ b ^ (b << shift); -} - -// In the 32-bit and 64-bit implementations, a block spans multiple words. -// |aes_nohw_compact_block| must permute bits across different words. First we -// implement |aes_nohw_compact_word| which performs a smaller version of the -// transformation which stays within a single word. -// -// These transformations are generalizations of the output of -// http://programming.sirrida.de/calcperm.php on smaller inputs. -#if defined(OPENSSL_64_BIT) -static inline uint64_t aes_nohw_compact_word(uint64_t a) { -#if defined(RING_BIG_ENDIAN) - a = CRYPTO_bswap8(a); -#endif - // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap - // quartets of those chunks: - // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => - // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 - a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4); - // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks): - // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 => - // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 - a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8); - // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks): - // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 => - // 0 2 4 6 | 8 10 12 14 | 1 3 5 7 | 9 11 13 15 - a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16); - return a; -} - -static inline uint64_t aes_nohw_uncompact_word(uint64_t a) { - // Reverse the steps of |aes_nohw_uncompact_word|. - a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16); - a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8); - a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4); -#if defined(RING_BIG_ENDIAN) - a = CRYPTO_bswap8(a); -#endif - return a; -} -#else // !OPENSSL_64_BIT -static inline uint32_t aes_nohw_compact_word(uint32_t a) { -#if defined(RING_BIG_ENDIAN) - a = CRYPTO_bswap4(a); -#endif - // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap: - // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => - // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 - // Note: 0x00cc = 0b0000_0000_1100_1100 - // 0x00cc << 6 = 0b0011_0011_0000_0000 - a = aes_nohw_delta_swap(a, 0x00cc00cc, 6); - // Now we swap groups of four bits (still numbering by pairs): - // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 => - // 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15 - // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000 - a = aes_nohw_delta_swap(a, 0x0000f0f0, 12); - return a; -} - -static inline uint32_t aes_nohw_uncompact_word(uint32_t a) { - // Reverse the steps of |aes_nohw_uncompact_word|. - a = aes_nohw_delta_swap(a, 0x0000f0f0, 12); - a = aes_nohw_delta_swap(a, 0x00cc00cc, 6); -#if defined(RING_BIG_ENDIAN) - a = CRYPTO_bswap4(a); -#endif - return a; -} - -static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1, - uint8_t a2, uint8_t a3) { - return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) | - ((uint32_t)a3 << 24); -} - -static inline uint8_t lo(uint32_t a) { - return (uint8_t)a; -} - -#endif // OPENSSL_64_BIT -#endif // !OPENSSL_SSE2 - -static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], - const uint8_t in[16]) { - OPENSSL_memcpy(out, in, 16); -#if defined(OPENSSL_SSE2) - // No conversions needed. -#elif defined(OPENSSL_64_BIT) - uint64_t a0 = aes_nohw_compact_word(out[0]); - uint64_t a1 = aes_nohw_compact_word(out[1]); - out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32); - out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32); -#else - uint32_t a0 = aes_nohw_compact_word(out[0]); - uint32_t a1 = aes_nohw_compact_word(out[1]); - uint32_t a2 = aes_nohw_compact_word(out[2]); - uint32_t a3 = aes_nohw_compact_word(out[3]); - // Note clang, when building for ARM Thumb2, will sometimes miscompile - // expressions such as (a0 & 0x0000ff00) << 8, particularly when building - // without optimizations. This bug was introduced in - // https://reviews.llvm.org/rL340261 and fixed in - // https://reviews.llvm.org/rL351310. The following is written to avoid this. - out[0] = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3)); - out[1] = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)); - out[2] = aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)); - out[3] = aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)); -#endif -} - -static inline void aes_nohw_uncompact_block( - uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { -#if defined(OPENSSL_SSE2) - OPENSSL_memcpy(out, in, 16); // No conversions needed. -#elif defined(OPENSSL_64_BIT) - uint64_t a0 = in[0]; - uint64_t a1 = in[1]; - uint64_t b0 = - aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32)); - uint64_t b1 = - aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32)); - OPENSSL_memcpy(out, &b0, 8); - OPENSSL_memcpy(out + 8, &b1, 8); -#else - uint32_t a0 = in[0]; - uint32_t a1 = in[1]; - uint32_t a2 = in[2]; - uint32_t a3 = in[3]; - // Note clang, when building for ARM Thumb2, will sometimes miscompile - // expressions such as (a0 & 0x0000ff00) << 8, particularly when building - // without optimizations. This bug was introduced in - // https://reviews.llvm.org/rL340261 and fixed in - // https://reviews.llvm.org/rL351310. The following is written to avoid this. - uint32_t b0 = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3)); - uint32_t b1 = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)); - uint32_t b2 = - aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)); - uint32_t b3 = - aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)); - b0 = aes_nohw_uncompact_word(b0); - b1 = aes_nohw_uncompact_word(b1); - b2 = aes_nohw_uncompact_word(b2); - b3 = aes_nohw_uncompact_word(b3); - OPENSSL_memcpy(out, &b0, 4); - OPENSSL_memcpy(out + 4, &b1, 4); - OPENSSL_memcpy(out + 8, &b2, 4); - OPENSSL_memcpy(out + 12, &b3, 4); -#endif -} - -// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in -// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and -// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it -// is repeated to the full width of |aes_word_t|. -#if defined(OPENSSL_SSE2) -// This must be a macro because |_mm_srli_epi32| and |_mm_slli_epi32| require -// constant shift values. -#define aes_nohw_swap_bits(/*__m128i* */ a, /*__m128i* */ b, \ - /* uint32_t */ mask, /* const */ shift) \ - do { \ - __m128i swap = \ - _mm_and_si128(_mm_xor_si128(_mm_srli_epi32(*(a), (shift)), *(b)), \ - _mm_set_epi32((mask), (mask), (mask), (mask))); \ - *(a) = _mm_xor_si128(*(a), _mm_slli_epi32(swap, (shift))); \ - *(b) = _mm_xor_si128(*(b), swap); \ - \ - } while (0) -#else -static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b, - uint32_t mask, aes_word_t shift) { -#if defined(OPENSSL_64_BIT) - aes_word_t mask_w = (((uint64_t)mask) << 32) | mask; -#else - aes_word_t mask_w = mask; -#endif - // This is a variation on a delta swap. - aes_word_t swap = ((*a >> shift) ^ *b) & mask_w; - *a ^= swap << shift; - *b ^= swap; -} -#endif // OPENSSL_SSE2 - -// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides -// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares -// and transposes each square. -static void aes_nohw_transpose(AES_NOHW_BATCH *batch) { - // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101). - aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1); - aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1); - aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1); - aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1); - -#if AES_NOHW_BATCH_SIZE >= 4 - // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011). - aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2); - aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2); - aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2); - aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2); -#endif - -#if AES_NOHW_BATCH_SIZE >= 8 - // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111). - aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4); - aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4); - aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4); - aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4); -#endif -} - -// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|. -// |num_blocks| must be at most |AES_NOHW_BATCH|. -static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in, - size_t num_blocks) { - // Don't leave unused blocks uninitialized. - OPENSSL_memset(out, 0, sizeof(AES_NOHW_BATCH)); - debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE); - for (size_t i = 0; i < num_blocks; i++) { - aes_word_t block[AES_NOHW_BLOCK_WORDS]; - aes_nohw_compact_block(block, in + 16 * i); - aes_nohw_batch_set(out, block, i); - } - - aes_nohw_transpose(out); -} - -// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|. -// |num_blocks| must be at most |AES_NOHW_BATCH|. -static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks, - const AES_NOHW_BATCH *batch) { - AES_NOHW_BATCH copy = *batch; - aes_nohw_transpose(©); - - debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE); - for (size_t i = 0; i < num_blocks; i++) { - aes_word_t block[AES_NOHW_BLOCK_WORDS]; - aes_nohw_batch_get(©, block, i); - aes_nohw_uncompact_block(out + 16 * i, block); - } -} - - -// AES round steps. - -static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch, - const AES_NOHW_BATCH *key) { - for (size_t i = 0; i < 8; i++) { - batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]); - } -} - -static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) { - // See https://eprint.iacr.org/2009/191.pdf, Appendix C. - aes_word_t x0 = batch->w[7]; - aes_word_t x1 = batch->w[6]; - aes_word_t x2 = batch->w[5]; - aes_word_t x3 = batch->w[4]; - aes_word_t x4 = batch->w[3]; - aes_word_t x5 = batch->w[2]; - aes_word_t x6 = batch->w[1]; - aes_word_t x7 = batch->w[0]; - - // Figure 2, the top linear transformation. - aes_word_t y14 = aes_nohw_xor(x3, x5); - aes_word_t y13 = aes_nohw_xor(x0, x6); - aes_word_t y9 = aes_nohw_xor(x0, x3); - aes_word_t y8 = aes_nohw_xor(x0, x5); - aes_word_t t0 = aes_nohw_xor(x1, x2); - aes_word_t y1 = aes_nohw_xor(t0, x7); - aes_word_t y4 = aes_nohw_xor(y1, x3); - aes_word_t y12 = aes_nohw_xor(y13, y14); - aes_word_t y2 = aes_nohw_xor(y1, x0); - aes_word_t y5 = aes_nohw_xor(y1, x6); - aes_word_t y3 = aes_nohw_xor(y5, y8); - aes_word_t t1 = aes_nohw_xor(x4, y12); - aes_word_t y15 = aes_nohw_xor(t1, x5); - aes_word_t y20 = aes_nohw_xor(t1, x1); - aes_word_t y6 = aes_nohw_xor(y15, x7); - aes_word_t y10 = aes_nohw_xor(y15, t0); - aes_word_t y11 = aes_nohw_xor(y20, y9); - aes_word_t y7 = aes_nohw_xor(x7, y11); - aes_word_t y17 = aes_nohw_xor(y10, y11); - aes_word_t y19 = aes_nohw_xor(y10, y8); - aes_word_t y16 = aes_nohw_xor(t0, y11); - aes_word_t y21 = aes_nohw_xor(y13, y16); - aes_word_t y18 = aes_nohw_xor(x0, y16); - - // Figure 3, the middle non-linear section. - aes_word_t t2 = aes_nohw_and(y12, y15); - aes_word_t t3 = aes_nohw_and(y3, y6); - aes_word_t t4 = aes_nohw_xor(t3, t2); - aes_word_t t5 = aes_nohw_and(y4, x7); - aes_word_t t6 = aes_nohw_xor(t5, t2); - aes_word_t t7 = aes_nohw_and(y13, y16); - aes_word_t t8 = aes_nohw_and(y5, y1); - aes_word_t t9 = aes_nohw_xor(t8, t7); - aes_word_t t10 = aes_nohw_and(y2, y7); - aes_word_t t11 = aes_nohw_xor(t10, t7); - aes_word_t t12 = aes_nohw_and(y9, y11); - aes_word_t t13 = aes_nohw_and(y14, y17); - aes_word_t t14 = aes_nohw_xor(t13, t12); - aes_word_t t15 = aes_nohw_and(y8, y10); - aes_word_t t16 = aes_nohw_xor(t15, t12); - aes_word_t t17 = aes_nohw_xor(t4, t14); - aes_word_t t18 = aes_nohw_xor(t6, t16); - aes_word_t t19 = aes_nohw_xor(t9, t14); - aes_word_t t20 = aes_nohw_xor(t11, t16); - aes_word_t t21 = aes_nohw_xor(t17, y20); - aes_word_t t22 = aes_nohw_xor(t18, y19); - aes_word_t t23 = aes_nohw_xor(t19, y21); - aes_word_t t24 = aes_nohw_xor(t20, y18); - aes_word_t t25 = aes_nohw_xor(t21, t22); - aes_word_t t26 = aes_nohw_and(t21, t23); - aes_word_t t27 = aes_nohw_xor(t24, t26); - aes_word_t t28 = aes_nohw_and(t25, t27); - aes_word_t t29 = aes_nohw_xor(t28, t22); - aes_word_t t30 = aes_nohw_xor(t23, t24); - aes_word_t t31 = aes_nohw_xor(t22, t26); - aes_word_t t32 = aes_nohw_and(t31, t30); - aes_word_t t33 = aes_nohw_xor(t32, t24); - aes_word_t t34 = aes_nohw_xor(t23, t33); - aes_word_t t35 = aes_nohw_xor(t27, t33); - aes_word_t t36 = aes_nohw_and(t24, t35); - aes_word_t t37 = aes_nohw_xor(t36, t34); - aes_word_t t38 = aes_nohw_xor(t27, t36); - aes_word_t t39 = aes_nohw_and(t29, t38); - aes_word_t t40 = aes_nohw_xor(t25, t39); - aes_word_t t41 = aes_nohw_xor(t40, t37); - aes_word_t t42 = aes_nohw_xor(t29, t33); - aes_word_t t43 = aes_nohw_xor(t29, t40); - aes_word_t t44 = aes_nohw_xor(t33, t37); - aes_word_t t45 = aes_nohw_xor(t42, t41); - aes_word_t z0 = aes_nohw_and(t44, y15); - aes_word_t z1 = aes_nohw_and(t37, y6); - aes_word_t z2 = aes_nohw_and(t33, x7); - aes_word_t z3 = aes_nohw_and(t43, y16); - aes_word_t z4 = aes_nohw_and(t40, y1); - aes_word_t z5 = aes_nohw_and(t29, y7); - aes_word_t z6 = aes_nohw_and(t42, y11); - aes_word_t z7 = aes_nohw_and(t45, y17); - aes_word_t z8 = aes_nohw_and(t41, y10); - aes_word_t z9 = aes_nohw_and(t44, y12); - aes_word_t z10 = aes_nohw_and(t37, y3); - aes_word_t z11 = aes_nohw_and(t33, y4); - aes_word_t z12 = aes_nohw_and(t43, y13); - aes_word_t z13 = aes_nohw_and(t40, y5); - aes_word_t z14 = aes_nohw_and(t29, y2); - aes_word_t z15 = aes_nohw_and(t42, y9); - aes_word_t z16 = aes_nohw_and(t45, y14); - aes_word_t z17 = aes_nohw_and(t41, y8); - - // Figure 4, bottom linear transformation. - aes_word_t t46 = aes_nohw_xor(z15, z16); - aes_word_t t47 = aes_nohw_xor(z10, z11); - aes_word_t t48 = aes_nohw_xor(z5, z13); - aes_word_t t49 = aes_nohw_xor(z9, z10); - aes_word_t t50 = aes_nohw_xor(z2, z12); - aes_word_t t51 = aes_nohw_xor(z2, z5); - aes_word_t t52 = aes_nohw_xor(z7, z8); - aes_word_t t53 = aes_nohw_xor(z0, z3); - aes_word_t t54 = aes_nohw_xor(z6, z7); - aes_word_t t55 = aes_nohw_xor(z16, z17); - aes_word_t t56 = aes_nohw_xor(z12, t48); - aes_word_t t57 = aes_nohw_xor(t50, t53); - aes_word_t t58 = aes_nohw_xor(z4, t46); - aes_word_t t59 = aes_nohw_xor(z3, t54); - aes_word_t t60 = aes_nohw_xor(t46, t57); - aes_word_t t61 = aes_nohw_xor(z14, t57); - aes_word_t t62 = aes_nohw_xor(t52, t58); - aes_word_t t63 = aes_nohw_xor(t49, t58); - aes_word_t t64 = aes_nohw_xor(z4, t59); - aes_word_t t65 = aes_nohw_xor(t61, t62); - aes_word_t t66 = aes_nohw_xor(z1, t63); - aes_word_t s0 = aes_nohw_xor(t59, t63); - aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62)); - aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60)); - aes_word_t t67 = aes_nohw_xor(t64, t65); - aes_word_t s3 = aes_nohw_xor(t53, t66); - aes_word_t s4 = aes_nohw_xor(t51, t66); - aes_word_t s5 = aes_nohw_xor(t47, t65); - aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3)); - aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67)); - - batch->w[0] = s7; - batch->w[1] = s6; - batch->w[2] = s5; - batch->w[3] = s4; - batch->w[4] = s3; - batch->w[5] = s2; - batch->w[6] = s1; - batch->w[7] = s0; -} - -// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated -// to the right by |n|. This is a macro because |aes_nohw_shift_*| require -// constant shift counts in the SSE2 implementation. -#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \ - (aes_nohw_or(aes_nohw_shift_right((v), (n)*4), \ - aes_nohw_shift_left((v), 16 - (n)*4))) - -static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) { - for (size_t i = 0; i < 8; i++) { - aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK); - aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK); - aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK); - aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK); - row1 = aes_nohw_rotate_cols_right(row1, 1); - row2 = aes_nohw_rotate_cols_right(row2, 2); - row3 = aes_nohw_rotate_cols_right(row3, 3); - batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3)); - } -} - -// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated -// down by one. -static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) { -#if defined(OPENSSL_SSE2) - return _mm_or_si128(_mm_srli_epi32(v, 8), _mm_slli_epi32(v, 24)); -#elif defined(OPENSSL_64_BIT) - return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) | - ((v << 12) & UINT64_C(0xf000f000f000f000)); -#else - return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0); -#endif -} - -// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated -// by two. -static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) { -#if defined(OPENSSL_SSE2) - return _mm_or_si128(_mm_srli_epi32(v, 16), _mm_slli_epi32(v, 16)); -#elif defined(OPENSSL_64_BIT) - return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) | - ((v << 8) & UINT64_C(0xff00ff00ff00ff00)); -#else - return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0); -#endif -} - -static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) { - // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A. - aes_word_t a0 = batch->w[0]; - aes_word_t a1 = batch->w[1]; - aes_word_t a2 = batch->w[2]; - aes_word_t a3 = batch->w[3]; - aes_word_t a4 = batch->w[4]; - aes_word_t a5 = batch->w[5]; - aes_word_t a6 = batch->w[6]; - aes_word_t a7 = batch->w[7]; - - aes_word_t r0 = aes_nohw_rotate_rows_down(a0); - aes_word_t a0_r0 = aes_nohw_xor(a0, r0); - aes_word_t r1 = aes_nohw_rotate_rows_down(a1); - aes_word_t a1_r1 = aes_nohw_xor(a1, r1); - aes_word_t r2 = aes_nohw_rotate_rows_down(a2); - aes_word_t a2_r2 = aes_nohw_xor(a2, r2); - aes_word_t r3 = aes_nohw_rotate_rows_down(a3); - aes_word_t a3_r3 = aes_nohw_xor(a3, r3); - aes_word_t r4 = aes_nohw_rotate_rows_down(a4); - aes_word_t a4_r4 = aes_nohw_xor(a4, r4); - aes_word_t r5 = aes_nohw_rotate_rows_down(a5); - aes_word_t a5_r5 = aes_nohw_xor(a5, r5); - aes_word_t r6 = aes_nohw_rotate_rows_down(a6); - aes_word_t a6_r6 = aes_nohw_xor(a6, r6); - aes_word_t r7 = aes_nohw_rotate_rows_down(a7); - aes_word_t a7_r7 = aes_nohw_xor(a7, r7); - - batch->w[0] = - aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0)); - batch->w[1] = - aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7), - aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1))); - batch->w[2] = - aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2)); - batch->w[3] = - aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7), - aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3))); - batch->w[4] = - aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7), - aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4))); - batch->w[5] = - aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5)); - batch->w[6] = - aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6)); - batch->w[7] = - aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7)); -} - -static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key, - size_t num_rounds, AES_NOHW_BATCH *batch) { - aes_nohw_add_round_key(batch, &key->keys[0]); - for (size_t i = 1; i < num_rounds; i++) { - aes_nohw_sub_bytes(batch); - aes_nohw_shift_rows(batch); - aes_nohw_mix_columns(batch); - aes_nohw_add_round_key(batch, &key->keys[i]); - } - aes_nohw_sub_bytes(batch); - aes_nohw_shift_rows(batch); - aes_nohw_add_round_key(batch, &key->keys[num_rounds]); -} - -// Key schedule. - -static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out, - const AES_KEY *key) { - for (size_t i = 0; i <= key->rounds; i++) { - // Copy the round key into each block in the batch. - for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) { - aes_word_t tmp[AES_NOHW_BLOCK_WORDS]; - OPENSSL_memcpy(tmp, key->rd_key + 4 * i, 16); - aes_nohw_batch_set(&out->keys[i], tmp, j); - } - aes_nohw_transpose(&out->keys[i]); - } -} - -static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10, - 0x20, 0x40, 0x80, 0x1b, 0x36}; - -// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in -// |rcon|, stored in a |aes_word_t|. -static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) { - rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1); -#if defined(OPENSSL_SSE2) - return _mm_set_epi32(0, 0, 0, rcon); -#else - return ((aes_word_t)rcon); -#endif -} - -static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], - const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { - AES_NOHW_BATCH batch; - OPENSSL_memset(&batch, 0, sizeof(batch)); - aes_nohw_batch_set(&batch, in, 0); - aes_nohw_transpose(&batch); - aes_nohw_sub_bytes(&batch); - aes_nohw_transpose(&batch); - aes_nohw_batch_get(&batch, out, 0); -} - -static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) { - key->rounds = 10; - - aes_word_t block[AES_NOHW_BLOCK_WORDS]; - aes_nohw_compact_block(block, in); - OPENSSL_memcpy(key->rd_key, block, 16); - - for (size_t i = 1; i <= 10; i++) { - aes_word_t sub[AES_NOHW_BLOCK_WORDS]; - aes_nohw_sub_block(sub, block); - uint8_t rcon = aes_nohw_rcon[i - 1]; - for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { - // Incorporate |rcon| and the transformed word into the first word. - block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j)); - block[j] = aes_nohw_xor( - block[j], - aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); - // Propagate to the remaining words. Note this is reordered from the usual - // formulation to avoid needing masks. - aes_word_t v = block[j]; - block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4)); - block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8)); - block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12)); - } - OPENSSL_memcpy(key->rd_key + 4 * i, block, 16); - } -} - -static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) { - key->rounds = 14; - - // Each key schedule iteration produces two round keys. - aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS]; - aes_nohw_compact_block(block1, in); - OPENSSL_memcpy(key->rd_key, block1, 16); - - aes_nohw_compact_block(block2, in + 16); - OPENSSL_memcpy(key->rd_key + 4, block2, 16); - - for (size_t i = 2; i <= 14; i += 2) { - aes_word_t sub[AES_NOHW_BLOCK_WORDS]; - aes_nohw_sub_block(sub, block2); - uint8_t rcon = aes_nohw_rcon[i / 2 - 1]; - for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { - // Incorporate |rcon| and the transformed word into the first word. - block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)); - block1[j] = aes_nohw_xor( - block1[j], - aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); - // Propagate to the remaining words. - aes_word_t v = block1[j]; - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4)); - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8)); - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12)); - } - OPENSSL_memcpy(key->rd_key + 4 * i, block1, 16); - - if (i == 14) { - break; - } - - aes_nohw_sub_block(sub, block1); - for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { - // Incorporate the transformed word into the first word. - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12)); - // Propagate to the remaining words. - aes_word_t v = block2[j]; - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4)); - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8)); - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12)); - } - OPENSSL_memcpy(key->rd_key + 4 * (i + 1), block2, 16); - } -} - - -// External API. - -int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits, - AES_KEY *aeskey) { - switch (bits) { - case 128: - aes_nohw_setup_key_128(aeskey, key); - return 0; - case 256: - aes_nohw_setup_key_256(aeskey, key); - return 0; - } - return 1; -} - -void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { - AES_NOHW_SCHEDULE sched; - aes_nohw_expand_round_keys(&sched, key); - AES_NOHW_BATCH batch; - aes_nohw_to_batch(&batch, in, /*num_blocks=*/1); - aes_nohw_encrypt_batch(&sched, key->rounds, &batch); - aes_nohw_from_batch(out, /*num_blocks=*/1, &batch); -} - -static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16], - const uint8_t b[16]) { - for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) { - aes_word_t x, y; - OPENSSL_memcpy(&x, a + i, sizeof(aes_word_t)); - OPENSSL_memcpy(&y, b + i, sizeof(aes_word_t)); - x = aes_nohw_xor(x, y); - OPENSSL_memcpy(out + i, &x, sizeof(aes_word_t)); - } -} - -void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, - size_t blocks, const AES_KEY *key, - const uint8_t ivec[16]) { - if (blocks == 0) { - return; - } - - AES_NOHW_SCHEDULE sched; - aes_nohw_expand_round_keys(&sched, key); - - // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|. - alignas(AES_NOHW_WORD_SIZE) uint8_t ivs[AES_NOHW_BATCH_SIZE * 16]; - alignas(AES_NOHW_WORD_SIZE) uint8_t enc_ivs[AES_NOHW_BATCH_SIZE * 16]; - for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) { - OPENSSL_memcpy(ivs + 16 * i, ivec, 16); - } - - uint32_t ctr = CRYPTO_load_u32_be(ivs + 12); - for (;;) { - // Update counters. - for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) { - CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + (uint32_t)i); - } - - size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks; - AES_NOHW_BATCH batch; - aes_nohw_to_batch(&batch, ivs, todo); - aes_nohw_encrypt_batch(&sched, key->rounds, &batch); - aes_nohw_from_batch(enc_ivs, todo, &batch); - - for (size_t i = 0; i < todo; i++) { - aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs + 16 * i); - } - - blocks -= todo; - if (blocks == 0) { - break; - } - - in += 16 * AES_NOHW_BATCH_SIZE; - out += 16 * AES_NOHW_BATCH_SIZE; - ctr += AES_NOHW_BATCH_SIZE; - } -} diff --git a/include/ring-core/aes.h b/include/ring-core/aes.h index 5b5130dad7..94827f6500 100644 --- a/include/ring-core/aes.h +++ b/include/ring-core/aes.h @@ -60,7 +60,7 @@ // aes_key_st should be an opaque type, but EVP requires that the size be // known. struct aes_key_st { - uint32_t rd_key[4 * (AES_MAXNR + 1)]; + uint32_t rd_key[240]; unsigned rounds; }; typedef struct aes_key_st AES_KEY; diff --git a/src/aead/aes.rs b/src/aead/aes.rs index 4076df1b25..9834b71577 100644 --- a/src/aead/aes.rs +++ b/src/aead/aes.rs @@ -19,6 +19,7 @@ use crate::{ endian::BigEndian, error, polyfill::{self, ArrayFlatten as _, ArraySplitMap as _}, + rust_crypto::aes, }; use core::ops::RangeFrom; @@ -120,6 +121,25 @@ fn ctr32_encrypt_blocks_( ctr.increment_by_less_safe(blocks_u32); } +fn u32_array_from_u64_array(from: &[u64], to: &mut [u32]) { + for (i, byte) in from.iter().enumerate() { + let idx = 2 * i; + if idx + 1 >= to.len() { + break; + } + let lhsu32 = (byte >> 32) as u32; + let rhsu32 = (byte & 0xffffffff) as u32; + to[idx] = lhsu32; + to[idx + 1] = rhsu32; + } +} +fn u64_array_from_u32_array(from: &[u32], to: &mut [u64]) { + for (i, _) in from.iter().enumerate().step_by(2) { + if i / 2 >= to.len() { break; } + to[i / 2] = (from[i] as u64) << 32 | from[i + 1] as u64; + } +} + impl Key { #[inline] pub fn new( @@ -136,7 +156,7 @@ impl Key { } let mut key = AES_KEY { - rd_key: [0u32; 4 * (MAX_ROUNDS + 1)], + rd_key: [0u32; 240], rounds: 0, }; @@ -162,7 +182,19 @@ impl Key { } Implementation::NOHW => { - set_encrypt_key!(aes_nohw_set_encrypt_key, bytes, key_bits, &mut key)? + match key_bits.as_bits() { + 128 => { + let sched = aes::fixslice::aes128_key_schedule(bytes.try_into()?); + u32_array_from_u64_array(&sched, &mut key.rd_key); + key.rounds = 10; + } + 256 => { + let sched = aes::fixslice::aes256_key_schedule(bytes.try_into()?); + u32_array_from_u64_array(&sched, &mut key.rd_key); + key.rounds = 14; + } + _ => unreachable!(), + }; } }; @@ -188,7 +220,21 @@ impl Key { ))] Implementation::VPAES_BSAES => encrypt_block!(vpaes_encrypt, a, self), - Implementation::NOHW => encrypt_block!(aes_nohw_encrypt, a, self), + Implementation::NOHW => match self.inner.rounds { + 10 => { + let mut enc_key: [u64; 88] = [0; 88]; + u64_array_from_u32_array(&self.inner.rd_key, &mut enc_key); + let blocks: [Block; 4] = [a, [0; 16], [0; 16], [0; 16]]; + aes::fixslice::aes128_encrypt(&enc_key, &blocks)[0] + } + 14 => { + let mut enc_key: [u64; 120] = [0; 120]; + u64_array_from_u32_array(&self.inner.rd_key, &mut enc_key); + let blocks: [Block; 4] = [a, [0; 16], [0; 16], [0; 16]]; + aes::fixslice::aes256_encrypt(&enc_key, &blocks)[0] + } + _ => unimplemented!() + }, } } @@ -267,7 +313,76 @@ impl Key { } Implementation::NOHW => { - ctr32_encrypt_blocks!(aes_nohw_ctr32_encrypt_blocks, in_out, src, &self.inner, ctr) + let in_out_len = in_out[src.clone()].len(); + assert_eq!(in_out_len % BLOCK_LEN, 0); + + let blocks = in_out_len / BLOCK_LEN; + #[allow(clippy::cast_possible_truncation)] + let blocks_u32 = blocks as u32; + assert_eq!(blocks, polyfill::usize_from_u32(blocks_u32)); + const MAX_BATCH_SIZE: usize = 4; + fn encrypt( + encryptor_fn: F, + mut blocks: usize, + key: &Key, + ctr: &mut Counter, + in_out: &mut [u8], + src: RangeFrom, + ) where + F: Fn(&[u64; N], &[Block; MAX_BATCH_SIZE]) -> [[u8; BLOCK_LEN]; MAX_BATCH_SIZE], + { + let mut offset = src.clone(); + let mut out_range = RangeFrom { start: 0 }; + loop { + let todo = if blocks > MAX_BATCH_SIZE { + MAX_BATCH_SIZE + } else { + blocks + }; + let mut enc_key: [u64; N] = [0; N]; + u64_array_from_u32_array(&key.inner.rd_key, &mut enc_key); + let mut b: [Block; MAX_BATCH_SIZE] = [[0; BLOCK_LEN]; MAX_BATCH_SIZE]; + for i in 0..todo { + b[i] = ctr.increment().into_block_less_safe() + } + let enc_ivs = encryptor_fn(&enc_key, &b); + for i in 0..todo { + for j in 0..BLOCK_LEN { + in_out[out_range.clone()][j] = + in_out[offset.clone()][j] ^ enc_ivs[i][j]; + } + offset.start = offset.start.checked_add(BLOCK_LEN).unwrap(); + out_range.start = out_range.start.checked_add(BLOCK_LEN).unwrap(); + } + blocks -= todo; + if blocks <= 0 { + break; + } + } + } + match self.inner.rounds { + 10 => { + encrypt( + aes::fixslice::aes128_encrypt, + blocks, + self, + ctr, + in_out, + src, + ); + } + 14 => { + encrypt( + aes::fixslice::aes256_encrypt, + blocks, + self, + ctr, + in_out, + src, + ); + } + _ => unreachable!(), + }; } } } @@ -294,7 +409,7 @@ impl Key { #[repr(C)] #[derive(Clone)] pub(super) struct AES_KEY { - pub rd_key: [u32; 4 * (MAX_ROUNDS + 1)], + pub rd_key: [u32; 240], pub rounds: c::uint, } diff --git a/src/lib.rs b/src/lib.rs index 64a68e4e67..f5bcd4a772 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -122,6 +122,8 @@ pub mod rand; #[cfg(feature = "alloc")] pub mod rsa; +pub mod rust_crypto; + pub mod signature; #[cfg(test)] diff --git a/src/rust_crypto.rs b/src/rust_crypto.rs new file mode 100644 index 0000000000..5657404aa8 --- /dev/null +++ b/src/rust_crypto.rs @@ -0,0 +1 @@ +pub mod aes; diff --git a/src/rust_crypto/aes/mod.rs b/src/rust_crypto/aes/mod.rs new file mode 100644 index 0000000000..db1bce79be --- /dev/null +++ b/src/rust_crypto/aes/mod.rs @@ -0,0 +1,3 @@ +#[cfg_attr(not(target_pointer_width = "64"), path = "soft/fixslice32.rs")] +#[cfg_attr(target_pointer_width = "64", path = "soft/fixslice64.rs")] +pub(crate) mod fixslice; diff --git a/src/rust_crypto/aes/soft/block.rs b/src/rust_crypto/aes/soft/block.rs new file mode 100644 index 0000000000..7e088d9a89 --- /dev/null +++ b/src/rust_crypto/aes/soft/block.rs @@ -0,0 +1 @@ +pub type Block = [u8; 16]; diff --git a/src/rust_crypto/aes/soft/fixslice32.rs b/src/rust_crypto/aes/soft/fixslice32.rs new file mode 100644 index 0000000000..80453a7c25 --- /dev/null +++ b/src/rust_crypto/aes/soft/fixslice32.rs @@ -0,0 +1,1371 @@ +//! Fixsliced implementations of AES-128, AES-192 and AES-256 (32-bit) +//! adapted from the C implementation +//! +//! All implementations are fully bitsliced and do not rely on any +//! Look-Up Table (LUT). +//! +//! See the paper at for more details. +//! +//! # Author (original C code) +//! +//! Alexandre Adomnicai, Nanyang Technological University, Singapore +//! +//! +//! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission. + +#![allow(clippy::unreadable_literal)] + +use crate::Block; +use cipher::{array::Array, consts::U2}; + +/// AES block batch size for this implementation +pub(crate) type FixsliceBlocks = U2; + +pub(crate) type BatchBlocks = Array; + +/// AES-128 round keys +pub(crate) type FixsliceKeys128 = [u32; 88]; + +/// AES-192 round keys +pub(crate) type FixsliceKeys192 = [u32; 104]; + +/// AES-256 round keys +pub(crate) type FixsliceKeys256 = [u32; 120]; + +/// 256-bit internal state +pub(crate) type State = [u32; 8]; + +/// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation. +pub(crate) fn aes128_key_schedule(key: &[u8; 16]) -> FixsliceKeys128 { + let mut rkeys = [0u32; 88]; + + bitslice(&mut rkeys[..8], key, key); + + let mut rk_off = 0; + for rcon in 0..10 { + memshift32(&mut rkeys, rk_off); + rk_off += 8; + + sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); + sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); + + if rcon < 8 { + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon); + } else { + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8); + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7); + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5); + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4); + } + + xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3)); + } + + // Adjust to match fixslicing format + #[cfg(aes_compact)] + { + for i in (8..88).step_by(16) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + } + } + #[cfg(not(aes_compact))] + { + for i in (8..72).step_by(32) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]); + inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]); + } + inv_shift_rows_1(&mut rkeys[72..80]); + } + + // Account for NOTs removed from sub_bytes + for i in 1..11 { + sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); + } + + rkeys +} + +/// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation. +#[allow(dead_code)] +pub(crate) fn aes192_key_schedule(key: &[u8; 24]) -> FixsliceKeys192 { + let mut rkeys = [0u32; 104]; + let mut tmp = [0u32; 8]; + + bitslice(&mut rkeys[..8], &key[..16], &key[..16]); + bitslice(&mut tmp, &key[8..], &key[8..]); + + let mut rcon = 0; + let mut rk_off = 8; + + loop { + for i in 0..8 { + rkeys[rk_off + i] = + (0x0f0f0f0f & (tmp[i] >> 4)) | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4)); + } + + sub_bytes(&mut tmp); + sub_bytes_nots(&mut tmp); + + add_round_constant_bit(&mut tmp, rcon); + rcon += 1; + + for i in 0..8 { + let mut ti = rkeys[rk_off + i]; + ti ^= 0x30303030 & ror(tmp[i], ror_distance(1, 1)); + ti ^= 0xc0c0c0c0 & (ti << 2); + tmp[i] = ti; + } + rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); + rk_off += 8; + + for i in 0..8 { + let ui = tmp[i]; + let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) | (0xf0f0f0f0 & (ui << 4)); + ti ^= 0x03030303 & (ui >> 6); + tmp[i] = + ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6)); + } + rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); + rk_off += 8; + + sub_bytes(&mut tmp); + sub_bytes_nots(&mut tmp); + + add_round_constant_bit(&mut tmp, rcon); + rcon += 1; + + for i in 0..8 { + let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) + | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4)); + ti ^= 0x03030303 & ror(tmp[i], ror_distance(1, 3)); + rkeys[rk_off + i] = + ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6)); + } + rk_off += 8; + + if rcon >= 8 { + break; + } + + for i in 0..8 { + let ui = rkeys[(rk_off - 8) + i]; + let mut ti = rkeys[(rk_off - 16) + i]; + ti ^= 0x30303030 & (ui >> 2); + ti ^= 0xc0c0c0c0 & (ti << 2); + tmp[i] = ti; + } + } + + // Adjust to match fixslicing format + #[cfg(aes_compact)] + { + for i in (8..104).step_by(16) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + } + } + #[cfg(not(aes_compact))] + { + for i in (0..96).step_by(32) { + inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]); + inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]); + inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]); + } + } + + // Account for NOTs removed from sub_bytes + for i in 1..13 { + sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); + } + + rkeys +} + +/// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation. +pub(crate) fn aes256_key_schedule(key: &[u8; 32]) -> FixsliceKeys256 { + let mut rkeys = [0u32; 120]; + + bitslice(&mut rkeys[..8], &key[..16], &key[..16]); + bitslice(&mut rkeys[8..16], &key[16..], &key[16..]); + + let mut rk_off = 8; + + let mut rcon = 0; + loop { + memshift32(&mut rkeys, rk_off); + rk_off += 8; + + sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); + sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); + + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon); + xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3)); + rcon += 1; + + if rcon == 7 { + break; + } + + memshift32(&mut rkeys, rk_off); + rk_off += 8; + + sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); + sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); + + xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3)); + } + + // Adjust to match fixslicing format + #[cfg(aes_compact)] + { + for i in (8..120).step_by(16) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + } + } + #[cfg(not(aes_compact))] + { + for i in (8..104).step_by(32) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]); + inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]); + } + inv_shift_rows_1(&mut rkeys[104..112]); + } + + // Account for NOTs removed from sub_bytes + for i in 1..15 { + sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); + } + + rkeys +} + +/// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted). +/// +/// Decrypts four blocks in-place and in parallel. +#[allow(dead_code)] +pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks { + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1]); + + add_round_key(&mut state, &rkeys[80..]); + inv_sub_bytes(&mut state); + + #[cfg(not(aes_compact))] + { + inv_shift_rows_2(&mut state); + } + + let mut rk_off = 72; + loop { + #[cfg(aes_compact)] + { + inv_shift_rows_2(&mut state); + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_1(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + if rk_off == 0 { + break; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_0(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + #[cfg(not(aes_compact))] + { + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_3(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_2(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + } + + add_round_key(&mut state, &rkeys[..8]); + + inv_bitslice(&state) +} + +/// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted). +/// +/// Encrypts four blocks in-place and in parallel. +pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks { + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1]); + + add_round_key(&mut state, &rkeys[..8]); + + let mut rk_off = 8; + loop { + sub_bytes(&mut state); + mix_columns_1(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + #[cfg(aes_compact)] + { + shift_rows_2(&mut state); + } + + if rk_off == 80 { + break; + } + + #[cfg(not(aes_compact))] + { + sub_bytes(&mut state); + mix_columns_2(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + sub_bytes(&mut state); + mix_columns_3(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + sub_bytes(&mut state); + mix_columns_0(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + #[cfg(not(aes_compact))] + { + shift_rows_2(&mut state); + } + + sub_bytes(&mut state); + add_round_key(&mut state, &rkeys[80..]); + + inv_bitslice(&state) +} + +/// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted). +/// +/// Decrypts four blocks in-place and in parallel. +#[allow(dead_code)] +pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks { + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1]); + + add_round_key(&mut state, &rkeys[96..]); + inv_sub_bytes(&mut state); + + let mut rk_off = 88; + loop { + #[cfg(aes_compact)] + { + inv_shift_rows_2(&mut state); + } + #[cfg(not(aes_compact))] + { + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_3(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_2(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_1(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + if rk_off == 0 { + break; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_0(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + + add_round_key(&mut state, &rkeys[..8]); + + inv_bitslice(&state) +} + +/// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted). +/// +/// Encrypts four blocks in-place and in parallel. +#[allow(dead_code)] +pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks { + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1]); + + add_round_key(&mut state, &rkeys[..8]); + + let mut rk_off = 8; + loop { + sub_bytes(&mut state); + mix_columns_1(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + #[cfg(aes_compact)] + { + shift_rows_2(&mut state); + } + #[cfg(not(aes_compact))] + { + sub_bytes(&mut state); + mix_columns_2(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + sub_bytes(&mut state); + mix_columns_3(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + if rk_off == 96 { + break; + } + + sub_bytes(&mut state); + mix_columns_0(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + sub_bytes(&mut state); + add_round_key(&mut state, &rkeys[96..]); + + inv_bitslice(&state) +} + +/// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted). +/// +/// Decrypts four blocks in-place and in parallel. +pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks { + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1]); + + add_round_key(&mut state, &rkeys[112..]); + inv_sub_bytes(&mut state); + + #[cfg(not(aes_compact))] + { + inv_shift_rows_2(&mut state); + } + + let mut rk_off = 104; + loop { + #[cfg(aes_compact)] + { + inv_shift_rows_2(&mut state); + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_1(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + if rk_off == 0 { + break; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_0(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + #[cfg(not(aes_compact))] + { + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_3(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_2(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + } + + add_round_key(&mut state, &rkeys[..8]); + + inv_bitslice(&state) +} + +/// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted). +/// +/// Encrypts four blocks in-place and in parallel. +pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks { + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1]); + + add_round_key(&mut state, &rkeys[..8]); + + let mut rk_off = 8; + loop { + sub_bytes(&mut state); + mix_columns_1(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + #[cfg(aes_compact)] + { + shift_rows_2(&mut state); + } + + if rk_off == 112 { + break; + } + + #[cfg(not(aes_compact))] + { + sub_bytes(&mut state); + mix_columns_2(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + sub_bytes(&mut state); + mix_columns_3(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + sub_bytes(&mut state); + mix_columns_0(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + #[cfg(not(aes_compact))] + { + shift_rows_2(&mut state); + } + + sub_bytes(&mut state); + add_round_key(&mut state, &rkeys[112..]); + + inv_bitslice(&state) +} + +/// Note that the 4 bitwise NOT (^= 0xffffffff) are accounted for here so that it is a true +/// inverse of 'sub_bytes'. +fn inv_sub_bytes(state: &mut [u32]) { + debug_assert_eq!(state.len(), 8); + + // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler + // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4) + + let u7 = state[0]; + let u6 = state[1]; + let u5 = state[2]; + let u4 = state[3]; + let u3 = state[4]; + let u2 = state[5]; + let u1 = state[6]; + let u0 = state[7]; + + let t23 = u0 ^ u3; + let t8 = u1 ^ t23; + let m2 = t23 & t8; + let t4 = u4 ^ t8; + let t22 = u1 ^ u3; + let t2 = u0 ^ u1; + let t1 = u3 ^ u4; + // t23 -> stack + let t9 = u7 ^ t1; + // t8 -> stack + let m7 = t22 & t9; + // t9 -> stack + let t24 = u4 ^ u7; + // m7 -> stack + let t10 = t2 ^ t24; + // u4 -> stack + let m14 = t2 & t10; + let r5 = u6 ^ u7; + // m2 -> stack + let t3 = t1 ^ r5; + // t2 -> stack + let t13 = t2 ^ r5; + let t19 = t22 ^ r5; + // t3 -> stack + let t17 = u2 ^ t19; + // t4 -> stack + let t25 = u2 ^ t1; + let r13 = u1 ^ u6; + // t25 -> stack + let t20 = t24 ^ r13; + // t17 -> stack + let m9 = t20 & t17; + // t20 -> stack + let r17 = u2 ^ u5; + // t22 -> stack + let t6 = t22 ^ r17; + // t13 -> stack + let m1 = t13 & t6; + let y5 = u0 ^ r17; + let m4 = t19 & y5; + let m5 = m4 ^ m1; + let m17 = m5 ^ t24; + let r18 = u5 ^ u6; + let t27 = t1 ^ r18; + let t15 = t10 ^ t27; + // t6 -> stack + let m11 = t1 & t15; + let m15 = m14 ^ m11; + let m21 = m17 ^ m15; + // t1 -> stack + // t4 <- stack + let m12 = t4 & t27; + let m13 = m12 ^ m11; + let t14 = t10 ^ r18; + let m3 = t14 ^ m1; + // m2 <- stack + let m16 = m3 ^ m2; + let m20 = m16 ^ m13; + // u4 <- stack + let r19 = u2 ^ u4; + let t16 = r13 ^ r19; + // t3 <- stack + let t26 = t3 ^ t16; + let m6 = t3 & t16; + let m8 = t26 ^ m6; + // t10 -> stack + // m7 <- stack + let m18 = m8 ^ m7; + let m22 = m18 ^ m13; + let m25 = m22 & m20; + let m26 = m21 ^ m25; + let m10 = m9 ^ m6; + let m19 = m10 ^ m15; + // t25 <- stack + let m23 = m19 ^ t25; + let m28 = m23 ^ m25; + let m24 = m22 ^ m23; + let m30 = m26 & m24; + let m39 = m23 ^ m30; + let m48 = m39 & y5; + let m57 = m39 & t19; + // m48 -> stack + let m36 = m24 ^ m25; + let m31 = m20 & m23; + let m27 = m20 ^ m21; + let m32 = m27 & m31; + let m29 = m28 & m27; + let m37 = m21 ^ m29; + // m39 -> stack + let m42 = m37 ^ m39; + let m52 = m42 & t15; + // t27 -> stack + // t1 <- stack + let m61 = m42 & t1; + let p0 = m52 ^ m61; + let p16 = m57 ^ m61; + // m57 -> stack + // t20 <- stack + let m60 = m37 & t20; + // p16 -> stack + // t17 <- stack + let m51 = m37 & t17; + let m33 = m27 ^ m25; + let m38 = m32 ^ m33; + let m43 = m37 ^ m38; + let m49 = m43 & t16; + let p6 = m49 ^ m60; + let p13 = m49 ^ m51; + let m58 = m43 & t3; + // t9 <- stack + let m50 = m38 & t9; + // t22 <- stack + let m59 = m38 & t22; + // p6 -> stack + let p1 = m58 ^ m59; + let p7 = p0 ^ p1; + let m34 = m21 & m22; + let m35 = m24 & m34; + let m40 = m35 ^ m36; + let m41 = m38 ^ m40; + let m45 = m42 ^ m41; + // t27 <- stack + let m53 = m45 & t27; + let p8 = m50 ^ m53; + let p23 = p7 ^ p8; + // t4 <- stack + let m62 = m45 & t4; + let p14 = m49 ^ m62; + let s6 = p14 ^ p23; + // t10 <- stack + let m54 = m41 & t10; + let p2 = m54 ^ m62; + let p22 = p2 ^ p7; + let s0 = p13 ^ p22; + let p17 = m58 ^ p2; + let p15 = m54 ^ m59; + // t2 <- stack + let m63 = m41 & t2; + // m39 <- stack + let m44 = m39 ^ m40; + // p17 -> stack + // t6 <- stack + let m46 = m44 & t6; + let p5 = m46 ^ m51; + // p23 -> stack + let p18 = m63 ^ p5; + let p24 = p5 ^ p7; + // m48 <- stack + let p12 = m46 ^ m48; + let s3 = p12 ^ p22; + // t13 <- stack + let m55 = m44 & t13; + let p9 = m55 ^ m63; + // p16 <- stack + let s7 = p9 ^ p16; + // t8 <- stack + let m47 = m40 & t8; + let p3 = m47 ^ m50; + let p19 = p2 ^ p3; + let s5 = p19 ^ p24; + let p11 = p0 ^ p3; + let p26 = p9 ^ p11; + // t23 <- stack + let m56 = m40 & t23; + let p4 = m48 ^ m56; + // p6 <- stack + let p20 = p4 ^ p6; + let p29 = p15 ^ p20; + let s1 = p26 ^ p29; + // m57 <- stack + let p10 = m57 ^ p4; + let p27 = p10 ^ p18; + // p23 <- stack + let s4 = p23 ^ p27; + let p25 = p6 ^ p10; + let p28 = p11 ^ p25; + // p17 <- stack + let s2 = p17 ^ p28; + + state[0] = s7; + state[1] = s6; + state[2] = s5; + state[3] = s4; + state[4] = s3; + state[5] = s2; + state[6] = s1; + state[7] = s0; +} + +/// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik. +/// +/// See: +/// +/// Note that the 4 bitwise NOT (^= 0xffffffff) are moved to the key schedule. +fn sub_bytes(state: &mut [u32]) { + debug_assert_eq!(state.len(), 8); + + // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler + // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4) + + let u7 = state[0]; + let u6 = state[1]; + let u5 = state[2]; + let u4 = state[3]; + let u3 = state[4]; + let u2 = state[5]; + let u1 = state[6]; + let u0 = state[7]; + + let y14 = u3 ^ u5; + let y13 = u0 ^ u6; + let y12 = y13 ^ y14; + let t1 = u4 ^ y12; + let y15 = t1 ^ u5; + let t2 = y12 & y15; + let y6 = y15 ^ u7; + let y20 = t1 ^ u1; + // y12 -> stack + let y9 = u0 ^ u3; + // y20 -> stack + let y11 = y20 ^ y9; + // y9 -> stack + let t12 = y9 & y11; + // y6 -> stack + let y7 = u7 ^ y11; + let y8 = u0 ^ u5; + let t0 = u1 ^ u2; + let y10 = y15 ^ t0; + // y15 -> stack + let y17 = y10 ^ y11; + // y14 -> stack + let t13 = y14 & y17; + let t14 = t13 ^ t12; + // y17 -> stack + let y19 = y10 ^ y8; + // y10 -> stack + let t15 = y8 & y10; + let t16 = t15 ^ t12; + let y16 = t0 ^ y11; + // y11 -> stack + let y21 = y13 ^ y16; + // y13 -> stack + let t7 = y13 & y16; + // y16 -> stack + let y18 = u0 ^ y16; + let y1 = t0 ^ u7; + let y4 = y1 ^ u3; + // u7 -> stack + let t5 = y4 & u7; + let t6 = t5 ^ t2; + let t18 = t6 ^ t16; + let t22 = t18 ^ y19; + let y2 = y1 ^ u0; + let t10 = y2 & y7; + let t11 = t10 ^ t7; + let t20 = t11 ^ t16; + let t24 = t20 ^ y18; + let y5 = y1 ^ u6; + let t8 = y5 & y1; + let t9 = t8 ^ t7; + let t19 = t9 ^ t14; + let t23 = t19 ^ y21; + let y3 = y5 ^ y8; + // y6 <- stack + let t3 = y3 & y6; + let t4 = t3 ^ t2; + // y20 <- stack + let t17 = t4 ^ y20; + let t21 = t17 ^ t14; + let t26 = t21 & t23; + let t27 = t24 ^ t26; + let t31 = t22 ^ t26; + let t25 = t21 ^ t22; + // y4 -> stack + let t28 = t25 & t27; + let t29 = t28 ^ t22; + let z14 = t29 & y2; + let z5 = t29 & y7; + let t30 = t23 ^ t24; + let t32 = t31 & t30; + let t33 = t32 ^ t24; + let t35 = t27 ^ t33; + let t36 = t24 & t35; + let t38 = t27 ^ t36; + let t39 = t29 & t38; + let t40 = t25 ^ t39; + let t43 = t29 ^ t40; + // y16 <- stack + let z3 = t43 & y16; + let tc12 = z3 ^ z5; + // tc12 -> stack + // y13 <- stack + let z12 = t43 & y13; + let z13 = t40 & y5; + let z4 = t40 & y1; + let tc6 = z3 ^ z4; + let t34 = t23 ^ t33; + let t37 = t36 ^ t34; + let t41 = t40 ^ t37; + // y10 <- stack + let z8 = t41 & y10; + let z17 = t41 & y8; + let t44 = t33 ^ t37; + // y15 <- stack + let z0 = t44 & y15; + // z17 -> stack + // y12 <- stack + let z9 = t44 & y12; + let z10 = t37 & y3; + let z1 = t37 & y6; + let tc5 = z1 ^ z0; + let tc11 = tc6 ^ tc5; + // y4 <- stack + let z11 = t33 & y4; + let t42 = t29 ^ t33; + let t45 = t42 ^ t41; + // y17 <- stack + let z7 = t45 & y17; + let tc8 = z7 ^ tc6; + // y14 <- stack + let z16 = t45 & y14; + // y11 <- stack + let z6 = t42 & y11; + let tc16 = z6 ^ tc8; + // z14 -> stack + // y9 <- stack + let z15 = t42 & y9; + let tc20 = z15 ^ tc16; + let tc1 = z15 ^ z16; + let tc2 = z10 ^ tc1; + let tc21 = tc2 ^ z11; + let tc3 = z9 ^ tc2; + let s0 = tc3 ^ tc16; + let s3 = tc3 ^ tc11; + let s1 = s3 ^ tc16; + let tc13 = z13 ^ tc1; + // u7 <- stack + let z2 = t33 & u7; + let tc4 = z0 ^ z2; + let tc7 = z12 ^ tc4; + let tc9 = z8 ^ tc7; + let tc10 = tc8 ^ tc9; + // z14 <- stack + let tc17 = z14 ^ tc10; + let s5 = tc21 ^ tc17; + let tc26 = tc17 ^ tc20; + // z17 <- stack + let s2 = tc26 ^ z17; + // tc12 <- stack + let tc14 = tc4 ^ tc12; + let tc18 = tc13 ^ tc14; + let s6 = tc10 ^ tc18; + let s7 = z12 ^ tc18; + let s4 = tc14 ^ s3; + + state[0] = s7; + state[1] = s6; + state[2] = s5; + state[3] = s4; + state[4] = s3; + state[5] = s2; + state[6] = s1; + state[7] = s0; +} + +/// NOT operations that are omitted in S-box +#[inline] +fn sub_bytes_nots(state: &mut [u32]) { + debug_assert_eq!(state.len(), 8); + state[0] ^= 0xffffffff; + state[1] ^= 0xffffffff; + state[5] ^= 0xffffffff; + state[6] ^= 0xffffffff; +} + +/// Computation of the MixColumns transformation in the fixsliced representation, with different +/// rotations used according to the round number mod 4. +/// +/// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm. +macro_rules! define_mix_columns { + ( + $name:ident, + $name_inv:ident, + $first_rotate:path, + $second_rotate:path + ) => { + #[rustfmt::skip] + fn $name(state: &mut State) { + let (a0, a1, a2, a3, a4, a5, a6, a7) = ( + state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7] + ); + let (b0, b1, b2, b3, b4, b5, b6, b7) = ( + $first_rotate(a0), + $first_rotate(a1), + $first_rotate(a2), + $first_rotate(a3), + $first_rotate(a4), + $first_rotate(a5), + $first_rotate(a6), + $first_rotate(a7), + ); + let (c0, c1, c2, c3, c4, c5, c6, c7) = ( + a0 ^ b0, + a1 ^ b1, + a2 ^ b2, + a3 ^ b3, + a4 ^ b4, + a5 ^ b5, + a6 ^ b6, + a7 ^ b7, + ); + state[0] = b0 ^ c7 ^ $second_rotate(c0); + state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1); + state[2] = b2 ^ c1 ^ $second_rotate(c2); + state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3); + state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4); + state[5] = b5 ^ c4 ^ $second_rotate(c5); + state[6] = b6 ^ c5 ^ $second_rotate(c6); + state[7] = b7 ^ c6 ^ $second_rotate(c7); + } + + #[rustfmt::skip] + fn $name_inv(state: &mut State) { + let (a0, a1, a2, a3, a4, a5, a6, a7) = ( + state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7] + ); + let (b0, b1, b2, b3, b4, b5, b6, b7) = ( + $first_rotate(a0), + $first_rotate(a1), + $first_rotate(a2), + $first_rotate(a3), + $first_rotate(a4), + $first_rotate(a5), + $first_rotate(a6), + $first_rotate(a7), + ); + let (c0, c1, c2, c3, c4, c5, c6, c7) = ( + a0 ^ b0, + a1 ^ b1, + a2 ^ b2, + a3 ^ b3, + a4 ^ b4, + a5 ^ b5, + a6 ^ b6, + a7 ^ b7, + ); + let (d0, d1, d2, d3, d4, d5, d6, d7) = ( + a0 ^ c7, + a1 ^ c0 ^ c7, + a2 ^ c1, + a3 ^ c2 ^ c7, + a4 ^ c3 ^ c7, + a5 ^ c4, + a6 ^ c5, + a7 ^ c6, + ); + let (e0, e1, e2, e3, e4, e5, e6, e7) = ( + c0 ^ d6, + c1 ^ d6 ^ d7, + c2 ^ d0 ^ d7, + c3 ^ d1 ^ d6, + c4 ^ d2 ^ d6 ^ d7, + c5 ^ d3 ^ d7, + c6 ^ d4, + c7 ^ d5, + ); + state[0] = d0 ^ e0 ^ $second_rotate(e0); + state[1] = d1 ^ e1 ^ $second_rotate(e1); + state[2] = d2 ^ e2 ^ $second_rotate(e2); + state[3] = d3 ^ e3 ^ $second_rotate(e3); + state[4] = d4 ^ e4 ^ $second_rotate(e4); + state[5] = d5 ^ e5 ^ $second_rotate(e5); + state[6] = d6 ^ e6 ^ $second_rotate(e6); + state[7] = d7 ^ e7 ^ $second_rotate(e7); + } + } +} + +define_mix_columns!( + mix_columns_0, + inv_mix_columns_0, + rotate_rows_1, + rotate_rows_2 +); + +define_mix_columns!( + mix_columns_1, + inv_mix_columns_1, + rotate_rows_and_columns_1_1, + rotate_rows_and_columns_2_2 +); + +#[cfg(not(aes_compact))] +define_mix_columns!( + mix_columns_2, + inv_mix_columns_2, + rotate_rows_and_columns_1_2, + rotate_rows_2 +); + +#[cfg(not(aes_compact))] +define_mix_columns!( + mix_columns_3, + inv_mix_columns_3, + rotate_rows_and_columns_1_3, + rotate_rows_and_columns_2_2 +); + +#[inline] +fn delta_swap_1(a: &mut u32, shift: u32, mask: u32) { + let t = (*a ^ ((*a) >> shift)) & mask; + *a ^= t ^ (t << shift); +} + +#[inline] +fn delta_swap_2(a: &mut u32, b: &mut u32, shift: u32, mask: u32) { + let t = (*a ^ ((*b) >> shift)) & mask; + *a ^= t; + *b ^= t << shift; +} + +/// Applies ShiftRows once on an AES state (or key). +#[cfg(any(not(aes_compact), feature = "hazmat"))] +#[inline] +fn shift_rows_1(state: &mut [u32]) { + debug_assert_eq!(state.len(), 8); + for x in state.iter_mut() { + delta_swap_1(x, 4, 0x0c0f0300); + delta_swap_1(x, 2, 0x33003300); + } +} + +/// Applies ShiftRows twice on an AES state (or key). +#[inline] +fn shift_rows_2(state: &mut [u32]) { + debug_assert_eq!(state.len(), 8); + for x in state.iter_mut() { + delta_swap_1(x, 4, 0x0f000f00); + } +} + +/// Applies ShiftRows three times on an AES state (or key). +#[inline] +fn shift_rows_3(state: &mut [u32]) { + debug_assert_eq!(state.len(), 8); + for x in state.iter_mut() { + delta_swap_1(x, 4, 0x030f0c00); + delta_swap_1(x, 2, 0x33003300); + } +} + +#[inline(always)] +fn inv_shift_rows_1(state: &mut [u32]) { + shift_rows_3(state); +} + +#[inline(always)] +fn inv_shift_rows_2(state: &mut [u32]) { + shift_rows_2(state); +} + +#[cfg(not(aes_compact))] +#[inline(always)] +fn inv_shift_rows_3(state: &mut [u32]) { + shift_rows_1(state); +} + +/// XOR the columns after the S-box during the key schedule round function. +/// +/// The `idx_xor` parameter refers to the index of the previous round key that is +/// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256, +/// respectively). +/// +/// The `idx_ror` parameter refers to the rotation value, which varies between the +/// different key schedules. +fn xor_columns(rkeys: &mut [u32], offset: usize, idx_xor: usize, idx_ror: u32) { + for i in 0..8 { + let off_i = offset + i; + let rk = rkeys[off_i - idx_xor] ^ (0x03030303 & ror(rkeys[off_i], idx_ror)); + rkeys[off_i] = + rk ^ (0xfcfcfcfc & (rk << 2)) ^ (0xf0f0f0f0 & (rk << 4)) ^ (0xc0c0c0c0 & (rk << 6)); + } +} + +/// Bitslice two 128-bit input blocks input0, input1 into a 256-bit internal state. +fn bitslice(output: &mut [u32], input0: &[u8], input1: &[u8]) { + debug_assert_eq!(output.len(), 8); + debug_assert_eq!(input0.len(), 16); + debug_assert_eq!(input1.len(), 16); + + // Bitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at an + // 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the + // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition): + // b0 c1 c0 r1 r0 p2 p1 p0 + // + // The desired bitsliced data groups first by bit position, then row, column, block: + // p2 p1 p0 r1 r0 c1 c0 b0 + + // Interleave the columns on input (note the order of input) + // b0 c1 c0 __ __ __ __ __ => c1 c0 b0 __ __ __ __ __ + let mut t0 = u32::from_le_bytes(input0[0x00..0x04].try_into().unwrap()); + let mut t2 = u32::from_le_bytes(input0[0x04..0x08].try_into().unwrap()); + let mut t4 = u32::from_le_bytes(input0[0x08..0x0c].try_into().unwrap()); + let mut t6 = u32::from_le_bytes(input0[0x0c..0x10].try_into().unwrap()); + let mut t1 = u32::from_le_bytes(input1[0x00..0x04].try_into().unwrap()); + let mut t3 = u32::from_le_bytes(input1[0x04..0x08].try_into().unwrap()); + let mut t5 = u32::from_le_bytes(input1[0x08..0x0c].try_into().unwrap()); + let mut t7 = u32::from_le_bytes(input1[0x0c..0x10].try_into().unwrap()); + + // Bit Index Swap 5 <-> 0: + // __ __ b0 __ __ __ __ p0 => __ __ p0 __ __ __ __ b0 + let m0 = 0x55555555; + delta_swap_2(&mut t1, &mut t0, 1, m0); + delta_swap_2(&mut t3, &mut t2, 1, m0); + delta_swap_2(&mut t5, &mut t4, 1, m0); + delta_swap_2(&mut t7, &mut t6, 1, m0); + + // Bit Index Swap 6 <-> 1: + // __ c0 __ __ __ __ p1 __ => __ p1 __ __ __ __ c0 __ + let m1 = 0x33333333; + delta_swap_2(&mut t2, &mut t0, 2, m1); + delta_swap_2(&mut t3, &mut t1, 2, m1); + delta_swap_2(&mut t6, &mut t4, 2, m1); + delta_swap_2(&mut t7, &mut t5, 2, m1); + + // Bit Index Swap 7 <-> 2: + // c1 __ __ __ __ p2 __ __ => p2 __ __ __ __ c1 __ __ + let m2 = 0x0f0f0f0f; + delta_swap_2(&mut t4, &mut t0, 4, m2); + delta_swap_2(&mut t5, &mut t1, 4, m2); + delta_swap_2(&mut t6, &mut t2, 4, m2); + delta_swap_2(&mut t7, &mut t3, 4, m2); + + // Final bitsliced bit index, as desired: + // p2 p1 p0 r1 r0 c1 c0 b0 + output[0] = t0; + output[1] = t1; + output[2] = t2; + output[3] = t3; + output[4] = t4; + output[5] = t5; + output[6] = t6; + output[7] = t7; +} + +/// Un-bitslice a 256-bit internal state into two 128-bit blocks of output. +fn inv_bitslice(input: &[u32]) -> BatchBlocks { + debug_assert_eq!(input.len(), 8); + + // Unbitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at + // an 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the + // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition): + // b0 c1 c0 r1 r0 p2 p1 p0 + // + // The initially bitsliced data groups first by bit position, then row, column, block: + // p2 p1 p0 r1 r0 c1 c0 b0 + + let mut t0 = input[0]; + let mut t1 = input[1]; + let mut t2 = input[2]; + let mut t3 = input[3]; + let mut t4 = input[4]; + let mut t5 = input[5]; + let mut t6 = input[6]; + let mut t7 = input[7]; + + // TODO: these bit index swaps are identical to those in 'packing' + + // Bit Index Swap 5 <-> 0: + // __ __ p0 __ __ __ __ b0 => __ __ b0 __ __ __ __ p0 + let m0 = 0x55555555; + delta_swap_2(&mut t1, &mut t0, 1, m0); + delta_swap_2(&mut t3, &mut t2, 1, m0); + delta_swap_2(&mut t5, &mut t4, 1, m0); + delta_swap_2(&mut t7, &mut t6, 1, m0); + + // Bit Index Swap 6 <-> 1: + // __ p1 __ __ __ __ c0 __ => __ c0 __ __ __ __ p1 __ + let m1 = 0x33333333; + delta_swap_2(&mut t2, &mut t0, 2, m1); + delta_swap_2(&mut t3, &mut t1, 2, m1); + delta_swap_2(&mut t6, &mut t4, 2, m1); + delta_swap_2(&mut t7, &mut t5, 2, m1); + + // Bit Index Swap 7 <-> 2: + // p2 __ __ __ __ c1 __ __ => c1 __ __ __ __ p2 __ __ + let m2 = 0x0f0f0f0f; + delta_swap_2(&mut t4, &mut t0, 4, m2); + delta_swap_2(&mut t5, &mut t1, 4, m2); + delta_swap_2(&mut t6, &mut t2, 4, m2); + delta_swap_2(&mut t7, &mut t3, 4, m2); + + let mut output = BatchBlocks::default(); + // De-interleave the columns on output (note the order of output) + // c1 c0 b0 __ __ __ __ __ => b0 c1 c0 __ __ __ __ __ + output[0][0x00..0x04].copy_from_slice(&t0.to_le_bytes()); + output[0][0x04..0x08].copy_from_slice(&t2.to_le_bytes()); + output[0][0x08..0x0c].copy_from_slice(&t4.to_le_bytes()); + output[0][0x0c..0x10].copy_from_slice(&t6.to_le_bytes()); + output[1][0x00..0x04].copy_from_slice(&t1.to_le_bytes()); + output[1][0x04..0x08].copy_from_slice(&t3.to_le_bytes()); + output[1][0x08..0x0c].copy_from_slice(&t5.to_le_bytes()); + output[1][0x0c..0x10].copy_from_slice(&t7.to_le_bytes()); + + // Final AES bit index, as desired: + // b0 c1 c0 r1 r0 p2 p1 p0 + output +} + +/// Copy 32-bytes within the provided slice to an 8-byte offset +fn memshift32(buffer: &mut [u32], src_offset: usize) { + debug_assert_eq!(src_offset % 8, 0); + + let dst_offset = src_offset + 8; + debug_assert!(dst_offset + 8 <= buffer.len()); + + for i in (0..8).rev() { + buffer[dst_offset + i] = buffer[src_offset + i]; + } +} + +/// XOR the round key to the internal state. The round keys are expected to be +/// pre-computed and to be packed in the fixsliced representation. +#[inline] +fn add_round_key(state: &mut State, rkey: &[u32]) { + debug_assert_eq!(rkey.len(), 8); + for (a, b) in state.iter_mut().zip(rkey) { + *a ^= b; + } +} + +#[inline(always)] +fn add_round_constant_bit(state: &mut [u32], bit: usize) { + state[bit] ^= 0x0000c000; +} + +#[inline(always)] +fn ror(x: u32, y: u32) -> u32 { + x.rotate_right(y) +} + +#[inline(always)] +fn ror_distance(rows: u32, cols: u32) -> u32 { + (rows << 3) + (cols << 1) +} + +#[inline(always)] +fn rotate_rows_1(x: u32) -> u32 { + ror(x, ror_distance(1, 0)) +} + +#[inline(always)] +fn rotate_rows_2(x: u32) -> u32 { + ror(x, ror_distance(2, 0)) +} + +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_1_1(x: u32) -> u32 { + (ror(x, ror_distance(1, 1)) & 0x3f3f3f3f) | + (ror(x, ror_distance(0, 1)) & 0xc0c0c0c0) +} + +#[cfg(not(aes_compact))] +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_1_2(x: u32) -> u32 { + (ror(x, ror_distance(1, 2)) & 0x0f0f0f0f) | + (ror(x, ror_distance(0, 2)) & 0xf0f0f0f0) +} + +#[cfg(not(aes_compact))] +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_1_3(x: u32) -> u32 { + (ror(x, ror_distance(1, 3)) & 0x03030303) | + (ror(x, ror_distance(0, 3)) & 0xfcfcfcfc) +} + +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_2_2(x: u32) -> u32 { + (ror(x, ror_distance(2, 2)) & 0x0f0f0f0f) | + (ror(x, ror_distance(1, 2)) & 0xf0f0f0f0) +} diff --git a/src/rust_crypto/aes/soft/fixslice64.rs b/src/rust_crypto/aes/soft/fixslice64.rs new file mode 100644 index 0000000000..23556e3c58 --- /dev/null +++ b/src/rust_crypto/aes/soft/fixslice64.rs @@ -0,0 +1,1536 @@ +//! Fixsliced implementations of AES-128, AES-192 and AES-256 (64-bit) +//! adapted from the C implementation. +//! +//! All implementations are fully bitsliced and do not rely on any +//! Look-Up Table (LUT). +//! +//! See the paper at for more details. +//! +//! # Author (original C code) +//! +//! Alexandre Adomnicai, Nanyang Technological University, Singapore +//! +//! +//! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission. + +#![allow(clippy::unreadable_literal)] + +mod block; +use block::Block; + +type BatchBlocks = [Block; 4]; + +/// AES-128 round keys +type FixsliceKeys128 = [u64; 88]; + +/// AES-192 round keys +type FixsliceKeys192 = [u64; 104]; + +/// AES-256 round keys +type FixsliceKeys256 = [u64; 120]; + +/// 512-bit internal state +pub(crate) type State = [u64; 8]; + +/// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation. +pub(crate) fn aes128_key_schedule(key: &[u8; 16]) -> FixsliceKeys128 { + let mut rkeys = [0u64; 88]; + + bitslice(&mut rkeys[..8], key, key, key, key); + + let mut rk_off = 0; + for rcon in 0..10 { + memshift32(&mut rkeys, rk_off); + rk_off += 8; + + sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); + sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); + + if rcon < 8 { + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon); + } else { + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8); + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7); + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5); + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4); + } + + xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3)); + } + + // Adjust to match fixslicing format + #[cfg(aes_compact)] + { + for i in (8..88).step_by(16) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + } + } + #[cfg(not(aes_compact))] + { + for i in (8..72).step_by(32) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]); + inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]); + } + inv_shift_rows_1(&mut rkeys[72..80]); + } + + // Account for NOTs removed from sub_bytes + for i in 1..11 { + sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); + } + + rkeys +} + +/// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation. +#[allow(dead_code)] +pub(crate) fn aes192_key_schedule(key: &[u8; 24]) -> FixsliceKeys192 { + let mut rkeys = [0u64; 104]; + let mut tmp = [0u64; 8]; + + bitslice( + &mut rkeys[..8], + &key[..16], + &key[..16], + &key[..16], + &key[..16], + ); + bitslice(&mut tmp, &key[8..], &key[8..], &key[8..], &key[8..]); + + let mut rcon = 0; + let mut rk_off = 8; + + loop { + for i in 0..8 { + rkeys[rk_off + i] = (0x00ff00ff00ff00ff & (tmp[i] >> 8)) + | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8)); + } + + sub_bytes(&mut tmp); + sub_bytes_nots(&mut tmp); + + add_round_constant_bit(&mut tmp, rcon); + rcon += 1; + + for i in 0..8 { + let mut ti = rkeys[rk_off + i]; + ti ^= 0x0f000f000f000f00 & ror(tmp[i], ror_distance(1, 1)); + ti ^= 0xf000f000f000f000 & (ti << 4); + tmp[i] = ti; + } + rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); + rk_off += 8; + + for i in 0..8 { + let ui = tmp[i]; + let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8)) + | (0xff00ff00ff00ff00 & (ui << 8)); + ti ^= 0x000f000f000f000f & (ui >> 12); + tmp[i] = ti + ^ (0xfff0fff0fff0fff0 & (ti << 4)) + ^ (0xff00ff00ff00ff00 & (ti << 8)) + ^ (0xf000f000f000f000 & (ti << 12)); + } + rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); + rk_off += 8; + + sub_bytes(&mut tmp); + sub_bytes_nots(&mut tmp); + + add_round_constant_bit(&mut tmp, rcon); + rcon += 1; + + for i in 0..8 { + let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8)) + | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8)); + ti ^= 0x000f000f000f000f & ror(tmp[i], ror_distance(1, 3)); + rkeys[rk_off + i] = ti + ^ (0xfff0fff0fff0fff0 & (ti << 4)) + ^ (0xff00ff00ff00ff00 & (ti << 8)) + ^ (0xf000f000f000f000 & (ti << 12)); + } + rk_off += 8; + + if rcon >= 8 { + break; + } + + for i in 0..8 { + let ui = rkeys[(rk_off - 8) + i]; + let mut ti = rkeys[(rk_off - 16) + i]; + ti ^= 0x0f000f000f000f00 & (ui >> 4); + ti ^= 0xf000f000f000f000 & (ti << 4); + tmp[i] = ti; + } + } + + // Adjust to match fixslicing format + #[cfg(aes_compact)] + { + for i in (8..104).step_by(16) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + } + } + #[cfg(not(aes_compact))] + { + for i in (0..96).step_by(32) { + inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]); + inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]); + inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]); + } + } + + // Account for NOTs removed from sub_bytes + for i in 1..13 { + sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); + } + + rkeys +} + +/// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation. +pub(crate) fn aes256_key_schedule(key: &[u8; 32]) -> FixsliceKeys256 { + let mut rkeys = [0u64; 120]; + + bitslice( + &mut rkeys[..8], + &key[..16], + &key[..16], + &key[..16], + &key[..16], + ); + bitslice( + &mut rkeys[8..16], + &key[16..], + &key[16..], + &key[16..], + &key[16..], + ); + + let mut rk_off = 8; + + let mut rcon = 0; + loop { + memshift32(&mut rkeys, rk_off); + rk_off += 8; + + sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); + sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); + + add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon); + xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3)); + rcon += 1; + + if rcon == 7 { + break; + } + + memshift32(&mut rkeys, rk_off); + rk_off += 8; + + sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); + sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); + + xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3)); + } + + // Adjust to match fixslicing format + #[cfg(aes_compact)] + { + for i in (8..120).step_by(16) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + } + } + #[cfg(not(aes_compact))] + { + for i in (8..104).step_by(32) { + inv_shift_rows_1(&mut rkeys[i..(i + 8)]); + inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]); + inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]); + } + inv_shift_rows_1(&mut rkeys[104..112]); + } + + // Account for NOTs removed from sub_bytes + for i in 1..15 { + sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); + } + + rkeys +} + +/// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted). +/// +/// Decrypts four blocks in-place and in parallel. +#[allow(dead_code)] +pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks { + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + + add_round_key(&mut state, &rkeys[80..]); + inv_sub_bytes(&mut state); + + #[cfg(not(aes_compact))] + { + inv_shift_rows_2(&mut state); + } + + let mut rk_off = 72; + loop { + #[cfg(aes_compact)] + { + inv_shift_rows_2(&mut state); + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_1(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + if rk_off == 0 { + break; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_0(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + #[cfg(not(aes_compact))] + { + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_3(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_2(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + } + + add_round_key(&mut state, &rkeys[..8]); + + inv_bitslice(&state) +} + +/// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted). +/// +/// Encrypts four blocks in-place and in parallel. +pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks { + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + + add_round_key(&mut state, &rkeys[..8]); + + let mut rk_off = 8; + loop { + sub_bytes(&mut state); + mix_columns_1(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + #[cfg(aes_compact)] + { + shift_rows_2(&mut state); + } + + if rk_off == 80 { + break; + } + + #[cfg(not(aes_compact))] + { + sub_bytes(&mut state); + mix_columns_2(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + sub_bytes(&mut state); + mix_columns_3(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + sub_bytes(&mut state); + mix_columns_0(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + #[cfg(not(aes_compact))] + { + shift_rows_2(&mut state); + } + + sub_bytes(&mut state); + add_round_key(&mut state, &rkeys[80..]); + + inv_bitslice(&state) +} + +/// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted). +/// +/// Decrypts four blocks in-place and in parallel. +#[allow(dead_code)] +pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks { + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + + add_round_key(&mut state, &rkeys[96..]); + inv_sub_bytes(&mut state); + + let mut rk_off = 88; + loop { + #[cfg(aes_compact)] + { + inv_shift_rows_2(&mut state); + } + #[cfg(not(aes_compact))] + { + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_3(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_2(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_1(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + if rk_off == 0 { + break; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_0(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + + add_round_key(&mut state, &rkeys[..8]); + + inv_bitslice(&state) +} + +/// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted). +/// +/// Encrypts four blocks in-place and in parallel. +#[allow(dead_code)] +pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks { + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + + add_round_key(&mut state, &rkeys[..8]); + + let mut rk_off = 8; + loop { + sub_bytes(&mut state); + mix_columns_1(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + #[cfg(aes_compact)] + { + shift_rows_2(&mut state); + } + #[cfg(not(aes_compact))] + { + sub_bytes(&mut state); + mix_columns_2(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + sub_bytes(&mut state); + mix_columns_3(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + if rk_off == 96 { + break; + } + + sub_bytes(&mut state); + mix_columns_0(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + sub_bytes(&mut state); + add_round_key(&mut state, &rkeys[96..]); + + inv_bitslice(&state) +} + +/// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted). +/// +/// Decrypts four blocks in-place and in parallel. +#[allow(dead_code)] +pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks { + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + + add_round_key(&mut state, &rkeys[112..]); + inv_sub_bytes(&mut state); + + #[cfg(not(aes_compact))] + { + inv_shift_rows_2(&mut state); + } + + let mut rk_off = 104; + loop { + #[cfg(aes_compact)] + { + inv_shift_rows_2(&mut state); + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_1(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + if rk_off == 0 { + break; + } + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_0(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + #[cfg(not(aes_compact))] + { + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_3(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + inv_mix_columns_2(&mut state); + inv_sub_bytes(&mut state); + rk_off -= 8; + } + } + + add_round_key(&mut state, &rkeys[..8]); + + inv_bitslice(&state) +} + +/// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted). +/// +/// Encrypts four blocks in-place and in parallel. +pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks { + let mut state = State::default(); + + bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); + + add_round_key(&mut state, &rkeys[..8]); + + let mut rk_off = 8; + loop { + sub_bytes(&mut state); + mix_columns_1(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + #[cfg(aes_compact)] + { + shift_rows_2(&mut state); + } + + if rk_off == 112 { + break; + } + + #[cfg(not(aes_compact))] + { + sub_bytes(&mut state); + mix_columns_2(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + + sub_bytes(&mut state); + mix_columns_3(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + sub_bytes(&mut state); + mix_columns_0(&mut state); + add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); + rk_off += 8; + } + + #[cfg(not(aes_compact))] + { + shift_rows_2(&mut state); + } + + sub_bytes(&mut state); + add_round_key(&mut state, &rkeys[112..]); + + inv_bitslice(&state) +} + +/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are accounted for here so that it is a true +/// inverse of 'sub_bytes'. +fn inv_sub_bytes(state: &mut [u64]) { + debug_assert_eq!(state.len(), 8); + + // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler + // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4) + + let u7 = state[0]; + let u6 = state[1]; + let u5 = state[2]; + let u4 = state[3]; + let u3 = state[4]; + let u2 = state[5]; + let u1 = state[6]; + let u0 = state[7]; + + let t23 = u0 ^ u3; + let t8 = u1 ^ t23; + let m2 = t23 & t8; + let t4 = u4 ^ t8; + let t22 = u1 ^ u3; + let t2 = u0 ^ u1; + let t1 = u3 ^ u4; + // t23 -> stack + let t9 = u7 ^ t1; + // t8 -> stack + let m7 = t22 & t9; + // t9 -> stack + let t24 = u4 ^ u7; + // m7 -> stack + let t10 = t2 ^ t24; + // u4 -> stack + let m14 = t2 & t10; + let r5 = u6 ^ u7; + // m2 -> stack + let t3 = t1 ^ r5; + // t2 -> stack + let t13 = t2 ^ r5; + let t19 = t22 ^ r5; + // t3 -> stack + let t17 = u2 ^ t19; + // t4 -> stack + let t25 = u2 ^ t1; + let r13 = u1 ^ u6; + // t25 -> stack + let t20 = t24 ^ r13; + // t17 -> stack + let m9 = t20 & t17; + // t20 -> stack + let r17 = u2 ^ u5; + // t22 -> stack + let t6 = t22 ^ r17; + // t13 -> stack + let m1 = t13 & t6; + let y5 = u0 ^ r17; + let m4 = t19 & y5; + let m5 = m4 ^ m1; + let m17 = m5 ^ t24; + let r18 = u5 ^ u6; + let t27 = t1 ^ r18; + let t15 = t10 ^ t27; + // t6 -> stack + let m11 = t1 & t15; + let m15 = m14 ^ m11; + let m21 = m17 ^ m15; + // t1 -> stack + // t4 <- stack + let m12 = t4 & t27; + let m13 = m12 ^ m11; + let t14 = t10 ^ r18; + let m3 = t14 ^ m1; + // m2 <- stack + let m16 = m3 ^ m2; + let m20 = m16 ^ m13; + // u4 <- stack + let r19 = u2 ^ u4; + let t16 = r13 ^ r19; + // t3 <- stack + let t26 = t3 ^ t16; + let m6 = t3 & t16; + let m8 = t26 ^ m6; + // t10 -> stack + // m7 <- stack + let m18 = m8 ^ m7; + let m22 = m18 ^ m13; + let m25 = m22 & m20; + let m26 = m21 ^ m25; + let m10 = m9 ^ m6; + let m19 = m10 ^ m15; + // t25 <- stack + let m23 = m19 ^ t25; + let m28 = m23 ^ m25; + let m24 = m22 ^ m23; + let m30 = m26 & m24; + let m39 = m23 ^ m30; + let m48 = m39 & y5; + let m57 = m39 & t19; + // m48 -> stack + let m36 = m24 ^ m25; + let m31 = m20 & m23; + let m27 = m20 ^ m21; + let m32 = m27 & m31; + let m29 = m28 & m27; + let m37 = m21 ^ m29; + // m39 -> stack + let m42 = m37 ^ m39; + let m52 = m42 & t15; + // t27 -> stack + // t1 <- stack + let m61 = m42 & t1; + let p0 = m52 ^ m61; + let p16 = m57 ^ m61; + // m57 -> stack + // t20 <- stack + let m60 = m37 & t20; + // p16 -> stack + // t17 <- stack + let m51 = m37 & t17; + let m33 = m27 ^ m25; + let m38 = m32 ^ m33; + let m43 = m37 ^ m38; + let m49 = m43 & t16; + let p6 = m49 ^ m60; + let p13 = m49 ^ m51; + let m58 = m43 & t3; + // t9 <- stack + let m50 = m38 & t9; + // t22 <- stack + let m59 = m38 & t22; + // p6 -> stack + let p1 = m58 ^ m59; + let p7 = p0 ^ p1; + let m34 = m21 & m22; + let m35 = m24 & m34; + let m40 = m35 ^ m36; + let m41 = m38 ^ m40; + let m45 = m42 ^ m41; + // t27 <- stack + let m53 = m45 & t27; + let p8 = m50 ^ m53; + let p23 = p7 ^ p8; + // t4 <- stack + let m62 = m45 & t4; + let p14 = m49 ^ m62; + let s6 = p14 ^ p23; + // t10 <- stack + let m54 = m41 & t10; + let p2 = m54 ^ m62; + let p22 = p2 ^ p7; + let s0 = p13 ^ p22; + let p17 = m58 ^ p2; + let p15 = m54 ^ m59; + // t2 <- stack + let m63 = m41 & t2; + // m39 <- stack + let m44 = m39 ^ m40; + // p17 -> stack + // t6 <- stack + let m46 = m44 & t6; + let p5 = m46 ^ m51; + // p23 -> stack + let p18 = m63 ^ p5; + let p24 = p5 ^ p7; + // m48 <- stack + let p12 = m46 ^ m48; + let s3 = p12 ^ p22; + // t13 <- stack + let m55 = m44 & t13; + let p9 = m55 ^ m63; + // p16 <- stack + let s7 = p9 ^ p16; + // t8 <- stack + let m47 = m40 & t8; + let p3 = m47 ^ m50; + let p19 = p2 ^ p3; + let s5 = p19 ^ p24; + let p11 = p0 ^ p3; + let p26 = p9 ^ p11; + // t23 <- stack + let m56 = m40 & t23; + let p4 = m48 ^ m56; + // p6 <- stack + let p20 = p4 ^ p6; + let p29 = p15 ^ p20; + let s1 = p26 ^ p29; + // m57 <- stack + let p10 = m57 ^ p4; + let p27 = p10 ^ p18; + // p23 <- stack + let s4 = p23 ^ p27; + let p25 = p6 ^ p10; + let p28 = p11 ^ p25; + // p17 <- stack + let s2 = p17 ^ p28; + + state[0] = s7; + state[1] = s6; + state[2] = s5; + state[3] = s4; + state[4] = s3; + state[5] = s2; + state[6] = s1; + state[7] = s0; +} + +/// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik. +/// +/// See: +/// +/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are moved to the key schedule. +fn sub_bytes(state: &mut [u64]) { + debug_assert_eq!(state.len(), 8); + + // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler + // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4) + + let u7 = state[0]; + let u6 = state[1]; + let u5 = state[2]; + let u4 = state[3]; + let u3 = state[4]; + let u2 = state[5]; + let u1 = state[6]; + let u0 = state[7]; + + let y14 = u3 ^ u5; + let y13 = u0 ^ u6; + let y12 = y13 ^ y14; + let t1 = u4 ^ y12; + let y15 = t1 ^ u5; + let t2 = y12 & y15; + let y6 = y15 ^ u7; + let y20 = t1 ^ u1; + // y12 -> stack + let y9 = u0 ^ u3; + // y20 -> stack + let y11 = y20 ^ y9; + // y9 -> stack + let t12 = y9 & y11; + // y6 -> stack + let y7 = u7 ^ y11; + let y8 = u0 ^ u5; + let t0 = u1 ^ u2; + let y10 = y15 ^ t0; + // y15 -> stack + let y17 = y10 ^ y11; + // y14 -> stack + let t13 = y14 & y17; + let t14 = t13 ^ t12; + // y17 -> stack + let y19 = y10 ^ y8; + // y10 -> stack + let t15 = y8 & y10; + let t16 = t15 ^ t12; + let y16 = t0 ^ y11; + // y11 -> stack + let y21 = y13 ^ y16; + // y13 -> stack + let t7 = y13 & y16; + // y16 -> stack + let y18 = u0 ^ y16; + let y1 = t0 ^ u7; + let y4 = y1 ^ u3; + // u7 -> stack + let t5 = y4 & u7; + let t6 = t5 ^ t2; + let t18 = t6 ^ t16; + let t22 = t18 ^ y19; + let y2 = y1 ^ u0; + let t10 = y2 & y7; + let t11 = t10 ^ t7; + let t20 = t11 ^ t16; + let t24 = t20 ^ y18; + let y5 = y1 ^ u6; + let t8 = y5 & y1; + let t9 = t8 ^ t7; + let t19 = t9 ^ t14; + let t23 = t19 ^ y21; + let y3 = y5 ^ y8; + // y6 <- stack + let t3 = y3 & y6; + let t4 = t3 ^ t2; + // y20 <- stack + let t17 = t4 ^ y20; + let t21 = t17 ^ t14; + let t26 = t21 & t23; + let t27 = t24 ^ t26; + let t31 = t22 ^ t26; + let t25 = t21 ^ t22; + // y4 -> stack + let t28 = t25 & t27; + let t29 = t28 ^ t22; + let z14 = t29 & y2; + let z5 = t29 & y7; + let t30 = t23 ^ t24; + let t32 = t31 & t30; + let t33 = t32 ^ t24; + let t35 = t27 ^ t33; + let t36 = t24 & t35; + let t38 = t27 ^ t36; + let t39 = t29 & t38; + let t40 = t25 ^ t39; + let t43 = t29 ^ t40; + // y16 <- stack + let z3 = t43 & y16; + let tc12 = z3 ^ z5; + // tc12 -> stack + // y13 <- stack + let z12 = t43 & y13; + let z13 = t40 & y5; + let z4 = t40 & y1; + let tc6 = z3 ^ z4; + let t34 = t23 ^ t33; + let t37 = t36 ^ t34; + let t41 = t40 ^ t37; + // y10 <- stack + let z8 = t41 & y10; + let z17 = t41 & y8; + let t44 = t33 ^ t37; + // y15 <- stack + let z0 = t44 & y15; + // z17 -> stack + // y12 <- stack + let z9 = t44 & y12; + let z10 = t37 & y3; + let z1 = t37 & y6; + let tc5 = z1 ^ z0; + let tc11 = tc6 ^ tc5; + // y4 <- stack + let z11 = t33 & y4; + let t42 = t29 ^ t33; + let t45 = t42 ^ t41; + // y17 <- stack + let z7 = t45 & y17; + let tc8 = z7 ^ tc6; + // y14 <- stack + let z16 = t45 & y14; + // y11 <- stack + let z6 = t42 & y11; + let tc16 = z6 ^ tc8; + // z14 -> stack + // y9 <- stack + let z15 = t42 & y9; + let tc20 = z15 ^ tc16; + let tc1 = z15 ^ z16; + let tc2 = z10 ^ tc1; + let tc21 = tc2 ^ z11; + let tc3 = z9 ^ tc2; + let s0 = tc3 ^ tc16; + let s3 = tc3 ^ tc11; + let s1 = s3 ^ tc16; + let tc13 = z13 ^ tc1; + // u7 <- stack + let z2 = t33 & u7; + let tc4 = z0 ^ z2; + let tc7 = z12 ^ tc4; + let tc9 = z8 ^ tc7; + let tc10 = tc8 ^ tc9; + // z14 <- stack + let tc17 = z14 ^ tc10; + let s5 = tc21 ^ tc17; + let tc26 = tc17 ^ tc20; + // z17 <- stack + let s2 = tc26 ^ z17; + // tc12 <- stack + let tc14 = tc4 ^ tc12; + let tc18 = tc13 ^ tc14; + let s6 = tc10 ^ tc18; + let s7 = z12 ^ tc18; + let s4 = tc14 ^ s3; + + state[0] = s7; + state[1] = s6; + state[2] = s5; + state[3] = s4; + state[4] = s3; + state[5] = s2; + state[6] = s1; + state[7] = s0; +} + +/// NOT operations that are omitted in S-box +#[inline] +fn sub_bytes_nots(state: &mut [u64]) { + debug_assert_eq!(state.len(), 8); + state[0] ^= 0xffffffffffffffff; + state[1] ^= 0xffffffffffffffff; + state[5] ^= 0xffffffffffffffff; + state[6] ^= 0xffffffffffffffff; +} + +/// Computation of the MixColumns transformation in the fixsliced representation, with different +/// rotations used according to the round number mod 4. +/// +/// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm. +macro_rules! define_mix_columns { + ( + $name:ident, + $name_inv:ident, + $first_rotate:path, + $second_rotate:path + ) => { + #[rustfmt::skip] + fn $name(state: &mut State) { + let (a0, a1, a2, a3, a4, a5, a6, a7) = ( + state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7] + ); + let (b0, b1, b2, b3, b4, b5, b6, b7) = ( + $first_rotate(a0), + $first_rotate(a1), + $first_rotate(a2), + $first_rotate(a3), + $first_rotate(a4), + $first_rotate(a5), + $first_rotate(a6), + $first_rotate(a7), + ); + let (c0, c1, c2, c3, c4, c5, c6, c7) = ( + a0 ^ b0, + a1 ^ b1, + a2 ^ b2, + a3 ^ b3, + a4 ^ b4, + a5 ^ b5, + a6 ^ b6, + a7 ^ b7, + ); + state[0] = b0 ^ c7 ^ $second_rotate(c0); + state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1); + state[2] = b2 ^ c1 ^ $second_rotate(c2); + state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3); + state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4); + state[5] = b5 ^ c4 ^ $second_rotate(c5); + state[6] = b6 ^ c5 ^ $second_rotate(c6); + state[7] = b7 ^ c6 ^ $second_rotate(c7); + } + + #[rustfmt::skip] + fn $name_inv(state: &mut State) { + let (a0, a1, a2, a3, a4, a5, a6, a7) = ( + state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7] + ); + let (b0, b1, b2, b3, b4, b5, b6, b7) = ( + $first_rotate(a0), + $first_rotate(a1), + $first_rotate(a2), + $first_rotate(a3), + $first_rotate(a4), + $first_rotate(a5), + $first_rotate(a6), + $first_rotate(a7), + ); + let (c0, c1, c2, c3, c4, c5, c6, c7) = ( + a0 ^ b0, + a1 ^ b1, + a2 ^ b2, + a3 ^ b3, + a4 ^ b4, + a5 ^ b5, + a6 ^ b6, + a7 ^ b7, + ); + let (d0, d1, d2, d3, d4, d5, d6, d7) = ( + a0 ^ c7, + a1 ^ c0 ^ c7, + a2 ^ c1, + a3 ^ c2 ^ c7, + a4 ^ c3 ^ c7, + a5 ^ c4, + a6 ^ c5, + a7 ^ c6, + ); + let (e0, e1, e2, e3, e4, e5, e6, e7) = ( + c0 ^ d6, + c1 ^ d6 ^ d7, + c2 ^ d0 ^ d7, + c3 ^ d1 ^ d6, + c4 ^ d2 ^ d6 ^ d7, + c5 ^ d3 ^ d7, + c6 ^ d4, + c7 ^ d5, + ); + state[0] = d0 ^ e0 ^ $second_rotate(e0); + state[1] = d1 ^ e1 ^ $second_rotate(e1); + state[2] = d2 ^ e2 ^ $second_rotate(e2); + state[3] = d3 ^ e3 ^ $second_rotate(e3); + state[4] = d4 ^ e4 ^ $second_rotate(e4); + state[5] = d5 ^ e5 ^ $second_rotate(e5); + state[6] = d6 ^ e6 ^ $second_rotate(e6); + state[7] = d7 ^ e7 ^ $second_rotate(e7); + } + } +} + +define_mix_columns!( + mix_columns_0, + inv_mix_columns_0, + rotate_rows_1, + rotate_rows_2 +); + +define_mix_columns!( + mix_columns_1, + inv_mix_columns_1, + rotate_rows_and_columns_1_1, + rotate_rows_and_columns_2_2 +); + +#[cfg(not(aes_compact))] +define_mix_columns!( + mix_columns_2, + inv_mix_columns_2, + rotate_rows_and_columns_1_2, + rotate_rows_2 +); + +#[cfg(not(aes_compact))] +define_mix_columns!( + mix_columns_3, + inv_mix_columns_3, + rotate_rows_and_columns_1_3, + rotate_rows_and_columns_2_2 +); + +#[inline] +fn delta_swap_1(a: &mut u64, shift: u32, mask: u64) { + let t = (*a ^ ((*a) >> shift)) & mask; + *a ^= t ^ (t << shift); +} + +#[inline] +fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) { + let t = (*a ^ ((*b) >> shift)) & mask; + *a ^= t; + *b ^= t << shift; +} + +/// Applies ShiftRows once on an AES state (or key). +#[cfg(any(not(aes_compact), feature = "hazmat"))] +#[inline] +fn shift_rows_1(state: &mut [u64]) { + debug_assert_eq!(state.len(), 8); + for x in state.iter_mut() { + delta_swap_1(x, 8, 0x00f000ff000f0000); + delta_swap_1(x, 4, 0x0f0f00000f0f0000); + } +} + +/// Applies ShiftRows twice on an AES state (or key). +#[inline] +fn shift_rows_2(state: &mut [u64]) { + debug_assert_eq!(state.len(), 8); + for x in state.iter_mut() { + delta_swap_1(x, 8, 0x00ff000000ff0000); + } +} + +/// Applies ShiftRows three times on an AES state (or key). +#[inline] +fn shift_rows_3(state: &mut [u64]) { + debug_assert_eq!(state.len(), 8); + for x in state.iter_mut() { + delta_swap_1(x, 8, 0x000f00ff00f00000); + delta_swap_1(x, 4, 0x0f0f00000f0f0000); + } +} + +#[inline(always)] +fn inv_shift_rows_1(state: &mut [u64]) { + shift_rows_3(state); +} + +#[inline(always)] +fn inv_shift_rows_2(state: &mut [u64]) { + shift_rows_2(state); +} + +#[cfg(not(aes_compact))] +#[inline(always)] +fn inv_shift_rows_3(state: &mut [u64]) { + shift_rows_1(state); +} + +/// XOR the columns after the S-box during the key schedule round function. +/// +/// The `idx_xor` parameter refers to the index of the previous round key that is +/// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256, +/// respectively). +/// +/// The `idx_ror` parameter refers to the rotation value, which varies between the +/// different key schedules. +fn xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32) { + for i in 0..8 { + let off_i = offset + i; + let rk = rkeys[off_i - idx_xor] ^ (0x000f000f000f000f & ror(rkeys[off_i], idx_ror)); + rkeys[off_i] = rk + ^ (0xfff0fff0fff0fff0 & (rk << 4)) + ^ (0xff00ff00ff00ff00 & (rk << 8)) + ^ (0xf000f000f000f000 & (rk << 12)); + } +} + +/// Bitslice four 128-bit input blocks input0, input1, input2, input3 into a 512-bit internal state. +fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) { + debug_assert_eq!(output.len(), 8); + debug_assert_eq!(input0.len(), 16); + debug_assert_eq!(input1.len(), 16); + debug_assert_eq!(input2.len(), 16); + debug_assert_eq!(input3.len(), 16); + + // Bitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at a + // 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the + // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition): + // b1 b0 c1 c0 r1 r0 p2 p1 p0 + // + // The desired bitsliced data groups first by bit position, then row, column, block: + // p2 p1 p0 r1 r0 c1 c0 b1 b0 + + #[rustfmt::skip] + fn read_reordered(input: &[u8]) -> u64 { + (u64::from(input[0x0]) ) | + (u64::from(input[0x1]) << 0x10) | + (u64::from(input[0x2]) << 0x20) | + (u64::from(input[0x3]) << 0x30) | + (u64::from(input[0x8]) << 0x08) | + (u64::from(input[0x9]) << 0x18) | + (u64::from(input[0xa]) << 0x28) | + (u64::from(input[0xb]) << 0x38) + } + + // Reorder each block's bytes on input + // __ __ c1 c0 r1 r0 __ __ __ => __ __ c0 r1 r0 c1 __ __ __ + // Reorder by relabeling (note the order of input) + // b1 b0 c0 __ __ __ __ __ __ => c0 b1 b0 __ __ __ __ __ __ + let mut t0 = read_reordered(&input0[0x00..0x0c]); + let mut t4 = read_reordered(&input0[0x04..0x10]); + let mut t1 = read_reordered(&input1[0x00..0x0c]); + let mut t5 = read_reordered(&input1[0x04..0x10]); + let mut t2 = read_reordered(&input2[0x00..0x0c]); + let mut t6 = read_reordered(&input2[0x04..0x10]); + let mut t3 = read_reordered(&input3[0x00..0x0c]); + let mut t7 = read_reordered(&input3[0x04..0x10]); + + // Bit Index Swap 6 <-> 0: + // __ __ b0 __ __ __ __ __ p0 => __ __ p0 __ __ __ __ __ b0 + let m0 = 0x5555555555555555; + delta_swap_2(&mut t1, &mut t0, 1, m0); + delta_swap_2(&mut t3, &mut t2, 1, m0); + delta_swap_2(&mut t5, &mut t4, 1, m0); + delta_swap_2(&mut t7, &mut t6, 1, m0); + + // Bit Index Swap 7 <-> 1: + // __ b1 __ __ __ __ __ p1 __ => __ p1 __ __ __ __ __ b1 __ + let m1 = 0x3333333333333333; + delta_swap_2(&mut t2, &mut t0, 2, m1); + delta_swap_2(&mut t3, &mut t1, 2, m1); + delta_swap_2(&mut t6, &mut t4, 2, m1); + delta_swap_2(&mut t7, &mut t5, 2, m1); + + // Bit Index Swap 8 <-> 2: + // c0 __ __ __ __ __ p2 __ __ => p2 __ __ __ __ __ c0 __ __ + let m2 = 0x0f0f0f0f0f0f0f0f; + delta_swap_2(&mut t4, &mut t0, 4, m2); + delta_swap_2(&mut t5, &mut t1, 4, m2); + delta_swap_2(&mut t6, &mut t2, 4, m2); + delta_swap_2(&mut t7, &mut t3, 4, m2); + + // Final bitsliced bit index, as desired: + // p2 p1 p0 r1 r0 c1 c0 b1 b0 + output[0] = t0; + output[1] = t1; + output[2] = t2; + output[3] = t3; + output[4] = t4; + output[5] = t5; + output[6] = t6; + output[7] = t7; +} + +/// Un-bitslice a 512-bit internal state into four 128-bit blocks of output. +fn inv_bitslice(input: &[u64]) -> BatchBlocks { + debug_assert_eq!(input.len(), 8); + + // Unbitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at + // a 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the + // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition): + // b1 b0 c1 c0 r1 r0 p2 p1 p0 + // + // The initially bitsliced data groups first by bit position, then row, column, block: + // p2 p1 p0 r1 r0 c1 c0 b1 b0 + + let mut t0 = input[0]; + let mut t1 = input[1]; + let mut t2 = input[2]; + let mut t3 = input[3]; + let mut t4 = input[4]; + let mut t5 = input[5]; + let mut t6 = input[6]; + let mut t7 = input[7]; + + // TODO: these bit index swaps are identical to those in 'packing' + + // Bit Index Swap 6 <-> 0: + // __ __ p0 __ __ __ __ __ b0 => __ __ b0 __ __ __ __ __ p0 + let m0 = 0x5555555555555555; + delta_swap_2(&mut t1, &mut t0, 1, m0); + delta_swap_2(&mut t3, &mut t2, 1, m0); + delta_swap_2(&mut t5, &mut t4, 1, m0); + delta_swap_2(&mut t7, &mut t6, 1, m0); + + // Bit Index Swap 7 <-> 1: + // __ p1 __ __ __ __ __ b1 __ => __ b1 __ __ __ __ __ p1 __ + let m1 = 0x3333333333333333; + delta_swap_2(&mut t2, &mut t0, 2, m1); + delta_swap_2(&mut t3, &mut t1, 2, m1); + delta_swap_2(&mut t6, &mut t4, 2, m1); + delta_swap_2(&mut t7, &mut t5, 2, m1); + + // Bit Index Swap 8 <-> 2: + // p2 __ __ __ __ __ c0 __ __ => c0 __ __ __ __ __ p2 __ __ + let m2 = 0x0f0f0f0f0f0f0f0f; + delta_swap_2(&mut t4, &mut t0, 4, m2); + delta_swap_2(&mut t5, &mut t1, 4, m2); + delta_swap_2(&mut t6, &mut t2, 4, m2); + delta_swap_2(&mut t7, &mut t3, 4, m2); + + #[rustfmt::skip] + fn write_reordered(columns: u64, output: &mut [u8]) { + output[0x0] = (columns ) as u8; + output[0x1] = (columns >> 0x10) as u8; + output[0x2] = (columns >> 0x20) as u8; + output[0x3] = (columns >> 0x30) as u8; + output[0x8] = (columns >> 0x08) as u8; + output[0x9] = (columns >> 0x18) as u8; + output[0xa] = (columns >> 0x28) as u8; + output[0xb] = (columns >> 0x38) as u8; + } + + let mut output = BatchBlocks::default(); + // Reorder by relabeling (note the order of output) + // c0 b1 b0 __ __ __ __ __ __ => b1 b0 c0 __ __ __ __ __ __ + // Reorder each block's bytes on output + // __ __ c0 r1 r0 c1 __ __ __ => __ __ c1 c0 r1 r0 __ __ __ + write_reordered(t0, &mut output[0][0x00..0x0c]); + write_reordered(t4, &mut output[0][0x04..0x10]); + write_reordered(t1, &mut output[1][0x00..0x0c]); + write_reordered(t5, &mut output[1][0x04..0x10]); + write_reordered(t2, &mut output[2][0x00..0x0c]); + write_reordered(t6, &mut output[2][0x04..0x10]); + write_reordered(t3, &mut output[3][0x00..0x0c]); + write_reordered(t7, &mut output[3][0x04..0x10]); + + // Final AES bit index, as desired: + // b1 b0 c1 c0 r1 r0 p2 p1 p0 + output +} + +/// Copy 32-bytes within the provided slice to an 8-byte offset +fn memshift32(buffer: &mut [u64], src_offset: usize) { + debug_assert_eq!(src_offset % 8, 0); + + let dst_offset = src_offset + 8; + debug_assert!(dst_offset + 8 <= buffer.len()); + + for i in (0..8).rev() { + buffer[dst_offset + i] = buffer[src_offset + i]; + } +} + +/// XOR the round key to the internal state. The round keys are expected to be +/// pre-computed and to be packed in the fixsliced representation. +#[inline] +fn add_round_key(state: &mut State, rkey: &[u64]) { + debug_assert_eq!(rkey.len(), 8); + for (a, b) in state.iter_mut().zip(rkey) { + *a ^= b; + } +} + +#[inline(always)] +fn add_round_constant_bit(state: &mut [u64], bit: usize) { + state[bit] ^= 0x00000000f0000000; +} + +#[inline(always)] +fn ror(x: u64, y: u32) -> u64 { + x.rotate_right(y) +} + +#[inline(always)] +fn ror_distance(rows: u32, cols: u32) -> u32 { + (rows << 4) + (cols << 2) +} + +#[inline(always)] +fn rotate_rows_1(x: u64) -> u64 { + ror(x, ror_distance(1, 0)) +} + +#[inline(always)] +fn rotate_rows_2(x: u64) -> u64 { + ror(x, ror_distance(2, 0)) +} + +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_1_1(x: u64) -> u64 { + (ror(x, ror_distance(1, 1)) & 0x0fff0fff0fff0fff) | + (ror(x, ror_distance(0, 1)) & 0xf000f000f000f000) +} + +#[cfg(not(aes_compact))] +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_1_2(x: u64) -> u64 { + (ror(x, ror_distance(1, 2)) & 0x00ff00ff00ff00ff) | + (ror(x, ror_distance(0, 2)) & 0xff00ff00ff00ff00) +} + +#[cfg(not(aes_compact))] +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_1_3(x: u64) -> u64 { + (ror(x, ror_distance(1, 3)) & 0x000f000f000f000f) | + (ror(x, ror_distance(0, 3)) & 0xfff0fff0fff0fff0) +} + +#[inline(always)] +#[rustfmt::skip] +fn rotate_rows_and_columns_2_2(x: u64) -> u64 { + (ror(x, ror_distance(2, 2)) & 0x00ff00ff00ff00ff) | + (ror(x, ror_distance(1, 2)) & 0xff00ff00ff00ff00) +} + +/// Low-level "hazmat" AES functions. +/// +/// Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256` +/// implementations in this crate, but instead provides raw access to +/// the AES round function gated under the `hazmat` crate feature. +#[cfg(feature = "hazmat")] +pub(crate) mod hazmat { + use super::{ + bitslice, inv_bitslice, inv_mix_columns_0, inv_shift_rows_1, inv_sub_bytes, mix_columns_0, + shift_rows_1, sub_bytes, sub_bytes_nots, State, + }; + use crate::{Block, Block8}; + + /// XOR the `src` block into the `dst` block in-place. + fn xor_in_place(dst: &mut Block, src: &Block) { + for (a, b) in dst.iter_mut().zip(src.as_slice()) { + *a ^= *b; + } + } + + /// Perform a bitslice operation, loading a single block. + fn bitslice_block(block: &Block) -> State { + let mut state = State::default(); + bitslice(&mut state, block, block, block, block); + state + } + + /// Perform an inverse bitslice operation, extracting a single block. + fn inv_bitslice_block(block: &mut Block, state: &State) { + block.copy_from_slice(&inv_bitslice(state)[0]); + } + + /// AES cipher (encrypt) round function. + #[inline] + pub(crate) fn cipher_round(block: &mut Block, round_key: &Block) { + let mut state = bitslice_block(block); + sub_bytes(&mut state); + sub_bytes_nots(&mut state); + shift_rows_1(&mut state); + mix_columns_0(&mut state); + inv_bitslice_block(block, &state); + xor_in_place(block, round_key); + } + + /// AES cipher (encrypt) round function: parallel version. + #[inline] + pub(crate) fn cipher_round_par(blocks: &mut Block8, round_keys: &Block8) { + for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) { + let mut state = State::default(); + bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]); + sub_bytes(&mut state); + sub_bytes_nots(&mut state); + shift_rows_1(&mut state); + mix_columns_0(&mut state); + let res = inv_bitslice(&state); + + for i in 0..4 { + chunk[i] = res[i]; + xor_in_place(&mut chunk[i], &keys[i]); + } + } + } + + /// AES cipher (encrypt) round function. + #[inline] + pub(crate) fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) { + let mut state = State::default(); + bitslice(&mut state, block, block, block, block); + sub_bytes_nots(&mut state); + inv_sub_bytes(&mut state); + inv_shift_rows_1(&mut state); + inv_mix_columns_0(&mut state); + inv_bitslice_block(block, &state); + xor_in_place(block, round_key); + } + + /// AES cipher (encrypt) round function: parallel version. + #[inline] + pub(crate) fn equiv_inv_cipher_round_par(blocks: &mut Block8, round_keys: &Block8) { + for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) { + let mut state = State::default(); + bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]); + sub_bytes_nots(&mut state); + inv_sub_bytes(&mut state); + inv_shift_rows_1(&mut state); + inv_mix_columns_0(&mut state); + let res = inv_bitslice(&state); + + for i in 0..4 { + chunk[i] = res[i]; + xor_in_place(&mut chunk[i], &keys[i]); + } + } + } + + /// AES mix columns function. + #[inline] + pub(crate) fn mix_columns(block: &mut Block) { + let mut state = bitslice_block(block); + mix_columns_0(&mut state); + inv_bitslice_block(block, &state); + } + + /// AES inverse mix columns function. + #[inline] + pub(crate) fn inv_mix_columns(block: &mut Block) { + let mut state = bitslice_block(block); + inv_mix_columns_0(&mut state); + inv_bitslice_block(block, &state); + } +}