Merge pull request #280 from ocheron/gcm-small-table

More optimizations for AES GCM and CCM
This commit is contained in:
Olivier Chéron 2019-06-15 09:27:48 +02:00
commit f121d1b8d1
8 changed files with 162 additions and 68 deletions

View File

@ -128,7 +128,7 @@ newtype AESCCM = AESCCM ScrubbedBytes
deriving (NFData) deriving (NFData)
sizeGCM :: Int sizeGCM :: Int
sizeGCM = 80 sizeGCM = 320
sizeOCB :: Int sizeOCB :: Int
sizeOCB = 160 sizeOCB = 160

View File

@ -34,33 +34,6 @@
#include <aes/gf.h> #include <aes/gf.h>
#include <aes/x86ni.h> #include <aes/x86ni.h>
/* this is a really inefficient way to GF multiply.
* the alternative without hw accel is building small tables
* to speed up the multiplication.
* TODO: optimise with tables
*/
void cryptonite_aes_generic_gf_mul(block128 *a, block128 *b)
{
uint64_t a0, a1, v0, v1;
int i, j;
a0 = a1 = 0;
v0 = cpu_to_be64(a->q[0]);
v1 = cpu_to_be64(a->q[1]);
for (i = 0; i < 16; i++)
for (j = 0x80; j != 0; j >>= 1) {
uint8_t x = b->b[i] & j;
a0 ^= x ? v0 : 0;
a1 ^= x ? v1 : 0;
x = (uint8_t) v1 & 1;
v1 = (v1 >> 1) | (v0 << 63);
v0 = (v0 >> 1) ^ (x ? (0xe1ULL << 56) : 0);
}
a->q[0] = cpu_to_be64(a0);
a->q[1] = cpu_to_be64(a1);
}
/* inplace GFMUL for xts mode */ /* inplace GFMUL for xts mode */
void cryptonite_aes_generic_gf_mulx(block128 *a) void cryptonite_aes_generic_gf_mulx(block128 *a)
{ {
@ -70,3 +43,104 @@ void cryptonite_aes_generic_gf_mulx(block128 *a)
a->q[0] = cpu_to_le64(le64_to_cpu(a->q[0]) << 1) ^ r; a->q[0] = cpu_to_le64(le64_to_cpu(a->q[0]) << 1) ^ r;
} }
/*
* GF multiplication with Shoup's method and 4-bit table.
*
* We precompute the products of H with all 4-bit polynomials and store them in
* a 'table_4bit' array. To avoid unnecessary byte swapping, the 16 blocks are
* written to the table with qwords already converted to CPU order. Table
* indices use the reflected bit ordering, i.e. polynomials X^0, X^1, X^2, X^3
* map to bit positions 3, 2, 1, 0 respectively.
*
* To multiply an arbitrary block with H, the input block is decomposed in 4-bit
* segments. We get the final result after 32 table lookups and additions, one
* for each segment, interleaving multiplication by P(X)=X^4.
*/
/* convert block128 qwords between BE and CPU order */
static inline void block128_cpu_swap_be(block128 *a, const block128 *b)
{
a->q[1] = cpu_to_be64(b->q[1]);
a->q[0] = cpu_to_be64(b->q[0]);
}
/* multiplication by P(X)=X, assuming qwords already in CPU order */
static inline void cpu_gf_mulx(block128 *a, const block128 *b)
{
uint64_t v0 = b->q[0];
uint64_t v1 = b->q[1];
a->q[1] = v1 >> 1 | v0 << 63;
a->q[0] = v0 >> 1 ^ ((0-(v1 & 1)) & 0xe100000000000000ULL);
}
const static uint64_t r4_0[] =
{ 0x0000000000000000ULL, 0x1c20000000000000ULL
, 0x3840000000000000ULL, 0x2460000000000000ULL
, 0x7080000000000000ULL, 0x6ca0000000000000ULL
, 0x48c0000000000000ULL, 0x54e0000000000000ULL
, 0xe100000000000000ULL, 0xfd20000000000000ULL
, 0xd940000000000000ULL, 0xc560000000000000ULL
, 0x9180000000000000ULL, 0x8da0000000000000ULL
, 0xa9c0000000000000ULL, 0xb5e0000000000000ULL
};
/* multiplication by P(X)=X^4, assuming qwords already in CPU order */
static inline void cpu_gf_mulx4(block128 *a, const block128 *b)
{
uint64_t v0 = b->q[0];
uint64_t v1 = b->q[1];
a->q[1] = v1 >> 4 | v0 << 60;
a->q[0] = v0 >> 4 ^ r4_0[v1 & 0xf];
}
/* initialize the 4-bit table given H */
void cryptonite_aes_generic_hinit(table_4bit htable, const block128 *h)
{
block128 v, *p;
int i, j;
/* multiplication by 0 is 0 */
block128_zero(&htable[0]);
/* at index 8=2^3 we have H.X^0 = H */
i = 8;
block128_cpu_swap_be(&htable[i], h); /* in CPU order */
p = &htable[i];
/* for other powers of 2, repeat multiplication by P(X)=X */
for (i = 4; i > 0; i >>= 1)
{
cpu_gf_mulx(&htable[i], p);
p = &htable[i];
}
/* remaining elements are linear combinations */
for (i = 2; i < 16; i <<= 1) {
p = &htable[i];
v = *p;
for (j = 1; j < i; j++) {
p[j] = v;
block128_xor_aligned(&p[j], &htable[j]);
}
}
}
/* multiply a block with H */
void cryptonite_aes_generic_gf_mul(block128 *a, const table_4bit htable)
{
block128 b;
int i;
block128_zero(&b);
for (i = 15; i >= 0; i--)
{
uint8_t v = a->b[i];
block128_xor_aligned(&b, &htable[v & 0xf]); /* high bits (reflected) */
cpu_gf_mulx4(&b, &b);
block128_xor_aligned(&b, &htable[v >> 4]); /* low bits (reflected) */
if (i > 0)
cpu_gf_mulx4(&b, &b);
else
block128_cpu_swap_be(a, &b); /* restore BE order when done */
}
}

View File

@ -32,7 +32,11 @@
#include "aes/block128.h" #include "aes/block128.h"
void cryptonite_aes_generic_gf_mul(block128 *a, block128 *b); typedef block128 table_4bit[16];
void cryptonite_aes_generic_gf_mulx(block128 *a); void cryptonite_aes_generic_gf_mulx(block128 *a);
void cryptonite_aes_generic_hinit(table_4bit htable, const block128 *h);
void cryptonite_aes_generic_gf_mul(block128 *a, const table_4bit htable);
#endif #endif

View File

@ -158,33 +158,32 @@ static __m128i gfmulx(__m128i v)
return v; return v;
} }
static __m128i gfmul_generic(__m128i tag, __m128i h) static __m128i gfmul_generic(__m128i tag, const table_4bit htable)
{ {
aes_block _t, _h; aes_block _t;
_mm_store_si128((__m128i *) &_t, tag); _mm_store_si128((__m128i *) &_t, tag);
_mm_store_si128((__m128i *) &_h, h); cryptonite_aes_generic_gf_mul(&_t, htable);
cryptonite_aes_generic_gf_mul(&_t, &_h);
tag = _mm_load_si128((__m128i *) &_t); tag = _mm_load_si128((__m128i *) &_t);
return tag; return tag;
} }
#ifdef WITH_PCLMUL #ifdef WITH_PCLMUL
__m128i (*gfmul_branch_ptr)(__m128i a, __m128i b) = gfmul_generic; __m128i (*gfmul_branch_ptr)(__m128i a, const table_4bit t) = gfmul_generic;
#define gfmul(a,b) ((*gfmul_branch_ptr)(a,b)) #define gfmul(a,t) ((*gfmul_branch_ptr)(a,t))
/* See Intel carry-less-multiplication-instruction-in-gcm-mode-paper.pdf /* See Intel carry-less-multiplication-instruction-in-gcm-mode-paper.pdf
* *
* Adapted from figure 5, with additional byte swapping so that interface * Adapted from figure 5, with additional byte swapping so that interface
* is simimar to cryptonite_aes_generic_gf_mul. * is simimar to cryptonite_aes_generic_gf_mul.
*/ */
static __m128i gfmul_pclmuldq(__m128i a, __m128i b) static __m128i gfmul_pclmuldq(__m128i a, const table_4bit htable)
{ {
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; __m128i b, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
__m128i bswap_mask = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); __m128i bswap_mask = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
a = _mm_shuffle_epi8(a, bswap_mask); a = _mm_shuffle_epi8(a, bswap_mask);
b = _mm_shuffle_epi8(b, bswap_mask); b = _mm_loadu_si128((__m128i *) htable);
tmp3 = _mm_clmulepi64_si128(a, b, 0x00); tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
tmp4 = _mm_clmulepi64_si128(a, b, 0x10); tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
@ -231,28 +230,37 @@ static __m128i gfmul_pclmuldq(__m128i a, __m128i b)
return _mm_shuffle_epi8(tmp6, bswap_mask); return _mm_shuffle_epi8(tmp6, bswap_mask);
} }
void cryptonite_aesni_gf_mul(block128 *a, block128 *b) void cryptonite_aesni_hinit_pclmul(table_4bit htable, const block128 *h)
{ {
__m128i _a, _b, _c; /* When pclmul is active we don't need to fill the table. Instead we just
_a = _mm_loadu_si128((__m128i *) a); * store H at index 0. It is written in reverse order, so function
_b = _mm_loadu_si128((__m128i *) b); * gfmul_pclmuldq will not byte-swap this value.
_c = gfmul_pclmuldq(_a, _b); */
_mm_storeu_si128((__m128i *) a, _c); htable->q[0] = bitfn_swap64(h->q[1]);
htable->q[1] = bitfn_swap64(h->q[0]);
} }
void cryptonite_aesni_init_pclmul() void cryptonite_aesni_gf_mul_pclmul(block128 *a, const table_4bit htable)
{
__m128i _a, _b;
_a = _mm_loadu_si128((__m128i *) a);
_b = gfmul_pclmuldq(_a, htable);
_mm_storeu_si128((__m128i *) a, _b);
}
void cryptonite_aesni_init_pclmul(void)
{ {
gfmul_branch_ptr = gfmul_pclmuldq; gfmul_branch_ptr = gfmul_pclmuldq;
} }
#else #else
#define gfmul(a,b) (gfmul_generic(a,b)) #define gfmul(a,t) (gfmul_generic(a,t))
#endif #endif
static inline __m128i ghash_add(__m128i tag, __m128i h, __m128i m) static inline __m128i ghash_add(__m128i tag, const table_4bit htable, __m128i m)
{ {
tag = _mm_xor_si128(tag, m); tag = _mm_xor_si128(tag, m);
return gfmul(tag, h); return gfmul(tag, htable);
} }
#define PRELOAD_ENC_KEYS128(k) \ #define PRELOAD_ENC_KEYS128(k) \

View File

@ -73,8 +73,9 @@ void cryptonite_aesni_gcm_encrypt128(uint8_t *out, aes_gcm *gcm, aes_key *key, u
void cryptonite_aesni_gcm_encrypt256(uint8_t *out, aes_gcm *gcm, aes_key *key, uint8_t *in, uint32_t length); void cryptonite_aesni_gcm_encrypt256(uint8_t *out, aes_gcm *gcm, aes_key *key, uint8_t *in, uint32_t length);
#ifdef WITH_PCLMUL #ifdef WITH_PCLMUL
void cryptonite_aesni_init_pclmul(); void cryptonite_aesni_init_pclmul(void);
void cryptonite_aesni_gf_mul(block128 *a, block128 *b); void cryptonite_aesni_hinit_pclmul(table_4bit htable, const block128 *h);
void cryptonite_aesni_gf_mul_pclmul(block128 *a, const table_4bit htable);
#endif #endif
#endif #endif

View File

@ -191,7 +191,6 @@ void SIZED(cryptonite_aesni_gcm_encrypt)(uint8_t *output, aes_gcm *gcm, aes_key
gcm->length_input += length; gcm->length_input += length;
__m128i h = _mm_loadu_si128((__m128i *) &gcm->h);
__m128i tag = _mm_loadu_si128((__m128i *) &gcm->tag); __m128i tag = _mm_loadu_si128((__m128i *) &gcm->tag);
__m128i iv = _mm_loadu_si128((__m128i *) &gcm->civ); __m128i iv = _mm_loadu_si128((__m128i *) &gcm->civ);
iv = _mm_shuffle_epi8(iv, bswap_mask); iv = _mm_shuffle_epi8(iv, bswap_mask);
@ -209,7 +208,7 @@ void SIZED(cryptonite_aesni_gcm_encrypt)(uint8_t *output, aes_gcm *gcm, aes_key
__m128i m = _mm_loadu_si128((__m128i *) input); __m128i m = _mm_loadu_si128((__m128i *) input);
m = _mm_xor_si128(m, tmp); m = _mm_xor_si128(m, tmp);
tag = ghash_add(tag, h, m); tag = ghash_add(tag, gcm->htable, m);
/* store it out */ /* store it out */
_mm_storeu_si128((__m128i *) output, m); _mm_storeu_si128((__m128i *) output, m);
@ -250,7 +249,7 @@ void SIZED(cryptonite_aesni_gcm_encrypt)(uint8_t *output, aes_gcm *gcm, aes_key
m = _mm_xor_si128(m, tmp); m = _mm_xor_si128(m, tmp);
m = _mm_shuffle_epi8(m, mask); m = _mm_shuffle_epi8(m, mask);
tag = ghash_add(tag, h, m); tag = ghash_add(tag, gcm->htable, m);
/* make output */ /* make output */
_mm_storeu_si128((__m128i *) &block.b, m); _mm_storeu_si128((__m128i *) &block.b, m);

View File

@ -82,7 +82,7 @@ enum {
ENCRYPT_CCM_128, ENCRYPT_CCM_192, ENCRYPT_CCM_256, ENCRYPT_CCM_128, ENCRYPT_CCM_192, ENCRYPT_CCM_256,
DECRYPT_CCM_128, DECRYPT_CCM_192, DECRYPT_CCM_256, DECRYPT_CCM_128, DECRYPT_CCM_192, DECRYPT_CCM_256,
/* ghash */ /* ghash */
GHASH_GF_MUL, GHASH_HINIT, GHASH_GF_MUL,
}; };
void *cryptonite_aes_branch_table[] = { void *cryptonite_aes_branch_table[] = {
@ -144,6 +144,7 @@ void *cryptonite_aes_branch_table[] = {
[DECRYPT_CCM_192] = cryptonite_aes_generic_ccm_decrypt, [DECRYPT_CCM_192] = cryptonite_aes_generic_ccm_decrypt,
[DECRYPT_CCM_256] = cryptonite_aes_generic_ccm_decrypt, [DECRYPT_CCM_256] = cryptonite_aes_generic_ccm_decrypt,
/* GHASH */ /* GHASH */
[GHASH_HINIT] = cryptonite_aes_generic_hinit,
[GHASH_GF_MUL] = cryptonite_aes_generic_gf_mul, [GHASH_GF_MUL] = cryptonite_aes_generic_gf_mul,
}; };
@ -156,7 +157,8 @@ typedef void (*gcm_crypt_f)(uint8_t *output, aes_gcm *gcm, aes_key *key, uint8_t
typedef void (*ocb_crypt_f)(uint8_t *output, aes_ocb *ocb, aes_key *key, uint8_t *input, uint32_t length); typedef void (*ocb_crypt_f)(uint8_t *output, aes_ocb *ocb, aes_key *key, uint8_t *input, uint32_t length);
typedef void (*ccm_crypt_f)(uint8_t *output, aes_ccm *ccm, aes_key *key, uint8_t *input, uint32_t length); typedef void (*ccm_crypt_f)(uint8_t *output, aes_ccm *ccm, aes_key *key, uint8_t *input, uint32_t length);
typedef void (*block_f)(aes_block *output, aes_key *key, aes_block *input); typedef void (*block_f)(aes_block *output, aes_key *key, aes_block *input);
typedef void (*gf_mul_f)(aes_block *a, aes_block *b); typedef void (*hinit_f)(table_4bit htable, const block128 *h);
typedef void (*gf_mul_f)(block128 *a, const table_4bit htable);
#ifdef WITH_AESNI #ifdef WITH_AESNI
#define GET_INIT(strength) \ #define GET_INIT(strength) \
@ -191,8 +193,10 @@ typedef void (*gf_mul_f)(aes_block *a, aes_block *b);
(((block_f) (cryptonite_aes_branch_table[ENCRYPT_BLOCK_128 + k->strength]))(o,k,i)) (((block_f) (cryptonite_aes_branch_table[ENCRYPT_BLOCK_128 + k->strength]))(o,k,i))
#define cryptonite_aes_decrypt_block(o,k,i) \ #define cryptonite_aes_decrypt_block(o,k,i) \
(((block_f) (cryptonite_aes_branch_table[DECRYPT_BLOCK_128 + k->strength]))(o,k,i)) (((block_f) (cryptonite_aes_branch_table[DECRYPT_BLOCK_128 + k->strength]))(o,k,i))
#define cryptonite_gf_mul(a,b) \ #define cryptonite_hinit(t,h) \
(((gf_mul_f) (cryptonite_aes_branch_table[GHASH_GF_MUL]))(a,b)) (((hinit_f) (cryptonite_aes_branch_table[GHASH_HINIT]))(t,h))
#define cryptonite_gf_mul(a,t) \
(((gf_mul_f) (cryptonite_aes_branch_table[GHASH_GF_MUL]))(a,t))
#else #else
#define GET_INIT(strenght) cryptonite_aes_generic_init #define GET_INIT(strenght) cryptonite_aes_generic_init
#define GET_ECB_ENCRYPT(strength) cryptonite_aes_generic_encrypt_ecb #define GET_ECB_ENCRYPT(strength) cryptonite_aes_generic_encrypt_ecb
@ -210,7 +214,8 @@ typedef void (*gf_mul_f)(aes_block *a, aes_block *b);
#define GET_CCM_DECRYPT(strength) cryptonite_aes_generic_ccm_decrypt #define GET_CCM_DECRYPT(strength) cryptonite_aes_generic_ccm_decrypt
#define cryptonite_aes_encrypt_block(o,k,i) cryptonite_aes_generic_encrypt_block(o,k,i) #define cryptonite_aes_encrypt_block(o,k,i) cryptonite_aes_generic_encrypt_block(o,k,i)
#define cryptonite_aes_decrypt_block(o,k,i) cryptonite_aes_generic_decrypt_block(o,k,i) #define cryptonite_aes_decrypt_block(o,k,i) cryptonite_aes_generic_decrypt_block(o,k,i)
#define cryptonite_gf_mul(a,b) cryptonite_aes_generic_gf_mul(a,b) #define cryptonite_hinit(t,h) cryptonite_aes_generic_hinit(t,h)
#define cryptonite_gf_mul(a,t) cryptonite_aes_generic_gf_mul(a,t)
#endif #endif
#if defined(ARCH_X86) && defined(WITH_AESNI) #if defined(ARCH_X86) && defined(WITH_AESNI)
@ -253,7 +258,8 @@ static void initialize_table_ni(int aesni, int pclmul)
if (!pclmul) if (!pclmul)
return; return;
/* GHASH */ /* GHASH */
cryptonite_aes_branch_table[GHASH_GF_MUL] = cryptonite_aesni_gf_mul; cryptonite_aes_branch_table[GHASH_HINIT] = cryptonite_aesni_hinit_pclmul,
cryptonite_aes_branch_table[GHASH_GF_MUL] = cryptonite_aesni_gf_mul_pclmul,
cryptonite_aesni_init_pclmul(); cryptonite_aesni_init_pclmul();
#endif #endif
} }
@ -382,20 +388,22 @@ void cryptonite_aes_ocb_decrypt(uint8_t *output, aes_ocb *ocb, aes_key *key, uin
static void gcm_ghash_add(aes_gcm *gcm, block128 *b) static void gcm_ghash_add(aes_gcm *gcm, block128 *b)
{ {
block128_xor(&gcm->tag, b); block128_xor(&gcm->tag, b);
cryptonite_gf_mul(&gcm->tag, &gcm->h); cryptonite_gf_mul(&gcm->tag, gcm->htable);
} }
void cryptonite_aes_gcm_init(aes_gcm *gcm, aes_key *key, uint8_t *iv, uint32_t len) void cryptonite_aes_gcm_init(aes_gcm *gcm, aes_key *key, uint8_t *iv, uint32_t len)
{ {
block128 h;
gcm->length_aad = 0; gcm->length_aad = 0;
gcm->length_input = 0; gcm->length_input = 0;
block128_zero(&gcm->h); block128_zero(&h);
block128_zero(&gcm->tag); block128_zero(&gcm->tag);
block128_zero(&gcm->iv); block128_zero(&gcm->iv);
/* prepare H : encrypt_K(0^128) */ /* prepare H : encrypt_K(0^128) */
cryptonite_aes_encrypt_block(&gcm->h, key, &gcm->h); cryptonite_aes_encrypt_block(&h, key, &h);
cryptonite_hinit(gcm->htable, &h);
if (len == 12) { if (len == 12) {
block128_copy_bytes(&gcm->iv, iv, 12); block128_copy_bytes(&gcm->iv, iv, 12);
@ -405,15 +413,15 @@ void cryptonite_aes_gcm_init(aes_gcm *gcm, aes_key *key, uint8_t *iv, uint32_t l
int i; int i;
for (; len >= 16; len -= 16, iv += 16) { for (; len >= 16; len -= 16, iv += 16) {
block128_xor(&gcm->iv, (block128 *) iv); block128_xor(&gcm->iv, (block128 *) iv);
cryptonite_gf_mul(&gcm->iv, &gcm->h); cryptonite_gf_mul(&gcm->iv, gcm->htable);
} }
if (len > 0) { if (len > 0) {
block128_xor_bytes(&gcm->iv, iv, len); block128_xor_bytes(&gcm->iv, iv, len);
cryptonite_gf_mul(&gcm->iv, &gcm->h); cryptonite_gf_mul(&gcm->iv, gcm->htable);
} }
for (i = 15; origlen; --i, origlen >>= 8) for (i = 15; origlen; --i, origlen >>= 8)
gcm->iv.b[i] ^= (uint8_t) origlen; gcm->iv.b[i] ^= (uint8_t) origlen;
cryptonite_gf_mul(&gcm->iv, &gcm->h); cryptonite_gf_mul(&gcm->iv, gcm->htable);
} }
block128_copy_aligned(&gcm->civ, &gcm->iv); block128_copy_aligned(&gcm->civ, &gcm->iv);
@ -507,7 +515,7 @@ static void ccm_encode_ctr(block128* out, aes_ccm* ccm, unsigned int cnt)
static void ccm_cbcmac_add(aes_ccm* ccm, aes_key* key, block128* bi) static void ccm_cbcmac_add(aes_ccm* ccm, aes_key* key, block128* bi)
{ {
block128_xor_aligned(&ccm->xi, bi); block128_xor_aligned(&ccm->xi, bi);
cryptonite_aes_generic_encrypt_block(&ccm->xi, key, &ccm->xi); cryptonite_aes_encrypt_block(&ccm->xi, key, &ccm->xi);
} }
/* even though it is possible to support message size as large as 2^64, we support up to 2^32 only */ /* even though it is possible to support message size as large as 2^64, we support up to 2^32 only */

View File

@ -45,10 +45,10 @@ typedef struct {
uint8_t data[16*14*2]; uint8_t data[16*14*2];
} aes_key; } aes_key;
/* size = 4*16+2*8= 80 */ /* size = 19*16+2*8= 320 */
typedef struct { typedef struct {
aes_block tag; aes_block tag;
aes_block h; aes_block htable[16];
aes_block iv; aes_block iv;
aes_block civ; aes_block civ;
uint64_t length_aad; uint64_t length_aad;