diff --git a/Crypto/Cipher/AES/Primitive.hs b/Crypto/Cipher/AES/Primitive.hs index 8241013..d8a8490 100644 --- a/Crypto/Cipher/AES/Primitive.hs +++ b/Crypto/Cipher/AES/Primitive.hs @@ -128,7 +128,7 @@ newtype AESCCM = AESCCM ScrubbedBytes deriving (NFData) sizeGCM :: Int -sizeGCM = 80 +sizeGCM = 320 sizeOCB :: Int sizeOCB = 160 diff --git a/cbits/aes/gf.c b/cbits/aes/gf.c index 7dcc12c..b750e3c 100644 --- a/cbits/aes/gf.c +++ b/cbits/aes/gf.c @@ -34,33 +34,6 @@ #include #include -/* this is a really inefficient way to GF multiply. - * the alternative without hw accel is building small tables - * to speed up the multiplication. - * TODO: optimise with tables - */ -void cryptonite_aes_generic_gf_mul(block128 *a, block128 *b) -{ - uint64_t a0, a1, v0, v1; - int i, j; - - a0 = a1 = 0; - v0 = cpu_to_be64(a->q[0]); - v1 = cpu_to_be64(a->q[1]); - - for (i = 0; i < 16; i++) - for (j = 0x80; j != 0; j >>= 1) { - uint8_t x = b->b[i] & j; - a0 ^= x ? v0 : 0; - a1 ^= x ? v1 : 0; - x = (uint8_t) v1 & 1; - v1 = (v1 >> 1) | (v0 << 63); - v0 = (v0 >> 1) ^ (x ? (0xe1ULL << 56) : 0); - } - a->q[0] = cpu_to_be64(a0); - a->q[1] = cpu_to_be64(a1); -} - /* inplace GFMUL for xts mode */ void cryptonite_aes_generic_gf_mulx(block128 *a) { @@ -70,3 +43,104 @@ void cryptonite_aes_generic_gf_mulx(block128 *a) a->q[0] = cpu_to_le64(le64_to_cpu(a->q[0]) << 1) ^ r; } + +/* + * GF multiplication with Shoup's method and 4-bit table. + * + * We precompute the products of H with all 4-bit polynomials and store them in + * a 'table_4bit' array. To avoid unnecessary byte swapping, the 16 blocks are + * written to the table with qwords already converted to CPU order. Table + * indices use the reflected bit ordering, i.e. polynomials X^0, X^1, X^2, X^3 + * map to bit positions 3, 2, 1, 0 respectively. + * + * To multiply an arbitrary block with H, the input block is decomposed in 4-bit + * segments. We get the final result after 32 table lookups and additions, one + * for each segment, interleaving multiplication by P(X)=X^4. + */ + +/* convert block128 qwords between BE and CPU order */ +static inline void block128_cpu_swap_be(block128 *a, const block128 *b) +{ + a->q[1] = cpu_to_be64(b->q[1]); + a->q[0] = cpu_to_be64(b->q[0]); +} + +/* multiplication by P(X)=X, assuming qwords already in CPU order */ +static inline void cpu_gf_mulx(block128 *a, const block128 *b) +{ + uint64_t v0 = b->q[0]; + uint64_t v1 = b->q[1]; + a->q[1] = v1 >> 1 | v0 << 63; + a->q[0] = v0 >> 1 ^ ((0-(v1 & 1)) & 0xe100000000000000ULL); +} + +const static uint64_t r4_0[] = + { 0x0000000000000000ULL, 0x1c20000000000000ULL + , 0x3840000000000000ULL, 0x2460000000000000ULL + , 0x7080000000000000ULL, 0x6ca0000000000000ULL + , 0x48c0000000000000ULL, 0x54e0000000000000ULL + , 0xe100000000000000ULL, 0xfd20000000000000ULL + , 0xd940000000000000ULL, 0xc560000000000000ULL + , 0x9180000000000000ULL, 0x8da0000000000000ULL + , 0xa9c0000000000000ULL, 0xb5e0000000000000ULL + }; + +/* multiplication by P(X)=X^4, assuming qwords already in CPU order */ +static inline void cpu_gf_mulx4(block128 *a, const block128 *b) +{ + uint64_t v0 = b->q[0]; + uint64_t v1 = b->q[1]; + a->q[1] = v1 >> 4 | v0 << 60; + a->q[0] = v0 >> 4 ^ r4_0[v1 & 0xf]; +} + +/* initialize the 4-bit table given H */ +void cryptonite_aes_generic_hinit(table_4bit htable, const block128 *h) +{ + block128 v, *p; + int i, j; + + /* multiplication by 0 is 0 */ + block128_zero(&htable[0]); + + /* at index 8=2^3 we have H.X^0 = H */ + i = 8; + block128_cpu_swap_be(&htable[i], h); /* in CPU order */ + p = &htable[i]; + + /* for other powers of 2, repeat multiplication by P(X)=X */ + for (i = 4; i > 0; i >>= 1) + { + cpu_gf_mulx(&htable[i], p); + p = &htable[i]; + } + + /* remaining elements are linear combinations */ + for (i = 2; i < 16; i <<= 1) { + p = &htable[i]; + v = *p; + for (j = 1; j < i; j++) { + p[j] = v; + block128_xor_aligned(&p[j], &htable[j]); + } + } +} + +/* multiply a block with H */ +void cryptonite_aes_generic_gf_mul(block128 *a, const table_4bit htable) +{ + block128 b; + int i; + block128_zero(&b); + for (i = 15; i >= 0; i--) + { + uint8_t v = a->b[i]; + block128_xor_aligned(&b, &htable[v & 0xf]); /* high bits (reflected) */ + cpu_gf_mulx4(&b, &b); + block128_xor_aligned(&b, &htable[v >> 4]); /* low bits (reflected) */ + if (i > 0) + cpu_gf_mulx4(&b, &b); + else + block128_cpu_swap_be(a, &b); /* restore BE order when done */ + } +} diff --git a/cbits/aes/gf.h b/cbits/aes/gf.h index 21b542c..59f73bc 100644 --- a/cbits/aes/gf.h +++ b/cbits/aes/gf.h @@ -32,7 +32,11 @@ #include "aes/block128.h" -void cryptonite_aes_generic_gf_mul(block128 *a, block128 *b); +typedef block128 table_4bit[16]; + void cryptonite_aes_generic_gf_mulx(block128 *a); +void cryptonite_aes_generic_hinit(table_4bit htable, const block128 *h); +void cryptonite_aes_generic_gf_mul(block128 *a, const table_4bit htable); + #endif diff --git a/cbits/aes/x86ni.c b/cbits/aes/x86ni.c index 556bde1..590a897 100644 --- a/cbits/aes/x86ni.c +++ b/cbits/aes/x86ni.c @@ -158,33 +158,32 @@ static __m128i gfmulx(__m128i v) return v; } -static __m128i gfmul_generic(__m128i tag, __m128i h) +static __m128i gfmul_generic(__m128i tag, const table_4bit htable) { - aes_block _t, _h; + aes_block _t; _mm_store_si128((__m128i *) &_t, tag); - _mm_store_si128((__m128i *) &_h, h); - cryptonite_aes_generic_gf_mul(&_t, &_h); + cryptonite_aes_generic_gf_mul(&_t, htable); tag = _mm_load_si128((__m128i *) &_t); return tag; } #ifdef WITH_PCLMUL -__m128i (*gfmul_branch_ptr)(__m128i a, __m128i b) = gfmul_generic; -#define gfmul(a,b) ((*gfmul_branch_ptr)(a,b)) +__m128i (*gfmul_branch_ptr)(__m128i a, const table_4bit t) = gfmul_generic; +#define gfmul(a,t) ((*gfmul_branch_ptr)(a,t)) /* See Intel carry-less-multiplication-instruction-in-gcm-mode-paper.pdf * * Adapted from figure 5, with additional byte swapping so that interface * is simimar to cryptonite_aes_generic_gf_mul. */ -static __m128i gfmul_pclmuldq(__m128i a, __m128i b) +static __m128i gfmul_pclmuldq(__m128i a, const table_4bit htable) { - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; + __m128i b, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; __m128i bswap_mask = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); a = _mm_shuffle_epi8(a, bswap_mask); - b = _mm_shuffle_epi8(b, bswap_mask); + b = _mm_loadu_si128((__m128i *) htable); tmp3 = _mm_clmulepi64_si128(a, b, 0x00); tmp4 = _mm_clmulepi64_si128(a, b, 0x10); @@ -231,28 +230,37 @@ static __m128i gfmul_pclmuldq(__m128i a, __m128i b) return _mm_shuffle_epi8(tmp6, bswap_mask); } -void cryptonite_aesni_gf_mul(block128 *a, block128 *b) +void cryptonite_aesni_hinit_pclmul(table_4bit htable, const block128 *h) { - __m128i _a, _b, _c; - _a = _mm_loadu_si128((__m128i *) a); - _b = _mm_loadu_si128((__m128i *) b); - _c = gfmul_pclmuldq(_a, _b); - _mm_storeu_si128((__m128i *) a, _c); + /* When pclmul is active we don't need to fill the table. Instead we just + * store H at index 0. It is written in reverse order, so function + * gfmul_pclmuldq will not byte-swap this value. + */ + htable->q[0] = bitfn_swap64(h->q[1]); + htable->q[1] = bitfn_swap64(h->q[0]); } -void cryptonite_aesni_init_pclmul() +void cryptonite_aesni_gf_mul_pclmul(block128 *a, const table_4bit htable) +{ + __m128i _a, _b; + _a = _mm_loadu_si128((__m128i *) a); + _b = gfmul_pclmuldq(_a, htable); + _mm_storeu_si128((__m128i *) a, _b); +} + +void cryptonite_aesni_init_pclmul(void) { gfmul_branch_ptr = gfmul_pclmuldq; } #else -#define gfmul(a,b) (gfmul_generic(a,b)) +#define gfmul(a,t) (gfmul_generic(a,t)) #endif -static inline __m128i ghash_add(__m128i tag, __m128i h, __m128i m) +static inline __m128i ghash_add(__m128i tag, const table_4bit htable, __m128i m) { tag = _mm_xor_si128(tag, m); - return gfmul(tag, h); + return gfmul(tag, htable); } #define PRELOAD_ENC_KEYS128(k) \ diff --git a/cbits/aes/x86ni.h b/cbits/aes/x86ni.h index b9a568a..2ef2615 100644 --- a/cbits/aes/x86ni.h +++ b/cbits/aes/x86ni.h @@ -73,8 +73,9 @@ void cryptonite_aesni_gcm_encrypt128(uint8_t *out, aes_gcm *gcm, aes_key *key, u void cryptonite_aesni_gcm_encrypt256(uint8_t *out, aes_gcm *gcm, aes_key *key, uint8_t *in, uint32_t length); #ifdef WITH_PCLMUL -void cryptonite_aesni_init_pclmul(); -void cryptonite_aesni_gf_mul(block128 *a, block128 *b); +void cryptonite_aesni_init_pclmul(void); +void cryptonite_aesni_hinit_pclmul(table_4bit htable, const block128 *h); +void cryptonite_aesni_gf_mul_pclmul(block128 *a, const table_4bit htable); #endif #endif diff --git a/cbits/aes/x86ni_impl.c b/cbits/aes/x86ni_impl.c index d019a2a..3eedaff 100644 --- a/cbits/aes/x86ni_impl.c +++ b/cbits/aes/x86ni_impl.c @@ -191,7 +191,6 @@ void SIZED(cryptonite_aesni_gcm_encrypt)(uint8_t *output, aes_gcm *gcm, aes_key gcm->length_input += length; - __m128i h = _mm_loadu_si128((__m128i *) &gcm->h); __m128i tag = _mm_loadu_si128((__m128i *) &gcm->tag); __m128i iv = _mm_loadu_si128((__m128i *) &gcm->civ); iv = _mm_shuffle_epi8(iv, bswap_mask); @@ -209,7 +208,7 @@ void SIZED(cryptonite_aesni_gcm_encrypt)(uint8_t *output, aes_gcm *gcm, aes_key __m128i m = _mm_loadu_si128((__m128i *) input); m = _mm_xor_si128(m, tmp); - tag = ghash_add(tag, h, m); + tag = ghash_add(tag, gcm->htable, m); /* store it out */ _mm_storeu_si128((__m128i *) output, m); @@ -250,7 +249,7 @@ void SIZED(cryptonite_aesni_gcm_encrypt)(uint8_t *output, aes_gcm *gcm, aes_key m = _mm_xor_si128(m, tmp); m = _mm_shuffle_epi8(m, mask); - tag = ghash_add(tag, h, m); + tag = ghash_add(tag, gcm->htable, m); /* make output */ _mm_storeu_si128((__m128i *) &block.b, m); diff --git a/cbits/cryptonite_aes.c b/cbits/cryptonite_aes.c index b5ce004..8859a23 100644 --- a/cbits/cryptonite_aes.c +++ b/cbits/cryptonite_aes.c @@ -82,7 +82,7 @@ enum { ENCRYPT_CCM_128, ENCRYPT_CCM_192, ENCRYPT_CCM_256, DECRYPT_CCM_128, DECRYPT_CCM_192, DECRYPT_CCM_256, /* ghash */ - GHASH_GF_MUL, + GHASH_HINIT, GHASH_GF_MUL, }; void *cryptonite_aes_branch_table[] = { @@ -144,6 +144,7 @@ void *cryptonite_aes_branch_table[] = { [DECRYPT_CCM_192] = cryptonite_aes_generic_ccm_decrypt, [DECRYPT_CCM_256] = cryptonite_aes_generic_ccm_decrypt, /* GHASH */ + [GHASH_HINIT] = cryptonite_aes_generic_hinit, [GHASH_GF_MUL] = cryptonite_aes_generic_gf_mul, }; @@ -156,7 +157,8 @@ typedef void (*gcm_crypt_f)(uint8_t *output, aes_gcm *gcm, aes_key *key, uint8_t typedef void (*ocb_crypt_f)(uint8_t *output, aes_ocb *ocb, aes_key *key, uint8_t *input, uint32_t length); typedef void (*ccm_crypt_f)(uint8_t *output, aes_ccm *ccm, aes_key *key, uint8_t *input, uint32_t length); typedef void (*block_f)(aes_block *output, aes_key *key, aes_block *input); -typedef void (*gf_mul_f)(aes_block *a, aes_block *b); +typedef void (*hinit_f)(table_4bit htable, const block128 *h); +typedef void (*gf_mul_f)(block128 *a, const table_4bit htable); #ifdef WITH_AESNI #define GET_INIT(strength) \ @@ -191,8 +193,10 @@ typedef void (*gf_mul_f)(aes_block *a, aes_block *b); (((block_f) (cryptonite_aes_branch_table[ENCRYPT_BLOCK_128 + k->strength]))(o,k,i)) #define cryptonite_aes_decrypt_block(o,k,i) \ (((block_f) (cryptonite_aes_branch_table[DECRYPT_BLOCK_128 + k->strength]))(o,k,i)) -#define cryptonite_gf_mul(a,b) \ - (((gf_mul_f) (cryptonite_aes_branch_table[GHASH_GF_MUL]))(a,b)) +#define cryptonite_hinit(t,h) \ + (((hinit_f) (cryptonite_aes_branch_table[GHASH_HINIT]))(t,h)) +#define cryptonite_gf_mul(a,t) \ + (((gf_mul_f) (cryptonite_aes_branch_table[GHASH_GF_MUL]))(a,t)) #else #define GET_INIT(strenght) cryptonite_aes_generic_init #define GET_ECB_ENCRYPT(strength) cryptonite_aes_generic_encrypt_ecb @@ -210,7 +214,8 @@ typedef void (*gf_mul_f)(aes_block *a, aes_block *b); #define GET_CCM_DECRYPT(strength) cryptonite_aes_generic_ccm_decrypt #define cryptonite_aes_encrypt_block(o,k,i) cryptonite_aes_generic_encrypt_block(o,k,i) #define cryptonite_aes_decrypt_block(o,k,i) cryptonite_aes_generic_decrypt_block(o,k,i) -#define cryptonite_gf_mul(a,b) cryptonite_aes_generic_gf_mul(a,b) +#define cryptonite_hinit(t,h) cryptonite_aes_generic_hinit(t,h) +#define cryptonite_gf_mul(a,t) cryptonite_aes_generic_gf_mul(a,t) #endif #if defined(ARCH_X86) && defined(WITH_AESNI) @@ -253,7 +258,8 @@ static void initialize_table_ni(int aesni, int pclmul) if (!pclmul) return; /* GHASH */ - cryptonite_aes_branch_table[GHASH_GF_MUL] = cryptonite_aesni_gf_mul; + cryptonite_aes_branch_table[GHASH_HINIT] = cryptonite_aesni_hinit_pclmul, + cryptonite_aes_branch_table[GHASH_GF_MUL] = cryptonite_aesni_gf_mul_pclmul, cryptonite_aesni_init_pclmul(); #endif } @@ -382,20 +388,22 @@ void cryptonite_aes_ocb_decrypt(uint8_t *output, aes_ocb *ocb, aes_key *key, uin static void gcm_ghash_add(aes_gcm *gcm, block128 *b) { block128_xor(&gcm->tag, b); - cryptonite_gf_mul(&gcm->tag, &gcm->h); + cryptonite_gf_mul(&gcm->tag, gcm->htable); } void cryptonite_aes_gcm_init(aes_gcm *gcm, aes_key *key, uint8_t *iv, uint32_t len) { + block128 h; gcm->length_aad = 0; gcm->length_input = 0; - block128_zero(&gcm->h); + block128_zero(&h); block128_zero(&gcm->tag); block128_zero(&gcm->iv); /* prepare H : encrypt_K(0^128) */ - cryptonite_aes_encrypt_block(&gcm->h, key, &gcm->h); + cryptonite_aes_encrypt_block(&h, key, &h); + cryptonite_hinit(gcm->htable, &h); if (len == 12) { block128_copy_bytes(&gcm->iv, iv, 12); @@ -405,15 +413,15 @@ void cryptonite_aes_gcm_init(aes_gcm *gcm, aes_key *key, uint8_t *iv, uint32_t l int i; for (; len >= 16; len -= 16, iv += 16) { block128_xor(&gcm->iv, (block128 *) iv); - cryptonite_gf_mul(&gcm->iv, &gcm->h); + cryptonite_gf_mul(&gcm->iv, gcm->htable); } if (len > 0) { block128_xor_bytes(&gcm->iv, iv, len); - cryptonite_gf_mul(&gcm->iv, &gcm->h); + cryptonite_gf_mul(&gcm->iv, gcm->htable); } for (i = 15; origlen; --i, origlen >>= 8) gcm->iv.b[i] ^= (uint8_t) origlen; - cryptonite_gf_mul(&gcm->iv, &gcm->h); + cryptonite_gf_mul(&gcm->iv, gcm->htable); } block128_copy_aligned(&gcm->civ, &gcm->iv); @@ -507,7 +515,7 @@ static void ccm_encode_ctr(block128* out, aes_ccm* ccm, unsigned int cnt) static void ccm_cbcmac_add(aes_ccm* ccm, aes_key* key, block128* bi) { block128_xor_aligned(&ccm->xi, bi); - cryptonite_aes_generic_encrypt_block(&ccm->xi, key, &ccm->xi); + cryptonite_aes_encrypt_block(&ccm->xi, key, &ccm->xi); } /* even though it is possible to support message size as large as 2^64, we support up to 2^32 only */ diff --git a/cbits/cryptonite_aes.h b/cbits/cryptonite_aes.h index 05e147d..fd648f3 100644 --- a/cbits/cryptonite_aes.h +++ b/cbits/cryptonite_aes.h @@ -45,10 +45,10 @@ typedef struct { uint8_t data[16*14*2]; } aes_key; -/* size = 4*16+2*8= 80 */ +/* size = 19*16+2*8= 320 */ typedef struct { aes_block tag; - aes_block h; + aes_block htable[16]; aes_block iv; aes_block civ; uint64_t length_aad;