Add GHASH implementation with PCLMULQDQ
This commit is contained in:
parent
cddbc2cef9
commit
d25e44ea61
@ -39,7 +39,7 @@
|
|||||||
* to speed up the multiplication.
|
* to speed up the multiplication.
|
||||||
* TODO: optimise with tables
|
* TODO: optimise with tables
|
||||||
*/
|
*/
|
||||||
void cryptonite_gf_mul(block128 *a, block128 *b)
|
void cryptonite_aes_generic_gf_mul(block128 *a, block128 *b)
|
||||||
{
|
{
|
||||||
uint64_t a0, a1, v0, v1;
|
uint64_t a0, a1, v0, v1;
|
||||||
int i, j;
|
int i, j;
|
||||||
@ -62,7 +62,7 @@ void cryptonite_gf_mul(block128 *a, block128 *b)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* inplace GFMUL for xts mode */
|
/* inplace GFMUL for xts mode */
|
||||||
void cryptonite_gf_mulx(block128 *a)
|
void cryptonite_aes_generic_gf_mulx(block128 *a)
|
||||||
{
|
{
|
||||||
const uint64_t gf_mask = cpu_to_le64(0x8000000000000000ULL);
|
const uint64_t gf_mask = cpu_to_le64(0x8000000000000000ULL);
|
||||||
uint64_t r = ((a->q[1] & gf_mask) ? cpu_to_le64(0x87) : 0);
|
uint64_t r = ((a->q[1] & gf_mask) ? cpu_to_le64(0x87) : 0);
|
||||||
|
|||||||
@ -32,7 +32,7 @@
|
|||||||
|
|
||||||
#include "aes/block128.h"
|
#include "aes/block128.h"
|
||||||
|
|
||||||
void cryptonite_gf_mul(block128 *a, block128 *b);
|
void cryptonite_aes_generic_gf_mul(block128 *a, block128 *b);
|
||||||
void cryptonite_gf_mulx(block128 *a);
|
void cryptonite_aes_generic_gf_mulx(block128 *a);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -158,18 +158,103 @@ static __m128i gfmulx(__m128i v)
|
|||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __m128i ghash_add(__m128i tag, __m128i h, __m128i m)
|
static __m128i gfmul_generic(__m128i tag, __m128i h)
|
||||||
{
|
{
|
||||||
aes_block _t, _h;
|
aes_block _t, _h;
|
||||||
tag = _mm_xor_si128(tag, m);
|
|
||||||
|
|
||||||
_mm_store_si128((__m128i *) &_t, tag);
|
_mm_store_si128((__m128i *) &_t, tag);
|
||||||
_mm_store_si128((__m128i *) &_h, h);
|
_mm_store_si128((__m128i *) &_h, h);
|
||||||
cryptonite_gf_mul(&_t, &_h);
|
cryptonite_aes_generic_gf_mul(&_t, &_h);
|
||||||
tag = _mm_load_si128((__m128i *) &_t);
|
tag = _mm_load_si128((__m128i *) &_t);
|
||||||
return tag;
|
return tag;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef WITH_PCLMUL
|
||||||
|
|
||||||
|
__m128i (*gfmul_branch_ptr)(__m128i a, __m128i b) = gfmul_generic;
|
||||||
|
#define gfmul(a,b) ((*gfmul_branch_ptr)(a,b))
|
||||||
|
|
||||||
|
/* See Intel carry-less-multiplication-instruction-in-gcm-mode-paper.pdf
|
||||||
|
*
|
||||||
|
* Adapted from figure 5, with additional byte swapping so that interface
|
||||||
|
* is simimar to cryptonite_aes_generic_gf_mul.
|
||||||
|
*/
|
||||||
|
static __m128i gfmul_pclmuldq(__m128i a, __m128i b)
|
||||||
|
{
|
||||||
|
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
|
||||||
|
__m128i bswap_mask = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
|
||||||
|
|
||||||
|
a = _mm_shuffle_epi8(a, bswap_mask);
|
||||||
|
b = _mm_shuffle_epi8(b, bswap_mask);
|
||||||
|
|
||||||
|
tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
|
||||||
|
tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
|
||||||
|
tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
|
||||||
|
tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
|
||||||
|
|
||||||
|
tmp4 = _mm_xor_si128(tmp4, tmp5);
|
||||||
|
tmp5 = _mm_slli_si128(tmp4, 8);
|
||||||
|
tmp4 = _mm_srli_si128(tmp4, 8);
|
||||||
|
tmp3 = _mm_xor_si128(tmp3, tmp5);
|
||||||
|
tmp6 = _mm_xor_si128(tmp6, tmp4);
|
||||||
|
|
||||||
|
tmp7 = _mm_srli_epi32(tmp3, 31);
|
||||||
|
tmp8 = _mm_srli_epi32(tmp6, 31);
|
||||||
|
tmp3 = _mm_slli_epi32(tmp3, 1);
|
||||||
|
tmp6 = _mm_slli_epi32(tmp6, 1);
|
||||||
|
|
||||||
|
tmp9 = _mm_srli_si128(tmp7, 12);
|
||||||
|
tmp8 = _mm_slli_si128(tmp8, 4);
|
||||||
|
tmp7 = _mm_slli_si128(tmp7, 4);
|
||||||
|
tmp3 = _mm_or_si128(tmp3, tmp7);
|
||||||
|
tmp6 = _mm_or_si128(tmp6, tmp8);
|
||||||
|
tmp6 = _mm_or_si128(tmp6, tmp9);
|
||||||
|
|
||||||
|
tmp7 = _mm_slli_epi32(tmp3, 31);
|
||||||
|
tmp8 = _mm_slli_epi32(tmp3, 30);
|
||||||
|
tmp9 = _mm_slli_epi32(tmp3, 25);
|
||||||
|
|
||||||
|
tmp7 = _mm_xor_si128(tmp7, tmp8);
|
||||||
|
tmp7 = _mm_xor_si128(tmp7, tmp9);
|
||||||
|
tmp8 = _mm_srli_si128(tmp7, 4);
|
||||||
|
tmp7 = _mm_slli_si128(tmp7, 12);
|
||||||
|
tmp3 = _mm_xor_si128(tmp3, tmp7);
|
||||||
|
|
||||||
|
tmp2 = _mm_srli_epi32(tmp3, 1);
|
||||||
|
tmp4 = _mm_srli_epi32(tmp3, 2);
|
||||||
|
tmp5 = _mm_srli_epi32(tmp3, 7);
|
||||||
|
tmp2 = _mm_xor_si128(tmp2, tmp4);
|
||||||
|
tmp2 = _mm_xor_si128(tmp2, tmp5);
|
||||||
|
tmp2 = _mm_xor_si128(tmp2, tmp8);
|
||||||
|
tmp3 = _mm_xor_si128(tmp3, tmp2);
|
||||||
|
tmp6 = _mm_xor_si128(tmp6, tmp3);
|
||||||
|
|
||||||
|
return _mm_shuffle_epi8(tmp6, bswap_mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cryptonite_aesni_gf_mul(block128 *a, block128 *b)
|
||||||
|
{
|
||||||
|
__m128i _a, _b, _c;
|
||||||
|
_a = _mm_loadu_si128((__m128i *) a);
|
||||||
|
_b = _mm_loadu_si128((__m128i *) b);
|
||||||
|
_c = gfmul_pclmuldq(_a, _b);
|
||||||
|
_mm_storeu_si128((__m128i *) a, _c);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cryptonite_aesni_init_pclmul()
|
||||||
|
{
|
||||||
|
gfmul_branch_ptr = gfmul_pclmuldq;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define gfmul(a,b) (gfmul_generic(a,b))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static inline __m128i ghash_add(__m128i tag, __m128i h, __m128i m)
|
||||||
|
{
|
||||||
|
tag = _mm_xor_si128(tag, m);
|
||||||
|
return gfmul(tag, h);
|
||||||
|
}
|
||||||
|
|
||||||
#define PRELOAD_ENC_KEYS128(k) \
|
#define PRELOAD_ENC_KEYS128(k) \
|
||||||
__m128i K0 = _mm_loadu_si128(((__m128i *) k)+0); \
|
__m128i K0 = _mm_loadu_si128(((__m128i *) k)+0); \
|
||||||
__m128i K1 = _mm_loadu_si128(((__m128i *) k)+1); \
|
__m128i K1 = _mm_loadu_si128(((__m128i *) k)+1); \
|
||||||
|
|||||||
@ -72,7 +72,10 @@ void cryptonite_aesni_encrypt_xts256(aes_block *out, aes_key *key1, aes_key *key
|
|||||||
void cryptonite_aesni_gcm_encrypt128(uint8_t *out, aes_gcm *gcm, aes_key *key, uint8_t *in, uint32_t length);
|
void cryptonite_aesni_gcm_encrypt128(uint8_t *out, aes_gcm *gcm, aes_key *key, uint8_t *in, uint32_t length);
|
||||||
void cryptonite_aesni_gcm_encrypt256(uint8_t *out, aes_gcm *gcm, aes_key *key, uint8_t *in, uint32_t length);
|
void cryptonite_aesni_gcm_encrypt256(uint8_t *out, aes_gcm *gcm, aes_key *key, uint8_t *in, uint32_t length);
|
||||||
|
|
||||||
void gf_mul_x86ni(block128 *res, block128 *a_, block128 *b_);
|
#ifdef WITH_PCLMUL
|
||||||
|
void cryptonite_aesni_init_pclmul();
|
||||||
|
void cryptonite_aesni_gf_mul(block128 *a, block128 *b);
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@ -81,6 +81,8 @@ enum {
|
|||||||
/* ccm */
|
/* ccm */
|
||||||
ENCRYPT_CCM_128, ENCRYPT_CCM_192, ENCRYPT_CCM_256,
|
ENCRYPT_CCM_128, ENCRYPT_CCM_192, ENCRYPT_CCM_256,
|
||||||
DECRYPT_CCM_128, DECRYPT_CCM_192, DECRYPT_CCM_256,
|
DECRYPT_CCM_128, DECRYPT_CCM_192, DECRYPT_CCM_256,
|
||||||
|
/* ghash */
|
||||||
|
GHASH_GF_MUL,
|
||||||
};
|
};
|
||||||
|
|
||||||
void *cryptonite_aes_branch_table[] = {
|
void *cryptonite_aes_branch_table[] = {
|
||||||
@ -141,6 +143,8 @@ void *cryptonite_aes_branch_table[] = {
|
|||||||
[DECRYPT_CCM_128] = cryptonite_aes_generic_ccm_decrypt,
|
[DECRYPT_CCM_128] = cryptonite_aes_generic_ccm_decrypt,
|
||||||
[DECRYPT_CCM_192] = cryptonite_aes_generic_ccm_decrypt,
|
[DECRYPT_CCM_192] = cryptonite_aes_generic_ccm_decrypt,
|
||||||
[DECRYPT_CCM_256] = cryptonite_aes_generic_ccm_decrypt,
|
[DECRYPT_CCM_256] = cryptonite_aes_generic_ccm_decrypt,
|
||||||
|
/* GHASH */
|
||||||
|
[GHASH_GF_MUL] = cryptonite_aes_generic_gf_mul,
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef void (*init_f)(aes_key *, uint8_t *, uint8_t);
|
typedef void (*init_f)(aes_key *, uint8_t *, uint8_t);
|
||||||
@ -152,6 +156,7 @@ typedef void (*gcm_crypt_f)(uint8_t *output, aes_gcm *gcm, aes_key *key, uint8_t
|
|||||||
typedef void (*ocb_crypt_f)(uint8_t *output, aes_ocb *ocb, aes_key *key, uint8_t *input, uint32_t length);
|
typedef void (*ocb_crypt_f)(uint8_t *output, aes_ocb *ocb, aes_key *key, uint8_t *input, uint32_t length);
|
||||||
typedef void (*ccm_crypt_f)(uint8_t *output, aes_ccm *ccm, aes_key *key, uint8_t *input, uint32_t length);
|
typedef void (*ccm_crypt_f)(uint8_t *output, aes_ccm *ccm, aes_key *key, uint8_t *input, uint32_t length);
|
||||||
typedef void (*block_f)(aes_block *output, aes_key *key, aes_block *input);
|
typedef void (*block_f)(aes_block *output, aes_key *key, aes_block *input);
|
||||||
|
typedef void (*gf_mul_f)(aes_block *a, aes_block *b);
|
||||||
|
|
||||||
#ifdef WITH_AESNI
|
#ifdef WITH_AESNI
|
||||||
#define GET_INIT(strength) \
|
#define GET_INIT(strength) \
|
||||||
@ -186,6 +191,8 @@ typedef void (*block_f)(aes_block *output, aes_key *key, aes_block *input);
|
|||||||
(((block_f) (cryptonite_aes_branch_table[ENCRYPT_BLOCK_128 + k->strength]))(o,k,i))
|
(((block_f) (cryptonite_aes_branch_table[ENCRYPT_BLOCK_128 + k->strength]))(o,k,i))
|
||||||
#define cryptonite_aes_decrypt_block(o,k,i) \
|
#define cryptonite_aes_decrypt_block(o,k,i) \
|
||||||
(((block_f) (cryptonite_aes_branch_table[DECRYPT_BLOCK_128 + k->strength]))(o,k,i))
|
(((block_f) (cryptonite_aes_branch_table[DECRYPT_BLOCK_128 + k->strength]))(o,k,i))
|
||||||
|
#define cryptonite_gf_mul(a,b) \
|
||||||
|
(((gf_mul_f) (cryptonite_aes_branch_table[GHASH_GF_MUL]))(a,b))
|
||||||
#else
|
#else
|
||||||
#define GET_INIT(strenght) cryptonite_aes_generic_init
|
#define GET_INIT(strenght) cryptonite_aes_generic_init
|
||||||
#define GET_ECB_ENCRYPT(strength) cryptonite_aes_generic_encrypt_ecb
|
#define GET_ECB_ENCRYPT(strength) cryptonite_aes_generic_encrypt_ecb
|
||||||
@ -203,6 +210,7 @@ typedef void (*block_f)(aes_block *output, aes_key *key, aes_block *input);
|
|||||||
#define GET_CCM_DECRYPT(strength) cryptonite_aes_generic_ccm_decrypt
|
#define GET_CCM_DECRYPT(strength) cryptonite_aes_generic_ccm_decrypt
|
||||||
#define cryptonite_aes_encrypt_block(o,k,i) cryptonite_aes_generic_encrypt_block(o,k,i)
|
#define cryptonite_aes_encrypt_block(o,k,i) cryptonite_aes_generic_encrypt_block(o,k,i)
|
||||||
#define cryptonite_aes_decrypt_block(o,k,i) cryptonite_aes_generic_decrypt_block(o,k,i)
|
#define cryptonite_aes_decrypt_block(o,k,i) cryptonite_aes_generic_decrypt_block(o,k,i)
|
||||||
|
#define cryptonite_gf_mul(a,b) cryptonite_aes_generic_gf_mul(a,b)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(ARCH_X86) && defined(WITH_AESNI)
|
#if defined(ARCH_X86) && defined(WITH_AESNI)
|
||||||
@ -241,6 +249,13 @@ static void initialize_table_ni(int aesni, int pclmul)
|
|||||||
cryptonite_aes_branch_table[ENCRYPT_OCB_128] = cryptonite_aesni_ocb_encrypt128;
|
cryptonite_aes_branch_table[ENCRYPT_OCB_128] = cryptonite_aesni_ocb_encrypt128;
|
||||||
cryptonite_aes_branch_table[ENCRYPT_OCB_256] = cryptonite_aesni_ocb_encrypt256;
|
cryptonite_aes_branch_table[ENCRYPT_OCB_256] = cryptonite_aesni_ocb_encrypt256;
|
||||||
*/
|
*/
|
||||||
|
#ifdef WITH_PCLMUL
|
||||||
|
if (!pclmul)
|
||||||
|
return;
|
||||||
|
/* GHASH */
|
||||||
|
cryptonite_aes_branch_table[GHASH_GF_MUL] = cryptonite_aesni_gf_mul;
|
||||||
|
cryptonite_aesni_init_pclmul();
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -761,9 +776,9 @@ void cryptonite_aes_generic_encrypt_xts(aes_block *output, aes_key *k1, aes_key
|
|||||||
|
|
||||||
/* TO OPTIMISE: this is really inefficient way to do that */
|
/* TO OPTIMISE: this is really inefficient way to do that */
|
||||||
while (spoint-- > 0)
|
while (spoint-- > 0)
|
||||||
cryptonite_gf_mulx(&tweak);
|
cryptonite_aes_generic_gf_mulx(&tweak);
|
||||||
|
|
||||||
for ( ; nb_blocks-- > 0; input++, output++, cryptonite_gf_mulx(&tweak)) {
|
for ( ; nb_blocks-- > 0; input++, output++, cryptonite_aes_generic_gf_mulx(&tweak)) {
|
||||||
block128_vxor(&block, input, &tweak);
|
block128_vxor(&block, input, &tweak);
|
||||||
cryptonite_aes_encrypt_block(&block, k1, &block);
|
cryptonite_aes_encrypt_block(&block, k1, &block);
|
||||||
block128_vxor(output, &block, &tweak);
|
block128_vxor(output, &block, &tweak);
|
||||||
@ -781,9 +796,9 @@ void cryptonite_aes_generic_decrypt_xts(aes_block *output, aes_key *k1, aes_key
|
|||||||
|
|
||||||
/* TO OPTIMISE: this is really inefficient way to do that */
|
/* TO OPTIMISE: this is really inefficient way to do that */
|
||||||
while (spoint-- > 0)
|
while (spoint-- > 0)
|
||||||
cryptonite_gf_mulx(&tweak);
|
cryptonite_aes_generic_gf_mulx(&tweak);
|
||||||
|
|
||||||
for ( ; nb_blocks-- > 0; input++, output++, cryptonite_gf_mulx(&tweak)) {
|
for ( ; nb_blocks-- > 0; input++, output++, cryptonite_aes_generic_gf_mulx(&tweak)) {
|
||||||
block128_vxor(&block, input, &tweak);
|
block128_vxor(&block, input, &tweak);
|
||||||
cryptonite_aes_decrypt_block(&block, k1, &block);
|
cryptonite_aes_decrypt_block(&block, k1, &block);
|
||||||
block128_vxor(output, &block, &tweak);
|
block128_vxor(output, &block, &tweak);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user