Merge pull request #278 from ocheron/gcm-pclmul

Faster AES GCM with PCLMULQDQ
2019-05-19 21:53:57 +01:00 · 2019-05-19 21:53:57 +01:00 · 982ded8ad5
commit 982ded8ad5
parent 3161630390 d25e44ea61
7 changed files with 135 additions and 39 deletions
--- a/Crypto/Cipher/Types/Base.hs
+++ b/Crypto/Cipher/Types/Base.hs
@ -22,6 +22,7 @@ module Crypto.Cipher.Types.Base
 import           Data.Word
 import           Crypto.Internal.ByteArray (Bytes, ByteArrayAccess, ByteArray)
 import qualified Crypto.Internal.ByteArray as B
+import           Crypto.Internal.DeepSeq
 import           Crypto.Error

 -- | Different specifier for key size in bytes
@ -36,7 +37,7 @@ type DataUnitOffset = Word32

 -- | Authentication Tag for AE cipher mode
 newtype AuthTag = AuthTag { unAuthTag :: Bytes }
-    deriving (Show, ByteArrayAccess)
+    deriving (Show, ByteArrayAccess, NFData)

 instance Eq AuthTag where
    (AuthTag a) == (AuthTag b) = B.constEq a b
--- a/benchs/Bench.hs
+++ b/benchs/Bench.hs
@ -162,14 +162,27 @@ benchBlockCipher =
        iv16 = maybe (error "iv size 16") id $ makeIV key16

 benchAE =
-    [ bench "ChaChaPoly1305" $ nf (run key32) (input64, input1024)
+    [ bench "ChaChaPoly1305" $ nf (cp key32) (input64, input1024)
+    , bench "AES-GCM" $ nf (gcm key32) (input64, input1024)
+    , bench "AES-CCM" $ nf (ccm key32) (input64, input1024)
    ]
-  where run k (ini, plain) =
+  where cp k (ini, plain) =
            let iniState            = throwCryptoError $ CP.initialize k (throwCryptoError $ CP.nonce12 nonce12)
                afterAAD            = CP.finalizeAAD (CP.appendAAD ini iniState)
                (out, afterEncrypt) = CP.encrypt plain afterAAD
                outtag              = CP.finalize afterEncrypt
-             in (out, outtag)
+             in (outtag, out)
+
+        gcm k (ini, plain) =
+            let ctx = throwCryptoError (cipherInit k) :: AES256
+                state = throwCryptoError $ aeadInit AEAD_GCM ctx nonce12
+             in aeadSimpleEncrypt state ini plain 16
+
+        ccm k (ini, plain) =
+            let ctx = throwCryptoError (cipherInit k) :: AES256
+                mode = AEAD_CCM 1024 CCM_M16 CCM_L3
+                state = throwCryptoError $ aeadInit mode ctx nonce12
+             in aeadSimpleEncrypt state ini plain 16

        input64 = B.replicate 64 0
        input1024 = B.replicate 1024 0
--- a/cbits/aes/gf.c
+++ b/cbits/aes/gf.c
@ -39,7 +39,7 @@
 * to speed up the multiplication.
 * TODO: optimise with tables
 */
-void cryptonite_gf_mul(block128 *a, block128 *b)
+void cryptonite_aes_generic_gf_mul(block128 *a, block128 *b)
 {
 	uint64_t a0, a1, v0, v1;
 	int i, j;
@ -62,7 +62,7 @@ void cryptonite_gf_mul(block128 *a, block128 *b)
 }

 /* inplace GFMUL for xts mode */
-void cryptonite_gf_mulx(block128 *a)
+void cryptonite_aes_generic_gf_mulx(block128 *a)
 {
 	const uint64_t gf_mask = cpu_to_le64(0x8000000000000000ULL);
 	uint64_t r = ((a->q[1] & gf_mask) ? cpu_to_le64(0x87) : 0);
--- a/cbits/aes/gf.h
+++ b/cbits/aes/gf.h
@ -32,7 +32,7 @@

 #include "aes/block128.h"

-void cryptonite_gf_mul(block128 *a, block128 *b);
-void cryptonite_gf_mulx(block128 *a);
+void cryptonite_aes_generic_gf_mul(block128 *a, block128 *b);
+void cryptonite_aes_generic_gf_mulx(block128 *a);

 #endif
--- a/cbits/aes/x86ni.c
+++ b/cbits/aes/x86ni.c
@ -35,6 +35,7 @@
 #include <string.h>
 #include <cryptonite_aes.h>
 #include <cryptonite_cpu.h>
+#include <aes/gf.h>
 #include <aes/x86ni.h>
 #include <aes/block128.h>

@ -157,40 +158,103 @@ static __m128i gfmulx(__m128i v)
 	return v;
 }

-static void unopt_gf_mul(block128 *a, block128 *b)
-{
-	uint64_t a0, a1, v0, v1;
-	int i, j;
-
-	a0 = a1 = 0;
-	v0 = cpu_to_be64(a->q[0]);
-	v1 = cpu_to_be64(a->q[1]);
-
-	for (i = 0; i < 16; i++)
-		for (j = 0x80; j != 0; j >>= 1) {
-			uint8_t x = b->b[i] & j;
-			a0 ^= x ? v0 : 0;
-			a1 ^= x ? v1 : 0;
-			x = (uint8_t) v1 & 1;
-			v1 = (v1 >> 1) | (v0 << 63);
-			v0 = (v0 >> 1) ^ (x ? (0xe1ULL << 56) : 0);
-		}
-	a->q[0] = cpu_to_be64(a0);
-	a->q[1] = cpu_to_be64(a1);
-}
-
-static __m128i ghash_add(__m128i tag, __m128i h, __m128i m)
+static __m128i gfmul_generic(__m128i tag, __m128i h)
 {
 	aes_block _t, _h;
-	tag = _mm_xor_si128(tag, m);
-
 	_mm_store_si128((__m128i *) &_t, tag);
 	_mm_store_si128((__m128i *) &_h, h);
-	unopt_gf_mul(&_t, &_h);
+	cryptonite_aes_generic_gf_mul(&_t, &_h);
 	tag = _mm_load_si128((__m128i *) &_t);
 	return tag;
 }

+#ifdef WITH_PCLMUL
+
+__m128i (*gfmul_branch_ptr)(__m128i a, __m128i b) = gfmul_generic;
+#define gfmul(a,b) ((*gfmul_branch_ptr)(a,b))
+
+/* See Intel carry-less-multiplication-instruction-in-gcm-mode-paper.pdf
+ *
+ * Adapted from figure 5, with additional byte swapping so that interface
+ * is simimar to cryptonite_aes_generic_gf_mul.
+ */
+static __m128i gfmul_pclmuldq(__m128i a, __m128i b)
+{
+	__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
+	__m128i bswap_mask = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
+
+	a = _mm_shuffle_epi8(a, bswap_mask);
+	b = _mm_shuffle_epi8(b, bswap_mask);
+
+	tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
+	tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
+	tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
+	tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
+
+	tmp4 = _mm_xor_si128(tmp4, tmp5);
+	tmp5 = _mm_slli_si128(tmp4, 8);
+	tmp4 = _mm_srli_si128(tmp4, 8);
+	tmp3 = _mm_xor_si128(tmp3, tmp5);
+	tmp6 = _mm_xor_si128(tmp6, tmp4);
+
+	tmp7 = _mm_srli_epi32(tmp3, 31);
+	tmp8 = _mm_srli_epi32(tmp6, 31);
+	tmp3 = _mm_slli_epi32(tmp3, 1);
+	tmp6 = _mm_slli_epi32(tmp6, 1);
+
+	tmp9 = _mm_srli_si128(tmp7, 12);
+	tmp8 = _mm_slli_si128(tmp8, 4);
+	tmp7 = _mm_slli_si128(tmp7, 4);
+	tmp3 = _mm_or_si128(tmp3, tmp7);
+	tmp6 = _mm_or_si128(tmp6, tmp8);
+	tmp6 = _mm_or_si128(tmp6, tmp9);
+
+	tmp7 = _mm_slli_epi32(tmp3, 31);
+	tmp8 = _mm_slli_epi32(tmp3, 30);
+	tmp9 = _mm_slli_epi32(tmp3, 25);
+
+	tmp7 = _mm_xor_si128(tmp7, tmp8);
+	tmp7 = _mm_xor_si128(tmp7, tmp9);
+	tmp8 = _mm_srli_si128(tmp7, 4);
+	tmp7 = _mm_slli_si128(tmp7, 12);
+	tmp3 = _mm_xor_si128(tmp3, tmp7);
+
+	tmp2 = _mm_srli_epi32(tmp3, 1);
+	tmp4 = _mm_srli_epi32(tmp3, 2);
+	tmp5 = _mm_srli_epi32(tmp3, 7);
+	tmp2 = _mm_xor_si128(tmp2, tmp4);
+	tmp2 = _mm_xor_si128(tmp2, tmp5);
+	tmp2 = _mm_xor_si128(tmp2, tmp8);
+	tmp3 = _mm_xor_si128(tmp3, tmp2);
+	tmp6 = _mm_xor_si128(tmp6, tmp3);
+
+	return _mm_shuffle_epi8(tmp6, bswap_mask);
+}
+
+void cryptonite_aesni_gf_mul(block128 *a, block128 *b)
+{
+	__m128i _a, _b, _c;
+	_a = _mm_loadu_si128((__m128i *) a);
+	_b = _mm_loadu_si128((__m128i *) b);
+	_c = gfmul_pclmuldq(_a, _b);
+	_mm_storeu_si128((__m128i *) a, _c);
+}
+
+void cryptonite_aesni_init_pclmul()
+{
+	gfmul_branch_ptr = gfmul_pclmuldq;
+}
+
+#else
+#define gfmul(a,b) (gfmul_generic(a,b))
+#endif
+
+static inline __m128i ghash_add(__m128i tag, __m128i h, __m128i m)
+{
+	tag = _mm_xor_si128(tag, m);
+	return gfmul(tag, h);
+}
+
 #define PRELOAD_ENC_KEYS128(k) \
 	__m128i K0  = _mm_loadu_si128(((__m128i *) k)+0); \
 	__m128i K1  = _mm_loadu_si128(((__m128i *) k)+1); \
--- a/cbits/aes/x86ni.h
+++ b/cbits/aes/x86ni.h
@ -72,7 +72,10 @@ void cryptonite_aesni_encrypt_xts256(aes_block *out, aes_key *key1, aes_key *key
 void cryptonite_aesni_gcm_encrypt128(uint8_t *out, aes_gcm *gcm, aes_key *key, uint8_t *in, uint32_t length);
 void cryptonite_aesni_gcm_encrypt256(uint8_t *out, aes_gcm *gcm, aes_key *key, uint8_t *in, uint32_t length);

-void gf_mul_x86ni(block128 *res, block128 *a_, block128 *b_);
+#ifdef WITH_PCLMUL
+void cryptonite_aesni_init_pclmul();
+void cryptonite_aesni_gf_mul(block128 *a, block128 *b);
+#endif

 #endif

--- a/cbits/cryptonite_aes.c
+++ b/cbits/cryptonite_aes.c
@ -81,6 +81,8 @@ enum {
 	/* ccm */
 	ENCRYPT_CCM_128, ENCRYPT_CCM_192, ENCRYPT_CCM_256,
 	DECRYPT_CCM_128, DECRYPT_CCM_192, DECRYPT_CCM_256,
+	/* ghash */
+	GHASH_GF_MUL,
 };

 void *cryptonite_aes_branch_table[] = {
@ -141,6 +143,8 @@ void *cryptonite_aes_branch_table[] = {
 	[DECRYPT_CCM_128]   = cryptonite_aes_generic_ccm_decrypt,
 	[DECRYPT_CCM_192]   = cryptonite_aes_generic_ccm_decrypt,
 	[DECRYPT_CCM_256]   = cryptonite_aes_generic_ccm_decrypt,
+	/* GHASH */
+	[GHASH_GF_MUL]      = cryptonite_aes_generic_gf_mul,
 };

 typedef void (*init_f)(aes_key *, uint8_t *, uint8_t);
@ -152,6 +156,7 @@ typedef void (*gcm_crypt_f)(uint8_t *output, aes_gcm *gcm, aes_key *key, uint8_t
 typedef void (*ocb_crypt_f)(uint8_t *output, aes_ocb *ocb, aes_key *key, uint8_t *input, uint32_t length);
 typedef void (*ccm_crypt_f)(uint8_t *output, aes_ccm *ccm, aes_key *key, uint8_t *input, uint32_t length);
 typedef void (*block_f)(aes_block *output, aes_key *key, aes_block *input);
+typedef void (*gf_mul_f)(aes_block *a, aes_block *b);

 #ifdef WITH_AESNI
 #define GET_INIT(strength) \
@ -186,6 +191,8 @@ typedef void (*block_f)(aes_block *output, aes_key *key, aes_block *input);
 	(((block_f) (cryptonite_aes_branch_table[ENCRYPT_BLOCK_128 + k->strength]))(o,k,i))
 #define cryptonite_aes_decrypt_block(o,k,i) \
 	(((block_f) (cryptonite_aes_branch_table[DECRYPT_BLOCK_128 + k->strength]))(o,k,i))
+#define cryptonite_gf_mul(a,b) \
+	(((gf_mul_f) (cryptonite_aes_branch_table[GHASH_GF_MUL]))(a,b))
 #else
 #define GET_INIT(strenght) cryptonite_aes_generic_init
 #define GET_ECB_ENCRYPT(strength) cryptonite_aes_generic_encrypt_ecb
@ -203,6 +210,7 @@ typedef void (*block_f)(aes_block *output, aes_key *key, aes_block *input);
 #define GET_CCM_DECRYPT(strength) cryptonite_aes_generic_ccm_decrypt
 #define cryptonite_aes_encrypt_block(o,k,i) cryptonite_aes_generic_encrypt_block(o,k,i)
 #define cryptonite_aes_decrypt_block(o,k,i) cryptonite_aes_generic_decrypt_block(o,k,i)
+#define cryptonite_gf_mul(a,b) cryptonite_aes_generic_gf_mul(a,b)
 #endif

 #if defined(ARCH_X86) && defined(WITH_AESNI)
@ -241,6 +249,13 @@ static void initialize_table_ni(int aesni, int pclmul)
 	cryptonite_aes_branch_table[ENCRYPT_OCB_128] = cryptonite_aesni_ocb_encrypt128;
 	cryptonite_aes_branch_table[ENCRYPT_OCB_256] = cryptonite_aesni_ocb_encrypt256;
 	*/
+#ifdef WITH_PCLMUL
+	if (!pclmul)
+		return;
+	/* GHASH */
+	cryptonite_aes_branch_table[GHASH_GF_MUL]    = cryptonite_aesni_gf_mul;
+	cryptonite_aesni_init_pclmul();
+#endif
 }
 #endif

@ -761,9 +776,9 @@ void cryptonite_aes_generic_encrypt_xts(aes_block *output, aes_key *k1, aes_key

 	/* TO OPTIMISE: this is really inefficient way to do that */
 	while (spoint-- > 0)
-		cryptonite_gf_mulx(&tweak);
+		cryptonite_aes_generic_gf_mulx(&tweak);

-	for ( ; nb_blocks-- > 0; input++, output++, cryptonite_gf_mulx(&tweak)) {
+	for ( ; nb_blocks-- > 0; input++, output++, cryptonite_aes_generic_gf_mulx(&tweak)) {
 		block128_vxor(&block, input, &tweak);
 		cryptonite_aes_encrypt_block(&block, k1, &block);
 		block128_vxor(output, &block, &tweak);
@ -781,9 +796,9 @@ void cryptonite_aes_generic_decrypt_xts(aes_block *output, aes_key *k1, aes_key

 	/* TO OPTIMISE: this is really inefficient way to do that */
 	while (spoint-- > 0)
-		cryptonite_gf_mulx(&tweak);
+		cryptonite_aes_generic_gf_mulx(&tweak);

-	for ( ; nb_blocks-- > 0; input++, output++, cryptonite_gf_mulx(&tweak)) {
+	for ( ; nb_blocks-- > 0; input++, output++, cryptonite_aes_generic_gf_mulx(&tweak)) {
 		block128_vxor(&block, input, &tweak);
 		cryptonite_aes_decrypt_block(&block, k1, &block);
 		block128_vxor(output, &block, &tweak);