Added x86 SIMD optimizations to crypto datatypes.

Lastique · Lastique · commit f7f31d5d2246 · 2020-09-21T17:30:05.000+03:00
- The v128 operations are optimized for SSE2/SSSE3/SSE4.1.
- srtp_octet_string_is_eq is optimized for SSE2. When SSE2 is not
  available, use a pair of 32-bit accumulators to speed up the
  bulk of the operation. We use two accumulators to leverage
  instruction-level parallelism supported by most modern CPUs.
- In srtp_cleanse, use memset and ensure it is not optimized away
  with a dummy asm statement, which can potentially consume the
  contents of the memory.
- Endian conversion functions use gcc-style intrinsics, when possible.
- In base64_block_to_octet_triple, prefer memchr to strchr as
  it explicitly accepts the string length, which is known at compile
  time.

The SIMD code uses intrinsics, which are available on all modern compilers.
For MSVC, config_in_cmake.h is modified to define gcc/clang-style SSE macros
based on MSVC predefined macros. We enable all SSE versions when it
indicates that AVX is enabled. SSE2 is always enabled for x86-64 or for x86
when SSE2 FP math is enabled.
diff --git a/config_in_cmake.h b/config_in_cmake.h
@@ -113,3 +113,14 @@
     #define inline
   #endif
 #endif
+
+/* Define gcc/clang-style SSE macros on compilers that don't define them (primarilly, MSVC). */
+#if !defined(__SSE2__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+#define __SSE2__
+#endif
+#if !defined(__SSSE3__) && defined(__AVX__)
+#define __SSSE3__
+#endif
+#if !defined(__SSE4_1__) && defined(__AVX__)
+#define __SSE4_1__
+#endif
diff --git a/crypto/include/datatypes.h b/crypto/include/datatypes.h
@@ -62,6 +62,10 @@
 #error "Platform not recognized"
 #endif
 
+#if defined(__SSE2__)
+#include <smmintrin.h>
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -137,6 +141,62 @@ void v128_right_shift(v128_t *x, int shift_index);
  * (and the compiler provides better warnings).
  */
 
+#if defined(__SSE2__)
+
+#define _v128_set_to_zero(x)                                                   \
+    (_mm_storeu_si128((__m128i *)(x), _mm_setzero_si128()))
+
+#define _v128_copy(x, y)                                                       \
+    (_mm_storeu_si128((__m128i *)(x), _mm_loadu_si128((const __m128i *)(y))))
+
+#define _v128_xor(z, x, y)                                                     \
+    (_mm_storeu_si128((__m128i *)(z),                                          \
+                      _mm_xor_si128(_mm_loadu_si128((const __m128i *)(x)),     \
+                                    _mm_loadu_si128((const __m128i *)(y)))))
+
+#define _v128_and(z, x, y)                                                     \
+    (_mm_storeu_si128((__m128i *)(z),                                          \
+                      _mm_and_si128(_mm_loadu_si128((const __m128i *)(x)),     \
+                                    _mm_loadu_si128((const __m128i *)(y)))))
+
+#define _v128_or(z, x, y)                                                      \
+    (_mm_storeu_si128((__m128i *)(z),                                          \
+                      _mm_or_si128(_mm_loadu_si128((const __m128i *)(x)),      \
+                                   _mm_loadu_si128((const __m128i *)(y)))))
+
+#define _v128_complement(x)                                                    \
+    ({                                                                         \
+        __m128i _mm = _mm_undefined_si128();                                   \
+        _mm_storeu_si128((__m128i *)(x),                                       \
+                         _mm_xor_si128(_mm_loadu_si128((const __m128i *)(x)),  \
+                                       _mm_cmpeq_epi32(_mm, _mm)));            \
+    })
+
+#if defined(__SSE4_1__)
+
+#define _v128_is_eq(x, y)                                                      \
+    ({                                                                         \
+        __m128i _mm = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(x)),     \
+                                    _mm_loadu_si128((const __m128i *)(y)));    \
+        _mm_testz_si128(_mm, _mm);                                             \
+    })
+
+#else /* defined(__SSE4_1__) */
+
+#define _v128_is_eq(x, y)                                                      \
+    (_mm_movemask_epi8(_mm_cmpeq_epi32(                                        \
+         _mm_loadu_si128((const __m128i *)(x)),                                \
+         _mm_loadu_si128((const __m128i *)(y)))) == 0x0000ffff)
+
+#endif /* defined(__SSE4_1__) */
+
+#define _v128_xor_eq(z, x)                                                     \
+    (_mm_storeu_si128((__m128i *)(z),                                          \
+                      _mm_xor_si128(_mm_loadu_si128((const __m128i *)(x)),     \
+                                    _mm_loadu_si128((const __m128i *)(z)))))
+
+#else /* defined(__SSE2__) */
+
 #define _v128_set_to_zero(x)                                                   \
     ((x)->v32[0] = 0, (x)->v32[1] = 0, (x)->v32[2] = 0, (x)->v32[3] = 0)
 
@@ -179,6 +239,8 @@ void v128_right_shift(v128_t *x, int shift_index);
     ((z)->v64[0] ^= (x)->v64[0], (z)->v64[1] ^= (x)->v64[1])
 #endif
 
+#endif /* defined(__SSE2__) */
+
 /* NOTE!  This assumes an odd ordering! */
 /* This will not be compatible directly with math on some processors */
 /* bit 0 is first 32-bit word, low order bit. in little-endian, that's
@@ -278,13 +340,11 @@ void octet_string_set_to_zero(void *s, size_t len);
 #define be64_to_cpu(x) bswap_64((x))
 #else /* WORDS_BIGENDIAN */
 
-#if defined(__GNUC__) && defined(HAVE_X86)
+#if defined(__GNUC__)
 /* Fall back. */
 static inline uint32_t be32_to_cpu(uint32_t v)
 {
-    /* optimized for x86. */
-    asm("bswap %0" : "=r"(v) : "0"(v));
-    return v;
+    return __builtin_bswap32(v);
 }
 #else /* HAVE_X86 */
 #ifdef HAVE_NETINET_IN_H
@@ -297,7 +357,9 @@ static inline uint32_t be32_to_cpu(uint32_t v)
 
 static inline uint64_t be64_to_cpu(uint64_t v)
 {
-#ifdef NO_64BIT_MATH
+#if defined(__GNUC__)
+    v = __builtin_bswap64(v);
+#elif defined(NO_64BIT_MATH)
     /* use the make64 functions to do 64-bit math */
     v = make64(htonl(low32(v)), htonl(high32(v)));
 #else  /* NO_64BIT_MATH */
diff --git a/crypto/math/datatypes.c b/crypto/math/datatypes.c