Skip to content

Commit f7f31d5

Browse files
committed
Added x86 SIMD optimizations to crypto datatypes.
- The v128 operations are optimized for SSE2/SSSE3/SSE4.1. - srtp_octet_string_is_eq is optimized for SSE2. When SSE2 is not available, use a pair of 32-bit accumulators to speed up the bulk of the operation. We use two accumulators to leverage instruction-level parallelism supported by most modern CPUs. - In srtp_cleanse, use memset and ensure it is not optimized away with a dummy asm statement, which can potentially consume the contents of the memory. - Endian conversion functions use gcc-style intrinsics, when possible. - In base64_block_to_octet_triple, prefer memchr to strchr as it explicitly accepts the string length, which is known at compile time. The SIMD code uses intrinsics, which are available on all modern compilers. For MSVC, config_in_cmake.h is modified to define gcc/clang-style SSE macros based on MSVC predefined macros. We enable all SSE versions when it indicates that AVX is enabled. SSE2 is always enabled for x86-64 or for x86 when SSE2 FP math is enabled.
1 parent 7d351de commit f7f31d5

File tree

3 files changed

+316
-12
lines changed

3 files changed

+316
-12
lines changed

config_in_cmake.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,14 @@
113113
#define inline
114114
#endif
115115
#endif
116+
117+
/* Define gcc/clang-style SSE macros on compilers that don't define them (primarilly, MSVC). */
118+
#if !defined(__SSE2__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
119+
#define __SSE2__
120+
#endif
121+
#if !defined(__SSSE3__) && defined(__AVX__)
122+
#define __SSSE3__
123+
#endif
124+
#if !defined(__SSE4_1__) && defined(__AVX__)
125+
#define __SSE4_1__
126+
#endif

crypto/include/datatypes.h

Lines changed: 67 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@
6262
#error "Platform not recognized"
6363
#endif
6464

65+
#if defined(__SSE2__)
66+
#include <smmintrin.h>
67+
#endif
68+
6569
#ifdef __cplusplus
6670
extern "C" {
6771
#endif
@@ -137,6 +141,62 @@ void v128_right_shift(v128_t *x, int shift_index);
137141
* (and the compiler provides better warnings).
138142
*/
139143

144+
#if defined(__SSE2__)
145+
146+
#define _v128_set_to_zero(x) \
147+
(_mm_storeu_si128((__m128i *)(x), _mm_setzero_si128()))
148+
149+
#define _v128_copy(x, y) \
150+
(_mm_storeu_si128((__m128i *)(x), _mm_loadu_si128((const __m128i *)(y))))
151+
152+
#define _v128_xor(z, x, y) \
153+
(_mm_storeu_si128((__m128i *)(z), \
154+
_mm_xor_si128(_mm_loadu_si128((const __m128i *)(x)), \
155+
_mm_loadu_si128((const __m128i *)(y)))))
156+
157+
#define _v128_and(z, x, y) \
158+
(_mm_storeu_si128((__m128i *)(z), \
159+
_mm_and_si128(_mm_loadu_si128((const __m128i *)(x)), \
160+
_mm_loadu_si128((const __m128i *)(y)))))
161+
162+
#define _v128_or(z, x, y) \
163+
(_mm_storeu_si128((__m128i *)(z), \
164+
_mm_or_si128(_mm_loadu_si128((const __m128i *)(x)), \
165+
_mm_loadu_si128((const __m128i *)(y)))))
166+
167+
#define _v128_complement(x) \
168+
({ \
169+
__m128i _mm = _mm_undefined_si128(); \
170+
_mm_storeu_si128((__m128i *)(x), \
171+
_mm_xor_si128(_mm_loadu_si128((const __m128i *)(x)), \
172+
_mm_cmpeq_epi32(_mm, _mm))); \
173+
})
174+
175+
#if defined(__SSE4_1__)
176+
177+
#define _v128_is_eq(x, y) \
178+
({ \
179+
__m128i _mm = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(x)), \
180+
_mm_loadu_si128((const __m128i *)(y))); \
181+
_mm_testz_si128(_mm, _mm); \
182+
})
183+
184+
#else /* defined(__SSE4_1__) */
185+
186+
#define _v128_is_eq(x, y) \
187+
(_mm_movemask_epi8(_mm_cmpeq_epi32( \
188+
_mm_loadu_si128((const __m128i *)(x)), \
189+
_mm_loadu_si128((const __m128i *)(y)))) == 0x0000ffff)
190+
191+
#endif /* defined(__SSE4_1__) */
192+
193+
#define _v128_xor_eq(z, x) \
194+
(_mm_storeu_si128((__m128i *)(z), \
195+
_mm_xor_si128(_mm_loadu_si128((const __m128i *)(x)), \
196+
_mm_loadu_si128((const __m128i *)(z)))))
197+
198+
#else /* defined(__SSE2__) */
199+
140200
#define _v128_set_to_zero(x) \
141201
((x)->v32[0] = 0, (x)->v32[1] = 0, (x)->v32[2] = 0, (x)->v32[3] = 0)
142202

@@ -179,6 +239,8 @@ void v128_right_shift(v128_t *x, int shift_index);
179239
((z)->v64[0] ^= (x)->v64[0], (z)->v64[1] ^= (x)->v64[1])
180240
#endif
181241

242+
#endif /* defined(__SSE2__) */
243+
182244
/* NOTE! This assumes an odd ordering! */
183245
/* This will not be compatible directly with math on some processors */
184246
/* bit 0 is first 32-bit word, low order bit. in little-endian, that's
@@ -278,13 +340,11 @@ void octet_string_set_to_zero(void *s, size_t len);
278340
#define be64_to_cpu(x) bswap_64((x))
279341
#else /* WORDS_BIGENDIAN */
280342

281-
#if defined(__GNUC__) && defined(HAVE_X86)
343+
#if defined(__GNUC__)
282344
/* Fall back. */
283345
static inline uint32_t be32_to_cpu(uint32_t v)
284346
{
285-
/* optimized for x86. */
286-
asm("bswap %0" : "=r"(v) : "0"(v));
287-
return v;
347+
return __builtin_bswap32(v);
288348
}
289349
#else /* HAVE_X86 */
290350
#ifdef HAVE_NETINET_IN_H
@@ -297,7 +357,9 @@ static inline uint32_t be32_to_cpu(uint32_t v)
297357

298358
static inline uint64_t be64_to_cpu(uint64_t v)
299359
{
300-
#ifdef NO_64BIT_MATH
360+
#if defined(__GNUC__)
361+
v = __builtin_bswap64(v);
362+
#elif defined(NO_64BIT_MATH)
301363
/* use the make64 functions to do 64-bit math */
302364
v = make64(htonl(low32(v)), htonl(high32(v)));
303365
#else /* NO_64BIT_MATH */

0 commit comments

Comments
 (0)