21 #if (CRYPTOPP_SSSE3_AVAILABLE) 22 # include <pmmintrin.h> 23 # include <tmmintrin.h> 26 #if (CRYPTOPP_SSE41_AVAILABLE) 27 # include <smmintrin.h> 30 #if defined(__AVX512F__) && defined(__AVX512VL__) 31 # define CRYPTOPP_AVX512_ROTATE 1 32 # include <immintrin.h> 35 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 36 # include <arm_neon.h> 41 #if defined(CRYPTOPP_ARM_ACLE_AVAILABLE) 43 # include <arm_acle.h> 48 #if (CRYPTOPP_GCC_VERSION >= 40900) 49 # define GCC_NO_UBSAN __attribute__ ((no_sanitize_undefined)) 54 ANONYMOUS_NAMESPACE_BEGIN
57 using CryptoPP::word32;
58 using CryptoPP::word64;
62 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 65 inline T UnpackHigh32(
const T& a,
const T& b)
67 const uint32x2_t x(vget_high_u32((uint32x4_t)a));
68 const uint32x2_t y(vget_high_u32((uint32x4_t)b));
69 const uint32x2x2_t r = vzip_u32(x, y);
70 return (T)vcombine_u32(r.val[0], r.val[1]);
74 inline T UnpackLow32(
const T& a,
const T& b)
76 const uint32x2_t x(vget_low_u32((uint32x4_t)a));
77 const uint32x2_t y(vget_low_u32((uint32x4_t)b));
78 const uint32x2x2_t r = vzip_u32(x, y);
79 return (T)vcombine_u32(r.val[0], r.val[1]);
82 template <
unsigned int R>
83 inline uint32x4_t RotateLeft32(
const uint32x4_t& val)
85 const uint32x4_t a(vshlq_n_u32(val, R));
86 const uint32x4_t b(vshrq_n_u32(val, 32 - R));
87 return vorrq_u32(a, b);
90 template <
unsigned int R>
91 inline uint32x4_t RotateRight32(
const uint32x4_t& val)
93 const uint32x4_t a(vshlq_n_u32(val, 32 - R));
94 const uint32x4_t b(vshrq_n_u32(val, R));
95 return vorrq_u32(a, b);
98 #if defined(__aarch32__) || defined(__aarch64__) 101 inline uint32x4_t RotateLeft32<8>(
const uint32x4_t& val)
103 #if defined(CRYPTOPP_BIG_ENDIAN) 104 const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
105 const uint8x16_t mask = vld1q_u8(maskb);
107 const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
108 const uint8x16_t mask = vld1q_u8(maskb);
111 return vreinterpretq_u32_u8(
112 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
117 inline uint32x4_t RotateRight32<8>(
const uint32x4_t& val)
119 #if defined(CRYPTOPP_BIG_ENDIAN) 120 const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
121 const uint8x16_t mask = vld1q_u8(maskb);
123 const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };
124 const uint8x16_t mask = vld1q_u8(maskb);
127 return vreinterpretq_u32_u8(
128 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
130 #endif // Aarch32 or Aarch64 132 inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
133 const word32 *subkeys,
unsigned int rounds)
139 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
140 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
142 for (
int i=0; i < static_cast<int>(rounds); ++i)
144 const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
146 x1 = RotateRight32<8>(x1);
147 x1 = vaddq_u32(x1, y1);
148 x1 = veorq_u32(x1, rk);
149 y1 = RotateLeft32<3>(y1);
150 y1 = veorq_u32(y1, x1);
154 block0 = UnpackLow32(y1, x1);
155 block1 = UnpackHigh32(y1, x1);
158 inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
159 const word32 *subkeys,
unsigned int rounds)
165 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
166 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
168 for (
int i = static_cast<int>(rounds-1); i >= 0; --i)
170 const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
172 y1 = veorq_u32(y1, x1);
173 y1 = RotateRight32<3>(y1);
174 x1 = veorq_u32(x1, rk);
175 x1 = vsubq_u32(x1, y1);
176 x1 = RotateLeft32<8>(x1);
180 block0 = UnpackLow32(y1, x1);
181 block1 = UnpackHigh32(y1, x1);
184 inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
185 uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
186 const word32 *subkeys,
unsigned int rounds)
193 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
194 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
195 uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
196 uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
197 uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
198 uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
200 for (
int i=0; i < static_cast<int>(rounds); ++i)
202 const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
204 x1 = RotateRight32<8>(x1);
205 x2 = RotateRight32<8>(x2);
206 x3 = RotateRight32<8>(x3);
207 x1 = vaddq_u32(x1, y1);
208 x2 = vaddq_u32(x2, y2);
209 x3 = vaddq_u32(x3, y3);
210 x1 = veorq_u32(x1, rk);
211 x2 = veorq_u32(x2, rk);
212 x3 = veorq_u32(x3, rk);
213 y1 = RotateLeft32<3>(y1);
214 y2 = RotateLeft32<3>(y2);
215 y3 = RotateLeft32<3>(y3);
216 y1 = veorq_u32(y1, x1);
217 y2 = veorq_u32(y2, x2);
218 y3 = veorq_u32(y3, x3);
222 block0 = UnpackLow32(y1, x1);
223 block1 = UnpackHigh32(y1, x1);
224 block2 = UnpackLow32(y2, x2);
225 block3 = UnpackHigh32(y2, x2);
226 block4 = UnpackLow32(y3, x3);
227 block5 = UnpackHigh32(y3, x3);
230 inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
231 uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
232 const word32 *subkeys,
unsigned int rounds)
239 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
240 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
241 uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
242 uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
243 uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
244 uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
246 for (
int i = static_cast<int>(rounds-1); i >= 0; --i)
248 const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
250 y1 = veorq_u32(y1, x1);
251 y2 = veorq_u32(y2, x2);
252 y3 = veorq_u32(y3, x3);
253 y1 = RotateRight32<3>(y1);
254 y2 = RotateRight32<3>(y2);
255 y3 = RotateRight32<3>(y3);
256 x1 = veorq_u32(x1, rk);
257 x2 = veorq_u32(x2, rk);
258 x3 = veorq_u32(x3, rk);
259 x1 = vsubq_u32(x1, y1);
260 x2 = vsubq_u32(x2, y2);
261 x3 = vsubq_u32(x3, y3);
262 x1 = RotateLeft32<8>(x1);
263 x2 = RotateLeft32<8>(x2);
264 x3 = RotateLeft32<8>(x3);
268 block0 = UnpackLow32(y1, x1);
269 block1 = UnpackHigh32(y1, x1);
270 block2 = UnpackLow32(y2, x2);
271 block3 = UnpackHigh32(y2, x2);
272 block4 = UnpackLow32(y3, x3);
273 block5 = UnpackHigh32(y3, x3);
276 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 278 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 281 inline T UnpackHigh64(
const T& a,
const T& b)
283 const uint64x1_t x(vget_high_u64((uint64x2_t)a));
284 const uint64x1_t y(vget_high_u64((uint64x2_t)b));
285 return (T)vcombine_u64(x, y);
289 inline T UnpackLow64(
const T& a,
const T& b)
291 const uint64x1_t x(vget_low_u64((uint64x2_t)a));
292 const uint64x1_t y(vget_low_u64((uint64x2_t)b));
293 return (T)vcombine_u64(x, y);
296 template <
unsigned int R>
297 inline uint64x2_t RotateLeft64(
const uint64x2_t& val)
299 const uint64x2_t a(vshlq_n_u64(val, R));
300 const uint64x2_t b(vshrq_n_u64(val, 64 - R));
301 return vorrq_u64(a, b);
304 template <
unsigned int R>
305 inline uint64x2_t RotateRight64(
const uint64x2_t& val)
307 const uint64x2_t a(vshlq_n_u64(val, 64 - R));
308 const uint64x2_t b(vshrq_n_u64(val, R));
309 return vorrq_u64(a, b);
312 #if defined(__aarch32__) || defined(__aarch64__) 315 inline uint64x2_t RotateLeft64<8>(
const uint64x2_t& val)
317 #if defined(CRYPTOPP_BIG_ENDIAN) 318 const uint8_t maskb[16] = { 14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7 };
319 const uint8x16_t mask = vld1q_u8(maskb);
321 const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
322 const uint8x16_t mask = vld1q_u8(maskb);
325 return vreinterpretq_u64_u8(
326 vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
331 inline uint64x2_t RotateRight64<8>(
const uint64x2_t& val)
333 #if defined(CRYPTOPP_BIG_ENDIAN) 334 const uint8_t maskb[16] = { 8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1 };
335 const uint8x16_t mask = vld1q_u8(maskb);
337 const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
338 const uint8x16_t mask = vld1q_u8(maskb);
341 return vreinterpretq_u64_u8(
342 vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
346 inline void SPECK128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
347 const word64 *subkeys,
unsigned int rounds)
353 uint64x2_t x1 = UnpackHigh64(block0, block1);
354 uint64x2_t y1 = UnpackLow64(block0, block1);
356 for (
int i=0; i < static_cast<int>(rounds); ++i)
358 const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
360 x1 = RotateRight64<8>(x1);
361 x1 = vaddq_u64(x1, y1);
362 x1 = veorq_u64(x1, rk);
363 y1 = RotateLeft64<3>(y1);
364 y1 = veorq_u64(y1, x1);
368 block0 = UnpackLow64(y1, x1);
369 block1 = UnpackHigh64(y1, x1);
372 inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
373 uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
374 const word64 *subkeys,
unsigned int rounds)
380 uint64x2_t x1 = UnpackHigh64(block0, block1);
381 uint64x2_t y1 = UnpackLow64(block0, block1);
382 uint64x2_t x2 = UnpackHigh64(block2, block3);
383 uint64x2_t y2 = UnpackLow64(block2, block3);
384 uint64x2_t x3 = UnpackHigh64(block4, block5);
385 uint64x2_t y3 = UnpackLow64(block4, block5);
387 for (
int i=0; i < static_cast<int>(rounds); ++i)
389 const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
391 x1 = RotateRight64<8>(x1);
392 x2 = RotateRight64<8>(x2);
393 x3 = RotateRight64<8>(x3);
394 x1 = vaddq_u64(x1, y1);
395 x2 = vaddq_u64(x2, y2);
396 x3 = vaddq_u64(x3, y3);
397 x1 = veorq_u64(x1, rk);
398 x2 = veorq_u64(x2, rk);
399 x3 = veorq_u64(x3, rk);
400 y1 = RotateLeft64<3>(y1);
401 y2 = RotateLeft64<3>(y2);
402 y3 = RotateLeft64<3>(y3);
403 y1 = veorq_u64(y1, x1);
404 y2 = veorq_u64(y2, x2);
405 y3 = veorq_u64(y3, x3);
409 block0 = UnpackLow64(y1, x1);
410 block1 = UnpackHigh64(y1, x1);
411 block2 = UnpackLow64(y2, x2);
412 block3 = UnpackHigh64(y2, x2);
413 block4 = UnpackLow64(y3, x3);
414 block5 = UnpackHigh64(y3, x3);
417 inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
418 const word64 *subkeys,
unsigned int rounds)
424 uint64x2_t x1 = UnpackHigh64(block0, block1);
425 uint64x2_t y1 = UnpackLow64(block0, block1);
427 for (
int i = static_cast<int>(rounds-1); i >= 0; --i)
429 const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
431 y1 = veorq_u64(y1, x1);
432 y1 = RotateRight64<3>(y1);
433 x1 = veorq_u64(x1, rk);
434 x1 = vsubq_u64(x1, y1);
435 x1 = RotateLeft64<8>(x1);
439 block0 = UnpackLow64(y1, x1);
440 block1 = UnpackHigh64(y1, x1);
443 inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
444 uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
445 const word64 *subkeys,
unsigned int rounds)
451 uint64x2_t x1 = UnpackHigh64(block0, block1);
452 uint64x2_t y1 = UnpackLow64(block0, block1);
453 uint64x2_t x2 = UnpackHigh64(block2, block3);
454 uint64x2_t y2 = UnpackLow64(block2, block3);
455 uint64x2_t x3 = UnpackHigh64(block4, block5);
456 uint64x2_t y3 = UnpackLow64(block4, block5);
458 for (
int i = static_cast<int>(rounds-1); i >= 0; --i)
460 const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
462 y1 = veorq_u64(y1, x1);
463 y2 = veorq_u64(y2, x2);
464 y3 = veorq_u64(y3, x3);
465 y1 = RotateRight64<3>(y1);
466 y2 = RotateRight64<3>(y2);
467 y3 = RotateRight64<3>(y3);
468 x1 = veorq_u64(x1, rk);
469 x2 = veorq_u64(x2, rk);
470 x3 = veorq_u64(x3, rk);
471 x1 = vsubq_u64(x1, y1);
472 x2 = vsubq_u64(x2, y2);
473 x3 = vsubq_u64(x3, y3);
474 x1 = RotateLeft64<8>(x1);
475 x2 = RotateLeft64<8>(x2);
476 x3 = RotateLeft64<8>(x3);
480 block0 = UnpackLow64(y1, x1);
481 block1 = UnpackHigh64(y1, x1);
482 block2 = UnpackLow64(y2, x2);
483 block3 = UnpackHigh64(y2, x2);
484 block4 = UnpackLow64(y3, x3);
485 block5 = UnpackHigh64(y3, x3);
488 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 492 #if defined(CRYPTOPP_SSSE3_AVAILABLE) 496 # define M128_CAST(x) ((__m128i *)(void *)(x)) 498 #ifndef CONST_M128_CAST 499 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) 504 # define DOUBLE_CAST(x) ((double *)(void *)(x)) 506 #ifndef CONST_DOUBLE_CAST 507 # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x)) 510 #if defined(CRYPTOPP_AVX512_ROTATE) 511 template <
unsigned int R>
512 inline __m128i RotateLeft64(
const __m128i& val)
514 return _mm_rol_epi64(val, R);
517 template <
unsigned int R>
518 inline __m128i RotateRight64(
const __m128i& val)
520 return _mm_ror_epi64(val, R);
523 template <
unsigned int R>
524 inline __m128i RotateLeft64(
const __m128i& val)
527 _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
530 template <
unsigned int R>
531 inline __m128i RotateRight64(
const __m128i& val)
534 _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
539 inline __m128i RotateLeft64<8>(
const __m128i& val)
541 const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
542 return _mm_shuffle_epi8(val, mask);
547 inline __m128i RotateRight64<8>(
const __m128i& val)
549 const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
550 return _mm_shuffle_epi8(val, mask);
553 #endif // CRYPTOPP_AVX512_ROTATE 555 inline void GCC_NO_UBSAN SPECK128_Enc_Block(__m128i &block0, __m128i &block1,
556 const word64 *subkeys,
unsigned int rounds)
562 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
563 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
565 for (
int i=0; i < static_cast<int>(rounds); ++i)
567 const __m128i rk = _mm_castpd_si128(
568 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
570 x1 = RotateRight64<8>(x1);
571 x1 = _mm_add_epi64(x1, y1);
572 x1 = _mm_xor_si128(x1, rk);
573 y1 = RotateLeft64<3>(y1);
574 y1 = _mm_xor_si128(y1, x1);
578 block0 = _mm_unpacklo_epi64(y1, x1);
579 block1 = _mm_unpackhi_epi64(y1, x1);
582 inline void GCC_NO_UBSAN SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
583 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
584 const word64 *subkeys,
unsigned int rounds)
590 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
591 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
592 __m128i x2 = _mm_unpackhi_epi64(block2, block3);
593 __m128i y2 = _mm_unpacklo_epi64(block2, block3);
594 __m128i x3 = _mm_unpackhi_epi64(block4, block5);
595 __m128i y3 = _mm_unpacklo_epi64(block4, block5);
597 for (
int i=0; i < static_cast<int>(rounds); ++i)
599 const __m128i rk = _mm_castpd_si128(
600 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
602 x1 = RotateRight64<8>(x1);
603 x2 = RotateRight64<8>(x2);
604 x3 = RotateRight64<8>(x3);
605 x1 = _mm_add_epi64(x1, y1);
606 x2 = _mm_add_epi64(x2, y2);
607 x3 = _mm_add_epi64(x3, y3);
608 x1 = _mm_xor_si128(x1, rk);
609 x2 = _mm_xor_si128(x2, rk);
610 x3 = _mm_xor_si128(x3, rk);
611 y1 = RotateLeft64<3>(y1);
612 y2 = RotateLeft64<3>(y2);
613 y3 = RotateLeft64<3>(y3);
614 y1 = _mm_xor_si128(y1, x1);
615 y2 = _mm_xor_si128(y2, x2);
616 y3 = _mm_xor_si128(y3, x3);
620 block0 = _mm_unpacklo_epi64(y1, x1);
621 block1 = _mm_unpackhi_epi64(y1, x1);
622 block2 = _mm_unpacklo_epi64(y2, x2);
623 block3 = _mm_unpackhi_epi64(y2, x2);
624 block4 = _mm_unpacklo_epi64(y3, x3);
625 block5 = _mm_unpackhi_epi64(y3, x3);
628 inline void GCC_NO_UBSAN SPECK128_Dec_Block(__m128i &block0, __m128i &block1,
629 const word64 *subkeys,
unsigned int rounds)
635 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
636 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
638 for (
int i = static_cast<int>(rounds-1); i >= 0; --i)
640 const __m128i rk = _mm_castpd_si128(
641 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
643 y1 = _mm_xor_si128(y1, x1);
644 y1 = RotateRight64<3>(y1);
645 x1 = _mm_xor_si128(x1, rk);
646 x1 = _mm_sub_epi64(x1, y1);
647 x1 = RotateLeft64<8>(x1);
651 block0 = _mm_unpacklo_epi64(y1, x1);
652 block1 = _mm_unpackhi_epi64(y1, x1);
655 inline void GCC_NO_UBSAN SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
656 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
657 const word64 *subkeys,
unsigned int rounds)
663 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
664 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
665 __m128i x2 = _mm_unpackhi_epi64(block2, block3);
666 __m128i y2 = _mm_unpacklo_epi64(block2, block3);
667 __m128i x3 = _mm_unpackhi_epi64(block4, block5);
668 __m128i y3 = _mm_unpacklo_epi64(block4, block5);
670 for (
int i = static_cast<int>(rounds-1); i >= 0; --i)
672 const __m128i rk = _mm_castpd_si128(
673 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
675 y1 = _mm_xor_si128(y1, x1);
676 y2 = _mm_xor_si128(y2, x2);
677 y3 = _mm_xor_si128(y3, x3);
678 y1 = RotateRight64<3>(y1);
679 y2 = RotateRight64<3>(y2);
680 y3 = RotateRight64<3>(y3);
681 x1 = _mm_xor_si128(x1, rk);
682 x2 = _mm_xor_si128(x2, rk);
683 x3 = _mm_xor_si128(x3, rk);
684 x1 = _mm_sub_epi64(x1, y1);
685 x2 = _mm_sub_epi64(x2, y2);
686 x3 = _mm_sub_epi64(x3, y3);
687 x1 = RotateLeft64<8>(x1);
688 x2 = RotateLeft64<8>(x2);
689 x3 = RotateLeft64<8>(x3);
693 block0 = _mm_unpacklo_epi64(y1, x1);
694 block1 = _mm_unpackhi_epi64(y1, x1);
695 block2 = _mm_unpacklo_epi64(y2, x2);
696 block3 = _mm_unpackhi_epi64(y2, x2);
697 block4 = _mm_unpacklo_epi64(y3, x3);
698 block5 = _mm_unpackhi_epi64(y3, x3);
701 #endif // CRYPTOPP_SSSE3_AVAILABLE 703 #if defined(CRYPTOPP_SSE41_AVAILABLE) 705 template <
unsigned int R>
706 inline __m128i RotateLeft32(
const __m128i& val)
709 _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
712 template <
unsigned int R>
713 inline __m128i RotateRight32(
const __m128i& val)
716 _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
721 inline __m128i RotateLeft32<8>(
const __m128i& val)
723 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
724 return _mm_shuffle_epi8(val, mask);
729 inline __m128i RotateRight32<8>(
const __m128i& val)
731 const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
732 return _mm_shuffle_epi8(val, mask);
735 inline void GCC_NO_UBSAN SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
736 const word32 *subkeys,
unsigned int rounds)
743 const __m128 t0 = _mm_castsi128_ps(block0);
744 const __m128 t1 = _mm_castsi128_ps(block1);
745 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
746 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
748 for (
int i=0; i < static_cast<int>(rounds); ++i)
750 const __m128i rk = _mm_set1_epi32(subkeys[i]);
752 x1 = RotateRight32<8>(x1);
753 x1 = _mm_add_epi32(x1, y1);
754 x1 = _mm_xor_si128(x1, rk);
755 y1 = RotateLeft32<3>(y1);
756 y1 = _mm_xor_si128(y1, x1);
761 block0 = _mm_unpacklo_epi32(y1, x1);
762 block1 = _mm_unpackhi_epi32(y1, x1);
765 inline void GCC_NO_UBSAN SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
766 const word32 *subkeys,
unsigned int rounds)
773 const __m128 t0 = _mm_castsi128_ps(block0);
774 const __m128 t1 = _mm_castsi128_ps(block1);
775 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
776 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
778 for (
int i = static_cast<int>(rounds-1); i >= 0; --i)
780 const __m128i rk = _mm_set1_epi32(subkeys[i]);
782 y1 = _mm_xor_si128(y1, x1);
783 y1 = RotateRight32<3>(y1);
784 x1 = _mm_xor_si128(x1, rk);
785 x1 = _mm_sub_epi32(x1, y1);
786 x1 = RotateLeft32<8>(x1);
791 block0 = _mm_unpacklo_epi32(y1, x1);
792 block1 = _mm_unpackhi_epi32(y1, x1);
795 inline void GCC_NO_UBSAN SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
796 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
797 const word32 *subkeys,
unsigned int rounds)
804 const __m128 t0 = _mm_castsi128_ps(block0);
805 const __m128 t1 = _mm_castsi128_ps(block1);
806 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
807 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
809 const __m128 t2 = _mm_castsi128_ps(block2);
810 const __m128 t3 = _mm_castsi128_ps(block3);
811 __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
812 __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
814 const __m128 t4 = _mm_castsi128_ps(block4);
815 const __m128 t5 = _mm_castsi128_ps(block5);
816 __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
817 __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
819 for (
int i=0; i < static_cast<int>(rounds); ++i)
821 const __m128i rk = _mm_set1_epi32(subkeys[i]);
823 x1 = RotateRight32<8>(x1);
824 x2 = RotateRight32<8>(x2);
825 x3 = RotateRight32<8>(x3);
826 x1 = _mm_add_epi32(x1, y1);
827 x2 = _mm_add_epi32(x2, y2);
828 x3 = _mm_add_epi32(x3, y3);
829 x1 = _mm_xor_si128(x1, rk);
830 x2 = _mm_xor_si128(x2, rk);
831 x3 = _mm_xor_si128(x3, rk);
832 y1 = RotateLeft32<3>(y1);
833 y2 = RotateLeft32<3>(y2);
834 y3 = RotateLeft32<3>(y3);
835 y1 = _mm_xor_si128(y1, x1);
836 y2 = _mm_xor_si128(y2, x2);
837 y3 = _mm_xor_si128(y3, x3);
842 block0 = _mm_unpacklo_epi32(y1, x1);
843 block1 = _mm_unpackhi_epi32(y1, x1);
844 block2 = _mm_unpacklo_epi32(y2, x2);
845 block3 = _mm_unpackhi_epi32(y2, x2);
846 block4 = _mm_unpacklo_epi32(y3, x3);
847 block5 = _mm_unpackhi_epi32(y3, x3);
850 inline void GCC_NO_UBSAN SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
851 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
852 const word32 *subkeys,
unsigned int rounds)
859 const __m128 t0 = _mm_castsi128_ps(block0);
860 const __m128 t1 = _mm_castsi128_ps(block1);
861 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
862 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
864 const __m128 t2 = _mm_castsi128_ps(block2);
865 const __m128 t3 = _mm_castsi128_ps(block3);
866 __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
867 __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
869 const __m128 t4 = _mm_castsi128_ps(block4);
870 const __m128 t5 = _mm_castsi128_ps(block5);
871 __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
872 __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
874 for (
int i = static_cast<int>(rounds-1); i >= 0; --i)
876 const __m128i rk = _mm_set1_epi32(subkeys[i]);
878 y1 = _mm_xor_si128(y1, x1);
879 y2 = _mm_xor_si128(y2, x2);
880 y3 = _mm_xor_si128(y3, x3);
881 y1 = RotateRight32<3>(y1);
882 y2 = RotateRight32<3>(y2);
883 y3 = RotateRight32<3>(y3);
884 x1 = _mm_xor_si128(x1, rk);
885 x2 = _mm_xor_si128(x2, rk);
886 x3 = _mm_xor_si128(x3, rk);
887 x1 = _mm_sub_epi32(x1, y1);
888 x2 = _mm_sub_epi32(x2, y2);
889 x3 = _mm_sub_epi32(x3, y3);
890 x1 = RotateLeft32<8>(x1);
891 x2 = RotateLeft32<8>(x2);
892 x3 = RotateLeft32<8>(x3);
897 block0 = _mm_unpacklo_epi32(y1, x1);
898 block1 = _mm_unpackhi_epi32(y1, x1);
899 block2 = _mm_unpacklo_epi32(y2, x2);
900 block3 = _mm_unpackhi_epi32(y2, x2);
901 block4 = _mm_unpacklo_epi32(y3, x3);
902 block5 = _mm_unpackhi_epi32(y3, x3);
905 #endif // CRYPTOPP_SSE41_AVAILABLE 907 ANONYMOUS_NAMESPACE_END
915 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 916 size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(
const word32* subKeys,
size_t rounds,
917 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
919 return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
920 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
923 size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(
const word32* subKeys,
size_t rounds,
924 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
926 return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
927 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
931 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 932 size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(
const word64* subKeys,
size_t rounds,
933 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
935 return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
936 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
939 size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(
const word64* subKeys,
size_t rounds,
940 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
942 return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
943 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
945 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 949 #if defined(CRYPTOPP_SSE41_AVAILABLE) 950 size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(
const word32* subKeys,
size_t rounds,
951 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
953 return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
954 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
957 size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(
const word32* subKeys,
size_t rounds,
958 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
960 return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
961 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
965 #if defined(CRYPTOPP_SSSE3_AVAILABLE) 966 size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(
const word64* subKeys,
size_t rounds,
967 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
969 return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
970 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
973 size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(
const word64* subKeys,
size_t rounds,
974 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
976 return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
977 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
979 #endif // CRYPTOPP_SSSE3_AVAILABLE Utility functions for the Crypto++ library.
Library configuration file.
Classes for the Speck block cipher.
Crypto++ library namespace.