14 #if (CRYPTOPP_SHANI_AVAILABLE) 15 # include <nmmintrin.h> 16 # include <immintrin.h> 20 #if (CRYPTOPP_ARM_SHA_AVAILABLE) 21 # include <arm_neon.h> 26 #if defined(CRYPTOPP_ARM_ACLE_AVAILABLE) 28 # include <arm_acle.h> 31 #if CRYPTOPP_POWER8_SHA_AVAILABLE 35 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY 40 #ifndef EXCEPTION_EXECUTE_HANDLER 41 # define EXCEPTION_EXECUTE_HANDLER 1 45 #define M128_CAST(x) ((__m128i *)(void *)(x)) 46 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) 52 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY 54 typedef void (*SigHandler)(int);
56 static jmp_buf s_jmpSIGILL;
57 static void SigIllHandler(
int)
59 longjmp(s_jmpSIGILL, 1);
62 #endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY 64 #if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) 67 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) 69 #elif (CRYPTOPP_ARM_SHA_AVAILABLE) 70 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) 71 volatile bool result =
true;
74 uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
76 uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
77 uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
78 uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
79 uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
80 uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
82 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
84 __except (EXCEPTION_EXECUTE_HANDLER)
93 volatile bool result =
true;
95 volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
96 if (oldHandler == SIG_ERR)
99 volatile sigset_t oldMask;
100 if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
103 if (setjmp(s_jmpSIGILL))
107 uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
109 uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
110 uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
111 uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
112 uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
113 uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
115 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
118 sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
119 signal(SIGILL, oldHandler);
124 #endif // CRYPTOPP_ARM_SHA_AVAILABLE 129 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) 131 #elif (CRYPTOPP_ARM_SHA_AVAILABLE) 132 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) 133 volatile bool result =
true;
136 uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
138 uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
139 uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
140 uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
141 uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
143 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
145 __except (EXCEPTION_EXECUTE_HANDLER)
154 volatile bool result =
true;
156 volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
157 if (oldHandler == SIG_ERR)
160 volatile sigset_t oldMask;
161 if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
164 if (setjmp(s_jmpSIGILL))
168 uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
170 uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
171 uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
172 uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
173 uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
175 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
178 sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
179 signal(SIGILL, oldHandler);
184 #endif // CRYPTOPP_ARM_SHA_AVAILABLE 186 #endif // ARM32 or ARM64 191 extern const word32 SHA256_K[64];
192 extern const word64 SHA512_K[80];
198 #if CRYPTOPP_SHANI_AVAILABLE 200 void SHA1_HashMultipleBlocks_SHANI(word32 *state,
const word32 *data,
size_t length,
ByteOrder order)
206 __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
207 __m128i MASK, MSG0, MSG1, MSG2, MSG3;
210 ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
211 E0 = _mm_set_epi32(state[4], 0, 0, 0);
212 ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
218 _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
219 _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
221 while (length >= SHA1::BLOCKSIZE)
228 MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
229 MSG0 = _mm_shuffle_epi8(MSG0, MASK);
230 E0 = _mm_add_epi32(E0, MSG0);
232 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
235 MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
236 MSG1 = _mm_shuffle_epi8(MSG1, MASK);
237 E1 = _mm_sha1nexte_epu32(E1, MSG1);
239 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
240 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
243 MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
244 MSG2 = _mm_shuffle_epi8(MSG2, MASK);
245 E0 = _mm_sha1nexte_epu32(E0, MSG2);
247 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
248 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
249 MSG0 = _mm_xor_si128(MSG0, MSG2);
252 MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
253 MSG3 = _mm_shuffle_epi8(MSG3, MASK);
254 E1 = _mm_sha1nexte_epu32(E1, MSG3);
256 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
257 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
258 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
259 MSG1 = _mm_xor_si128(MSG1, MSG3);
262 E0 = _mm_sha1nexte_epu32(E0, MSG0);
264 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
265 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
266 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
267 MSG2 = _mm_xor_si128(MSG2, MSG0);
270 E1 = _mm_sha1nexte_epu32(E1, MSG1);
272 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
273 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
274 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
275 MSG3 = _mm_xor_si128(MSG3, MSG1);
278 E0 = _mm_sha1nexte_epu32(E0, MSG2);
280 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
281 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
282 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
283 MSG0 = _mm_xor_si128(MSG0, MSG2);
286 E1 = _mm_sha1nexte_epu32(E1, MSG3);
288 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
289 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
290 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
291 MSG1 = _mm_xor_si128(MSG1, MSG3);
294 E0 = _mm_sha1nexte_epu32(E0, MSG0);
296 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
297 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
298 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
299 MSG2 = _mm_xor_si128(MSG2, MSG0);
302 E1 = _mm_sha1nexte_epu32(E1, MSG1);
304 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
305 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
306 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
307 MSG3 = _mm_xor_si128(MSG3, MSG1);
310 E0 = _mm_sha1nexte_epu32(E0, MSG2);
312 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
313 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
314 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
315 MSG0 = _mm_xor_si128(MSG0, MSG2);
318 E1 = _mm_sha1nexte_epu32(E1, MSG3);
320 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
321 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
322 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
323 MSG1 = _mm_xor_si128(MSG1, MSG3);
326 E0 = _mm_sha1nexte_epu32(E0, MSG0);
328 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
329 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
330 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
331 MSG2 = _mm_xor_si128(MSG2, MSG0);
334 E1 = _mm_sha1nexte_epu32(E1, MSG1);
336 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
337 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
338 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
339 MSG3 = _mm_xor_si128(MSG3, MSG1);
342 E0 = _mm_sha1nexte_epu32(E0, MSG2);
344 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
345 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
346 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
347 MSG0 = _mm_xor_si128(MSG0, MSG2);
350 E1 = _mm_sha1nexte_epu32(E1, MSG3);
352 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
353 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
354 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
355 MSG1 = _mm_xor_si128(MSG1, MSG3);
358 E0 = _mm_sha1nexte_epu32(E0, MSG0);
360 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
361 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
362 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
363 MSG2 = _mm_xor_si128(MSG2, MSG0);
366 E1 = _mm_sha1nexte_epu32(E1, MSG1);
368 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
369 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
370 MSG3 = _mm_xor_si128(MSG3, MSG1);
373 E0 = _mm_sha1nexte_epu32(E0, MSG2);
375 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
376 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
379 E1 = _mm_sha1nexte_epu32(E1, MSG3);
381 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
384 E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
385 ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
387 data += SHA1::BLOCKSIZE/
sizeof(word32);
388 length -= SHA1::BLOCKSIZE;
392 ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
393 _mm_storeu_si128(M128_CAST(state), ABCD);
394 state[4] = _mm_extract_epi32(E0, 3);
398 void SHA256_HashMultipleBlocks_SHANI(word32 *state,
const word32 *data,
size_t length,
ByteOrder order)
404 __m128i STATE0, STATE1;
405 __m128i MSG, TMP, MASK;
406 __m128i TMSG0, TMSG1, TMSG2, TMSG3;
407 __m128i ABEF_SAVE, CDGH_SAVE;
410 TMP = _mm_loadu_si128(M128_CAST(&state[0]));
411 STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
417 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
418 _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
420 TMP = _mm_shuffle_epi32(TMP, 0xB1);
421 STATE1 = _mm_shuffle_epi32(STATE1, 0x1B);
422 STATE0 = _mm_alignr_epi8(TMP, STATE1, 8);
423 STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0);
425 while (length >= SHA256::BLOCKSIZE)
432 MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
433 TMSG0 = _mm_shuffle_epi8(MSG, MASK);
434 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
435 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
436 MSG = _mm_shuffle_epi32(MSG, 0x0E);
437 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
440 TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
441 TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
442 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
443 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
444 MSG = _mm_shuffle_epi32(MSG, 0x0E);
445 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
446 TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
449 TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
450 TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
451 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
452 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
453 MSG = _mm_shuffle_epi32(MSG, 0x0E);
454 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
455 TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
458 TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
459 TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
460 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
461 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
462 TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
463 TMSG0 = _mm_add_epi32(TMSG0, TMP);
464 TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
465 MSG = _mm_shuffle_epi32(MSG, 0x0E);
466 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
467 TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
470 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
471 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
472 TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
473 TMSG1 = _mm_add_epi32(TMSG1, TMP);
474 TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
475 MSG = _mm_shuffle_epi32(MSG, 0x0E);
476 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
477 TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
480 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
481 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
482 TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
483 TMSG2 = _mm_add_epi32(TMSG2, TMP);
484 TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
485 MSG = _mm_shuffle_epi32(MSG, 0x0E);
486 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
487 TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
490 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
491 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
492 TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
493 TMSG3 = _mm_add_epi32(TMSG3, TMP);
494 TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
495 MSG = _mm_shuffle_epi32(MSG, 0x0E);
496 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
497 TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
500 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
501 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
502 TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
503 TMSG0 = _mm_add_epi32(TMSG0, TMP);
504 TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
505 MSG = _mm_shuffle_epi32(MSG, 0x0E);
506 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
507 TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
510 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
511 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
512 TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
513 TMSG1 = _mm_add_epi32(TMSG1, TMP);
514 TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
515 MSG = _mm_shuffle_epi32(MSG, 0x0E);
516 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
517 TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
520 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
521 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
522 TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
523 TMSG2 = _mm_add_epi32(TMSG2, TMP);
524 TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
525 MSG = _mm_shuffle_epi32(MSG, 0x0E);
526 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
527 TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
530 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
531 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
532 TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
533 TMSG3 = _mm_add_epi32(TMSG3, TMP);
534 TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
535 MSG = _mm_shuffle_epi32(MSG, 0x0E);
536 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
537 TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
540 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
541 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
542 TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
543 TMSG0 = _mm_add_epi32(TMSG0, TMP);
544 TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
545 MSG = _mm_shuffle_epi32(MSG, 0x0E);
546 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
547 TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
550 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
551 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
552 TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
553 TMSG1 = _mm_add_epi32(TMSG1, TMP);
554 TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
555 MSG = _mm_shuffle_epi32(MSG, 0x0E);
556 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
557 TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
560 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
561 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
562 TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
563 TMSG2 = _mm_add_epi32(TMSG2, TMP);
564 TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
565 MSG = _mm_shuffle_epi32(MSG, 0x0E);
566 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
569 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
570 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
571 TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
572 TMSG3 = _mm_add_epi32(TMSG3, TMP);
573 TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
574 MSG = _mm_shuffle_epi32(MSG, 0x0E);
575 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
578 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
579 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
580 MSG = _mm_shuffle_epi32(MSG, 0x0E);
581 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
584 STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
585 STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
587 data += SHA256::BLOCKSIZE/
sizeof(word32);
588 length -= SHA256::BLOCKSIZE;
591 TMP = _mm_shuffle_epi32(STATE0, 0x1B);
592 STATE1 = _mm_shuffle_epi32(STATE1, 0xB1);
593 STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0);
594 STATE1 = _mm_alignr_epi8(STATE1, TMP, 8);
597 _mm_storeu_si128(M128_CAST(&state[0]), STATE0);
598 _mm_storeu_si128(M128_CAST(&state[4]), STATE1);
600 #endif // CRYPTOPP_SHANI_AVAILABLE 612 #if CRYPTOPP_ARM_SHA_AVAILABLE 613 void SHA1_HashMultipleBlocks_ARMV8(word32 *state,
const word32 *data,
size_t length,
ByteOrder order)
619 uint32x4_t C0, C1, C2, C3;
620 uint32x4_t ABCD, ABCD_SAVED;
621 uint32x4_t MSG0, MSG1, MSG2, MSG3;
622 uint32x4_t TMP0, TMP1;
623 uint32_t E0, E0_SAVED, E1;
626 C0 = vdupq_n_u32(0x5A827999);
627 C1 = vdupq_n_u32(0x6ED9EBA1);
628 C2 = vdupq_n_u32(0x8F1BBCDC);
629 C3 = vdupq_n_u32(0xCA62C1D6);
631 ABCD = vld1q_u32(&state[0]);
634 while (length >= SHA1::BLOCKSIZE)
640 MSG0 = vld1q_u32(data + 0);
641 MSG1 = vld1q_u32(data + 4);
642 MSG2 = vld1q_u32(data + 8);
643 MSG3 = vld1q_u32(data + 12);
647 MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
648 MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
649 MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
650 MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
653 TMP0 = vaddq_u32(MSG0, C0);
654 TMP1 = vaddq_u32(MSG1, C0);
657 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
658 ABCD = vsha1cq_u32(ABCD, E0, TMP0);
659 TMP0 = vaddq_u32(MSG2, C0);
660 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
663 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
664 ABCD = vsha1cq_u32(ABCD, E1, TMP1);
665 TMP1 = vaddq_u32(MSG3, C0);
666 MSG0 = vsha1su1q_u32(MSG0, MSG3);
667 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
670 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
671 ABCD = vsha1cq_u32(ABCD, E0, TMP0);
672 TMP0 = vaddq_u32(MSG0, C0);
673 MSG1 = vsha1su1q_u32(MSG1, MSG0);
674 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
677 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
678 ABCD = vsha1cq_u32(ABCD, E1, TMP1);
679 TMP1 = vaddq_u32(MSG1, C1);
680 MSG2 = vsha1su1q_u32(MSG2, MSG1);
681 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
684 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
685 ABCD = vsha1cq_u32(ABCD, E0, TMP0);
686 TMP0 = vaddq_u32(MSG2, C1);
687 MSG3 = vsha1su1q_u32(MSG3, MSG2);
688 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
691 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
692 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
693 TMP1 = vaddq_u32(MSG3, C1);
694 MSG0 = vsha1su1q_u32(MSG0, MSG3);
695 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
698 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
699 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
700 TMP0 = vaddq_u32(MSG0, C1);
701 MSG1 = vsha1su1q_u32(MSG1, MSG0);
702 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
705 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
706 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
707 TMP1 = vaddq_u32(MSG1, C1);
708 MSG2 = vsha1su1q_u32(MSG2, MSG1);
709 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
712 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
713 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
714 TMP0 = vaddq_u32(MSG2, C2);
715 MSG3 = vsha1su1q_u32(MSG3, MSG2);
716 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
719 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
720 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
721 TMP1 = vaddq_u32(MSG3, C2);
722 MSG0 = vsha1su1q_u32(MSG0, MSG3);
723 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
726 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
727 ABCD = vsha1mq_u32(ABCD, E0, TMP0);
728 TMP0 = vaddq_u32(MSG0, C2);
729 MSG1 = vsha1su1q_u32(MSG1, MSG0);
730 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
733 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
734 ABCD = vsha1mq_u32(ABCD, E1, TMP1);
735 TMP1 = vaddq_u32(MSG1, C2);
736 MSG2 = vsha1su1q_u32(MSG2, MSG1);
737 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
740 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
741 ABCD = vsha1mq_u32(ABCD, E0, TMP0);
742 TMP0 = vaddq_u32(MSG2, C2);
743 MSG3 = vsha1su1q_u32(MSG3, MSG2);
744 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
747 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
748 ABCD = vsha1mq_u32(ABCD, E1, TMP1);
749 TMP1 = vaddq_u32(MSG3, C3);
750 MSG0 = vsha1su1q_u32(MSG0, MSG3);
751 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
754 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
755 ABCD = vsha1mq_u32(ABCD, E0, TMP0);
756 TMP0 = vaddq_u32(MSG0, C3);
757 MSG1 = vsha1su1q_u32(MSG1, MSG0);
758 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
761 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
762 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
763 TMP1 = vaddq_u32(MSG1, C3);
764 MSG2 = vsha1su1q_u32(MSG2, MSG1);
765 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
768 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
769 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
770 TMP0 = vaddq_u32(MSG2, C3);
771 MSG3 = vsha1su1q_u32(MSG3, MSG2);
772 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
775 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
776 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
777 TMP1 = vaddq_u32(MSG3, C3);
778 MSG0 = vsha1su1q_u32(MSG0, MSG3);
781 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
782 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
785 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
786 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
789 ABCD = vaddq_u32(ABCD_SAVED, ABCD);
791 data += SHA1::BLOCKSIZE/
sizeof(word32);
792 length -= SHA1::BLOCKSIZE;
796 vst1q_u32(&state[0], ABCD);
800 void SHA256_HashMultipleBlocks_ARMV8(word32 *state,
const word32 *data,
size_t length,
ByteOrder order)
806 uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
807 uint32x4_t MSG0, MSG1, MSG2, MSG3;
808 uint32x4_t TMP0, TMP1, TMP2;
811 STATE0 = vld1q_u32(&state[0]);
812 STATE1 = vld1q_u32(&state[4]);
814 while (length >= SHA256::BLOCKSIZE)
821 MSG0 = vld1q_u32(data + 0);
822 MSG1 = vld1q_u32(data + 4);
823 MSG2 = vld1q_u32(data + 8);
824 MSG3 = vld1q_u32(data + 12);
828 MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
829 MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
830 MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
831 MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
834 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
837 MSG0 = vsha256su0q_u32(MSG0, MSG1);
839 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04]));
840 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
841 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
842 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
845 MSG1 = vsha256su0q_u32(MSG1, MSG2);
847 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08]));
848 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
849 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
850 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
853 MSG2 = vsha256su0q_u32(MSG2, MSG3);
855 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c]));
856 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
857 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
858 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
861 MSG3 = vsha256su0q_u32(MSG3, MSG0);
863 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10]));
864 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
865 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
866 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
869 MSG0 = vsha256su0q_u32(MSG0, MSG1);
871 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14]));
872 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
873 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
874 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
877 MSG1 = vsha256su0q_u32(MSG1, MSG2);
879 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18]));
880 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
881 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
882 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
885 MSG2 = vsha256su0q_u32(MSG2, MSG3);
887 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c]));
888 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
889 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
890 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
893 MSG3 = vsha256su0q_u32(MSG3, MSG0);
895 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20]));
896 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
897 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
898 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
901 MSG0 = vsha256su0q_u32(MSG0, MSG1);
903 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24]));
904 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
905 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
906 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
909 MSG1 = vsha256su0q_u32(MSG1, MSG2);
911 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28]));
912 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
913 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
914 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
917 MSG2 = vsha256su0q_u32(MSG2, MSG3);
919 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c]));
920 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
921 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
922 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
925 MSG3 = vsha256su0q_u32(MSG3, MSG0);
927 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30]));
928 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
929 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
930 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
934 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34]));
935 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
936 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);;
940 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38]));
941 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
942 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);;
946 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c]));
947 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
948 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);;
952 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
953 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);;
956 STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
957 STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
959 data += SHA256::BLOCKSIZE/
sizeof(word32);
960 length -= SHA256::BLOCKSIZE;
964 vst1q_u32(&state[0], STATE0);
965 vst1q_u32(&state[4], STATE1);
967 #endif // CRYPTOPP_ARM_SHA_AVAILABLE 979 #if CRYPTOPP_POWER8_SHA_AVAILABLE 982 enum {A=0, B=1, C, D, E, F, G, H};
984 typedef __vector
unsigned char uint8x16_p8;
985 typedef __vector
unsigned int uint32x4_p8;
986 typedef __vector
unsigned long long uint64x2_p8;
988 uint32x4_p8 VEC_XL_BE(
int offset,
const uint8_t* data)
990 #if defined(CRYPTOPP_XLC_VERSION) 991 return vec_xl_be(offset, data);
994 __asm(
" lxvd2x %x0, %1, %2 \n\t" 996 :
"b" (data),
"r" (offset));
1001 #endif // CRYPTOPP_POWER8_SHA_AVAILABLE 1003 #if CRYPTOPP_POWER8_SHA_AVAILABLE 1006 template <
class T>
static inline 1007 uint32x4_p8 VectorLoad32x4(
const T* data,
int offset)
1009 return (uint32x4_p8)vec_ld(offset, data);
1013 template <
class T>
static inline 1014 uint32x4_p8 VectorLoad32x4u(
const T* data,
int offset)
1016 #if defined(CRYPTOPP_XLC_VERSION) 1017 return (uint32x4_p8)vec_xl(offset, data);
1019 return (uint32x4_p8)vec_vsx_ld(offset, data);
1024 template <
class T>
static inline 1025 void VectorStore32x4(
const uint32x4_p8 val, T* data,
int offset)
1027 vec_st((uint8x16_p8)val, offset, data);
1031 template <
class T>
static inline 1032 void VectorStore32x4u(
const uint32x4_p8 val, T* data,
int offset)
1034 #if defined(CRYPTOPP_XLC_VERSION) 1035 vec_xst((uint8x16_p8)val, offset, (uint8_t*)data);
1037 vec_vsx_st((uint8x16_p8)val, offset, (uint8_t*)data);
1043 template <
class T>
static inline 1044 uint32x4_p8 VectorLoadMsg32x4(
const T* data,
int offset)
1046 #if defined(CRYPTOPP_LITTLE_ENDIAN) 1047 const uint8x16_p8 mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
1048 const uint32x4_p8 r = VectorLoad32x4u(data, offset);
1049 return (uint32x4_p8)vec_perm(r, r, mask);
1051 return VectorLoad32x4u(data, offset);
1056 uint32x4_p8 VectorCh(
const uint32x4_p8 x,
const uint32x4_p8 y,
const uint32x4_p8 z)
1059 return vec_sel(z,y,x);
1063 uint32x4_p8 VectorMaj(
const uint32x4_p8 x,
const uint32x4_p8 y,
const uint32x4_p8 z)
1066 return vec_sel(y, z, vec_xor(x, y));
1070 uint32x4_p8 Vector_sigma0(
const uint32x4_p8 val)
1072 #if defined(CRYPTOPP_XLC_VERSION) 1073 return __vshasigmaw(val, 0, 0);
1075 return __builtin_crypto_vshasigmaw(val, 0, 0);
1080 uint32x4_p8 Vector_sigma1(
const uint32x4_p8 val)
1082 #if defined(CRYPTOPP_XLC_VERSION) 1083 return __vshasigmaw(val, 0, 0xf);
1085 return __builtin_crypto_vshasigmaw(val, 0, 0xf);
1090 uint32x4_p8 VectorSigma0(
const uint32x4_p8 val)
1092 #if defined(CRYPTOPP_XLC_VERSION) 1093 return __vshasigmaw(val, 1, 0);
1095 return __builtin_crypto_vshasigmaw(val, 1, 0);
1100 uint32x4_p8 VectorSigma1(
const uint32x4_p8 val)
1102 #if defined(CRYPTOPP_XLC_VERSION) 1103 return __vshasigmaw(val, 1, 0xf);
1105 return __builtin_crypto_vshasigmaw(val, 1, 0xf);
1110 uint32x4_p8 VectorPack(
const uint32x4_p8 a,
const uint32x4_p8 b,
1111 const uint32x4_p8 c,
const uint32x4_p8 d)
1113 const uint8x16_p8 m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
1114 const uint8x16_p8 m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1115 return vec_perm(vec_perm(a,b,m1), vec_perm(c,d,m1), m2);
1118 template <
unsigned int L>
static inline 1121 #if (defined(CRYPTOPP_LITTLE_ENDIAN)) 1122 return (uint32x4_p8)vec_sld((uint8x16_p8)val, (uint8x16_p8)val, (16-L)&0xf);
1124 return (uint32x4_p8)vec_sld((uint8x16_p8)val, (uint8x16_p8)val, L&0xf);
1129 uint32x4_p8 VectorShiftLeft<0>(
const uint32x4_p8 val) {
return val; }
1132 uint32x4_p8 VectorShiftLeft<16>(
const uint32x4_p8 val) {
return val; }
1134 template <
unsigned int R>
static inline 1135 void SHA256_ROUND1(uint32x4_p8 W[16], uint32x4_p8 S[8],
const uint32x4_p8 K,
const uint32x4_p8 M)
1140 T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1141 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1143 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1145 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1149 template <
unsigned int R>
static inline 1150 void SHA256_ROUND2(uint32x4_p8 W[16], uint32x4_p8 S[8],
const uint32x4_p8 K)
1153 enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1155 const uint32x4_p8 s0 = Vector_sigma0(W[IDX1]);
1156 const uint32x4_p8 s1 = Vector_sigma1(W[IDX14]);
1158 uint32x4_p8 T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1159 T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1160 uint32x4_p8 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1162 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1164 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1168 void SHA256_HashMultipleBlocks_POWER8(word32 *state,
const word32 *data,
size_t length,
ByteOrder order)
1172 CRYPTOPP_UNUSED(order);
1174 const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K);
1175 const uint32_t* m = reinterpret_cast<const uint32_t*>(data);
1177 uint32x4_p8 abcd = VectorLoad32x4u(state+0, 0);
1178 uint32x4_p8 efgh = VectorLoad32x4u(state+4, 0);
1179 uint32x4_p8 W[16], S[8], vm, vk;
1181 size_t blocks = length / SHA256::BLOCKSIZE;
1184 unsigned int i, offset=0;
1186 S[A] = abcd; S[E] = efgh;
1187 S[B] = VectorShiftLeft<4>(S[A]);
1188 S[F] = VectorShiftLeft<4>(S[E]);
1189 S[C] = VectorShiftLeft<4>(S[B]);
1190 S[G] = VectorShiftLeft<4>(S[F]);
1191 S[D] = VectorShiftLeft<4>(S[C]);
1192 S[H] = VectorShiftLeft<4>(S[G]);
1197 vk = VectorLoad32x4(k, offset);
1198 vm = VectorLoadMsg32x4(m, offset);
1199 SHA256_ROUND1<0>(W,S, vk,vm);
1202 vk = VectorShiftLeft<4>(vk);
1203 vm = VectorShiftLeft<4>(vm);
1204 SHA256_ROUND1<1>(W,S, vk,vm);
1206 vk = VectorShiftLeft<4>(vk);
1207 vm = VectorShiftLeft<4>(vm);
1208 SHA256_ROUND1<2>(W,S, vk,vm);
1210 vk = VectorShiftLeft<4>(vk);
1211 vm = VectorShiftLeft<4>(vm);
1212 SHA256_ROUND1<3>(W,S, vk,vm);
1214 vk = VectorLoad32x4(k, offset);
1215 vm = VectorLoadMsg32x4(m, offset);
1216 SHA256_ROUND1<4>(W,S, vk,vm);
1219 vk = VectorShiftLeft<4>(vk);
1220 vm = VectorShiftLeft<4>(vm);
1221 SHA256_ROUND1<5>(W,S, vk,vm);
1223 vk = VectorShiftLeft<4>(vk);
1224 vm = VectorShiftLeft<4>(vm);
1225 SHA256_ROUND1<6>(W,S, vk,vm);
1227 vk = VectorShiftLeft<4>(vk);
1228 vm = VectorShiftLeft<4>(vm);
1229 SHA256_ROUND1<7>(W,S, vk,vm);
1231 vk = VectorLoad32x4(k, offset);
1232 vm = VectorLoadMsg32x4(m, offset);
1233 SHA256_ROUND1<8>(W,S, vk,vm);
1236 vk = VectorShiftLeft<4>(vk);
1237 vm = VectorShiftLeft<4>(vm);
1238 SHA256_ROUND1<9>(W,S, vk,vm);
1240 vk = VectorShiftLeft<4>(vk);
1241 vm = VectorShiftLeft<4>(vm);
1242 SHA256_ROUND1<10>(W,S, vk,vm);
1244 vk = VectorShiftLeft<4>(vk);
1245 vm = VectorShiftLeft<4>(vm);
1246 SHA256_ROUND1<11>(W,S, vk,vm);
1248 vk = VectorLoad32x4(k, offset);
1249 vm = VectorLoadMsg32x4(m, offset);
1250 SHA256_ROUND1<12>(W,S, vk,vm);
1253 vk = VectorShiftLeft<4>(vk);
1254 vm = VectorShiftLeft<4>(vm);
1255 SHA256_ROUND1<13>(W,S, vk,vm);
1257 vk = VectorShiftLeft<4>(vk);
1258 vm = VectorShiftLeft<4>(vm);
1259 SHA256_ROUND1<14>(W,S, vk,vm);
1261 vk = VectorShiftLeft<4>(vk);
1262 vm = VectorShiftLeft<4>(vm);
1263 SHA256_ROUND1<15>(W,S, vk,vm);
1268 for (i=16; i<64; i+=16)
1270 vk = VectorLoad32x4(k, offset);
1271 SHA256_ROUND2<0>(W,S, vk);
1272 SHA256_ROUND2<1>(W,S, VectorShiftLeft<4>(vk));
1273 SHA256_ROUND2<2>(W,S, VectorShiftLeft<8>(vk));
1274 SHA256_ROUND2<3>(W,S, VectorShiftLeft<12>(vk));
1277 vk = VectorLoad32x4(k, offset);
1278 SHA256_ROUND2<4>(W,S, vk);
1279 SHA256_ROUND2<5>(W,S, VectorShiftLeft<4>(vk));
1280 SHA256_ROUND2<6>(W,S, VectorShiftLeft<8>(vk));
1281 SHA256_ROUND2<7>(W,S, VectorShiftLeft<12>(vk));
1284 vk = VectorLoad32x4(k, offset);
1285 SHA256_ROUND2<8>(W,S, vk);
1286 SHA256_ROUND2<9>(W,S, VectorShiftLeft<4>(vk));
1287 SHA256_ROUND2<10>(W,S, VectorShiftLeft<8>(vk));
1288 SHA256_ROUND2<11>(W,S, VectorShiftLeft<12>(vk));
1291 vk = VectorLoad32x4(k, offset);
1292 SHA256_ROUND2<12>(W,S, vk);
1293 SHA256_ROUND2<13>(W,S, VectorShiftLeft<4>(vk));
1294 SHA256_ROUND2<14>(W,S, VectorShiftLeft<8>(vk));
1295 SHA256_ROUND2<15>(W,S, VectorShiftLeft<12>(vk));
1299 abcd += VectorPack(S[A],S[B],S[C],S[D]);
1300 efgh += VectorPack(S[E],S[F],S[G],S[H]);
1303 VectorStore32x4u(abcd, state+0, 0);
1304 VectorStore32x4u(efgh, state+4, 0);
1308 uint64x2_p8 VectorPermute64x2(
const uint64x2_p8 val,
const uint8x16_p8 mask)
1310 return (uint64x2_p8)vec_perm(val, val, mask);
1314 template <
class T>
static inline 1315 uint64x2_p8 VectorLoad64x2(
const T* data,
int offset)
1317 return (uint64x2_p8)vec_ld(offset, (
const uint8_t*)data);
1321 template <
class T>
static inline 1322 uint64x2_p8 VectorLoad64x2u(
const T* data,
int offset)
1324 #if defined(CRYPTOPP_XLC_VERSION) 1325 return (uint64x2_p8)vec_xl(offset, (
const uint8_t*)data);
1327 return (uint64x2_p8)vec_vsx_ld(offset, (
const uint8_t*)data);
1332 template <
class T>
static inline 1333 void VectorStore64x2(
const uint64x2_p8 val, T* data,
int offset)
1335 vec_st((uint8x16_p8)val, offset, (uint8_t*)data);
1339 template <
class T>
static inline 1340 void VectorStore64x2u(
const uint64x2_p8 val, T* data,
int offset)
1342 #if defined(CRYPTOPP_XLC_VERSION) 1343 vec_xst((uint8x16_p8)val, offset, (uint8_t*)data);
1345 vec_vsx_st((uint8x16_p8)val, offset, (uint8_t*)data);
1351 template <
class T>
static inline 1352 uint64x2_p8 VectorLoadMsg64x2(
const T* data,
int offset)
1354 #if defined(CRYPTOPP_LITTLE_ENDIAN) 1355 const uint8x16_p8 mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
1356 return VectorPermute64x2(VectorLoad64x2u(data, offset), mask);
1358 return VectorLoad64x2u(data, offset);
1363 uint64x2_p8 VectorCh(
const uint64x2_p8 x,
const uint64x2_p8 y,
const uint64x2_p8 z)
1366 return vec_sel(z,y,x);
1370 uint64x2_p8 VectorMaj(
const uint64x2_p8 x,
const uint64x2_p8 y,
const uint64x2_p8 z)
1373 return vec_sel(y, z, vec_xor(x, y));
1377 uint64x2_p8 Vector_sigma0(
const uint64x2_p8 val)
1379 #if defined(CRYPTOPP_XLC_VERSION) 1380 return __vshasigmad(val, 0, 0);
1382 return __builtin_crypto_vshasigmad(val, 0, 0);
1387 uint64x2_p8 Vector_sigma1(
const uint64x2_p8 val)
1389 #if defined(CRYPTOPP_XLC_VERSION) 1390 return __vshasigmad(val, 0, 0xf);
1392 return __builtin_crypto_vshasigmad(val, 0, 0xf);
1397 uint64x2_p8 VectorSigma0(
const uint64x2_p8 val)
1399 #if defined(CRYPTOPP_XLC_VERSION) 1400 return __vshasigmad(val, 1, 0);
1402 return __builtin_crypto_vshasigmad(val, 1, 0);
1407 uint64x2_p8 VectorSigma1(
const uint64x2_p8 val)
1409 #if defined(CRYPTOPP_XLC_VERSION) 1410 return __vshasigmad(val, 1, 0xf);
1412 return __builtin_crypto_vshasigmad(val, 1, 0xf);
1417 uint64x2_p8 VectorPack(
const uint64x2_p8 x,
const uint64x2_p8 y)
1419 const uint8x16_p8 m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1420 return vec_perm(x,y,m);
1423 template <
unsigned int L>
static inline 1426 #if (defined(CRYPTOPP_LITTLE_ENDIAN)) 1427 return (uint64x2_p8)vec_sld((uint8x16_p8)val, (uint8x16_p8)val, (16-L)&0xf);
1429 return (uint64x2_p8)vec_sld((uint8x16_p8)val, (uint8x16_p8)val, L&0xf);
1434 uint64x2_p8 VectorShiftLeft<0>(
const uint64x2_p8 val) {
return val; }
1437 uint64x2_p8 VectorShiftLeft<16>(
const uint64x2_p8 val) {
return val; }
1439 template <
unsigned int R>
static inline 1440 void SHA512_ROUND1(uint64x2_p8 W[16], uint64x2_p8 S[8],
const uint64x2_p8 K,
const uint64x2_p8 M)
1445 T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1446 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1448 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1450 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1454 template <
unsigned int R>
static inline 1455 void SHA512_ROUND2(uint64x2_p8 W[16], uint64x2_p8 S[8],
const uint64x2_p8 K)
1458 enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1460 const uint64x2_p8 s0 = Vector_sigma0(W[IDX1]);
1461 const uint64x2_p8 s1 = Vector_sigma1(W[IDX14]);
1463 uint64x2_p8 T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1464 T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1465 uint64x2_p8 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1467 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1469 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1473 void SHA512_HashMultipleBlocks_POWER8(word64 *state,
const word64 *data,
size_t length,
ByteOrder order)
1477 CRYPTOPP_UNUSED(order);
1479 const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K);
1480 const uint64_t* m = reinterpret_cast<const uint64_t*>(data);
1482 uint64x2_p8 ab = VectorLoad64x2u(state+0, 0);
1483 uint64x2_p8 cd = VectorLoad64x2u(state+2, 0);
1484 uint64x2_p8 ef = VectorLoad64x2u(state+4, 0);
1485 uint64x2_p8 gh = VectorLoad64x2u(state+6, 0);
1486 uint64x2_p8 W[16], S[8], vm, vk;
1488 size_t blocks = length / SHA512::BLOCKSIZE;
1491 unsigned int i, offset=0;
1493 S[A] = ab; S[C] = cd;
1494 S[E] = ef; S[G] = gh;
1495 S[B] = VectorShiftLeft<8>(S[A]);
1496 S[D] = VectorShiftLeft<8>(S[C]);
1497 S[F] = VectorShiftLeft<8>(S[E]);
1498 S[H] = VectorShiftLeft<8>(S[G]);
1503 vk = VectorLoad64x2(k, offset);
1504 vm = VectorLoadMsg64x2(m, offset);
1505 SHA512_ROUND1<0>(W,S, vk,vm);
1508 vk = VectorShiftLeft<8>(vk);
1509 vm = VectorShiftLeft<8>(vm);
1510 SHA512_ROUND1<1>(W,S, vk,vm);
1512 vk = VectorLoad64x2(k, offset);
1513 vm = VectorLoadMsg64x2(m, offset);
1514 SHA512_ROUND1<2>(W,S, vk,vm);
1517 vk = VectorShiftLeft<8>(vk);
1518 vm = VectorShiftLeft<8>(vm);
1519 SHA512_ROUND1<3>(W,S, vk,vm);
1521 vk = VectorLoad64x2(k, offset);
1522 vm = VectorLoadMsg64x2(m, offset);
1523 SHA512_ROUND1<4>(W,S, vk,vm);
1526 vk = VectorShiftLeft<8>(vk);
1527 vm = VectorShiftLeft<8>(vm);
1528 SHA512_ROUND1<5>(W,S, vk,vm);
1530 vk = VectorLoad64x2(k, offset);
1531 vm = VectorLoadMsg64x2(m, offset);
1532 SHA512_ROUND1<6>(W,S, vk,vm);
1535 vk = VectorShiftLeft<8>(vk);
1536 vm = VectorShiftLeft<8>(vm);
1537 SHA512_ROUND1<7>(W,S, vk,vm);
1539 vk = VectorLoad64x2(k, offset);
1540 vm = VectorLoadMsg64x2(m, offset);
1541 SHA512_ROUND1<8>(W,S, vk,vm);
1544 vk = VectorShiftLeft<8>(vk);
1545 vm = VectorShiftLeft<8>(vm);
1546 SHA512_ROUND1<9>(W,S, vk,vm);
1548 vk = VectorLoad64x2(k, offset);
1549 vm = VectorLoadMsg64x2(m, offset);
1550 SHA512_ROUND1<10>(W,S, vk,vm);
1553 vk = VectorShiftLeft<8>(vk);
1554 vm = VectorShiftLeft<8>(vm);
1555 SHA512_ROUND1<11>(W,S, vk,vm);
1557 vk = VectorLoad64x2(k, offset);
1558 vm = VectorLoadMsg64x2(m, offset);
1559 SHA512_ROUND1<12>(W,S, vk,vm);
1562 vk = VectorShiftLeft<8>(vk);
1563 vm = VectorShiftLeft<8>(vm);
1564 SHA512_ROUND1<13>(W,S, vk,vm);
1566 vk = VectorLoad64x2(k, offset);
1567 vm = VectorLoadMsg64x2(m, offset);
1568 SHA512_ROUND1<14>(W,S, vk,vm);
1571 vk = VectorShiftLeft<8>(vk);
1572 vm = VectorShiftLeft<8>(vm);
1573 SHA512_ROUND1<15>(W,S, vk,vm);
1578 for (i=16 ; i<80; i+=16)
1580 vk = VectorLoad64x2(k, offset);
1581 SHA512_ROUND2<0>(W,S, vk);
1582 SHA512_ROUND2<1>(W,S, VectorShiftLeft<8>(vk));
1585 vk = VectorLoad64x2(k, offset);
1586 SHA512_ROUND2<2>(W,S, vk);
1587 SHA512_ROUND2<3>(W,S, VectorShiftLeft<8>(vk));
1590 vk = VectorLoad64x2(k, offset);
1591 SHA512_ROUND2<4>(W,S, vk);
1592 SHA512_ROUND2<5>(W,S, VectorShiftLeft<8>(vk));
1595 vk = VectorLoad64x2(k, offset);
1596 SHA512_ROUND2<6>(W,S, vk);
1597 SHA512_ROUND2<7>(W,S, VectorShiftLeft<8>(vk));
1600 vk = VectorLoad64x2(k, offset);
1601 SHA512_ROUND2<8>(W,S, vk);
1602 SHA512_ROUND2<9>(W,S, VectorShiftLeft<8>(vk));
1605 vk = VectorLoad64x2(k, offset);
1606 SHA512_ROUND2<10>(W,S, vk);
1607 SHA512_ROUND2<11>(W,S, VectorShiftLeft<8>(vk));
1610 vk = VectorLoad64x2(k, offset);
1611 SHA512_ROUND2<12>(W,S, vk);
1612 SHA512_ROUND2<13>(W,S, VectorShiftLeft<8>(vk));
1615 vk = VectorLoad64x2(k, offset);
1616 SHA512_ROUND2<14>(W,S, vk);
1617 SHA512_ROUND2<15>(W,S, VectorShiftLeft<8>(vk));
1621 ab += VectorPack(S[A],S[B]);
1622 cd += VectorPack(S[C],S[D]);
1623 ef += VectorPack(S[E],S[F]);
1624 gh += VectorPack(S[G],S[H]);
1627 VectorStore64x2u(ab, state+0, 0);
1628 VectorStore64x2u(cd, state+2, 0);
1629 VectorStore64x2u(ef, state+4, 0);
1630 VectorStore64x2u(gh, state+6, 0);
1633 #endif // CRYPTOPP_POWER8_SHA_AVAILABLE Utility functions for the Crypto++ library.
ByteOrder
Provides the byte ordering.
T1 VectorShiftLeft(const T1 &vec1, const T2 &vec2)
Shift two vectors left.
Library configuration file.
Support functions for PowerPC and vector operations.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Classes for SHA-1 and SHA-2 family of message digests.
Crypto++ library namespace.