Go to the documentation of this file.
12 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
14 #define MAC_Apply_G_mx2_ass MAC_Apply_G_mx2_ops
15 #define MAC_Apply_G_mx2_asd MAC_Apply_G_mx2_opd
16 #define MAC_Apply_G_mx2_asc MAC_Apply_G_mx2_opc
17 #define MAC_Apply_G_mx2_asz MAC_Apply_G_mx2_opz
19 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
21 #define MAC_Apply_G_mx2_ass( m_A, \
27 int n_iter32 = m_A / ( 4 * 8 ); \
28 int n_left32 = m_A % ( 4 * 8 ); \
29 int n_iter4 = n_left32 / ( 4 * 1 ); \
30 int n_left = n_left32 % ( 4 * 1 ); \
33 const int step_a1 = inc_a1 * 4; \
34 const int step_a2 = inc_a2 * 4; \
36 float* restrict alpha1 = a1; \
37 float* restrict alpha2 = a2; \
43 g12v.v = _mm_load1_ps( gamma12 ); \
44 s12v.v = _mm_load1_ps( sigma12 ); \
46 for ( i = 0; i < n_iter32; ++i ) \
49 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
50 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
53 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
54 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
56 _mm_store_ps( ( float* )alpha1, a1v.v ); \
57 _mm_store_ps( ( float* )alpha2, a2v.v ); \
62 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
63 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
66 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
67 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
69 _mm_store_ps( ( float* )alpha1, a1v.v ); \
70 _mm_store_ps( ( float* )alpha2, a2v.v ); \
75 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
76 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
79 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
80 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
82 _mm_store_ps( ( float* )alpha1, a1v.v ); \
83 _mm_store_ps( ( float* )alpha2, a2v.v ); \
88 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
89 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
92 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
93 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
95 _mm_store_ps( ( float* )alpha1, a1v.v ); \
96 _mm_store_ps( ( float* )alpha2, a2v.v ); \
101 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
102 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
105 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
106 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
108 _mm_store_ps( ( float* )alpha1, a1v.v ); \
109 _mm_store_ps( ( float* )alpha2, a2v.v ); \
114 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
115 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
118 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
119 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
121 _mm_store_ps( ( float* )alpha1, a1v.v ); \
122 _mm_store_ps( ( float* )alpha2, a2v.v ); \
127 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
128 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
131 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
132 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
134 _mm_store_ps( ( float* )alpha1, a1v.v ); \
135 _mm_store_ps( ( float* )alpha2, a2v.v ); \
140 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
141 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
144 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
145 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
147 _mm_store_ps( ( float* )alpha1, a1v.v ); \
148 _mm_store_ps( ( float* )alpha2, a2v.v ); \
154 for ( i = 0; i < n_iter4; ++i ) \
157 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
158 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
161 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
162 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
164 _mm_store_ps( ( float* )alpha1, a1v.v ); \
165 _mm_store_ps( ( float* )alpha2, a2v.v ); \
171 for ( i = 0; i < n_left; ++i ) \
173 float ga12 = *gamma12; \
174 float si12 = *sigma12; \
181 *alpha1 = temp1 * ga12 + temp2 * si12; \
182 *alpha2 = temp2 * ga12 - temp1 * si12; \
189 #define MAC_Apply_G_mx2_asd( m_A, \
195 int n_iter16 = m_A / ( 2 * 8 ); \
196 int n_left16 = m_A % ( 2 * 8 ); \
197 int n_iter2 = n_left16 / ( 2 * 1 ); \
198 int n_left = n_left16 % ( 2 * 1 ); \
201 const int step_a1 = inc_a1 * 2; \
202 const int step_a2 = inc_a2 * 2; \
204 double* restrict alpha1 = a1; \
205 double* restrict alpha2 = a2; \
211 g12v.v = _mm_loaddup_pd( gamma12 ); \
212 s12v.v = _mm_loaddup_pd( sigma12 ); \
214 for ( i = 0; i < n_iter16; ++i ) \
217 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
218 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
221 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
222 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
224 _mm_store_pd( ( double* )alpha1, a1v.v ); \
225 _mm_store_pd( ( double* )alpha2, a2v.v ); \
230 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
231 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
234 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
235 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
237 _mm_store_pd( ( double* )alpha1, a1v.v ); \
238 _mm_store_pd( ( double* )alpha2, a2v.v ); \
243 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
244 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
247 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
248 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
250 _mm_store_pd( ( double* )alpha1, a1v.v ); \
251 _mm_store_pd( ( double* )alpha2, a2v.v ); \
256 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
257 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
260 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
261 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
263 _mm_store_pd( ( double* )alpha1, a1v.v ); \
264 _mm_store_pd( ( double* )alpha2, a2v.v ); \
269 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
270 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
273 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
274 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
276 _mm_store_pd( ( double* )alpha1, a1v.v ); \
277 _mm_store_pd( ( double* )alpha2, a2v.v ); \
282 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
283 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
286 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
287 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
289 _mm_store_pd( ( double* )alpha1, a1v.v ); \
290 _mm_store_pd( ( double* )alpha2, a2v.v ); \
295 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
296 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
299 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
300 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
302 _mm_store_pd( ( double* )alpha1, a1v.v ); \
303 _mm_store_pd( ( double* )alpha2, a2v.v ); \
308 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
309 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
312 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
313 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
315 _mm_store_pd( ( double* )alpha1, a1v.v ); \
316 _mm_store_pd( ( double* )alpha2, a2v.v ); \
322 for ( i = 0; i < n_iter2; ++i ) \
325 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
326 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
329 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
330 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
332 _mm_store_pd( ( double* )alpha1, a1v.v ); \
333 _mm_store_pd( ( double* )alpha2, a2v.v ); \
341 double ga12 = *gamma12; \
342 double si12 = *sigma12; \
349 *alpha1 = temp1 * ga12 + temp2 * si12; \
350 *alpha2 = temp2 * ga12 - temp1 * si12; \
354 #define MAC_Apply_G_mx2_asc( m_A, \
360 int n_iter16 = m_A / ( 2 * 8 ); \
361 int n_left16 = m_A % ( 2 * 8 ); \
362 int n_iter2 = n_left16 / ( 2 * 1 ); \
363 int n_left = n_left16 % ( 2 * 1 ); \
366 const int step_a1 = inc_a1 * 2; \
367 const int step_a2 = inc_a2 * 2; \
369 scomplex* restrict alpha1 = a1; \
370 scomplex* restrict alpha2 = a2; \
376 g12v.v = _mm_load1_ps( gamma12 ); \
377 s12v.v = _mm_load1_ps( sigma12 ); \
379 for ( i = 0; i < n_iter16; ++i ) \
382 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
383 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
386 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
387 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
389 _mm_store_ps( ( float* )alpha1, a1v.v ); \
390 _mm_store_ps( ( float* )alpha2, a2v.v ); \
395 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
396 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
399 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
400 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
402 _mm_store_ps( ( float* )alpha1, a1v.v ); \
403 _mm_store_ps( ( float* )alpha2, a2v.v ); \
408 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
409 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
412 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
413 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
415 _mm_store_ps( ( float* )alpha1, a1v.v ); \
416 _mm_store_ps( ( float* )alpha2, a2v.v ); \
421 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
422 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
425 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
426 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
428 _mm_store_ps( ( float* )alpha1, a1v.v ); \
429 _mm_store_ps( ( float* )alpha2, a2v.v ); \
434 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
435 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
438 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
439 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
441 _mm_store_ps( ( float* )alpha1, a1v.v ); \
442 _mm_store_ps( ( float* )alpha2, a2v.v ); \
447 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
448 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
451 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
452 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
454 _mm_store_ps( ( float* )alpha1, a1v.v ); \
455 _mm_store_ps( ( float* )alpha2, a2v.v ); \
460 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
461 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
464 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
465 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
467 _mm_store_ps( ( float* )alpha1, a1v.v ); \
468 _mm_store_ps( ( float* )alpha2, a2v.v ); \
473 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
474 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
477 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
478 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
480 _mm_store_ps( ( float* )alpha1, a1v.v ); \
481 _mm_store_ps( ( float* )alpha2, a2v.v ); \
487 for ( i = 0; i < n_iter2; ++i ) \
490 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
491 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
494 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
495 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
497 _mm_store_ps( ( float* )alpha1, a1v.v ); \
498 _mm_store_ps( ( float* )alpha2, a2v.v ); \
506 float ga12 = *gamma12; \
507 float si12 = *sigma12; \
514 alpha1->real = temp1.real * ga12 + temp2.real * si12; \
515 alpha2->real = temp2.real * ga12 - temp1.real * si12; \
517 alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \
518 alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \
522 #define MAC_Apply_G_mx2_asz( m_A, \
528 int n_iter = m_A / 8; \
529 int n_left = m_A % 8; \
532 const int step_a1 = inc_a1 * 1; \
533 const int step_a2 = inc_a2 * 1; \
535 dcomplex* restrict alpha1 = a1; \
536 dcomplex* restrict alpha2 = a2; \
542 g12v.v = _mm_loaddup_pd( gamma12 ); \
543 s12v.v = _mm_loaddup_pd( sigma12 ); \
545 for ( i = 0; i < n_iter; ++i ) \
548 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
549 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
552 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
553 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
555 _mm_store_pd( ( double* )alpha1, a1v.v ); \
556 _mm_store_pd( ( double* )alpha2, a2v.v ); \
561 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
562 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
565 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
566 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
568 _mm_store_pd( ( double* )alpha1, a1v.v ); \
569 _mm_store_pd( ( double* )alpha2, a2v.v ); \
574 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
575 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
578 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
579 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
581 _mm_store_pd( ( double* )alpha1, a1v.v ); \
582 _mm_store_pd( ( double* )alpha2, a2v.v ); \
587 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
588 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
591 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
592 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
594 _mm_store_pd( ( double* )alpha1, a1v.v ); \
595 _mm_store_pd( ( double* )alpha2, a2v.v ); \
600 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
601 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
604 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
605 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
607 _mm_store_pd( ( double* )alpha1, a1v.v ); \
608 _mm_store_pd( ( double* )alpha2, a2v.v ); \
613 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
614 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
617 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
618 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
620 _mm_store_pd( ( double* )alpha1, a1v.v ); \
621 _mm_store_pd( ( double* )alpha2, a2v.v ); \
626 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
627 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
630 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
631 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
633 _mm_store_pd( ( double* )alpha1, a1v.v ); \
634 _mm_store_pd( ( double* )alpha2, a2v.v ); \
639 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
640 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
643 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
644 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
646 _mm_store_pd( ( double* )alpha1, a1v.v ); \
647 _mm_store_pd( ( double* )alpha2, a2v.v ); \
653 for ( i = 0; i < n_left; ++i ) \
655 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
656 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
659 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
660 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
662 _mm_store_pd( ( double* )alpha1, a1v.v ); \
663 _mm_store_pd( ( double* )alpha2, a2v.v ); \