Go to the documentation of this file.
12 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
14 #define MAC_Apply_G_mx3b_ass MAC_Apply_G_mx3b_ops
15 #define MAC_Apply_G_mx3b_asd MAC_Apply_G_mx3b_opd
16 #define MAC_Apply_G_mx3b_asc MAC_Apply_G_mx3b_opc
17 #define MAC_Apply_G_mx3b_asz MAC_Apply_G_mx3b_opz
19 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
21 #define MAC_Apply_G_mx3b_ass( m_A, \
30 int n_iter32 = m_A / ( 4 * 8 ); \
31 int n_left32 = m_A % ( 4 * 8 ); \
32 int n_iter4 = n_left32 / ( 4 * 1 ); \
33 int n_left = n_left32 % ( 4 * 1 ); \
36 const int step_a1 = inc_a1 * 4; \
37 const int step_a2 = inc_a2 * 4; \
38 const int step_a3 = inc_a3 * 4; \
40 float* restrict alpha1 = a1; \
41 float* restrict alpha2 = a2; \
42 float* restrict alpha3 = a3; \
44 v4sf_t a1v, a2v, a3v; \
49 g12v.v = _mm_load1_ps( gamma12 ); \
50 s12v.v = _mm_load1_ps( sigma12 ); \
51 g23v.v = _mm_load1_ps( gamma23 ); \
52 s23v.v = _mm_load1_ps( sigma23 ); \
54 for ( i = 0; i < n_iter32; ++i ) \
57 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
58 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
61 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
62 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
64 _mm_store_ps( ( float* )alpha3, a3v.v ); \
66 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
69 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
70 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
72 _mm_store_ps( ( float* )alpha1, a1v.v ); \
74 _mm_store_ps( ( float* )alpha2, a2v.v ); \
77 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
78 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
81 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
82 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
84 _mm_store_ps( ( float* )alpha3, a3v.v ); \
86 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
89 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
90 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
92 _mm_store_ps( ( float* )alpha1, a1v.v ); \
94 _mm_store_ps( ( float* )alpha2, a2v.v ); \
97 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
98 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
101 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
102 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
104 _mm_store_ps( ( float* )alpha3, a3v.v ); \
106 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
109 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
110 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
112 _mm_store_ps( ( float* )alpha1, a1v.v ); \
114 _mm_store_ps( ( float* )alpha2, a2v.v ); \
117 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
118 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
121 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
122 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
124 _mm_store_ps( ( float* )alpha3, a3v.v ); \
126 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
129 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
130 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
132 _mm_store_ps( ( float* )alpha1, a1v.v ); \
134 _mm_store_ps( ( float* )alpha2, a2v.v ); \
137 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
138 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
141 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
142 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
144 _mm_store_ps( ( float* )alpha3, a3v.v ); \
146 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
149 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
150 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
152 _mm_store_ps( ( float* )alpha1, a1v.v ); \
154 _mm_store_ps( ( float* )alpha2, a2v.v ); \
157 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
158 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
161 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
162 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
164 _mm_store_ps( ( float* )alpha3, a3v.v ); \
166 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
169 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
170 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
172 _mm_store_ps( ( float* )alpha1, a1v.v ); \
174 _mm_store_ps( ( float* )alpha2, a2v.v ); \
177 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
178 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
181 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
182 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
184 _mm_store_ps( ( float* )alpha3, a3v.v ); \
186 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
189 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
190 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
192 _mm_store_ps( ( float* )alpha1, a1v.v ); \
194 _mm_store_ps( ( float* )alpha2, a2v.v ); \
197 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
198 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
201 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
202 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
204 _mm_store_ps( ( float* )alpha3, a3v.v ); \
206 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
209 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
210 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
212 _mm_store_ps( ( float* )alpha1, a1v.v ); \
214 _mm_store_ps( ( float* )alpha2, a2v.v ); \
218 for ( i = 0; i < n_iter4; ++i ) \
221 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
222 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
225 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
226 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
228 _mm_store_ps( ( float* )alpha3, a3v.v ); \
230 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
233 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
234 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
236 _mm_store_ps( ( float* )alpha1, a1v.v ); \
238 _mm_store_ps( ( float* )alpha2, a2v.v ); \
242 for ( i = 0; i < n_left; ++i ) \
244 float ga12 = *gamma12; \
245 float si12 = *sigma12; \
246 float ga23 = *gamma23; \
247 float si23 = *sigma23; \
255 *alpha2 = temp2 * ga23 + temp3 * si23; \
256 *alpha3 = temp3 * ga23 - temp2 * si23; \
261 *alpha1 = temp1 * ga12 + temp2 * si12; \
262 *alpha2 = temp2 * ga12 - temp1 * si12; \
270 #define MAC_Apply_G_mx3b_asd( m_A, \
279 int n_iter16 = m_A / ( 2 * 8 ); \
280 int n_left16 = m_A % ( 2 * 8 ); \
281 int n_iter2 = n_left16 / ( 2 * 1 ); \
282 int n_left = n_left16 % ( 2 * 1 ); \
285 const int step_a1 = inc_a1 * 2; \
286 const int step_a2 = inc_a2 * 2; \
287 const int step_a3 = inc_a3 * 2; \
289 double* restrict alpha1 = a1; \
290 double* restrict alpha2 = a2; \
291 double* restrict alpha3 = a3; \
293 v2df_t a1v, a2v, a3v; \
298 g12v.v = _mm_loaddup_pd( gamma12 ); \
299 s12v.v = _mm_loaddup_pd( sigma12 ); \
300 g23v.v = _mm_loaddup_pd( gamma23 ); \
301 s23v.v = _mm_loaddup_pd( sigma23 ); \
303 for ( i = 0; i < n_iter16; ++i ) \
306 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
307 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
310 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
311 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
313 _mm_store_pd( ( double* )alpha3, a3v.v ); \
315 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
318 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
319 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
321 _mm_store_pd( ( double* )alpha1, a1v.v ); \
323 _mm_store_pd( ( double* )alpha2, a2v.v ); \
326 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
327 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
330 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
331 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
333 _mm_store_pd( ( double* )alpha3, a3v.v ); \
335 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
338 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
339 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
341 _mm_store_pd( ( double* )alpha1, a1v.v ); \
343 _mm_store_pd( ( double* )alpha2, a2v.v ); \
346 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
347 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
350 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
351 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
353 _mm_store_pd( ( double* )alpha3, a3v.v ); \
355 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
358 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
359 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
361 _mm_store_pd( ( double* )alpha1, a1v.v ); \
363 _mm_store_pd( ( double* )alpha2, a2v.v ); \
366 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
367 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
370 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
371 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
373 _mm_store_pd( ( double* )alpha3, a3v.v ); \
375 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
378 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
379 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
381 _mm_store_pd( ( double* )alpha1, a1v.v ); \
383 _mm_store_pd( ( double* )alpha2, a2v.v ); \
386 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
387 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
390 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
391 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
393 _mm_store_pd( ( double* )alpha3, a3v.v ); \
395 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
398 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
399 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
401 _mm_store_pd( ( double* )alpha1, a1v.v ); \
403 _mm_store_pd( ( double* )alpha2, a2v.v ); \
406 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
407 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
410 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
411 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
413 _mm_store_pd( ( double* )alpha3, a3v.v ); \
415 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
418 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
419 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
421 _mm_store_pd( ( double* )alpha1, a1v.v ); \
423 _mm_store_pd( ( double* )alpha2, a2v.v ); \
426 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
427 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
430 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
431 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
433 _mm_store_pd( ( double* )alpha3, a3v.v ); \
435 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
438 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
439 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
441 _mm_store_pd( ( double* )alpha1, a1v.v ); \
443 _mm_store_pd( ( double* )alpha2, a2v.v ); \
446 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
447 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
450 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
451 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
453 _mm_store_pd( ( double* )alpha3, a3v.v ); \
455 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
458 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
459 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
461 _mm_store_pd( ( double* )alpha1, a1v.v ); \
463 _mm_store_pd( ( double* )alpha2, a2v.v ); \
467 for ( i = 0; i < n_iter2; ++i ) \
470 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
471 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
474 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
475 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
477 _mm_store_pd( ( double* )alpha3, a3v.v ); \
479 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
482 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
483 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
485 _mm_store_pd( ( double* )alpha1, a1v.v ); \
487 _mm_store_pd( ( double* )alpha2, a2v.v ); \
493 double ga12 = *gamma12; \
494 double si12 = *sigma12; \
495 double ga23 = *gamma23; \
496 double si23 = *sigma23; \
504 *alpha2 = temp2 * ga23 + temp3 * si23; \
505 *alpha3 = temp3 * ga23 - temp2 * si23; \
510 *alpha1 = temp1 * ga12 + temp2 * si12; \
511 *alpha2 = temp2 * ga12 - temp1 * si12; \
515 #define MAC_Apply_G_mx3b_asc( m_A, \
524 int n_iter16 = m_A / ( 2 * 8 ); \
525 int n_left16 = m_A % ( 2 * 8 ); \
526 int n_iter2 = n_left16 / ( 2 * 1 ); \
527 int n_left = n_left16 % ( 2 * 1 ); \
530 const int step_a1 = inc_a1 * 2; \
531 const int step_a2 = inc_a2 * 2; \
532 const int step_a3 = inc_a3 * 2; \
534 scomplex* restrict alpha1 = a1; \
535 scomplex* restrict alpha2 = a2; \
536 scomplex* restrict alpha3 = a3; \
538 v4sf_t a1v, a2v, a3v; \
543 g12v.v = _mm_load1_ps( gamma12 ); \
544 s12v.v = _mm_load1_ps( sigma12 ); \
545 g23v.v = _mm_load1_ps( gamma23 ); \
546 s23v.v = _mm_load1_ps( sigma23 ); \
548 for ( i = 0; i < n_iter16; ++i ) \
551 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
552 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
555 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
556 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
558 _mm_store_ps( ( float* )alpha3, a3v.v ); \
560 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
563 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
564 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
566 _mm_store_ps( ( float* )alpha1, a1v.v ); \
568 _mm_store_ps( ( float* )alpha2, a2v.v ); \
571 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
572 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
575 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
576 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
578 _mm_store_ps( ( float* )alpha3, a3v.v ); \
580 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
583 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
584 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
586 _mm_store_ps( ( float* )alpha1, a1v.v ); \
588 _mm_store_ps( ( float* )alpha2, a2v.v ); \
591 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
592 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
595 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
596 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
598 _mm_store_ps( ( float* )alpha3, a3v.v ); \
600 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
603 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
604 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
606 _mm_store_ps( ( float* )alpha1, a1v.v ); \
608 _mm_store_ps( ( float* )alpha2, a2v.v ); \
611 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
612 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
615 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
616 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
618 _mm_store_ps( ( float* )alpha3, a3v.v ); \
620 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
623 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
624 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
626 _mm_store_ps( ( float* )alpha1, a1v.v ); \
628 _mm_store_ps( ( float* )alpha2, a2v.v ); \
631 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
632 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
635 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
636 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
638 _mm_store_ps( ( float* )alpha3, a3v.v ); \
640 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
643 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
644 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
646 _mm_store_ps( ( float* )alpha1, a1v.v ); \
648 _mm_store_ps( ( float* )alpha2, a2v.v ); \
651 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
652 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
655 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
656 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
658 _mm_store_ps( ( float* )alpha3, a3v.v ); \
660 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
663 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
664 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
666 _mm_store_ps( ( float* )alpha1, a1v.v ); \
668 _mm_store_ps( ( float* )alpha2, a2v.v ); \
671 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
672 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
675 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
676 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
678 _mm_store_ps( ( float* )alpha3, a3v.v ); \
680 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
683 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
684 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
686 _mm_store_ps( ( float* )alpha1, a1v.v ); \
688 _mm_store_ps( ( float* )alpha2, a2v.v ); \
691 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
692 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
695 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
696 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
698 _mm_store_ps( ( float* )alpha3, a3v.v ); \
700 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
703 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
704 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
706 _mm_store_ps( ( float* )alpha1, a1v.v ); \
708 _mm_store_ps( ( float* )alpha2, a2v.v ); \
712 for ( i = 0; i < n_iter2; ++i ) \
715 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
716 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
719 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
720 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
722 _mm_store_ps( ( float* )alpha3, a3v.v ); \
724 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
727 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
728 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
730 _mm_store_ps( ( float* )alpha1, a1v.v ); \
732 _mm_store_ps( ( float* )alpha2, a2v.v ); \
738 float ga12 = *gamma12; \
739 float si12 = *sigma12; \
740 float ga23 = *gamma23; \
741 float si23 = *sigma23; \
749 alpha1->real = temp1.real * ga12 + temp2.real * si12; \
750 alpha2->real = temp2.real * ga12 - temp1.real * si12; \
752 alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \
753 alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \
758 alpha2->real = temp2.real * ga23 + temp3.real * si23; \
759 alpha3->real = temp3.real * ga23 - temp2.real * si23; \
761 alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \
762 alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \
766 #define MAC_Apply_G_mx3b_asz( m_A, \
775 int n_iter = m_A / 8; \
776 int n_left = m_A % 8; \
779 const int step_a1 = inc_a1 * 1; \
780 const int step_a2 = inc_a2 * 1; \
781 const int step_a3 = inc_a3 * 1; \
783 dcomplex* restrict alpha1 = a1; \
784 dcomplex* restrict alpha2 = a2; \
785 dcomplex* restrict alpha3 = a3; \
787 v2df_t a1v, a2v, a3v; \
792 g12v.v = _mm_loaddup_pd( gamma12 ); \
793 s12v.v = _mm_loaddup_pd( sigma12 ); \
794 g23v.v = _mm_loaddup_pd( gamma23 ); \
795 s23v.v = _mm_loaddup_pd( sigma23 ); \
797 for ( i = 0; i < n_iter; ++i ) \
800 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
801 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
804 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
805 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
807 _mm_store_pd( ( double* )alpha3, a3v.v ); \
809 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
812 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
813 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
815 _mm_store_pd( ( double* )alpha1, a1v.v ); \
817 _mm_store_pd( ( double* )alpha2, a2v.v ); \
820 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
821 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
824 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
825 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
827 _mm_store_pd( ( double* )alpha3, a3v.v ); \
829 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
832 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
833 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
835 _mm_store_pd( ( double* )alpha1, a1v.v ); \
837 _mm_store_pd( ( double* )alpha2, a2v.v ); \
840 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
841 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
844 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
845 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
847 _mm_store_pd( ( double* )alpha3, a3v.v ); \
849 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
852 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
853 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
855 _mm_store_pd( ( double* )alpha1, a1v.v ); \
857 _mm_store_pd( ( double* )alpha2, a2v.v ); \
860 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
861 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
864 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
865 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
867 _mm_store_pd( ( double* )alpha3, a3v.v ); \
869 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
872 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
873 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
875 _mm_store_pd( ( double* )alpha1, a1v.v ); \
877 _mm_store_pd( ( double* )alpha2, a2v.v ); \
880 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
881 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
884 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
885 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
887 _mm_store_pd( ( double* )alpha3, a3v.v ); \
889 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
892 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
893 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
895 _mm_store_pd( ( double* )alpha1, a1v.v ); \
897 _mm_store_pd( ( double* )alpha2, a2v.v ); \
900 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
901 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
904 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
905 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
907 _mm_store_pd( ( double* )alpha3, a3v.v ); \
909 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
912 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
913 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
915 _mm_store_pd( ( double* )alpha1, a1v.v ); \
917 _mm_store_pd( ( double* )alpha2, a2v.v ); \
920 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
921 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
924 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
925 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
927 _mm_store_pd( ( double* )alpha3, a3v.v ); \
929 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
932 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
933 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
935 _mm_store_pd( ( double* )alpha1, a1v.v ); \
937 _mm_store_pd( ( double* )alpha2, a2v.v ); \
940 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
941 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
944 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
945 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
947 _mm_store_pd( ( double* )alpha3, a3v.v ); \
949 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
952 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
953 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
955 _mm_store_pd( ( double* )alpha1, a1v.v ); \
957 _mm_store_pd( ( double* )alpha2, a2v.v ); \
961 for ( i = 0; i < n_left; ++i ) \
964 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
965 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
968 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
969 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
971 _mm_store_pd( ( double* )alpha3, a3v.v ); \
973 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
976 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
977 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
979 _mm_store_pd( ( double* )alpha1, a1v.v ); \
981 _mm_store_pd( ( double* )alpha2, a2v.v ); \