Go to the documentation of this file.
12 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
14 #define MAC_Apply_G_mx3_ass MAC_Apply_G_mx3_ops
15 #define MAC_Apply_G_mx3_asd MAC_Apply_G_mx3_opd
16 #define MAC_Apply_G_mx3_asc MAC_Apply_G_mx3_opc
17 #define MAC_Apply_G_mx3_asz MAC_Apply_G_mx3_opz
19 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
21 #define MAC_Apply_G_mx3_ass( m_A, \
30 int n_iter32 = m_A / ( 4 * 8 ); \
31 int n_left32 = m_A % ( 4 * 8 ); \
32 int n_iter4 = n_left32 / ( 4 * 1 ); \
33 int n_left = n_left32 % ( 4 * 1 ); \
36 const int step_a1 = inc_a1 * 4; \
37 const int step_a2 = inc_a1 * 4; \
38 const int step_a3 = inc_a1 * 4; \
40 float* restrict alpha1 = a1; \
41 float* restrict alpha2 = a2; \
42 float* restrict alpha3 = a3; \
44 v4sf_t a1v, a2v, a3v; \
49 g12v.v = _mm_load1_ps( gamma12 ); \
50 s12v.v = _mm_load1_ps( sigma12 ); \
51 g23v.v = _mm_load1_ps( gamma23 ); \
52 s23v.v = _mm_load1_ps( sigma23 ); \
54 for ( i = 0; i < n_iter32; ++i ) \
57 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
58 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
61 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
62 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
64 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
65 _mm_store_ps( ( float* )alpha1, a1v.v ); \
69 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
70 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
72 _mm_store_ps( ( float* )alpha2, a2v.v ); \
74 _mm_store_ps( ( float* )alpha3, a3v.v ); \
77 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
78 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
81 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
82 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
84 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
85 _mm_store_ps( ( float* )alpha1, a1v.v ); \
89 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
90 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
92 _mm_store_ps( ( float* )alpha2, a2v.v ); \
94 _mm_store_ps( ( float* )alpha3, a3v.v ); \
97 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
98 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
101 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
102 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
104 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
105 _mm_store_ps( ( float* )alpha1, a1v.v ); \
109 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
110 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
112 _mm_store_ps( ( float* )alpha2, a2v.v ); \
114 _mm_store_ps( ( float* )alpha3, a3v.v ); \
117 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
118 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
121 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
122 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
124 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
125 _mm_store_ps( ( float* )alpha1, a1v.v ); \
129 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
130 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
132 _mm_store_ps( ( float* )alpha2, a2v.v ); \
134 _mm_store_ps( ( float* )alpha3, a3v.v ); \
137 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
138 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
141 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
142 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
144 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
145 _mm_store_ps( ( float* )alpha1, a1v.v ); \
149 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
150 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
152 _mm_store_ps( ( float* )alpha2, a2v.v ); \
154 _mm_store_ps( ( float* )alpha3, a3v.v ); \
157 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
158 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
161 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
162 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
164 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
165 _mm_store_ps( ( float* )alpha1, a1v.v ); \
169 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
170 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
172 _mm_store_ps( ( float* )alpha2, a2v.v ); \
174 _mm_store_ps( ( float* )alpha3, a3v.v ); \
177 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
178 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
181 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
182 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
184 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
185 _mm_store_ps( ( float* )alpha1, a1v.v ); \
189 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
190 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
192 _mm_store_ps( ( float* )alpha2, a2v.v ); \
194 _mm_store_ps( ( float* )alpha3, a3v.v ); \
197 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
198 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
201 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
202 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
204 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
205 _mm_store_ps( ( float* )alpha1, a1v.v ); \
209 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
210 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
212 _mm_store_ps( ( float* )alpha2, a2v.v ); \
214 _mm_store_ps( ( float* )alpha3, a3v.v ); \
218 for ( i = 0; i < n_iter4; ++i ) \
221 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
222 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
225 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
226 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
228 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
229 _mm_store_ps( ( float* )alpha1, a1v.v ); \
233 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
234 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
236 _mm_store_ps( ( float* )alpha2, a2v.v ); \
238 _mm_store_ps( ( float* )alpha3, a3v.v ); \
242 for ( i = 0; i < n_left; ++i ) \
244 float ga12 = *gamma12; \
245 float si12 = *sigma12; \
246 float ga23 = *gamma23; \
247 float si23 = *sigma23; \
255 *alpha1 = temp1 * ga12 + temp2 * si12; \
256 *alpha2 = temp2 * ga12 - temp1 * si12; \
261 *alpha2 = temp2 * ga23 + temp3 * si23; \
262 *alpha3 = temp3 * ga23 - temp2 * si23; \
270 #define MAC_Apply_G_mx3_asd( m_A, \
279 int n_iter16 = m_A / ( 2 * 8 ); \
280 int n_left16 = m_A % ( 2 * 8 ); \
281 int n_iter2 = n_left16 / ( 2 * 1 ); \
282 int n_left = n_left16 % ( 2 * 1 ); \
285 const int step_a1 = inc_a1 * 2; \
286 const int step_a2 = inc_a1 * 2; \
287 const int step_a3 = inc_a1 * 2; \
289 double* restrict alpha1 = a1; \
290 double* restrict alpha2 = a2; \
291 double* restrict alpha3 = a3; \
293 v2df_t a1v, a2v, a3v; \
298 g12v.v = _mm_loaddup_pd( gamma12 ); \
299 s12v.v = _mm_loaddup_pd( sigma12 ); \
300 g23v.v = _mm_loaddup_pd( gamma23 ); \
301 s23v.v = _mm_loaddup_pd( sigma23 ); \
303 for ( i = 0; i < n_iter16; ++i ) \
306 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
307 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
310 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
311 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
313 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
314 _mm_store_pd( ( double* )alpha1, a1v.v ); \
318 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
319 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
321 _mm_store_pd( ( double* )alpha2, a2v.v ); \
323 _mm_store_pd( ( double* )alpha3, a3v.v ); \
326 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
327 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
330 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
331 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
333 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
334 _mm_store_pd( ( double* )alpha1, a1v.v ); \
338 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
339 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
341 _mm_store_pd( ( double* )alpha2, a2v.v ); \
343 _mm_store_pd( ( double* )alpha3, a3v.v ); \
346 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
347 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
350 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
351 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
353 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
354 _mm_store_pd( ( double* )alpha1, a1v.v ); \
358 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
359 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
361 _mm_store_pd( ( double* )alpha2, a2v.v ); \
363 _mm_store_pd( ( double* )alpha3, a3v.v ); \
366 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
367 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
370 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
371 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
373 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
374 _mm_store_pd( ( double* )alpha1, a1v.v ); \
378 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
379 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
381 _mm_store_pd( ( double* )alpha2, a2v.v ); \
383 _mm_store_pd( ( double* )alpha3, a3v.v ); \
386 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
387 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
390 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
391 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
393 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
394 _mm_store_pd( ( double* )alpha1, a1v.v ); \
398 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
399 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
401 _mm_store_pd( ( double* )alpha2, a2v.v ); \
403 _mm_store_pd( ( double* )alpha3, a3v.v ); \
406 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
407 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
410 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
411 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
413 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
414 _mm_store_pd( ( double* )alpha1, a1v.v ); \
418 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
419 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
421 _mm_store_pd( ( double* )alpha2, a2v.v ); \
423 _mm_store_pd( ( double* )alpha3, a3v.v ); \
426 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
427 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
430 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
431 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
433 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
434 _mm_store_pd( ( double* )alpha1, a1v.v ); \
438 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
439 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
441 _mm_store_pd( ( double* )alpha2, a2v.v ); \
443 _mm_store_pd( ( double* )alpha3, a3v.v ); \
446 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
447 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
450 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
451 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
453 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
454 _mm_store_pd( ( double* )alpha1, a1v.v ); \
458 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
459 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
461 _mm_store_pd( ( double* )alpha2, a2v.v ); \
463 _mm_store_pd( ( double* )alpha3, a3v.v ); \
467 for ( i = 0; i < n_iter2; ++i ) \
470 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
471 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
474 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
475 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
477 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
478 _mm_store_pd( ( double* )alpha1, a1v.v ); \
482 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
483 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
485 _mm_store_pd( ( double* )alpha2, a2v.v ); \
487 _mm_store_pd( ( double* )alpha3, a3v.v ); \
493 double ga12 = *gamma12; \
494 double si12 = *sigma12; \
495 double ga23 = *gamma23; \
496 double si23 = *sigma23; \
504 *alpha1 = temp1 * ga12 + temp2 * si12; \
505 *alpha2 = temp2 * ga12 - temp1 * si12; \
510 *alpha2 = temp2 * ga23 + temp3 * si23; \
511 *alpha3 = temp3 * ga23 - temp2 * si23; \
515 #define MAC_Apply_G_mx3_asc( m_A, \
524 int n_iter16 = m_A / ( 2 * 8 ); \
525 int n_left16 = m_A % ( 2 * 8 ); \
526 int n_iter2 = n_left16 / ( 2 * 1 ); \
527 int n_left = n_left16 % ( 2 * 1 ); \
530 const int step_a1 = inc_a1 * 2; \
531 const int step_a2 = inc_a1 * 2; \
532 const int step_a3 = inc_a1 * 2; \
534 scomplex* restrict alpha1 = a1; \
535 scomplex* restrict alpha2 = a2; \
536 scomplex* restrict alpha3 = a3; \
538 v4sf_t a1v, a2v, a3v; \
543 g12v.v = _mm_load1_ps( gamma12 ); \
544 s12v.v = _mm_load1_ps( sigma12 ); \
545 g23v.v = _mm_load1_ps( gamma23 ); \
546 s23v.v = _mm_load1_ps( sigma23 ); \
548 for ( i = 0; i < n_iter16; ++i ) \
551 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
552 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
555 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
556 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
558 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
559 _mm_store_ps( ( float* )alpha1, a1v.v ); \
563 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
564 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
566 _mm_store_ps( ( float* )alpha2, a2v.v ); \
568 _mm_store_ps( ( float* )alpha3, a3v.v ); \
571 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
572 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
575 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
576 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
578 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
579 _mm_store_ps( ( float* )alpha1, a1v.v ); \
583 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
584 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
586 _mm_store_ps( ( float* )alpha2, a2v.v ); \
588 _mm_store_ps( ( float* )alpha3, a3v.v ); \
591 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
592 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
595 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
596 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
598 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
599 _mm_store_ps( ( float* )alpha1, a1v.v ); \
603 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
604 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
606 _mm_store_ps( ( float* )alpha2, a2v.v ); \
608 _mm_store_ps( ( float* )alpha3, a3v.v ); \
611 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
612 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
615 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
616 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
618 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
619 _mm_store_ps( ( float* )alpha1, a1v.v ); \
623 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
624 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
626 _mm_store_ps( ( float* )alpha2, a2v.v ); \
628 _mm_store_ps( ( float* )alpha3, a3v.v ); \
631 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
632 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
635 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
636 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
638 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
639 _mm_store_ps( ( float* )alpha1, a1v.v ); \
643 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
644 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
646 _mm_store_ps( ( float* )alpha2, a2v.v ); \
648 _mm_store_ps( ( float* )alpha3, a3v.v ); \
651 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
652 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
655 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
656 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
658 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
659 _mm_store_ps( ( float* )alpha1, a1v.v ); \
663 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
664 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
666 _mm_store_ps( ( float* )alpha2, a2v.v ); \
668 _mm_store_ps( ( float* )alpha3, a3v.v ); \
671 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
672 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
675 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
676 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
678 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
679 _mm_store_ps( ( float* )alpha1, a1v.v ); \
683 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
684 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
686 _mm_store_ps( ( float* )alpha2, a2v.v ); \
688 _mm_store_ps( ( float* )alpha3, a3v.v ); \
691 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
692 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
695 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
696 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
698 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
699 _mm_store_ps( ( float* )alpha1, a1v.v ); \
703 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
704 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
706 _mm_store_ps( ( float* )alpha2, a2v.v ); \
708 _mm_store_ps( ( float* )alpha3, a3v.v ); \
712 for ( i = 0; i < n_iter2; ++i ) \
715 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
716 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
719 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
720 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
722 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
723 _mm_store_ps( ( float* )alpha1, a1v.v ); \
727 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
728 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
730 _mm_store_ps( ( float* )alpha2, a2v.v ); \
732 _mm_store_ps( ( float* )alpha3, a3v.v ); \
738 float ga12 = *gamma12; \
739 float si12 = *sigma12; \
740 float ga23 = *gamma23; \
741 float si23 = *sigma23; \
749 alpha1->real = temp1.real * ga12 + temp2.real * si12; \
750 alpha2->real = temp2.real * ga12 - temp1.real * si12; \
752 alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \
753 alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \
758 alpha2->real = temp2.real * ga23 + temp3.real * si23; \
759 alpha3->real = temp3.real * ga23 - temp2.real * si23; \
761 alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \
762 alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \
766 #define MAC_Apply_G_mx3_asz( m_A, \
775 int n_iter = m_A / 8; \
776 int n_left = m_A % 8; \
779 const int step_a1 = inc_a1 * 1; \
780 const int step_a2 = inc_a1 * 1; \
781 const int step_a3 = inc_a1 * 1; \
783 dcomplex* restrict alpha1 = a1; \
784 dcomplex* restrict alpha2 = a2; \
785 dcomplex* restrict alpha3 = a3; \
787 v2df_t a1v, a2v, a3v; \
792 g12v.v = _mm_loaddup_pd( gamma12 ); \
793 s12v.v = _mm_loaddup_pd( sigma12 ); \
794 g23v.v = _mm_loaddup_pd( gamma23 ); \
795 s23v.v = _mm_loaddup_pd( sigma23 ); \
797 for ( i = 0; i < n_iter; ++i ) \
800 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
801 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
804 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
805 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
807 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
808 _mm_store_pd( ( double* )alpha1, a1v.v ); \
812 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
813 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
815 _mm_store_pd( ( double* )alpha2, a2v.v ); \
816 _mm_store_pd( ( double* )alpha3, a3v.v ); \
820 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
821 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
824 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
825 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
827 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
828 _mm_store_pd( ( double* )alpha1, a1v.v ); \
832 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
833 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
835 _mm_store_pd( ( double* )alpha2, a2v.v ); \
836 _mm_store_pd( ( double* )alpha3, a3v.v ); \
840 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
841 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
844 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
845 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
847 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
848 _mm_store_pd( ( double* )alpha1, a1v.v ); \
852 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
853 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
855 _mm_store_pd( ( double* )alpha2, a2v.v ); \
856 _mm_store_pd( ( double* )alpha3, a3v.v ); \
860 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
861 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
864 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
865 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
867 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
868 _mm_store_pd( ( double* )alpha1, a1v.v ); \
872 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
873 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
875 _mm_store_pd( ( double* )alpha2, a2v.v ); \
876 _mm_store_pd( ( double* )alpha3, a3v.v ); \
880 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
881 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
884 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
885 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
887 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
888 _mm_store_pd( ( double* )alpha1, a1v.v ); \
892 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
893 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
895 _mm_store_pd( ( double* )alpha2, a2v.v ); \
896 _mm_store_pd( ( double* )alpha3, a3v.v ); \
900 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
901 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
904 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
905 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
907 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
908 _mm_store_pd( ( double* )alpha1, a1v.v ); \
912 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
913 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
915 _mm_store_pd( ( double* )alpha2, a2v.v ); \
916 _mm_store_pd( ( double* )alpha3, a3v.v ); \
920 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
921 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
924 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
925 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
927 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
928 _mm_store_pd( ( double* )alpha1, a1v.v ); \
932 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
933 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
935 _mm_store_pd( ( double* )alpha2, a2v.v ); \
936 _mm_store_pd( ( double* )alpha3, a3v.v ); \
940 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
941 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
944 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
945 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
947 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
948 _mm_store_pd( ( double* )alpha1, a1v.v ); \
952 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
953 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
955 _mm_store_pd( ( double* )alpha2, a2v.v ); \
956 _mm_store_pd( ( double* )alpha3, a3v.v ); \
961 for ( i = 0; i < n_left; ++i ) \
963 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
964 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
967 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
968 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
970 _mm_store_pd( ( double* )alpha1, a1v.v ); \
972 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
975 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
976 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
978 _mm_store_pd( ( double* )alpha2, a2v.v ); \
980 _mm_store_pd( ( double* )alpha3, a3v.v ); \