Go to the documentation of this file.
12 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
14 #define MAC_Apply_G_mx4s_ass MAC_Apply_G_mx4s_ops
15 #define MAC_Apply_G_mx4s_asd MAC_Apply_G_mx4s_opd
16 #define MAC_Apply_G_mx4s_asc MAC_Apply_G_mx4s_opc
17 #define MAC_Apply_G_mx4s_asz MAC_Apply_G_mx4s_opz
19 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
21 #define MAC_Apply_G_mx4s_ass( m_A, \
35 int n_iter32 = m_A / ( 4 * 8 ); \
36 int n_left32 = m_A % ( 4 * 8 ); \
37 int n_iter4 = n_left32 / ( 4 * 1 ); \
38 int n_left = n_left32 % ( 4 * 1 ); \
41 const int step_a1 = inc_a1 * 4; \
42 const int step_a2 = inc_a2 * 4; \
43 const int step_a3 = inc_a3 * 4; \
44 const int step_a4 = inc_a4 * 4; \
46 float* restrict alpha1 = a1; \
47 float* restrict alpha2 = a2; \
48 float* restrict alpha3 = a3; \
49 float* restrict alpha4 = a4; \
51 v4sf_t a1v, a2v, a3v, a4v; \
52 v4sf_t b1v, b2v, b3v, b4v; \
53 v4sf_t g23_k1v, s23_k1v; \
54 v4sf_t g34_k1v, s34_k1v; \
55 v4sf_t g12_k2v, s12_k2v; \
56 v4sf_t g23_k2v, s23_k2v; \
57 v4sf_t t1v, t2v, t3v; \
59 g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \
60 s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \
61 g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \
62 s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \
63 g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \
64 s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \
65 g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \
66 s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \
68 for ( i = 0; i < n_iter32; ++i ) \
71 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
72 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
73 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
76 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
77 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
79 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
82 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
83 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
85 _mm_store_ps( ( float* )alpha4, a4v.v ); \
89 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
90 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
92 _mm_store_ps( ( float* )alpha1, a1v.v ); \
94 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
97 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
98 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
100 _mm_store_ps( ( float* )alpha2, a2v.v ); \
102 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
106 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
109 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
110 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
112 _mm_store_ps( ( float* )alpha3, a3v.v ); \
114 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
117 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
118 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
120 _mm_store_ps( ( float* )alpha4, b4v.v ); \
124 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
125 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
127 _mm_store_ps( ( float* )alpha1, b1v.v ); \
129 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
132 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
133 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
135 _mm_store_ps( ( float* )alpha2, b2v.v ); \
137 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
141 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
144 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
145 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
147 _mm_store_ps( ( float* )alpha3, b3v.v ); \
149 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
152 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
153 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
155 _mm_store_ps( ( float* )alpha4, a4v.v ); \
159 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
160 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
162 _mm_store_ps( ( float* )alpha1, a1v.v ); \
164 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
167 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
168 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
170 _mm_store_ps( ( float* )alpha2, a2v.v ); \
172 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
176 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
179 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
180 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
182 _mm_store_ps( ( float* )alpha3, a3v.v ); \
184 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
187 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
188 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
190 _mm_store_ps( ( float* )alpha4, b4v.v ); \
194 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
195 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
197 _mm_store_ps( ( float* )alpha1, b1v.v ); \
199 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \
202 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
203 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
205 _mm_store_ps( ( float* )alpha2, b2v.v ); \
207 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
212 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
215 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
216 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
218 _mm_store_ps( ( float* )alpha3, b3v.v ); \
220 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
223 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
224 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
226 _mm_store_ps( ( float* )alpha4, a4v.v ); \
230 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
231 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
233 _mm_store_ps( ( float* )alpha1, a1v.v ); \
235 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
238 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
239 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
241 _mm_store_ps( ( float* )alpha2, a2v.v ); \
243 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
247 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
250 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
251 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
253 _mm_store_ps( ( float* )alpha3, a3v.v ); \
255 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
258 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
259 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
261 _mm_store_ps( ( float* )alpha4, b4v.v ); \
265 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
266 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
268 _mm_store_ps( ( float* )alpha1, b1v.v ); \
270 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
273 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
274 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
276 _mm_store_ps( ( float* )alpha2, b2v.v ); \
278 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
282 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
285 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
286 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
288 _mm_store_ps( ( float* )alpha3, b3v.v ); \
290 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
293 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
294 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
296 _mm_store_ps( ( float* )alpha4, a4v.v ); \
300 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
301 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
303 _mm_store_ps( ( float* )alpha1, a1v.v ); \
305 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
308 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
309 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
311 _mm_store_ps( ( float* )alpha2, a2v.v ); \
313 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
317 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
320 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
321 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
323 _mm_store_ps( ( float* )alpha3, a3v.v ); \
325 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
328 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
329 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
331 _mm_store_ps( ( float* )alpha4, b4v.v ); \
335 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
336 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
338 _mm_store_ps( ( float* )alpha1, b1v.v ); \
342 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
343 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
345 _mm_store_ps( ( float* )alpha2, b2v.v ); \
348 _mm_store_ps( ( float* )alpha3, b3v.v ); \
354 for ( i = 0; i < n_iter4; ++i ) \
357 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
358 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
359 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
362 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
363 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
365 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
368 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
369 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
371 _mm_store_ps( ( float* )alpha4, a4v.v ); \
375 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
376 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
378 _mm_store_ps( ( float* )alpha1, a1v.v ); \
382 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
383 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
385 _mm_store_ps( ( float* )alpha2, a2v.v ); \
387 _mm_store_ps( ( float* )alpha3, a3v.v ); \
391 for ( i = 0; i < n_left; ++i ) \
393 float ga23_k1 = *gamma23_k1; \
394 float si23_k1 = *sigma23_k1; \
395 float ga34_k1 = *gamma34_k1; \
396 float si34_k1 = *sigma34_k1; \
397 float ga12_k2 = *gamma12_k2; \
398 float si12_k2 = *sigma12_k2; \
399 float ga23_k2 = *gamma23_k2; \
400 float si23_k2 = *sigma23_k2; \
409 *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
410 *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
415 *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
416 *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
421 *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
422 *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
427 *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
428 *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
437 #define MAC_Apply_G_mx4s_asd( m_A, \
451 int n_iter16 = m_A / ( 2 * 8 ); \
452 int n_left16 = m_A % ( 2 * 8 ); \
453 int n_iter2 = n_left16 / ( 2 * 1 ); \
454 int n_left = n_left16 % ( 2 * 1 ); \
457 const int step_a1 = inc_a1 * 2; \
458 const int step_a2 = inc_a2 * 2; \
459 const int step_a3 = inc_a3 * 2; \
460 const int step_a4 = inc_a4 * 2; \
462 double* restrict alpha1 = a1; \
463 double* restrict alpha2 = a2; \
464 double* restrict alpha3 = a3; \
465 double* restrict alpha4 = a4; \
467 v2df_t a1v, a2v, a3v, a4v; \
468 v2df_t b1v, b2v, b3v, b4v; \
469 v2df_t g23_k1v, s23_k1v; \
470 v2df_t g34_k1v, s34_k1v; \
471 v2df_t g12_k2v, s12_k2v; \
472 v2df_t g23_k2v, s23_k2v; \
473 v2df_t t1v, t2v, t3v; \
475 g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \
476 s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \
477 g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \
478 s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \
479 g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \
480 s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \
481 g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \
482 s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \
484 for ( i = 0; i < n_iter16; ++i ) \
487 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
488 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
489 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
492 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
493 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
495 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
498 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
499 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
501 _mm_store_pd( ( double* )alpha4, a4v.v ); \
505 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
506 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
508 _mm_store_pd( ( double* )alpha1, a1v.v ); \
510 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
513 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
514 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
516 _mm_store_pd( ( double* )alpha2, a2v.v ); \
518 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
522 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
525 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
526 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
528 _mm_store_pd( ( double* )alpha3, a3v.v ); \
530 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
533 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
534 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
536 _mm_store_pd( ( double* )alpha4, b4v.v ); \
540 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
541 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
543 _mm_store_pd( ( double* )alpha1, b1v.v ); \
545 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
548 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
549 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
551 _mm_store_pd( ( double* )alpha2, b2v.v ); \
553 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
557 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
560 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
561 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
563 _mm_store_pd( ( double* )alpha3, b3v.v ); \
565 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
568 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
569 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
571 _mm_store_pd( ( double* )alpha4, a4v.v ); \
575 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
576 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
578 _mm_store_pd( ( double* )alpha1, a1v.v ); \
580 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
583 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
584 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
586 _mm_store_pd( ( double* )alpha2, a2v.v ); \
588 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
592 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
595 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
596 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
598 _mm_store_pd( ( double* )alpha3, a3v.v ); \
600 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
603 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
604 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
606 _mm_store_pd( ( double* )alpha4, b4v.v ); \
610 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
611 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
613 _mm_store_pd( ( double* )alpha1, b1v.v ); \
615 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \
618 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
619 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
621 _mm_store_pd( ( double* )alpha2, b2v.v ); \
623 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
628 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
631 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
632 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
634 _mm_store_pd( ( double* )alpha3, b3v.v ); \
636 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
639 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
640 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
642 _mm_store_pd( ( double* )alpha4, a4v.v ); \
646 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
647 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
649 _mm_store_pd( ( double* )alpha1, a1v.v ); \
651 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
654 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
655 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
657 _mm_store_pd( ( double* )alpha2, a2v.v ); \
659 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
663 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
666 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
667 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
669 _mm_store_pd( ( double* )alpha3, a3v.v ); \
671 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
674 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
675 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
677 _mm_store_pd( ( double* )alpha4, b4v.v ); \
681 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
682 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
684 _mm_store_pd( ( double* )alpha1, b1v.v ); \
686 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
689 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
690 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
692 _mm_store_pd( ( double* )alpha2, b2v.v ); \
694 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
698 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
701 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
702 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
704 _mm_store_pd( ( double* )alpha3, b3v.v ); \
706 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
709 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
710 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
712 _mm_store_pd( ( double* )alpha4, a4v.v ); \
716 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
717 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
719 _mm_store_pd( ( double* )alpha1, a1v.v ); \
721 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
724 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
725 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
727 _mm_store_pd( ( double* )alpha2, a2v.v ); \
729 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
733 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
736 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
737 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
739 _mm_store_pd( ( double* )alpha3, a3v.v ); \
741 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
744 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
745 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
747 _mm_store_pd( ( double* )alpha4, b4v.v ); \
751 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
752 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
754 _mm_store_pd( ( double* )alpha1, b1v.v ); \
758 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
759 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
761 _mm_store_pd( ( double* )alpha2, b2v.v ); \
764 _mm_store_pd( ( double* )alpha3, b3v.v ); \
770 for ( i = 0; i < n_iter2; ++i ) \
773 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
774 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
775 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
778 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
779 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
781 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
784 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
785 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
787 _mm_store_pd( ( double* )alpha4, a4v.v ); \
791 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
792 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
794 _mm_store_pd( ( double* )alpha1, a1v.v ); \
798 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
799 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
801 _mm_store_pd( ( double* )alpha2, a2v.v ); \
803 _mm_store_pd( ( double* )alpha3, a3v.v ); \
809 double ga23_k1 = *gamma23_k1; \
810 double si23_k1 = *sigma23_k1; \
811 double ga34_k1 = *gamma34_k1; \
812 double si34_k1 = *sigma34_k1; \
813 double ga12_k2 = *gamma12_k2; \
814 double si12_k2 = *sigma12_k2; \
815 double ga23_k2 = *gamma23_k2; \
816 double si23_k2 = *sigma23_k2; \
825 *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
826 *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
831 *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
832 *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
837 *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
838 *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
843 *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
844 *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
849 #define MAC_Apply_G_mx4s_asc( m_A, \
863 int n_iter16 = m_A / ( 2 * 8 ); \
864 int n_left16 = m_A % ( 2 * 8 ); \
865 int n_iter2 = n_left16 / ( 2 * 1 ); \
866 int n_left = n_left16 % ( 2 * 1 ); \
869 const int step_a1 = inc_a1 * 2; \
870 const int step_a2 = inc_a2 * 2; \
871 const int step_a3 = inc_a3 * 2; \
872 const int step_a4 = inc_a4 * 2; \
874 scomplex* restrict alpha1 = a1; \
875 scomplex* restrict alpha2 = a2; \
876 scomplex* restrict alpha3 = a3; \
877 scomplex* restrict alpha4 = a4; \
879 v4sf_t a1v, a2v, a3v, a4v; \
880 v4sf_t b1v, b2v, b3v, b4v; \
881 v4sf_t g23_k1v, s23_k1v; \
882 v4sf_t g34_k1v, s34_k1v; \
883 v4sf_t g12_k2v, s12_k2v; \
884 v4sf_t g23_k2v, s23_k2v; \
885 v4sf_t t1v, t2v, t3v; \
887 g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \
888 s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \
889 g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \
890 s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \
891 g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \
892 s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \
893 g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \
894 s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \
896 for ( i = 0; i < n_iter16; ++i ) \
899 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
900 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
901 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
904 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
905 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
907 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
910 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
911 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
913 _mm_store_ps( ( float* )alpha4, a4v.v ); \
917 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
918 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
920 _mm_store_ps( ( float* )alpha1, a1v.v ); \
922 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
925 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
926 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
928 _mm_store_ps( ( float* )alpha2, a2v.v ); \
930 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
934 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
937 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
938 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
940 _mm_store_ps( ( float* )alpha3, a3v.v ); \
942 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
945 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
946 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
948 _mm_store_ps( ( float* )alpha4, b4v.v ); \
952 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
953 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
955 _mm_store_ps( ( float* )alpha1, b1v.v ); \
957 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
960 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
961 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
963 _mm_store_ps( ( float* )alpha2, b2v.v ); \
965 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
969 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
972 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
973 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
975 _mm_store_ps( ( float* )alpha3, b3v.v ); \
977 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
980 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
981 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
983 _mm_store_ps( ( float* )alpha4, a4v.v ); \
987 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
988 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
990 _mm_store_ps( ( float* )alpha1, a1v.v ); \
992 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
995 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
996 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
998 _mm_store_ps( ( float* )alpha2, a2v.v ); \
1000 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
1004 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
1007 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
1008 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1010 _mm_store_ps( ( float* )alpha3, a3v.v ); \
1011 alpha3 += step_a3; \
1012 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
1015 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
1016 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1018 _mm_store_ps( ( float* )alpha4, b4v.v ); \
1019 alpha4 += step_a4; \
1022 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
1023 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1025 _mm_store_ps( ( float* )alpha1, b1v.v ); \
1026 alpha1 += step_a1; \
1027 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \
1030 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
1031 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1033 _mm_store_ps( ( float* )alpha2, b2v.v ); \
1034 alpha2 += step_a2; \
1035 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
1040 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
1043 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1044 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1046 _mm_store_ps( ( float* )alpha3, b3v.v ); \
1047 alpha3 += step_a3; \
1048 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
1051 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1052 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1054 _mm_store_ps( ( float* )alpha4, a4v.v ); \
1055 alpha4 += step_a4; \
1058 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1059 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1061 _mm_store_ps( ( float* )alpha1, a1v.v ); \
1062 alpha1 += step_a1; \
1063 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
1066 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1067 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1069 _mm_store_ps( ( float* )alpha2, a2v.v ); \
1070 alpha2 += step_a2; \
1071 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
1075 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
1078 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
1079 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1081 _mm_store_ps( ( float* )alpha3, a3v.v ); \
1082 alpha3 += step_a3; \
1083 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
1086 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
1087 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1089 _mm_store_ps( ( float* )alpha4, b4v.v ); \
1090 alpha4 += step_a4; \
1093 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
1094 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1096 _mm_store_ps( ( float* )alpha1, b1v.v ); \
1097 alpha1 += step_a1; \
1098 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
1101 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
1102 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1104 _mm_store_ps( ( float* )alpha2, b2v.v ); \
1105 alpha2 += step_a2; \
1106 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
1110 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
1113 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1114 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1116 _mm_store_ps( ( float* )alpha3, b3v.v ); \
1117 alpha3 += step_a3; \
1118 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
1121 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1122 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1124 _mm_store_ps( ( float* )alpha4, a4v.v ); \
1125 alpha4 += step_a4; \
1128 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1129 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1131 _mm_store_ps( ( float* )alpha1, a1v.v ); \
1132 alpha1 += step_a1; \
1133 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
1136 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1137 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1139 _mm_store_ps( ( float* )alpha2, a2v.v ); \
1140 alpha2 += step_a2; \
1141 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
1145 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
1148 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
1149 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1151 _mm_store_ps( ( float* )alpha3, a3v.v ); \
1152 alpha3 += step_a3; \
1153 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
1156 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
1157 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1159 _mm_store_ps( ( float* )alpha4, b4v.v ); \
1160 alpha4 += step_a4; \
1163 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
1164 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1166 _mm_store_ps( ( float* )alpha1, b1v.v ); \
1167 alpha1 += step_a1; \
1170 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
1171 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1173 _mm_store_ps( ( float* )alpha2, b2v.v ); \
1174 alpha2 += step_a2; \
1176 _mm_store_ps( ( float* )alpha3, b3v.v ); \
1177 alpha3 += step_a3; \
1182 for ( i = 0; i < n_iter2; ++i ) \
1185 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
1186 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
1187 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
1190 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1191 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1193 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
1196 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1197 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1199 _mm_store_ps( ( float* )alpha4, a4v.v ); \
1200 alpha4 += step_a4; \
1203 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1204 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1206 _mm_store_ps( ( float* )alpha1, a1v.v ); \
1207 alpha1 += step_a1; \
1210 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1211 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1213 _mm_store_ps( ( float* )alpha2, a2v.v ); \
1214 alpha2 += step_a2; \
1215 _mm_store_ps( ( float* )alpha3, a3v.v ); \
1216 alpha3 += step_a3; \
1219 if ( n_left == 1 ) \
1221 float ga23_k1 = *gamma23_k1; \
1222 float si23_k1 = *sigma23_k1; \
1223 float ga34_k1 = *gamma34_k1; \
1224 float si34_k1 = *sigma34_k1; \
1225 float ga12_k2 = *gamma12_k2; \
1226 float si12_k2 = *sigma12_k2; \
1227 float ga23_k2 = *gamma23_k2; \
1228 float si23_k2 = *sigma23_k2; \
1237 alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \
1238 alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \
1240 alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \
1241 alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \
1246 alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \
1247 alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \
1249 alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \
1250 alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \
1255 alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \
1256 alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \
1258 alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \
1259 alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \
1264 alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \
1265 alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \
1267 alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \
1268 alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \
1273 #define MAC_Apply_G_mx4s_asz( m_A, \
1287 int n_iter = m_A / 8; \
1288 int n_left = m_A % 8; \
1291 const int step_a1 = inc_a1 * 1; \
1292 const int step_a2 = inc_a2 * 1; \
1293 const int step_a3 = inc_a3 * 1; \
1294 const int step_a4 = inc_a4 * 1; \
1296 dcomplex* restrict alpha1 = a1; \
1297 dcomplex* restrict alpha2 = a2; \
1298 dcomplex* restrict alpha3 = a3; \
1299 dcomplex* restrict alpha4 = a4; \
1301 v2df_t a1v, a2v, a3v, a4v; \
1302 v2df_t b1v, b2v, b3v, b4v; \
1303 v2df_t g23_k1v, s23_k1v; \
1304 v2df_t g34_k1v, s34_k1v; \
1305 v2df_t g12_k2v, s12_k2v; \
1306 v2df_t g23_k2v, s23_k2v; \
1307 v2df_t t1v, t2v, t3v; \
1309 g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \
1310 s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \
1311 g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \
1312 s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \
1313 g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \
1314 s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \
1315 g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \
1316 s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \
1318 for ( i = 0; i < n_iter; ++i ) \
1321 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
1322 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
1323 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
1326 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1327 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1329 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
1332 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1333 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1335 _mm_store_pd( ( double* )alpha4, a4v.v ); \
1336 alpha4 += step_a4; \
1339 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1340 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1342 _mm_store_pd( ( double* )alpha1, a1v.v ); \
1343 alpha1 += step_a1; \
1344 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
1347 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1348 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1350 _mm_store_pd( ( double* )alpha2, a2v.v ); \
1351 alpha2 += step_a2; \
1352 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
1356 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
1359 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
1360 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1362 _mm_store_pd( ( double* )alpha3, a3v.v ); \
1363 alpha3 += step_a3; \
1364 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
1367 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
1368 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1370 _mm_store_pd( ( double* )alpha4, b4v.v ); \
1371 alpha4 += step_a4; \
1374 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
1375 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1377 _mm_store_pd( ( double* )alpha1, b1v.v ); \
1378 alpha1 += step_a1; \
1379 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
1382 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
1383 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1385 _mm_store_pd( ( double* )alpha2, b2v.v ); \
1386 alpha2 += step_a2; \
1387 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
1391 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
1394 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1395 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1397 _mm_store_pd( ( double* )alpha3, b3v.v ); \
1398 alpha3 += step_a3; \
1399 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
1402 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1403 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1405 _mm_store_pd( ( double* )alpha4, a4v.v ); \
1406 alpha4 += step_a4; \
1409 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1410 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1412 _mm_store_pd( ( double* )alpha1, a1v.v ); \
1413 alpha1 += step_a1; \
1414 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
1417 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1418 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1420 _mm_store_pd( ( double* )alpha2, a2v.v ); \
1421 alpha2 += step_a2; \
1422 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
1426 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
1429 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
1430 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1432 _mm_store_pd( ( double* )alpha3, a3v.v ); \
1433 alpha3 += step_a3; \
1434 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
1437 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
1438 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1440 _mm_store_pd( ( double* )alpha4, b4v.v ); \
1441 alpha4 += step_a4; \
1444 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
1445 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1447 _mm_store_pd( ( double* )alpha1, b1v.v ); \
1448 alpha1 += step_a1; \
1449 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \
1452 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
1453 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1455 _mm_store_pd( ( double* )alpha2, b2v.v ); \
1456 alpha2 += step_a2; \
1457 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
1461 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
1464 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1465 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1467 _mm_store_pd( ( double* )alpha3, b3v.v ); \
1468 alpha3 += step_a3; \
1469 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
1472 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1473 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1475 _mm_store_pd( ( double* )alpha4, a4v.v ); \
1476 alpha4 += step_a4; \
1479 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1480 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1482 _mm_store_pd( ( double* )alpha1, a1v.v ); \
1483 alpha1 += step_a1; \
1484 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
1487 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1488 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1490 _mm_store_pd( ( double* )alpha2, a2v.v ); \
1491 alpha2 += step_a2; \
1492 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
1496 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
1499 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
1500 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1502 _mm_store_pd( ( double* )alpha3, a3v.v ); \
1503 alpha3 += step_a3; \
1504 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
1507 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
1508 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1510 _mm_store_pd( ( double* )alpha4, b4v.v ); \
1511 alpha4 += step_a4; \
1514 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
1515 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1517 _mm_store_pd( ( double* )alpha1, b1v.v ); \
1518 alpha1 += step_a1; \
1519 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
1522 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
1523 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1525 _mm_store_pd( ( double* )alpha2, b2v.v ); \
1526 alpha2 += step_a2; \
1527 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
1531 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
1534 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1535 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1537 _mm_store_pd( ( double* )alpha3, b3v.v ); \
1538 alpha3 += step_a3; \
1539 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
1542 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1543 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1545 _mm_store_pd( ( double* )alpha4, a4v.v ); \
1546 alpha4 += step_a4; \
1549 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1550 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1552 _mm_store_pd( ( double* )alpha1, a1v.v ); \
1553 alpha1 += step_a1; \
1554 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
1557 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1558 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1560 _mm_store_pd( ( double* )alpha2, a2v.v ); \
1561 alpha2 += step_a2; \
1562 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
1566 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
1569 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
1570 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1572 _mm_store_pd( ( double* )alpha3, a3v.v ); \
1573 alpha3 += step_a3; \
1574 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
1577 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
1578 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1580 _mm_store_pd( ( double* )alpha4, b4v.v ); \
1581 alpha4 += step_a4; \
1584 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
1585 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1587 _mm_store_pd( ( double* )alpha1, b1v.v ); \
1588 alpha1 += step_a1; \
1591 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
1592 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1594 _mm_store_pd( ( double* )alpha2, b2v.v ); \
1595 alpha2 += step_a2; \
1597 _mm_store_pd( ( double* )alpha3, b3v.v ); \
1598 alpha3 += step_a3; \
1603 for ( i = 0; i < n_left; ++i ) \
1606 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
1607 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
1608 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
1611 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1612 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1614 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
1617 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1618 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1620 _mm_store_pd( ( double* )alpha4, a4v.v ); \
1621 alpha4 += step_a4; \
1624 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1625 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1627 _mm_store_pd( ( double* )alpha1, a1v.v ); \
1628 alpha1 += step_a1; \
1631 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1632 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1634 _mm_store_pd( ( double* )alpha2, a2v.v ); \
1635 alpha2 += step_a2; \
1636 _mm_store_pd( ( double* )alpha3, a3v.v ); \
1637 alpha3 += step_a3; \