libflame  revision_anchor
FLA_Apply_G_mx4s_asm.h
Go to the documentation of this file.
00001 /*
00002    libflame
00003    An object-based infrastructure for developing high-performance
00004    dense linear algebra libraries.
00005 
00006    Copyright (C) 2011, The University of Texas
00007 
00008    libflame is free software; you can redistribute it and/or modify
00009    it under the terms of the GNU Lesser General Public License as
00010    published by the Free Software Foundation; either version 2.1 of
00011    the License, or (at your option) any later version.
00012 
00013    libflame is distributed in the hope that it will be useful, but
00014    WITHOUT ANY WARRANTY; without even the implied warranty of
00015    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
00016    Lesser General Public License for more details.
00017 
00018    You should have received a copy of the GNU Lesser General Public
00019    License along with libflame; if you did not receive a copy, see
00020    http://www.gnu.org/licenses/.
00021 
00022    For more information, please contact us at flame@cs.utexas.edu or
00023    send mail to:
00024 
00025    Field G. Van Zee and/or
00026    Robert A. van de Geijn
00027    The University of Texas at Austin
00028    Department of Computer Sciences
00029    1 University Station C0500
00030    Austin TX 78712
00031 */
00032 
00033 
00034 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
00035 
00036 #define MAC_Apply_G_mx4s_ass MAC_Apply_G_mx4s_ops
00037 #define MAC_Apply_G_mx4s_asd MAC_Apply_G_mx4s_opd
00038 #define MAC_Apply_G_mx4s_asc MAC_Apply_G_mx4s_opc
00039 #define MAC_Apply_G_mx4s_asz MAC_Apply_G_mx4s_opz
00040 
00041 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
00042 
00043 #define MAC_Apply_G_mx4s_ass( m_A, \
00044                               gamma23_k1, \
00045                               sigma23_k1, \
00046                               gamma34_k1, \
00047                               sigma34_k1, \
00048                               gamma12_k2, \
00049                               sigma12_k2, \
00050                               gamma23_k2, \
00051                               sigma23_k2, \
00052                               a1, inc_a1, \
00053                               a2, inc_a2, \
00054                               a3, inc_a3, \
00055                               a4, inc_a4 ) \
00056 {\
00057     int                n_iter32 = m_A / ( 4 * 8 ); \
00058     int                n_left32 = m_A % ( 4 * 8 ); \
00059     int                n_iter4  = n_left32 / ( 4 * 1 ); \
00060     int                n_left   = n_left32 % ( 4 * 1 ); \
00061     int                i; \
00062 \
00063     const int          step_a1 = inc_a1 * 4; \
00064     const int          step_a2 = inc_a2 * 4; \
00065     const int          step_a3 = inc_a3 * 4; \
00066     const int          step_a4 = inc_a4 * 4; \
00067 \
00068     float*    restrict alpha1 = a1; \
00069     float*    restrict alpha2 = a2; \
00070     float*    restrict alpha3 = a3; \
00071     float*    restrict alpha4 = a4; \
00072 \
00073     v4sf_t             a1v, a2v, a3v, a4v; \
00074     v4sf_t             b1v, b2v, b3v, b4v; \
00075     v4sf_t             g23_k1v, s23_k1v; \
00076     v4sf_t             g34_k1v, s34_k1v; \
00077     v4sf_t             g12_k2v, s12_k2v; \
00078     v4sf_t             g23_k2v, s23_k2v; \
00079     v4sf_t             t1v, t2v, t3v; \
00080 \
00081     g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \
00082     s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \
00083     g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \
00084     s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \
00085     g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \
00086     s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \
00087     g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \
00088     s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \
00089 \
00090     for ( i = 0; i < n_iter32; ++i ) \
00091     { \
00092 \
00093         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00094         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00095         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
00096 \
00097         t2v.v = a2v.v; \
00098         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00099         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00100 \
00101         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00102 \
00103         t3v.v = a3v.v; \
00104         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00105         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00106 \
00107         _mm_store_ps( ( float* )alpha4, a4v.v ); \
00108         alpha4 += step_a4; \
00109 \
00110         t1v.v = a1v.v; \
00111         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00112         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00113 \
00114         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00115         alpha1 += step_a1; \
00116         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00117 \
00118         t2v.v = a2v.v; \
00119         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00120         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00121 \
00122         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00123         alpha2 += step_a2; \
00124         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00125 \
00126 /* ----------------------------------------------------------- */ \
00127 \
00128         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
00129 \
00130         t2v.v = b2v.v; \
00131         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00132         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00133 \
00134         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00135         alpha3 += step_a3; \
00136         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
00137 \
00138         t3v.v = b3v.v; \
00139         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00140         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00141 \
00142         _mm_store_ps( ( float* )alpha4, b4v.v ); \
00143         alpha4 += step_a4; \
00144 \
00145         t1v.v = b1v.v; \
00146         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00147         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00148 \
00149         _mm_store_ps( ( float* )alpha1, b1v.v ); \
00150         alpha1 += step_a1; \
00151         a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00152 \
00153         t2v.v = b2v.v; \
00154         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00155         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00156 \
00157         _mm_store_ps( ( float* )alpha2, b2v.v ); \
00158         alpha2 += step_a2; \
00159         a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00160 \
00161 /* ----------------------------------------------------------- */ \
00162 \
00163         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
00164 \
00165         t2v.v = a2v.v; \
00166         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00167         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00168 \
00169         _mm_store_ps( ( float* )alpha3, b3v.v ); \
00170         alpha3 += step_a3; \
00171         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00172 \
00173         t3v.v = a3v.v; \
00174         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00175         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00176 \
00177         _mm_store_ps( ( float* )alpha4, a4v.v ); \
00178         alpha4 += step_a4; \
00179 \
00180         t1v.v = a1v.v; \
00181         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00182         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00183 \
00184         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00185         alpha1 += step_a1; \
00186         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00187 \
00188         t2v.v = a2v.v; \
00189         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00190         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00191 \
00192         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00193         alpha2 += step_a2; \
00194         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00195 \
00196 /* ----------------------------------------------------------- */ \
00197 \
00198         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
00199 \
00200         t2v.v = b2v.v; \
00201         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00202         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00203 \
00204         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00205         alpha3 += step_a3; \
00206         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
00207 \
00208         t3v.v = b3v.v; \
00209         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00210         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00211 \
00212         _mm_store_ps( ( float* )alpha4, b4v.v ); \
00213         alpha4 += step_a4; \
00214 \
00215         t1v.v = b1v.v; \
00216         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00217         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00218 \
00219         _mm_store_ps( ( float* )alpha1, b1v.v ); \
00220         alpha1 += step_a1; \
00221         a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \
00222 \
00223         t2v.v = b2v.v; \
00224         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00225         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00226 \
00227         _mm_store_ps( ( float* )alpha2, b2v.v ); \
00228         alpha2 += step_a2; \
00229         a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00230 \
00231 \
00232 /* ----------------------------------------------------------- */ \
00233 \
00234         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
00235 \
00236         t2v.v = a2v.v; \
00237         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00238         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00239 \
00240         _mm_store_ps( ( float* )alpha3, b3v.v ); \
00241         alpha3 += step_a3; \
00242         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00243 \
00244         t3v.v = a3v.v; \
00245         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00246         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00247 \
00248         _mm_store_ps( ( float* )alpha4, a4v.v ); \
00249         alpha4 += step_a4; \
00250 \
00251         t1v.v = a1v.v; \
00252         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00253         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00254 \
00255         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00256         alpha1 += step_a1; \
00257         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00258 \
00259         t2v.v = a2v.v; \
00260         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00261         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00262 \
00263         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00264         alpha2 += step_a2; \
00265         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00266 \
00267 /* ----------------------------------------------------------- */ \
00268 \
00269         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
00270 \
00271         t2v.v = b2v.v; \
00272         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00273         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00274 \
00275         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00276         alpha3 += step_a3; \
00277         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
00278 \
00279         t3v.v = b3v.v; \
00280         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00281         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00282 \
00283         _mm_store_ps( ( float* )alpha4, b4v.v ); \
00284         alpha4 += step_a4; \
00285 \
00286         t1v.v = b1v.v; \
00287         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00288         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00289 \
00290         _mm_store_ps( ( float* )alpha1, b1v.v ); \
00291         alpha1 += step_a1; \
00292         a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00293 \
00294         t2v.v = b2v.v; \
00295         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00296         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00297 \
00298         _mm_store_ps( ( float* )alpha2, b2v.v ); \
00299         alpha2 += step_a2; \
00300         a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00301 \
00302 /* ----------------------------------------------------------- */ \
00303 \
00304         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
00305 \
00306         t2v.v = a2v.v; \
00307         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00308         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00309 \
00310         _mm_store_ps( ( float* )alpha3, b3v.v ); \
00311         alpha3 += step_a3; \
00312         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00313 \
00314         t3v.v = a3v.v; \
00315         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00316         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00317 \
00318         _mm_store_ps( ( float* )alpha4, a4v.v ); \
00319         alpha4 += step_a4; \
00320 \
00321         t1v.v = a1v.v; \
00322         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00323         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00324 \
00325         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00326         alpha1 += step_a1; \
00327         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00328 \
00329         t2v.v = a2v.v; \
00330         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00331         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00332 \
00333         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00334         alpha2 += step_a2; \
00335         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00336 \
00337 /* ----------------------------------------------------------- */ \
00338 \
00339         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
00340 \
00341         t2v.v = b2v.v; \
00342         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00343         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00344 \
00345         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00346         alpha3 += step_a3; \
00347         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
00348 \
00349         t3v.v = b3v.v; \
00350         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00351         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00352 \
00353         _mm_store_ps( ( float* )alpha4, b4v.v ); \
00354         alpha4 += step_a4; \
00355 \
00356         t1v.v = b1v.v; \
00357         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00358         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00359 \
00360         _mm_store_ps( ( float* )alpha1, b1v.v ); \
00361         alpha1 += step_a1; \
00362 \
00363         t2v.v = b2v.v; \
00364         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00365         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00366 \
00367         _mm_store_ps( ( float* )alpha2, b2v.v ); \
00368         alpha2 += step_a2; \
00369 \
00370         _mm_store_ps( ( float* )alpha3, b3v.v ); \
00371         alpha3 += step_a3; \
00372 \
00373 /* ----------------------------------------------------------- */ \
00374     } \
00375 \
00376     for ( i = 0; i < n_iter4; ++i ) \
00377     { \
00378 \
00379         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00380         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00381         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
00382 \
00383         t2v.v = a2v.v; \
00384         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00385         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00386 \
00387         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00388 \
00389         t3v.v = a3v.v; \
00390         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00391         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00392 \
00393         _mm_store_ps( ( float* )alpha4, a4v.v ); \
00394         alpha4 += step_a4; \
00395 \
00396         t1v.v = a1v.v; \
00397         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00398         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00399 \
00400         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00401         alpha1 += step_a1; \
00402 \
00403         t2v.v = a2v.v; \
00404         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00405         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00406 \
00407         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00408         alpha2 += step_a2; \
00409         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00410         alpha3 += step_a3; \
00411     } \
00412 \
00413     for ( i = 0; i < n_left; ++i ) \
00414     { \
00415         float              ga23_k1 = *gamma23_k1; \
00416         float              si23_k1 = *sigma23_k1; \
00417         float              ga34_k1 = *gamma34_k1; \
00418         float              si34_k1 = *sigma34_k1; \
00419         float              ga12_k2 = *gamma12_k2; \
00420         float              si12_k2 = *sigma12_k2; \
00421         float              ga23_k2 = *gamma23_k2; \
00422         float              si23_k2 = *sigma23_k2; \
00423         float              temp1; \
00424         float              temp2; \
00425         float              temp3; \
00426         float              temp4; \
00427 \
00428         temp2 = *alpha2; \
00429         temp3 = *alpha3; \
00430 \
00431         *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
00432         *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
00433 \
00434         temp3 = *alpha3; \
00435         temp4 = *alpha4; \
00436 \
00437         *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
00438         *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
00439 \
00440         temp1 = *alpha1; \
00441         temp2 = *alpha2; \
00442 \
00443         *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
00444         *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
00445 \
00446         temp2 = *alpha2; \
00447         temp3 = *alpha3; \
00448 \
00449         *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
00450         *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
00451 \
00452         alpha1 += 1; \
00453         alpha2 += 1; \
00454         alpha3 += 1; \
00455         alpha4 += 1; \
00456     } \
00457 }
00458 
00459 #define MAC_Apply_G_mx4s_asd( m_A, \
00460                               gamma23_k1, \
00461                               sigma23_k1, \
00462                               gamma34_k1, \
00463                               sigma34_k1, \
00464                               gamma12_k2, \
00465                               sigma12_k2, \
00466                               gamma23_k2, \
00467                               sigma23_k2, \
00468                               a1, inc_a1, \
00469                               a2, inc_a2, \
00470                               a3, inc_a3, \
00471                               a4, inc_a4 ) \
00472 {\
00473     int                n_iter16 = m_A / ( 2 * 8 ); \
00474     int                n_left16 = m_A % ( 2 * 8 ); \
00475     int                n_iter2  = n_left16 / ( 2 * 1 ); \
00476     int                n_left   = n_left16 % ( 2 * 1 ); \
00477     int                i; \
00478 \
00479     const int          step_a1 = inc_a1 * 2; \
00480     const int          step_a2 = inc_a2 * 2; \
00481     const int          step_a3 = inc_a3 * 2; \
00482     const int          step_a4 = inc_a4 * 2; \
00483 \
00484     double*   restrict alpha1 = a1; \
00485     double*   restrict alpha2 = a2; \
00486     double*   restrict alpha3 = a3; \
00487     double*   restrict alpha4 = a4; \
00488 \
00489     v2df_t             a1v, a2v, a3v, a4v; \
00490     v2df_t             b1v, b2v, b3v, b4v; \
00491     v2df_t             g23_k1v, s23_k1v; \
00492     v2df_t             g34_k1v, s34_k1v; \
00493     v2df_t             g12_k2v, s12_k2v; \
00494     v2df_t             g23_k2v, s23_k2v; \
00495     v2df_t             t1v, t2v, t3v; \
00496 \
00497     g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \
00498     s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \
00499     g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \
00500     s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \
00501     g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \
00502     s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \
00503     g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \
00504     s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \
00505 \
00506     for ( i = 0; i < n_iter16; ++i ) \
00507     { \
00508 \
00509         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00510         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00511         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
00512 \
00513         t2v.v = a2v.v; \
00514         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00515         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00516 \
00517         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00518 \
00519         t3v.v = a3v.v; \
00520         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00521         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00522 \
00523         _mm_store_pd( ( double* )alpha4, a4v.v ); \
00524         alpha4 += step_a4; \
00525 \
00526         t1v.v = a1v.v; \
00527         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00528         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00529 \
00530         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00531         alpha1 += step_a1; \
00532         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
00533 \
00534         t2v.v = a2v.v; \
00535         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00536         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00537 \
00538         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00539         alpha2 += step_a2; \
00540         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
00541 \
00542 /* ----------------------------------------------------------- */ \
00543 \
00544         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
00545 \
00546         t2v.v = b2v.v; \
00547         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00548         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00549 \
00550         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00551         alpha3 += step_a3; \
00552         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
00553 \
00554         t3v.v = b3v.v; \
00555         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00556         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00557 \
00558         _mm_store_pd( ( double* )alpha4, b4v.v ); \
00559         alpha4 += step_a4; \
00560 \
00561         t1v.v = b1v.v; \
00562         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00563         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00564 \
00565         _mm_store_pd( ( double* )alpha1, b1v.v ); \
00566         alpha1 += step_a1; \
00567         a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
00568 \
00569         t2v.v = b2v.v; \
00570         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00571         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00572 \
00573         _mm_store_pd( ( double* )alpha2, b2v.v ); \
00574         alpha2 += step_a2; \
00575         a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
00576 \
00577 /* ----------------------------------------------------------- */ \
00578 \
00579         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
00580 \
00581         t2v.v = a2v.v; \
00582         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00583         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00584 \
00585         _mm_store_pd( ( double* )alpha3, b3v.v ); \
00586         alpha3 += step_a3; \
00587         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00588 \
00589         t3v.v = a3v.v; \
00590         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00591         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00592 \
00593         _mm_store_pd( ( double* )alpha4, a4v.v ); \
00594         alpha4 += step_a4; \
00595 \
00596         t1v.v = a1v.v; \
00597         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00598         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00599 \
00600         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00601         alpha1 += step_a1; \
00602         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
00603 \
00604         t2v.v = a2v.v; \
00605         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00606         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00607 \
00608         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00609         alpha2 += step_a2; \
00610         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
00611 \
00612 /* ----------------------------------------------------------- */ \
00613 \
00614         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
00615 \
00616         t2v.v = b2v.v; \
00617         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00618         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00619 \
00620         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00621         alpha3 += step_a3; \
00622         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
00623 \
00624         t3v.v = b3v.v; \
00625         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00626         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00627 \
00628         _mm_store_pd( ( double* )alpha4, b4v.v ); \
00629         alpha4 += step_a4; \
00630 \
00631         t1v.v = b1v.v; \
00632         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00633         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00634 \
00635         _mm_store_pd( ( double* )alpha1, b1v.v ); \
00636         alpha1 += step_a1; \
00637         a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \
00638 \
00639         t2v.v = b2v.v; \
00640         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00641         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00642 \
00643         _mm_store_pd( ( double* )alpha2, b2v.v ); \
00644         alpha2 += step_a2; \
00645         a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
00646 \
00647 \
00648 /* ----------------------------------------------------------- */ \
00649 \
00650         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
00651 \
00652         t2v.v = a2v.v; \
00653         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00654         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00655 \
00656         _mm_store_pd( ( double* )alpha3, b3v.v ); \
00657         alpha3 += step_a3; \
00658         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00659 \
00660         t3v.v = a3v.v; \
00661         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00662         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00663 \
00664         _mm_store_pd( ( double* )alpha4, a4v.v ); \
00665         alpha4 += step_a4; \
00666 \
00667         t1v.v = a1v.v; \
00668         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00669         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00670 \
00671         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00672         alpha1 += step_a1; \
00673         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
00674 \
00675         t2v.v = a2v.v; \
00676         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00677         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00678 \
00679         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00680         alpha2 += step_a2; \
00681         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
00682 \
00683 /* ----------------------------------------------------------- */ \
00684 \
00685         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
00686 \
00687         t2v.v = b2v.v; \
00688         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00689         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00690 \
00691         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00692         alpha3 += step_a3; \
00693         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
00694 \
00695         t3v.v = b3v.v; \
00696         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00697         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00698 \
00699         _mm_store_pd( ( double* )alpha4, b4v.v ); \
00700         alpha4 += step_a4; \
00701 \
00702         t1v.v = b1v.v; \
00703         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00704         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00705 \
00706         _mm_store_pd( ( double* )alpha1, b1v.v ); \
00707         alpha1 += step_a1; \
00708         a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
00709 \
00710         t2v.v = b2v.v; \
00711         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00712         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00713 \
00714         _mm_store_pd( ( double* )alpha2, b2v.v ); \
00715         alpha2 += step_a2; \
00716         a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
00717 \
00718 /* ----------------------------------------------------------- */ \
00719 \
00720         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
00721 \
00722         t2v.v = a2v.v; \
00723         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00724         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00725 \
00726         _mm_store_pd( ( double* )alpha3, b3v.v ); \
00727         alpha3 += step_a3; \
00728         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00729 \
00730         t3v.v = a3v.v; \
00731         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00732         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00733 \
00734         _mm_store_pd( ( double* )alpha4, a4v.v ); \
00735         alpha4 += step_a4; \
00736 \
00737         t1v.v = a1v.v; \
00738         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00739         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00740 \
00741         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00742         alpha1 += step_a1; \
00743         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
00744 \
00745         t2v.v = a2v.v; \
00746         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00747         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00748 \
00749         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00750         alpha2 += step_a2; \
00751         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
00752 \
00753 /* ----------------------------------------------------------- */ \
00754 \
00755         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
00756 \
00757         t2v.v = b2v.v; \
00758         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00759         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00760 \
00761         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00762         alpha3 += step_a3; \
00763         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
00764 \
00765         t3v.v = b3v.v; \
00766         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00767         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00768 \
00769         _mm_store_pd( ( double* )alpha4, b4v.v ); \
00770         alpha4 += step_a4; \
00771 \
00772         t1v.v = b1v.v; \
00773         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00774         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00775 \
00776         _mm_store_pd( ( double* )alpha1, b1v.v ); \
00777         alpha1 += step_a1; \
00778 \
00779         t2v.v = b2v.v; \
00780         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00781         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00782 \
00783         _mm_store_pd( ( double* )alpha2, b2v.v ); \
00784         alpha2 += step_a2; \
00785 \
00786         _mm_store_pd( ( double* )alpha3, b3v.v ); \
00787         alpha3 += step_a3; \
00788 \
00789 /* ----------------------------------------------------------- */ \
00790     } \
00791 \
00792     for ( i = 0; i < n_iter2; ++i ) \
00793     { \
00794 \
00795         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00796         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00797         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
00798 \
00799         t2v.v = a2v.v; \
00800         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00801         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00802 \
00803         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00804 \
00805         t3v.v = a3v.v; \
00806         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00807         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00808 \
00809         _mm_store_pd( ( double* )alpha4, a4v.v ); \
00810         alpha4 += step_a4; \
00811 \
00812         t1v.v = a1v.v; \
00813         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00814         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00815 \
00816         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00817         alpha1 += step_a1; \
00818 \
00819         t2v.v = a2v.v; \
00820         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00821         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00822 \
00823         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00824         alpha2 += step_a2; \
00825         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00826         alpha3 += step_a3; \
00827     } \
00828 \
00829     if ( n_left == 1 ) \
00830     { \
00831         double             ga23_k1 = *gamma23_k1; \
00832         double             si23_k1 = *sigma23_k1; \
00833         double             ga34_k1 = *gamma34_k1; \
00834         double             si34_k1 = *sigma34_k1; \
00835         double             ga12_k2 = *gamma12_k2; \
00836         double             si12_k2 = *sigma12_k2; \
00837         double             ga23_k2 = *gamma23_k2; \
00838         double             si23_k2 = *sigma23_k2; \
00839         double             temp1; \
00840         double             temp2; \
00841         double             temp3; \
00842         double             temp4; \
00843 \
00844         temp2 = *alpha2; \
00845         temp3 = *alpha3; \
00846 \
00847         *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
00848         *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
00849 \
00850         temp3 = *alpha3; \
00851         temp4 = *alpha4; \
00852 \
00853         *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
00854         *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
00855 \
00856         temp1 = *alpha1; \
00857         temp2 = *alpha2; \
00858 \
00859         *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
00860         *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
00861 \
00862         temp2 = *alpha2; \
00863         temp3 = *alpha3; \
00864 \
00865         *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
00866         *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
00867 \
00868     } \
00869 }
00870 
00871 #define MAC_Apply_G_mx4s_asc( m_A, \
00872                               gamma23_k1, \
00873                               sigma23_k1, \
00874                               gamma34_k1, \
00875                               sigma34_k1, \
00876                               gamma12_k2, \
00877                               sigma12_k2, \
00878                               gamma23_k2, \
00879                               sigma23_k2, \
00880                               a1, inc_a1, \
00881                               a2, inc_a2, \
00882                               a3, inc_a3, \
00883                               a4, inc_a4 ) \
00884 {\
00885     int                n_iter16 = m_A / ( 2 * 8 ); \
00886     int                n_left16 = m_A % ( 2 * 8 ); \
00887     int                n_iter2  = n_left16 / ( 2 * 1 ); \
00888     int                n_left   = n_left16 % ( 2 * 1 ); \
00889     int                i; \
00890 \
00891     const int          step_a1 = inc_a1 * 2; \
00892     const int          step_a2 = inc_a2 * 2; \
00893     const int          step_a3 = inc_a3 * 2; \
00894     const int          step_a4 = inc_a4 * 2; \
00895 \
00896     scomplex* restrict alpha1 = a1; \
00897     scomplex* restrict alpha2 = a2; \
00898     scomplex* restrict alpha3 = a3; \
00899     scomplex* restrict alpha4 = a4; \
00900 \
00901     v4sf_t             a1v, a2v, a3v, a4v; \
00902     v4sf_t             b1v, b2v, b3v, b4v; \
00903     v4sf_t             g23_k1v, s23_k1v; \
00904     v4sf_t             g34_k1v, s34_k1v; \
00905     v4sf_t             g12_k2v, s12_k2v; \
00906     v4sf_t             g23_k2v, s23_k2v; \
00907     v4sf_t             t1v, t2v, t3v; \
00908 \
00909     g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \
00910     s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \
00911     g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \
00912     s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \
00913     g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \
00914     s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \
00915     g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \
00916     s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \
00917 \
00918     for ( i = 0; i < n_iter16; ++i ) \
00919     { \
00920 \
00921         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00922         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00923         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
00924 \
00925         t2v.v = a2v.v; \
00926         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00927         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00928 \
00929         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00930 \
00931         t3v.v = a3v.v; \
00932         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00933         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00934 \
00935         _mm_store_ps( ( float* )alpha4, a4v.v ); \
00936         alpha4 += step_a4; \
00937 \
00938         t1v.v = a1v.v; \
00939         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00940         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00941 \
00942         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00943         alpha1 += step_a1; \
00944         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00945 \
00946         t2v.v = a2v.v; \
00947         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00948         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00949 \
00950         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00951         alpha2 += step_a2; \
00952         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00953 \
00954 /* ----------------------------------------------------------- */ \
00955 \
00956         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
00957 \
00958         t2v.v = b2v.v; \
00959         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00960         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00961 \
00962         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00963         alpha3 += step_a3; \
00964         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
00965 \
00966         t3v.v = b3v.v; \
00967         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00968         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00969 \
00970         _mm_store_ps( ( float* )alpha4, b4v.v ); \
00971         alpha4 += step_a4; \
00972 \
00973         t1v.v = b1v.v; \
00974         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00975         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00976 \
00977         _mm_store_ps( ( float* )alpha1, b1v.v ); \
00978         alpha1 += step_a1; \
00979         a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00980 \
00981         t2v.v = b2v.v; \
00982         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00983         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00984 \
00985         _mm_store_ps( ( float* )alpha2, b2v.v ); \
00986         alpha2 += step_a2; \
00987         a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00988 \
00989 /* ----------------------------------------------------------- */ \
00990 \
00991         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
00992 \
00993         t2v.v = a2v.v; \
00994         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00995         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00996 \
00997         _mm_store_ps( ( float* )alpha3, b3v.v ); \
00998         alpha3 += step_a3; \
00999         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
01000 \
01001         t3v.v = a3v.v; \
01002         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01003         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01004 \
01005         _mm_store_ps( ( float* )alpha4, a4v.v ); \
01006         alpha4 += step_a4; \
01007 \
01008         t1v.v = a1v.v; \
01009         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01010         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01011 \
01012         _mm_store_ps( ( float* )alpha1, a1v.v ); \
01013         alpha1 += step_a1; \
01014         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
01015 \
01016         t2v.v = a2v.v; \
01017         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01018         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01019 \
01020         _mm_store_ps( ( float* )alpha2, a2v.v ); \
01021         alpha2 += step_a2; \
01022         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
01023 \
01024 /* ----------------------------------------------------------- */ \
01025 \
01026         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
01027 \
01028         t2v.v = b2v.v; \
01029         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
01030         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01031 \
01032         _mm_store_ps( ( float* )alpha3, a3v.v ); \
01033         alpha3 += step_a3; \
01034         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
01035 \
01036         t3v.v = b3v.v; \
01037         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
01038         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01039 \
01040         _mm_store_ps( ( float* )alpha4, b4v.v ); \
01041         alpha4 += step_a4; \
01042 \
01043         t1v.v = b1v.v; \
01044         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
01045         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01046 \
01047         _mm_store_ps( ( float* )alpha1, b1v.v ); \
01048         alpha1 += step_a1; \
01049         a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \
01050 \
01051         t2v.v = b2v.v; \
01052         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
01053         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01054 \
01055         _mm_store_ps( ( float* )alpha2, b2v.v ); \
01056         alpha2 += step_a2; \
01057         a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
01058 \
01059 \
01060 /* ----------------------------------------------------------- */ \
01061 \
01062         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
01063 \
01064         t2v.v = a2v.v; \
01065         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01066         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01067 \
01068         _mm_store_ps( ( float* )alpha3, b3v.v ); \
01069         alpha3 += step_a3; \
01070         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
01071 \
01072         t3v.v = a3v.v; \
01073         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01074         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01075 \
01076         _mm_store_ps( ( float* )alpha4, a4v.v ); \
01077         alpha4 += step_a4; \
01078 \
01079         t1v.v = a1v.v; \
01080         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01081         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01082 \
01083         _mm_store_ps( ( float* )alpha1, a1v.v ); \
01084         alpha1 += step_a1; \
01085         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
01086 \
01087         t2v.v = a2v.v; \
01088         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01089         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01090 \
01091         _mm_store_ps( ( float* )alpha2, a2v.v ); \
01092         alpha2 += step_a2; \
01093         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
01094 \
01095 /* ----------------------------------------------------------- */ \
01096 \
01097         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
01098 \
01099         t2v.v = b2v.v; \
01100         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
01101         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01102 \
01103         _mm_store_ps( ( float* )alpha3, a3v.v ); \
01104         alpha3 += step_a3; \
01105         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
01106 \
01107         t3v.v = b3v.v; \
01108         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
01109         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01110 \
01111         _mm_store_ps( ( float* )alpha4, b4v.v ); \
01112         alpha4 += step_a4; \
01113 \
01114         t1v.v = b1v.v; \
01115         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
01116         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01117 \
01118         _mm_store_ps( ( float* )alpha1, b1v.v ); \
01119         alpha1 += step_a1; \
01120         a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
01121 \
01122         t2v.v = b2v.v; \
01123         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
01124         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01125 \
01126         _mm_store_ps( ( float* )alpha2, b2v.v ); \
01127         alpha2 += step_a2; \
01128         a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
01129 \
01130 /* ----------------------------------------------------------- */ \
01131 \
01132         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
01133 \
01134         t2v.v = a2v.v; \
01135         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01136         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01137 \
01138         _mm_store_ps( ( float* )alpha3, b3v.v ); \
01139         alpha3 += step_a3; \
01140         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
01141 \
01142         t3v.v = a3v.v; \
01143         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01144         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01145 \
01146         _mm_store_ps( ( float* )alpha4, a4v.v ); \
01147         alpha4 += step_a4; \
01148 \
01149         t1v.v = a1v.v; \
01150         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01151         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01152 \
01153         _mm_store_ps( ( float* )alpha1, a1v.v ); \
01154         alpha1 += step_a1; \
01155         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
01156 \
01157         t2v.v = a2v.v; \
01158         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01159         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01160 \
01161         _mm_store_ps( ( float* )alpha2, a2v.v ); \
01162         alpha2 += step_a2; \
01163         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
01164 \
01165 /* ----------------------------------------------------------- */ \
01166 \
01167         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
01168 \
01169         t2v.v = b2v.v; \
01170         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
01171         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01172 \
01173         _mm_store_ps( ( float* )alpha3, a3v.v ); \
01174         alpha3 += step_a3; \
01175         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
01176 \
01177         t3v.v = b3v.v; \
01178         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
01179         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01180 \
01181         _mm_store_ps( ( float* )alpha4, b4v.v ); \
01182         alpha4 += step_a4; \
01183 \
01184         t1v.v = b1v.v; \
01185         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
01186         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01187 \
01188         _mm_store_ps( ( float* )alpha1, b1v.v ); \
01189         alpha1 += step_a1; \
01190 \
01191         t2v.v = b2v.v; \
01192         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
01193         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01194 \
01195         _mm_store_ps( ( float* )alpha2, b2v.v ); \
01196         alpha2 += step_a2; \
01197 \
01198         _mm_store_ps( ( float* )alpha3, b3v.v ); \
01199         alpha3 += step_a3; \
01200 \
01201 /* ----------------------------------------------------------- */ \
01202     } \
01203 \
01204     for ( i = 0; i < n_iter2; ++i ) \
01205     { \
01206 \
01207         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
01208         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
01209         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
01210 \
01211         t2v.v = a2v.v; \
01212         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01213         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01214 \
01215         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
01216 \
01217         t3v.v = a3v.v; \
01218         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01219         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01220 \
01221         _mm_store_ps( ( float* )alpha4, a4v.v ); \
01222         alpha4 += step_a4; \
01223 \
01224         t1v.v = a1v.v; \
01225         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01226         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01227 \
01228         _mm_store_ps( ( float* )alpha1, a1v.v ); \
01229         alpha1 += step_a1; \
01230 \
01231         t2v.v = a2v.v; \
01232         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01233         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01234 \
01235         _mm_store_ps( ( float* )alpha2, a2v.v ); \
01236         alpha2 += step_a2; \
01237         _mm_store_ps( ( float* )alpha3, a3v.v ); \
01238         alpha3 += step_a3; \
01239     } \
01240 \
01241     if ( n_left == 1 ) \
01242     { \
01243         float             ga23_k1 = *gamma23_k1; \
01244         float             si23_k1 = *sigma23_k1; \
01245         float             ga34_k1 = *gamma34_k1; \
01246         float             si34_k1 = *sigma34_k1; \
01247         float             ga12_k2 = *gamma12_k2; \
01248         float             si12_k2 = *sigma12_k2; \
01249         float             ga23_k2 = *gamma23_k2; \
01250         float             si23_k2 = *sigma23_k2; \
01251         scomplex          temp1; \
01252         scomplex          temp2; \
01253         scomplex          temp3; \
01254         scomplex          temp4; \
01255 \
01256         temp2 = *alpha2; \
01257         temp3 = *alpha3; \
01258 \
01259         alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \
01260         alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \
01261 \
01262         alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \
01263         alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \
01264 \
01265         temp3 = *alpha3; \
01266         temp4 = *alpha4; \
01267 \
01268         alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \
01269         alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \
01270 \
01271         alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \
01272         alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \
01273 \
01274         temp1 = *alpha1; \
01275         temp2 = *alpha2; \
01276 \
01277         alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \
01278         alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \
01279 \
01280         alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \
01281         alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \
01282 \
01283         temp2 = *alpha2; \
01284         temp3 = *alpha3; \
01285 \
01286         alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \
01287         alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \
01288 \
01289         alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \
01290         alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \
01291 \
01292     } \
01293 }
01294 
01295 #define MAC_Apply_G_mx4s_asz( m_A, \
01296                               gamma23_k1, \
01297                               sigma23_k1, \
01298                               gamma34_k1, \
01299                               sigma34_k1, \
01300                               gamma12_k2, \
01301                               sigma12_k2, \
01302                               gamma23_k2, \
01303                               sigma23_k2, \
01304                               a1, inc_a1, \
01305                               a2, inc_a2, \
01306                               a3, inc_a3, \
01307                               a4, inc_a4 ) \
01308 {\
01309     int                n_iter = m_A / 8; \
01310     int                n_left = m_A % 8; \
01311     int                i; \
01312 \
01313     const int          step_a1 = inc_a1 * 1; \
01314     const int          step_a2 = inc_a2 * 1; \
01315     const int          step_a3 = inc_a3 * 1; \
01316     const int          step_a4 = inc_a4 * 1; \
01317 \
01318     dcomplex* restrict alpha1 = a1; \
01319     dcomplex* restrict alpha2 = a2; \
01320     dcomplex* restrict alpha3 = a3; \
01321     dcomplex* restrict alpha4 = a4; \
01322 \
01323     v2df_t             a1v, a2v, a3v, a4v; \
01324     v2df_t             b1v, b2v, b3v, b4v; \
01325     v2df_t             g23_k1v, s23_k1v; \
01326     v2df_t             g34_k1v, s34_k1v; \
01327     v2df_t             g12_k2v, s12_k2v; \
01328     v2df_t             g23_k2v, s23_k2v; \
01329     v2df_t             t1v, t2v, t3v; \
01330 \
01331     g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \
01332     s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \
01333     g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \
01334     s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \
01335     g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \
01336     s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \
01337     g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \
01338     s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \
01339 \
01340     for ( i = 0; i < n_iter; ++i ) \
01341     { \
01342 \
01343         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
01344         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
01345         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
01346 \
01347         t2v.v = a2v.v; \
01348         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01349         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01350 \
01351         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
01352 \
01353         t3v.v = a3v.v; \
01354         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01355         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01356 \
01357         _mm_store_pd( ( double* )alpha4, a4v.v ); \
01358         alpha4 += step_a4; \
01359 \
01360         t1v.v = a1v.v; \
01361         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01362         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01363 \
01364         _mm_store_pd( ( double* )alpha1, a1v.v ); \
01365         alpha1 += step_a1; \
01366         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
01367 \
01368         t2v.v = a2v.v; \
01369         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01370         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01371 \
01372         _mm_store_pd( ( double* )alpha2, a2v.v ); \
01373         alpha2 += step_a2; \
01374         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
01375 \
01376 /* ----------------------------------------------------------- */ \
01377 \
01378         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
01379 \
01380         t2v.v = b2v.v; \
01381         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
01382         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01383 \
01384         _mm_store_pd( ( double* )alpha3, a3v.v ); \
01385         alpha3 += step_a3; \
01386         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
01387 \
01388         t3v.v = b3v.v; \
01389         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
01390         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01391 \
01392         _mm_store_pd( ( double* )alpha4, b4v.v ); \
01393         alpha4 += step_a4; \
01394 \
01395         t1v.v = b1v.v; \
01396         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
01397         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01398 \
01399         _mm_store_pd( ( double* )alpha1, b1v.v ); \
01400         alpha1 += step_a1; \
01401         a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
01402 \
01403         t2v.v = b2v.v; \
01404         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
01405         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01406 \
01407         _mm_store_pd( ( double* )alpha2, b2v.v ); \
01408         alpha2 += step_a2; \
01409         a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
01410 \
01411 /* ----------------------------------------------------------- */ \
01412 \
01413         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
01414 \
01415         t2v.v = a2v.v; \
01416         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01417         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01418 \
01419         _mm_store_pd( ( double* )alpha3, b3v.v ); \
01420         alpha3 += step_a3; \
01421         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
01422 \
01423         t3v.v = a3v.v; \
01424         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01425         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01426 \
01427         _mm_store_pd( ( double* )alpha4, a4v.v ); \
01428         alpha4 += step_a4; \
01429 \
01430         t1v.v = a1v.v; \
01431         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01432         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01433 \
01434         _mm_store_pd( ( double* )alpha1, a1v.v ); \
01435         alpha1 += step_a1; \
01436         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
01437 \
01438         t2v.v = a2v.v; \
01439         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01440         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01441 \
01442         _mm_store_pd( ( double* )alpha2, a2v.v ); \
01443         alpha2 += step_a2; \
01444         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
01445 \
01446 /* ----------------------------------------------------------- */ \
01447 \
01448         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
01449 \
01450         t2v.v = b2v.v; \
01451         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
01452         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01453 \
01454         _mm_store_pd( ( double* )alpha3, a3v.v ); \
01455         alpha3 += step_a3; \
01456         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
01457 \
01458         t3v.v = b3v.v; \
01459         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
01460         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01461 \
01462         _mm_store_pd( ( double* )alpha4, b4v.v ); \
01463         alpha4 += step_a4; \
01464 \
01465         t1v.v = b1v.v; \
01466         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
01467         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01468 \
01469         _mm_store_pd( ( double* )alpha1, b1v.v ); \
01470         alpha1 += step_a1; \
01471         a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \
01472 \
01473         t2v.v = b2v.v; \
01474         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
01475         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01476 \
01477         _mm_store_pd( ( double* )alpha2, b2v.v ); \
01478         alpha2 += step_a2; \
01479         a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
01480 \
01481 /* ----------------------------------------------------------- */ \
01482 \
01483         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
01484 \
01485         t2v.v = a2v.v; \
01486         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01487         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01488 \
01489         _mm_store_pd( ( double* )alpha3, b3v.v ); \
01490         alpha3 += step_a3; \
01491         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
01492 \
01493         t3v.v = a3v.v; \
01494         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01495         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01496 \
01497         _mm_store_pd( ( double* )alpha4, a4v.v ); \
01498         alpha4 += step_a4; \
01499 \
01500         t1v.v = a1v.v; \
01501         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01502         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01503 \
01504         _mm_store_pd( ( double* )alpha1, a1v.v ); \
01505         alpha1 += step_a1; \
01506         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
01507 \
01508         t2v.v = a2v.v; \
01509         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01510         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01511 \
01512         _mm_store_pd( ( double* )alpha2, a2v.v ); \
01513         alpha2 += step_a2; \
01514         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
01515 \
01516 /* ----------------------------------------------------------- */ \
01517 \
01518         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
01519 \
01520         t2v.v = b2v.v; \
01521         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
01522         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01523 \
01524         _mm_store_pd( ( double* )alpha3, a3v.v ); \
01525         alpha3 += step_a3; \
01526         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
01527 \
01528         t3v.v = b3v.v; \
01529         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
01530         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01531 \
01532         _mm_store_pd( ( double* )alpha4, b4v.v ); \
01533         alpha4 += step_a4; \
01534 \
01535         t1v.v = b1v.v; \
01536         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
01537         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01538 \
01539         _mm_store_pd( ( double* )alpha1, b1v.v ); \
01540         alpha1 += step_a1; \
01541         a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
01542 \
01543         t2v.v = b2v.v; \
01544         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
01545         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01546 \
01547         _mm_store_pd( ( double* )alpha2, b2v.v ); \
01548         alpha2 += step_a2; \
01549         a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
01550 \
01551 /* ----------------------------------------------------------- */ \
01552 \
01553         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
01554 \
01555         t2v.v = a2v.v; \
01556         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01557         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01558 \
01559         _mm_store_pd( ( double* )alpha3, b3v.v ); \
01560         alpha3 += step_a3; \
01561         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
01562 \
01563         t3v.v = a3v.v; \
01564         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01565         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01566 \
01567         _mm_store_pd( ( double* )alpha4, a4v.v ); \
01568         alpha4 += step_a4; \
01569 \
01570         t1v.v = a1v.v; \
01571         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01572         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01573 \
01574         _mm_store_pd( ( double* )alpha1, a1v.v ); \
01575         alpha1 += step_a1; \
01576         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
01577 \
01578         t2v.v = a2v.v; \
01579         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01580         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01581 \
01582         _mm_store_pd( ( double* )alpha2, a2v.v ); \
01583         alpha2 += step_a2; \
01584         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
01585 \
01586 /* ----------------------------------------------------------- */ \
01587 \
01588         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
01589 \
01590         t2v.v = b2v.v; \
01591         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
01592         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01593 \
01594         _mm_store_pd( ( double* )alpha3, a3v.v ); \
01595         alpha3 += step_a3; \
01596         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
01597 \
01598         t3v.v = b3v.v; \
01599         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
01600         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01601 \
01602         _mm_store_pd( ( double* )alpha4, b4v.v ); \
01603         alpha4 += step_a4; \
01604 \
01605         t1v.v = b1v.v; \
01606         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
01607         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01608 \
01609         _mm_store_pd( ( double* )alpha1, b1v.v ); \
01610         alpha1 += step_a1; \
01611 \
01612         t2v.v = b2v.v; \
01613         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
01614         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01615 \
01616         _mm_store_pd( ( double* )alpha2, b2v.v ); \
01617         alpha2 += step_a2; \
01618 \
01619         _mm_store_pd( ( double* )alpha3, b3v.v ); \
01620         alpha3 += step_a3; \
01621 \
01622 /* ----------------------------------------------------------- */ \
01623     } \
01624 \
01625     for ( i = 0; i < n_left; ++i ) \
01626     { \
01627 \
01628         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
01629         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
01630         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
01631 \
01632         t2v.v = a2v.v; \
01633         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01634         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01635 \
01636         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
01637 \
01638         t3v.v = a3v.v; \
01639         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01640         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01641 \
01642         _mm_store_pd( ( double* )alpha4, a4v.v ); \
01643         alpha4 += step_a4; \
01644 \
01645         t1v.v = a1v.v; \
01646         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01647         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01648 \
01649         _mm_store_pd( ( double* )alpha1, a1v.v ); \
01650         alpha1 += step_a1; \
01651 \
01652         t2v.v = a2v.v; \
01653         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01654         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01655 \
01656         _mm_store_pd( ( double* )alpha2, a2v.v ); \
01657         alpha2 += step_a2; \
01658         _mm_store_pd( ( double* )alpha3, a3v.v ); \
01659         alpha3 += step_a3; \
01660     } \
01661 }
01662 
01663 #endif