libflame  revision_anchor
FLA_Apply_G_mx2_asm.h
Go to the documentation of this file.
00001 /*
00002    libflame
00003    An object-based infrastructure for developing high-performance
00004    dense linear algebra libraries.
00005 
00006    Copyright (C) 2011, The University of Texas
00007 
00008    libflame is free software; you can redistribute it and/or modify
00009    it under the terms of the GNU Lesser General Public License as
00010    published by the Free Software Foundation; either version 2.1 of
00011    the License, or (at your option) any later version.
00012 
00013    libflame is distributed in the hope that it will be useful, but
00014    WITHOUT ANY WARRANTY; without even the implied warranty of
00015    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
00016    Lesser General Public License for more details.
00017 
00018    You should have received a copy of the GNU Lesser General Public
00019    License along with libflame; if you did not receive a copy, see
00020    http://www.gnu.org/licenses/.
00021 
00022    For more information, please contact us at flame@cs.utexas.edu or
00023    send mail to:
00024 
00025    Field G. Van Zee and/or
00026    Robert A. van de Geijn
00027    The University of Texas at Austin
00028    Department of Computer Sciences
00029    1 University Station C0500
00030    Austin TX 78712
00031 */
00032 
00033 
00034 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
00035 
00036 #define MAC_Apply_G_mx2_ass MAC_Apply_G_mx2_ops
00037 #define MAC_Apply_G_mx2_asd MAC_Apply_G_mx2_opd
00038 #define MAC_Apply_G_mx2_asc MAC_Apply_G_mx2_opc
00039 #define MAC_Apply_G_mx2_asz MAC_Apply_G_mx2_opz
00040 
00041 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
00042 
00043 #define MAC_Apply_G_mx2_ass( m_A, \
00044                              gamma12, \
00045                              sigma12, \
00046                              a1, inc_a1, \
00047                              a2, inc_a2 ) \
00048 {\
00049     int              n_iter32  = m_A / ( 4 * 8 ); \
00050     int              n_left32  = m_A % ( 4 * 8 ); \
00051     int              n_iter4   = n_left32 / ( 4 * 1 ); \
00052     int              n_left    = n_left32 % ( 4 * 1 ); \
00053     int              i; \
00054 \
00055     const int        step_a1 = inc_a1 * 4; \
00056     const int        step_a2 = inc_a2 * 4; \
00057 \
00058     float*  restrict alpha1 = a1; \
00059     float*  restrict alpha2 = a2; \
00060 \
00061     v4sf_t           a1v, a2v; \
00062     v4sf_t           g12v, s12v; \
00063     v4sf_t           t1v; \
00064 \
00065     g12v.v = _mm_load1_ps( gamma12 ); \
00066     s12v.v = _mm_load1_ps( sigma12 ); \
00067 \
00068     for ( i = 0; i < n_iter32; ++i ) \
00069     { \
00070 \
00071         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00072         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00073 \
00074         t1v.v = a1v.v; \
00075         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00076         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00077 \
00078         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00079         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00080 \
00081         alpha1 += step_a1; \
00082         alpha2 += step_a2; \
00083 \
00084         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00085         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00086 \
00087         t1v.v = a1v.v; \
00088         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00089         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00090 \
00091         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00092         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00093 \
00094         alpha1 += step_a1; \
00095         alpha2 += step_a2; \
00096 \
00097         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00098         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00099 \
00100         t1v.v = a1v.v; \
00101         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00102         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00103 \
00104         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00105         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00106 \
00107         alpha1 += step_a1; \
00108         alpha2 += step_a2; \
00109 \
00110         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00111         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00112 \
00113         t1v.v = a1v.v; \
00114         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00115         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00116 \
00117         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00118         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00119 \
00120         alpha1 += step_a1; \
00121         alpha2 += step_a2; \
00122 \
00123         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00124         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00125 \
00126         t1v.v = a1v.v; \
00127         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00128         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00129 \
00130         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00131         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00132 \
00133         alpha1 += step_a1; \
00134         alpha2 += step_a2; \
00135 \
00136         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00137         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00138 \
00139         t1v.v = a1v.v; \
00140         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00141         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00142 \
00143         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00144         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00145 \
00146         alpha1 += step_a1; \
00147         alpha2 += step_a2; \
00148 \
00149         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00150         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00151 \
00152         t1v.v = a1v.v; \
00153         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00154         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00155 \
00156         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00157         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00158 \
00159         alpha1 += step_a1; \
00160         alpha2 += step_a2; \
00161 \
00162         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00163         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00164 \
00165         t1v.v = a1v.v; \
00166         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00167         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00168 \
00169         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00170         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00171 \
00172         alpha1 += step_a1; \
00173         alpha2 += step_a2; \
00174     } \
00175 \
00176     for ( i = 0; i < n_iter4; ++i ) \
00177     { \
00178 \
00179         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00180         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00181 \
00182         t1v.v = a1v.v; \
00183         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00184         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00185 \
00186         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00187         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00188 \
00189         alpha1 += step_a1; \
00190         alpha2 += step_a2; \
00191     } \
00192 \
00193     for ( i = 0; i < n_left; ++i ) \
00194     { \
00195         float ga12 = *gamma12; \
00196         float si12 = *sigma12; \
00197         float temp1; \
00198         float temp2; \
00199 \
00200         temp1 = *alpha1; \
00201         temp2 = *alpha2; \
00202 \
00203         *alpha1 = temp1 * ga12 + temp2 * si12; \
00204         *alpha2 = temp2 * ga12 - temp1 * si12; \
00205 \
00206         alpha1 += 1; \
00207         alpha2 += 1; \
00208     } \
00209 }
00210 
00211 #define MAC_Apply_G_mx2_asd( m_A, \
00212                              gamma12, \
00213                              sigma12, \
00214                              a1, inc_a1, \
00215                              a2, inc_a2 ) \
00216 {\
00217     int              n_iter16  = m_A / ( 2 * 8 ); \
00218     int              n_left16  = m_A % ( 2 * 8 ); \
00219     int              n_iter2   = n_left16 / ( 2 * 1 ); \
00220     int              n_left    = n_left16 % ( 2 * 1 ); \
00221     int              i; \
00222 \
00223     const int        step_a1 = inc_a1 * 2; \
00224     const int        step_a2 = inc_a2 * 2; \
00225 \
00226     double* restrict alpha1 = a1; \
00227     double* restrict alpha2 = a2; \
00228 \
00229     v2df_t           a1v, a2v; \
00230     v2df_t           g12v, s12v; \
00231     v2df_t           t1v; \
00232 \
00233     g12v.v = _mm_loaddup_pd( gamma12 ); \
00234     s12v.v = _mm_loaddup_pd( sigma12 ); \
00235 \
00236     for ( i = 0; i < n_iter16; ++i ) \
00237     { \
00238 \
00239         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00240         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00241 \
00242         t1v.v = a1v.v; \
00243         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00244         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00245 \
00246         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00247         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00248 \
00249         alpha1 += step_a1; \
00250         alpha2 += step_a2; \
00251 \
00252         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00253         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00254 \
00255         t1v.v = a1v.v; \
00256         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00257         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00258 \
00259         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00260         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00261 \
00262         alpha1 += step_a1; \
00263         alpha2 += step_a2; \
00264 \
00265         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00266         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00267 \
00268         t1v.v = a1v.v; \
00269         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00270         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00271 \
00272         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00273         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00274 \
00275         alpha1 += step_a1; \
00276         alpha2 += step_a2; \
00277 \
00278         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00279         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00280 \
00281         t1v.v = a1v.v; \
00282         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00283         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00284 \
00285         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00286         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00287 \
00288         alpha1 += step_a1; \
00289         alpha2 += step_a2; \
00290 \
00291         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00292         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00293 \
00294         t1v.v = a1v.v; \
00295         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00296         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00297 \
00298         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00299         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00300 \
00301         alpha1 += step_a1; \
00302         alpha2 += step_a2; \
00303 \
00304         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00305         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00306 \
00307         t1v.v = a1v.v; \
00308         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00309         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00310 \
00311         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00312         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00313 \
00314         alpha1 += step_a1; \
00315         alpha2 += step_a2; \
00316 \
00317         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00318         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00319 \
00320         t1v.v = a1v.v; \
00321         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00322         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00323 \
00324         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00325         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00326 \
00327         alpha1 += step_a1; \
00328         alpha2 += step_a2; \
00329 \
00330         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00331         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00332 \
00333         t1v.v = a1v.v; \
00334         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00335         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00336 \
00337         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00338         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00339 \
00340         alpha1 += step_a1; \
00341         alpha2 += step_a2; \
00342     } \
00343 \
00344     for ( i = 0; i < n_iter2; ++i ) \
00345     { \
00346 \
00347         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00348         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00349 \
00350         t1v.v = a1v.v; \
00351         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00352         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00353 \
00354         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00355         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00356 \
00357         alpha1 += step_a1; \
00358         alpha2 += step_a2; \
00359     } \
00360 \
00361     if ( n_left == 1 ) \
00362     { \
00363         double ga12 = *gamma12; \
00364         double si12 = *sigma12; \
00365         double temp1; \
00366         double temp2; \
00367 \
00368         temp1 = *alpha1; \
00369         temp2 = *alpha2; \
00370 \
00371         *alpha1 = temp1 * ga12 + temp2 * si12; \
00372         *alpha2 = temp2 * ga12 - temp1 * si12; \
00373     } \
00374 }
00375 
00376 #define MAC_Apply_G_mx2_asc( m_A, \
00377                              gamma12, \
00378                              sigma12, \
00379                              a1, inc_a1, \
00380                              a2, inc_a2 ) \
00381 {\
00382     int                n_iter16  = m_A / ( 2 * 8 ); \
00383     int                n_left16  = m_A % ( 2 * 8 ); \
00384     int                n_iter2   = n_left16 / ( 2 * 1 ); \
00385     int                n_left    = n_left16 % ( 2 * 1 ); \
00386     int                i; \
00387 \
00388     const int          step_a1 = inc_a1 * 2; \
00389     const int          step_a2 = inc_a2 * 2; \
00390 \
00391     scomplex* restrict alpha1 = a1; \
00392     scomplex* restrict alpha2 = a2; \
00393 \
00394     v4sf_t             a1v, a2v; \
00395     v4sf_t             g12v, s12v; \
00396     v4sf_t             t1v; \
00397 \
00398     g12v.v = _mm_load1_ps( gamma12 ); \
00399     s12v.v = _mm_load1_ps( sigma12 ); \
00400 \
00401     for ( i = 0; i < n_iter16; ++i ) \
00402     { \
00403 \
00404         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00405         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00406 \
00407         t1v.v = a1v.v; \
00408         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00409         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00410 \
00411         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00412         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00413 \
00414         alpha1 += step_a1; \
00415         alpha2 += step_a2; \
00416 \
00417         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00418         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00419 \
00420         t1v.v = a1v.v; \
00421         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00422         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00423 \
00424         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00425         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00426 \
00427         alpha1 += step_a1; \
00428         alpha2 += step_a2; \
00429 \
00430         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00431         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00432 \
00433         t1v.v = a1v.v; \
00434         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00435         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00436 \
00437         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00438         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00439 \
00440         alpha1 += step_a1; \
00441         alpha2 += step_a2; \
00442 \
00443         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00444         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00445 \
00446         t1v.v = a1v.v; \
00447         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00448         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00449 \
00450         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00451         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00452 \
00453         alpha1 += step_a1; \
00454         alpha2 += step_a2; \
00455 \
00456         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00457         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00458 \
00459         t1v.v = a1v.v; \
00460         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00461         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00462 \
00463         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00464         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00465 \
00466         alpha1 += step_a1; \
00467         alpha2 += step_a2; \
00468 \
00469         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00470         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00471 \
00472         t1v.v = a1v.v; \
00473         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00474         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00475 \
00476         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00477         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00478 \
00479         alpha1 += step_a1; \
00480         alpha2 += step_a2; \
00481 \
00482         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00483         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00484 \
00485         t1v.v = a1v.v; \
00486         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00487         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00488 \
00489         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00490         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00491 \
00492         alpha1 += step_a1; \
00493         alpha2 += step_a2; \
00494 \
00495         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00496         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00497 \
00498         t1v.v = a1v.v; \
00499         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00500         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00501 \
00502         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00503         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00504 \
00505         alpha1 += step_a1; \
00506         alpha2 += step_a2; \
00507     } \
00508 \
00509     for ( i = 0; i < n_iter2; ++i ) \
00510     { \
00511 \
00512         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00513         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00514 \
00515         t1v.v = a1v.v; \
00516         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00517         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00518 \
00519         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00520         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00521 \
00522         alpha1 += step_a1; \
00523         alpha2 += step_a2; \
00524     } \
00525 \
00526     if ( n_left == 1 ) \
00527     { \
00528         float    ga12 = *gamma12; \
00529         float    si12 = *sigma12; \
00530         scomplex temp1; \
00531         scomplex temp2; \
00532 \
00533         temp1 = *alpha1; \
00534         temp2 = *alpha2; \
00535 \
00536         alpha1->real = temp1.real * ga12 + temp2.real * si12; \
00537         alpha2->real = temp2.real * ga12 - temp1.real * si12; \
00538 \
00539         alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \
00540         alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \
00541     } \
00542 }
00543 
00544 #define MAC_Apply_G_mx2_asz( m_A, \
00545                              gamma12, \
00546                              sigma12, \
00547                              a1, inc_a1, \
00548                              a2, inc_a2 ) \
00549 {\
00550     int                n_iter  = m_A / 8; \
00551     int                n_left  = m_A % 8; \
00552     int                i; \
00553 \
00554     const int          step_a1 = inc_a1 * 1; \
00555     const int          step_a2 = inc_a2 * 1; \
00556 \
00557     dcomplex* restrict alpha1 = a1; \
00558     dcomplex* restrict alpha2 = a2; \
00559 \
00560     v2df_t             a1v, a2v; \
00561     v2df_t             g12v, s12v; \
00562     v2df_t             t1v; \
00563 \
00564     g12v.v = _mm_loaddup_pd( gamma12 ); \
00565     s12v.v = _mm_loaddup_pd( sigma12 ); \
00566 \
00567     for ( i = 0; i < n_iter; ++i ) \
00568     { \
00569 \
00570         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00571         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00572 \
00573         t1v.v = a1v.v; \
00574         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00575         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00576 \
00577         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00578         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00579 \
00580         alpha1 += step_a1; \
00581         alpha2 += step_a2; \
00582 \
00583         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00584         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00585 \
00586         t1v.v = a1v.v; \
00587         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00588         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00589 \
00590         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00591         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00592 \
00593         alpha1 += step_a1; \
00594         alpha2 += step_a2; \
00595 \
00596         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00597         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00598 \
00599         t1v.v = a1v.v; \
00600         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00601         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00602 \
00603         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00604         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00605 \
00606         alpha1 += step_a1; \
00607         alpha2 += step_a2; \
00608 \
00609         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00610         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00611 \
00612         t1v.v = a1v.v; \
00613         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00614         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00615 \
00616         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00617         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00618 \
00619         alpha1 += step_a1; \
00620         alpha2 += step_a2; \
00621 \
00622         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00623         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00624 \
00625         t1v.v = a1v.v; \
00626         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00627         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00628 \
00629         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00630         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00631 \
00632         alpha1 += step_a1; \
00633         alpha2 += step_a2; \
00634 \
00635         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00636         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00637 \
00638         t1v.v = a1v.v; \
00639         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00640         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00641 \
00642         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00643         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00644 \
00645         alpha1 += step_a1; \
00646         alpha2 += step_a2; \
00647 \
00648         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00649         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00650 \
00651         t1v.v = a1v.v; \
00652         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00653         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00654 \
00655         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00656         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00657 \
00658         alpha1 += step_a1; \
00659         alpha2 += step_a2; \
00660 \
00661         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00662         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00663 \
00664         t1v.v = a1v.v; \
00665         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00666         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00667 \
00668         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00669         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00670 \
00671         alpha1 += step_a1; \
00672         alpha2 += step_a2; \
00673     } \
00674 \
00675     for ( i = 0; i < n_left; ++i ) \
00676     { \
00677         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00678         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00679 \
00680         t1v.v = a1v.v; \
00681         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00682         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00683 \
00684         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00685         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00686 \
00687         alpha1 += step_a1; \
00688         alpha2 += step_a2; \
00689     } \
00690 }
00691 
00692 #endif