libflame  revision_anchor
FLA_Apply_G_mx3b_asm.h
Go to the documentation of this file.
00001 /*
00002    libflame
00003    An object-based infrastructure for developing high-performance
00004    dense linear algebra libraries.
00005 
00006    Copyright (C) 2011, The University of Texas
00007 
00008    libflame is free software; you can redistribute it and/or modify
00009    it under the terms of the GNU Lesser General Public License as
00010    published by the Free Software Foundation; either version 2.1 of
00011    the License, or (at your option) any later version.
00012 
00013    libflame is distributed in the hope that it will be useful, but
00014    WITHOUT ANY WARRANTY; without even the implied warranty of
00015    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
00016    Lesser General Public License for more details.
00017 
00018    You should have received a copy of the GNU Lesser General Public
00019    License along with libflame; if you did not receive a copy, see
00020    http://www.gnu.org/licenses/.
00021 
00022    For more information, please contact us at flame@cs.utexas.edu or
00023    send mail to:
00024 
00025    Field G. Van Zee and/or
00026    Robert A. van de Geijn
00027    The University of Texas at Austin
00028    Department of Computer Sciences
00029    1 University Station C0500
00030    Austin TX 78712
00031 */
00032 
00033 
00034 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
00035 
00036 #define MAC_Apply_G_mx3b_ass MAC_Apply_G_mx3b_ops
00037 #define MAC_Apply_G_mx3b_asd MAC_Apply_G_mx3b_opd
00038 #define MAC_Apply_G_mx3b_asc MAC_Apply_G_mx3b_opc
00039 #define MAC_Apply_G_mx3b_asz MAC_Apply_G_mx3b_opz
00040 
00041 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
00042 
00043 #define MAC_Apply_G_mx3b_ass( m_A, \
00044                               gamma12, \
00045                               sigma12, \
00046                               gamma23, \
00047                               sigma23, \
00048                               a1, inc_a1, \
00049                               a2, inc_a2, \
00050                               a3, inc_a3 ) \
00051 {\
00052     int              n_iter32 = m_A / ( 4 * 8 ); \
00053     int              n_left32 = m_A % ( 4 * 8 ); \
00054     int              n_iter4  = n_left32 / ( 4 * 1 ); \
00055     int              n_left   = n_left32 % ( 4 * 1 ); \
00056     int              i; \
00057 \
00058     const int        step_a1 = inc_a1 * 4; \
00059     const int        step_a2 = inc_a2 * 4; \
00060     const int        step_a3 = inc_a3 * 4; \
00061 \
00062     float*  restrict alpha1 = a1; \
00063     float*  restrict alpha2 = a2; \
00064     float*  restrict alpha3 = a3; \
00065 \
00066     v4sf_t           a1v, a2v, a3v; \
00067     v4sf_t           g12v, s12v; \
00068     v4sf_t           g23v, s23v; \
00069     v4sf_t           t1v, t2v; \
00070 \
00071     g12v.v = _mm_load1_ps( gamma12 ); \
00072     s12v.v = _mm_load1_ps( sigma12 ); \
00073     g23v.v = _mm_load1_ps( gamma23 ); \
00074     s23v.v = _mm_load1_ps( sigma23 ); \
00075 \
00076     for ( i = 0; i < n_iter32; ++i ) \
00077     { \
00078 \
00079         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00080         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00081 \
00082         t2v.v = a2v.v; \
00083         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00084         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00085 \
00086         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00087         alpha3 += step_a3; \
00088         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00089 \
00090         t1v.v = a1v.v; \
00091         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00092         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00093 \
00094         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00095         alpha1 += step_a1; \
00096         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00097         alpha2 += step_a2; \
00098 \
00099         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00100         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00101 \
00102         t2v.v = a2v.v; \
00103         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00104         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00105 \
00106         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00107         alpha3 += step_a3; \
00108         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00109 \
00110         t1v.v = a1v.v; \
00111         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00112         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00113 \
00114         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00115         alpha1 += step_a1; \
00116         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00117         alpha2 += step_a2; \
00118 \
00119         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00120         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00121 \
00122         t2v.v = a2v.v; \
00123         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00124         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00125 \
00126         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00127         alpha3 += step_a3; \
00128         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00129 \
00130         t1v.v = a1v.v; \
00131         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00132         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00133 \
00134         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00135         alpha1 += step_a1; \
00136         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00137         alpha2 += step_a2; \
00138 \
00139         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00140         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00141 \
00142         t2v.v = a2v.v; \
00143         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00144         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00145 \
00146         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00147         alpha3 += step_a3; \
00148         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00149 \
00150         t1v.v = a1v.v; \
00151         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00152         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00153 \
00154         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00155         alpha1 += step_a1; \
00156         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00157         alpha2 += step_a2; \
00158 \
00159         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00160         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00161 \
00162         t2v.v = a2v.v; \
00163         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00164         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00165 \
00166         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00167         alpha3 += step_a3; \
00168         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00169 \
00170         t1v.v = a1v.v; \
00171         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00172         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00173 \
00174         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00175         alpha1 += step_a1; \
00176         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00177         alpha2 += step_a2; \
00178 \
00179         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00180         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00181 \
00182         t2v.v = a2v.v; \
00183         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00184         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00185 \
00186         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00187         alpha3 += step_a3; \
00188         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00189 \
00190         t1v.v = a1v.v; \
00191         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00192         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00193 \
00194         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00195         alpha1 += step_a1; \
00196         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00197         alpha2 += step_a2; \
00198 \
00199         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00200         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00201 \
00202         t2v.v = a2v.v; \
00203         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00204         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00205 \
00206         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00207         alpha3 += step_a3; \
00208         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00209 \
00210         t1v.v = a1v.v; \
00211         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00212         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00213 \
00214         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00215         alpha1 += step_a1; \
00216         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00217         alpha2 += step_a2; \
00218 \
00219         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00220         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00221 \
00222         t2v.v = a2v.v; \
00223         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00224         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00225 \
00226         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00227         alpha3 += step_a3; \
00228         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00229 \
00230         t1v.v = a1v.v; \
00231         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00232         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00233 \
00234         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00235         alpha1 += step_a1; \
00236         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00237         alpha2 += step_a2; \
00238     } \
00239 \
00240     for ( i = 0; i < n_iter4; ++i ) \
00241     { \
00242 \
00243         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00244         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00245 \
00246         t2v.v = a2v.v; \
00247         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00248         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00249 \
00250         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00251         alpha3 += step_a3; \
00252         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00253 \
00254         t1v.v = a1v.v; \
00255         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00256         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00257 \
00258         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00259         alpha1 += step_a1; \
00260         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00261         alpha2 += step_a2; \
00262     } \
00263 \
00264     for ( i = 0; i < n_left; ++i ) \
00265     { \
00266         float ga12 = *gamma12; \
00267         float si12 = *sigma12; \
00268         float ga23 = *gamma23; \
00269         float si23 = *sigma23; \
00270         float temp1; \
00271         float temp2; \
00272         float temp3; \
00273 \
00274         temp2 = *alpha2; \
00275         temp3 = *alpha3; \
00276 \
00277         *alpha2 = temp2 * ga23 + temp3 * si23; \
00278         *alpha3 = temp3 * ga23 - temp2 * si23; \
00279 \
00280         temp1 = *alpha1; \
00281         temp2 = *alpha2; \
00282 \
00283         *alpha1 = temp1 * ga12 + temp2 * si12; \
00284         *alpha2 = temp2 * ga12 - temp1 * si12; \
00285 \
00286         alpha1 += 1; \
00287         alpha2 += 1; \
00288         alpha3 += 1; \
00289     } \
00290 }
00291 
00292 #define MAC_Apply_G_mx3b_asd( m_A, \
00293                               gamma12, \
00294                               sigma12, \
00295                               gamma23, \
00296                               sigma23, \
00297                               a1, inc_a1, \
00298                               a2, inc_a2, \
00299                               a3, inc_a3 ) \
00300 {\
00301     int              n_iter16 = m_A / ( 2 * 8 ); \
00302     int              n_left16 = m_A % ( 2 * 8 ); \
00303     int              n_iter2  = n_left16 / ( 2 * 1 ); \
00304     int              n_left   = n_left16 % ( 2 * 1 ); \
00305     int              i; \
00306 \
00307     const int        step_a1 = inc_a1 * 2; \
00308     const int        step_a2 = inc_a2 * 2; \
00309     const int        step_a3 = inc_a3 * 2; \
00310 \
00311     double* restrict alpha1 = a1; \
00312     double* restrict alpha2 = a2; \
00313     double* restrict alpha3 = a3; \
00314 \
00315     v2df_t           a1v, a2v, a3v; \
00316     v2df_t           g12v, s12v; \
00317     v2df_t           g23v, s23v; \
00318     v2df_t           t1v, t2v; \
00319 \
00320     g12v.v = _mm_loaddup_pd( gamma12 ); \
00321     s12v.v = _mm_loaddup_pd( sigma12 ); \
00322     g23v.v = _mm_loaddup_pd( gamma23 ); \
00323     s23v.v = _mm_loaddup_pd( sigma23 ); \
00324 \
00325     for ( i = 0; i < n_iter16; ++i ) \
00326     { \
00327 \
00328         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00329         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00330 \
00331         t2v.v = a2v.v; \
00332         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00333         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00334 \
00335         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00336         alpha3 += step_a3; \
00337         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00338 \
00339         t1v.v = a1v.v; \
00340         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00341         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00342 \
00343         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00344         alpha1 += step_a1; \
00345         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00346         alpha2 += step_a2; \
00347 \
00348         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00349         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00350 \
00351         t2v.v = a2v.v; \
00352         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00353         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00354 \
00355         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00356         alpha3 += step_a3; \
00357         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00358 \
00359         t1v.v = a1v.v; \
00360         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00361         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00362 \
00363         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00364         alpha1 += step_a1; \
00365         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00366         alpha2 += step_a2; \
00367 \
00368         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00369         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00370 \
00371         t2v.v = a2v.v; \
00372         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00373         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00374 \
00375         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00376         alpha3 += step_a3; \
00377         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00378 \
00379         t1v.v = a1v.v; \
00380         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00381         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00382 \
00383         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00384         alpha1 += step_a1; \
00385         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00386         alpha2 += step_a2; \
00387 \
00388         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00389         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00390 \
00391         t2v.v = a2v.v; \
00392         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00393         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00394 \
00395         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00396         alpha3 += step_a3; \
00397         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00398 \
00399         t1v.v = a1v.v; \
00400         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00401         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00402 \
00403         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00404         alpha1 += step_a1; \
00405         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00406         alpha2 += step_a2; \
00407 \
00408         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00409         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00410 \
00411         t2v.v = a2v.v; \
00412         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00413         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00414 \
00415         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00416         alpha3 += step_a3; \
00417         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00418 \
00419         t1v.v = a1v.v; \
00420         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00421         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00422 \
00423         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00424         alpha1 += step_a1; \
00425         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00426         alpha2 += step_a2; \
00427 \
00428         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00429         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00430 \
00431         t2v.v = a2v.v; \
00432         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00433         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00434 \
00435         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00436         alpha3 += step_a3; \
00437         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00438 \
00439         t1v.v = a1v.v; \
00440         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00441         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00442 \
00443         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00444         alpha1 += step_a1; \
00445         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00446         alpha2 += step_a2; \
00447 \
00448         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00449         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00450 \
00451         t2v.v = a2v.v; \
00452         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00453         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00454 \
00455         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00456         alpha3 += step_a3; \
00457         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00458 \
00459         t1v.v = a1v.v; \
00460         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00461         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00462 \
00463         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00464         alpha1 += step_a1; \
00465         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00466         alpha2 += step_a2; \
00467 \
00468         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00469         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00470 \
00471         t2v.v = a2v.v; \
00472         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00473         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00474 \
00475         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00476         alpha3 += step_a3; \
00477         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00478 \
00479         t1v.v = a1v.v; \
00480         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00481         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00482 \
00483         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00484         alpha1 += step_a1; \
00485         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00486         alpha2 += step_a2; \
00487     } \
00488 \
00489     for ( i = 0; i < n_iter2; ++i ) \
00490     { \
00491 \
00492         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00493         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00494 \
00495         t2v.v = a2v.v; \
00496         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00497         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00498 \
00499         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00500         alpha3 += step_a3; \
00501         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00502 \
00503         t1v.v = a1v.v; \
00504         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00505         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00506 \
00507         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00508         alpha1 += step_a1; \
00509         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00510         alpha2 += step_a2; \
00511     } \
00512 \
00513     if ( n_left == 1 ) \
00514     { \
00515         double ga12 = *gamma12; \
00516         double si12 = *sigma12; \
00517         double ga23 = *gamma23; \
00518         double si23 = *sigma23; \
00519         double temp1; \
00520         double temp2; \
00521         double temp3; \
00522 \
00523         temp2 = *alpha2; \
00524         temp3 = *alpha3; \
00525 \
00526         *alpha2 = temp2 * ga23 + temp3 * si23; \
00527         *alpha3 = temp3 * ga23 - temp2 * si23; \
00528 \
00529         temp1 = *alpha1; \
00530         temp2 = *alpha2; \
00531 \
00532         *alpha1 = temp1 * ga12 + temp2 * si12; \
00533         *alpha2 = temp2 * ga12 - temp1 * si12; \
00534     } \
00535 }
00536 
00537 #define MAC_Apply_G_mx3b_asc( m_A, \
00538                               gamma12, \
00539                               sigma12, \
00540                               gamma23, \
00541                               sigma23, \
00542                               a1, inc_a1, \
00543                               a2, inc_a2, \
00544                               a3, inc_a3 ) \
00545 {\
00546     int                n_iter16 = m_A / ( 2 * 8 ); \
00547     int                n_left16 = m_A % ( 2 * 8 ); \
00548     int                n_iter2  = n_left16 / ( 2 * 1 ); \
00549     int                n_left   = n_left16 % ( 2 * 1 ); \
00550     int                i; \
00551 \
00552     const int          step_a1 = inc_a1 * 2; \
00553     const int          step_a2 = inc_a2 * 2; \
00554     const int          step_a3 = inc_a3 * 2; \
00555 \
00556     scomplex* restrict alpha1 = a1; \
00557     scomplex* restrict alpha2 = a2; \
00558     scomplex* restrict alpha3 = a3; \
00559 \
00560     v4sf_t             a1v, a2v, a3v; \
00561     v4sf_t             g12v, s12v; \
00562     v4sf_t             g23v, s23v; \
00563     v4sf_t             t1v, t2v; \
00564 \
00565     g12v.v = _mm_load1_ps( gamma12 ); \
00566     s12v.v = _mm_load1_ps( sigma12 ); \
00567     g23v.v = _mm_load1_ps( gamma23 ); \
00568     s23v.v = _mm_load1_ps( sigma23 ); \
00569 \
00570     for ( i = 0; i < n_iter16; ++i ) \
00571     { \
00572 \
00573         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00574         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00575 \
00576         t2v.v = a2v.v; \
00577         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00578         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00579 \
00580         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00581         alpha3 += step_a3; \
00582         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00583 \
00584         t1v.v = a1v.v; \
00585         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00586         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00587 \
00588         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00589         alpha1 += step_a1; \
00590         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00591         alpha2 += step_a2; \
00592 \
00593         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00594         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00595 \
00596         t2v.v = a2v.v; \
00597         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00598         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00599 \
00600         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00601         alpha3 += step_a3; \
00602         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00603 \
00604         t1v.v = a1v.v; \
00605         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00606         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00607 \
00608         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00609         alpha1 += step_a1; \
00610         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00611         alpha2 += step_a2; \
00612 \
00613         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00614         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00615 \
00616         t2v.v = a2v.v; \
00617         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00618         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00619 \
00620         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00621         alpha3 += step_a3; \
00622         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00623 \
00624         t1v.v = a1v.v; \
00625         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00626         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00627 \
00628         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00629         alpha1 += step_a1; \
00630         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00631         alpha2 += step_a2; \
00632 \
00633         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00634         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00635 \
00636         t2v.v = a2v.v; \
00637         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00638         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00639 \
00640         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00641         alpha3 += step_a3; \
00642         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00643 \
00644         t1v.v = a1v.v; \
00645         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00646         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00647 \
00648         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00649         alpha1 += step_a1; \
00650         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00651         alpha2 += step_a2; \
00652 \
00653         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00654         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00655 \
00656         t2v.v = a2v.v; \
00657         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00658         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00659 \
00660         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00661         alpha3 += step_a3; \
00662         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00663 \
00664         t1v.v = a1v.v; \
00665         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00666         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00667 \
00668         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00669         alpha1 += step_a1; \
00670         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00671         alpha2 += step_a2; \
00672 \
00673         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00674         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00675 \
00676         t2v.v = a2v.v; \
00677         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00678         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00679 \
00680         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00681         alpha3 += step_a3; \
00682         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00683 \
00684         t1v.v = a1v.v; \
00685         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00686         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00687 \
00688         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00689         alpha1 += step_a1; \
00690         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00691         alpha2 += step_a2; \
00692 \
00693         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00694         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00695 \
00696         t2v.v = a2v.v; \
00697         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00698         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00699 \
00700         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00701         alpha3 += step_a3; \
00702         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00703 \
00704         t1v.v = a1v.v; \
00705         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00706         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00707 \
00708         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00709         alpha1 += step_a1; \
00710         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00711         alpha2 += step_a2; \
00712 \
00713         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00714         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00715 \
00716         t2v.v = a2v.v; \
00717         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00718         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00719 \
00720         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00721         alpha3 += step_a3; \
00722         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00723 \
00724         t1v.v = a1v.v; \
00725         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00726         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00727 \
00728         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00729         alpha1 += step_a1; \
00730         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00731         alpha2 += step_a2; \
00732     } \
00733 \
00734     for ( i = 0; i < n_iter2; ++i ) \
00735     { \
00736 \
00737         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00738         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00739 \
00740         t2v.v = a2v.v; \
00741         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00742         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00743 \
00744         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00745         alpha3 += step_a3; \
00746         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00747 \
00748         t1v.v = a1v.v; \
00749         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00750         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00751 \
00752         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00753         alpha1 += step_a1; \
00754         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00755         alpha2 += step_a2; \
00756     } \
00757 \
00758     if ( n_left == 1 ) \
00759     { \
00760         float ga12 = *gamma12; \
00761         float si12 = *sigma12; \
00762         float ga23 = *gamma23; \
00763         float si23 = *sigma23; \
00764         scomplex temp1; \
00765         scomplex temp2; \
00766         scomplex temp3; \
00767 \
00768         temp1 = *alpha1; \
00769         temp2 = *alpha2; \
00770 \
00771         alpha1->real = temp1.real * ga12 + temp2.real * si12; \
00772         alpha2->real = temp2.real * ga12 - temp1.real * si12; \
00773 \
00774         alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \
00775         alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \
00776 \
00777         temp2 = *alpha2; \
00778         temp3 = *alpha3; \
00779 \
00780         alpha2->real = temp2.real * ga23 + temp3.real * si23; \
00781         alpha3->real = temp3.real * ga23 - temp2.real * si23; \
00782 \
00783         alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \
00784         alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \
00785     } \
00786 }
00787 
00788 #define MAC_Apply_G_mx3b_asz( m_A, \
00789                               gamma12, \
00790                               sigma12, \
00791                               gamma23, \
00792                               sigma23, \
00793                               a1, inc_a1, \
00794                               a2, inc_a2, \
00795                               a3, inc_a3 ) \
00796 {\
00797     int                n_iter = m_A / 8; \
00798     int                n_left = m_A % 8; \
00799     int                i; \
00800 \
00801     const int          step_a1 = inc_a1 * 1; \
00802     const int          step_a2 = inc_a2 * 1; \
00803     const int          step_a3 = inc_a3 * 1; \
00804 \
00805     dcomplex* restrict alpha1 = a1; \
00806     dcomplex* restrict alpha2 = a2; \
00807     dcomplex* restrict alpha3 = a3; \
00808 \
00809     v2df_t             a1v, a2v, a3v; \
00810     v2df_t             g12v, s12v; \
00811     v2df_t             g23v, s23v; \
00812     v2df_t             t1v, t2v; \
00813 \
00814     g12v.v = _mm_loaddup_pd( gamma12 ); \
00815     s12v.v = _mm_loaddup_pd( sigma12 ); \
00816     g23v.v = _mm_loaddup_pd( gamma23 ); \
00817     s23v.v = _mm_loaddup_pd( sigma23 ); \
00818 \
00819     for ( i = 0; i < n_iter; ++i ) \
00820     { \
00821 \
00822         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00823         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00824 \
00825         t2v.v = a2v.v; \
00826         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00827         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00828 \
00829         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00830         alpha3 += step_a3; \
00831         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00832 \
00833         t1v.v = a1v.v; \
00834         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00835         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00836 \
00837         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00838         alpha1 += step_a1; \
00839         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00840         alpha2 += step_a2; \
00841 \
00842         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00843         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00844 \
00845         t2v.v = a2v.v; \
00846         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00847         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00848 \
00849         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00850         alpha3 += step_a3; \
00851         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00852 \
00853         t1v.v = a1v.v; \
00854         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00855         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00856 \
00857         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00858         alpha1 += step_a1; \
00859         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00860         alpha2 += step_a2; \
00861 \
00862         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00863         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00864 \
00865         t2v.v = a2v.v; \
00866         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00867         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00868 \
00869         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00870         alpha3 += step_a3; \
00871         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00872 \
00873         t1v.v = a1v.v; \
00874         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00875         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00876 \
00877         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00878         alpha1 += step_a1; \
00879         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00880         alpha2 += step_a2; \
00881 \
00882         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00883         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00884 \
00885         t2v.v = a2v.v; \
00886         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00887         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00888 \
00889         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00890         alpha3 += step_a3; \
00891         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00892 \
00893         t1v.v = a1v.v; \
00894         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00895         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00896 \
00897         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00898         alpha1 += step_a1; \
00899         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00900         alpha2 += step_a2; \
00901 \
00902         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00903         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00904 \
00905         t2v.v = a2v.v; \
00906         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00907         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00908 \
00909         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00910         alpha3 += step_a3; \
00911         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00912 \
00913         t1v.v = a1v.v; \
00914         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00915         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00916 \
00917         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00918         alpha1 += step_a1; \
00919         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00920         alpha2 += step_a2; \
00921 \
00922         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00923         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00924 \
00925         t2v.v = a2v.v; \
00926         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00927         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00928 \
00929         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00930         alpha3 += step_a3; \
00931         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00932 \
00933         t1v.v = a1v.v; \
00934         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00935         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00936 \
00937         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00938         alpha1 += step_a1; \
00939         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00940         alpha2 += step_a2; \
00941 \
00942         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00943         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00944 \
00945         t2v.v = a2v.v; \
00946         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00947         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00948 \
00949         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00950         alpha3 += step_a3; \
00951         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00952 \
00953         t1v.v = a1v.v; \
00954         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00955         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00956 \
00957         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00958         alpha1 += step_a1; \
00959         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00960         alpha2 += step_a2; \
00961 \
00962         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00963         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00964 \
00965         t2v.v = a2v.v; \
00966         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00967         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00968 \
00969         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00970         alpha3 += step_a3; \
00971         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00972 \
00973         t1v.v = a1v.v; \
00974         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00975         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00976 \
00977         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00978         alpha1 += step_a1; \
00979         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00980         alpha2 += step_a2; \
00981     } \
00982 \
00983     for ( i = 0; i < n_left; ++i ) \
00984     { \
00985 \
00986         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00987         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00988 \
00989         t2v.v = a2v.v; \
00990         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00991         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00992 \
00993         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00994         alpha3 += step_a3; \
00995         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00996 \
00997         t1v.v = a1v.v; \
00998         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00999         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
01000 \
01001         _mm_store_pd( ( double* )alpha1, a1v.v ); \
01002         alpha1 += step_a1; \
01003         _mm_store_pd( ( double* )alpha2, a2v.v ); \
01004         alpha2 += step_a2; \
01005     } \
01006 }
01007 
01008 #endif