libflame  revision_anchor
FLA_Apply_G_mx3_asm.h
Go to the documentation of this file.
00001 /*
00002    libflame
00003    An object-based infrastructure for developing high-performance
00004    dense linear algebra libraries.
00005 
00006    Copyright (C) 2011, The University of Texas
00007 
00008    libflame is free software; you can redistribute it and/or modify
00009    it under the terms of the GNU Lesser General Public License as
00010    published by the Free Software Foundation; either version 2.1 of
00011    the License, or (at your option) any later version.
00012 
00013    libflame is distributed in the hope that it will be useful, but
00014    WITHOUT ANY WARRANTY; without even the implied warranty of
00015    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
00016    Lesser General Public License for more details.
00017 
00018    You should have received a copy of the GNU Lesser General Public
00019    License along with libflame; if you did not receive a copy, see
00020    http://www.gnu.org/licenses/.
00021 
00022    For more information, please contact us at flame@cs.utexas.edu or
00023    send mail to:
00024 
00025    Field G. Van Zee and/or
00026    Robert A. van de Geijn
00027    The University of Texas at Austin
00028    Department of Computer Sciences
00029    1 University Station C0500
00030    Austin TX 78712
00031 */
00032 
00033 
00034 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
00035 
00036 #define MAC_Apply_G_mx3_ass MAC_Apply_G_mx3_ops
00037 #define MAC_Apply_G_mx3_asd MAC_Apply_G_mx3_opd
00038 #define MAC_Apply_G_mx3_asc MAC_Apply_G_mx3_opc
00039 #define MAC_Apply_G_mx3_asz MAC_Apply_G_mx3_opz
00040 
00041 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
00042 
00043 #define MAC_Apply_G_mx3_ass( m_A, \
00044                              gamma12, \
00045                              sigma12, \
00046                              gamma23, \
00047                              sigma23, \
00048                              a1, inc_a1, \
00049                              a2, inc_a2, \
00050                              a3, inc_a3 ) \
00051 {\
00052     int              n_iter32 = m_A / ( 4 * 8 ); \
00053     int              n_left32 = m_A % ( 4 * 8 ); \
00054     int              n_iter4  = n_left32 / ( 4 * 1 ); \
00055     int              n_left   = n_left32 % ( 4 * 1 ); \
00056     int              i; \
00057 \
00058     const int        step_a1 = inc_a1 * 4; \
00059     const int        step_a2 = inc_a1 * 4; \
00060     const int        step_a3 = inc_a1 * 4; \
00061 \
00062     float* restrict alpha1 = a1; \
00063     float* restrict alpha2 = a2; \
00064     float* restrict alpha3 = a3; \
00065 \
00066     v4sf_t    a1v, a2v, a3v; \
00067     v4sf_t    g12v, s12v; \
00068     v4sf_t    g23v, s23v; \
00069     v4sf_t    t1v, t2v; \
00070 \
00071     g12v.v = _mm_load1_ps( gamma12 ); \
00072     s12v.v = _mm_load1_ps( sigma12 ); \
00073     g23v.v = _mm_load1_ps( gamma23 ); \
00074     s23v.v = _mm_load1_ps( sigma23 ); \
00075 \
00076     for ( i = 0; i < n_iter32; ++i ) \
00077     { \
00078 \
00079         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00080         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00081 \
00082         t1v.v = a1v.v; \
00083         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00084         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00085 \
00086         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00087         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00088         alpha1 += step_a1; \
00089 \
00090         t2v.v = a2v.v; \
00091         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00092         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00093 \
00094         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00095         alpha2 += step_a2; \
00096         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00097         alpha3 += step_a3; \
00098 \
00099         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00100         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00101 \
00102         t1v.v = a1v.v; \
00103         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00104         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00105 \
00106         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00107         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00108         alpha1 += step_a1; \
00109 \
00110         t2v.v = a2v.v; \
00111         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00112         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00113 \
00114         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00115         alpha2 += step_a2; \
00116         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00117         alpha3 += step_a3; \
00118 \
00119         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00120         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00121 \
00122         t1v.v = a1v.v; \
00123         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00124         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00125 \
00126         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00127         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00128         alpha1 += step_a1; \
00129 \
00130         t2v.v = a2v.v; \
00131         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00132         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00133 \
00134         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00135         alpha2 += step_a2; \
00136         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00137         alpha3 += step_a3; \
00138 \
00139         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00140         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00141 \
00142         t1v.v = a1v.v; \
00143         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00144         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00145 \
00146         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00147         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00148         alpha1 += step_a1; \
00149 \
00150         t2v.v = a2v.v; \
00151         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00152         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00153 \
00154         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00155         alpha2 += step_a2; \
00156         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00157         alpha3 += step_a3; \
00158 \
00159         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00160         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00161 \
00162         t1v.v = a1v.v; \
00163         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00164         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00165 \
00166         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00167         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00168         alpha1 += step_a1; \
00169 \
00170         t2v.v = a2v.v; \
00171         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00172         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00173 \
00174         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00175         alpha2 += step_a2; \
00176         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00177         alpha3 += step_a3; \
00178 \
00179         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00180         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00181 \
00182         t1v.v = a1v.v; \
00183         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00184         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00185 \
00186         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00187         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00188         alpha1 += step_a1; \
00189 \
00190         t2v.v = a2v.v; \
00191         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00192         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00193 \
00194         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00195         alpha2 += step_a2; \
00196         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00197         alpha3 += step_a3; \
00198 \
00199         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00200         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00201 \
00202         t1v.v = a1v.v; \
00203         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00204         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00205 \
00206         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00207         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00208         alpha1 += step_a1; \
00209 \
00210         t2v.v = a2v.v; \
00211         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00212         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00213 \
00214         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00215         alpha2 += step_a2; \
00216         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00217         alpha3 += step_a3; \
00218 \
00219         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00220         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00221 \
00222         t1v.v = a1v.v; \
00223         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00224         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00225 \
00226         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00227         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00228         alpha1 += step_a1; \
00229 \
00230         t2v.v = a2v.v; \
00231         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00232         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00233 \
00234         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00235         alpha2 += step_a2; \
00236         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00237         alpha3 += step_a3; \
00238     } \
00239 \
00240     for ( i = 0; i < n_iter4; ++i ) \
00241     { \
00242 \
00243         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00244         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00245 \
00246         t1v.v = a1v.v; \
00247         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00248         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00249 \
00250         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00251         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00252         alpha1 += step_a1; \
00253 \
00254         t2v.v = a2v.v; \
00255         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00256         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00257 \
00258         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00259         alpha2 += step_a2; \
00260         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00261         alpha3 += step_a3; \
00262     } \
00263 \
00264     for ( i = 0; i < n_left; ++i ) \
00265     { \
00266         float ga12 = *gamma12; \
00267         float si12 = *sigma12; \
00268         float ga23 = *gamma23; \
00269         float si23 = *sigma23; \
00270         float temp1; \
00271         float temp2; \
00272         float temp3; \
00273 \
00274         temp1 = *alpha1; \
00275         temp2 = *alpha2; \
00276 \
00277         *alpha1 = temp1 * ga12 + temp2 * si12; \
00278         *alpha2 = temp2 * ga12 - temp1 * si12; \
00279 \
00280         temp2 = *alpha2; \
00281         temp3 = *alpha3; \
00282 \
00283         *alpha2 = temp2 * ga23 + temp3 * si23; \
00284         *alpha3 = temp3 * ga23 - temp2 * si23; \
00285 \
00286         alpha1 += 1; \
00287         alpha2 += 1; \
00288         alpha3 += 1; \
00289     } \
00290 }
00291 
00292 #define MAC_Apply_G_mx3_asd( m_A, \
00293                              gamma12, \
00294                              sigma12, \
00295                              gamma23, \
00296                              sigma23, \
00297                              a1, inc_a1, \
00298                              a2, inc_a2, \
00299                              a3, inc_a3 ) \
00300 {\
00301     int              n_iter16 = m_A / ( 2 * 8 ); \
00302     int              n_left16 = m_A % ( 2 * 8 ); \
00303     int              n_iter2  = n_left16 / ( 2 * 1 ); \
00304     int              n_left   = n_left16 % ( 2 * 1 ); \
00305     int              i; \
00306 \
00307     const int        step_a1 = inc_a1 * 2; \
00308     const int        step_a2 = inc_a1 * 2; \
00309     const int        step_a3 = inc_a1 * 2; \
00310 \
00311     double* restrict alpha1 = a1; \
00312     double* restrict alpha2 = a2; \
00313     double* restrict alpha3 = a3; \
00314 \
00315     v2df_t           a1v, a2v, a3v; \
00316     v2df_t           g12v, s12v; \
00317     v2df_t           g23v, s23v; \
00318     v2df_t           t1v, t2v; \
00319 \
00320     g12v.v = _mm_loaddup_pd( gamma12 ); \
00321     s12v.v = _mm_loaddup_pd( sigma12 ); \
00322     g23v.v = _mm_loaddup_pd( gamma23 ); \
00323     s23v.v = _mm_loaddup_pd( sigma23 ); \
00324 \
00325     for ( i = 0; i < n_iter16; ++i ) \
00326     { \
00327 \
00328         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00329         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00330 \
00331         t1v.v = a1v.v; \
00332         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00333         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00334 \
00335         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00336         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00337         alpha1 += step_a1; \
00338 \
00339         t2v.v = a2v.v; \
00340         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00341         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00342 \
00343         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00344         alpha2 += step_a2; \
00345         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00346         alpha3 += step_a3; \
00347 \
00348         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00349         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00350 \
00351         t1v.v = a1v.v; \
00352         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00353         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00354 \
00355         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00356         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00357         alpha1 += step_a1; \
00358 \
00359         t2v.v = a2v.v; \
00360         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00361         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00362 \
00363         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00364         alpha2 += step_a2; \
00365         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00366         alpha3 += step_a3; \
00367 \
00368         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00369         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00370 \
00371         t1v.v = a1v.v; \
00372         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00373         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00374 \
00375         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00376         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00377         alpha1 += step_a1; \
00378 \
00379         t2v.v = a2v.v; \
00380         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00381         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00382 \
00383         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00384         alpha2 += step_a2; \
00385         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00386         alpha3 += step_a3; \
00387 \
00388         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00389         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00390 \
00391         t1v.v = a1v.v; \
00392         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00393         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00394 \
00395         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00396         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00397         alpha1 += step_a1; \
00398 \
00399         t2v.v = a2v.v; \
00400         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00401         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00402 \
00403         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00404         alpha2 += step_a2; \
00405         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00406         alpha3 += step_a3; \
00407 \
00408         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00409         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00410 \
00411         t1v.v = a1v.v; \
00412         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00413         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00414 \
00415         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00416         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00417         alpha1 += step_a1; \
00418 \
00419         t2v.v = a2v.v; \
00420         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00421         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00422 \
00423         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00424         alpha2 += step_a2; \
00425         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00426         alpha3 += step_a3; \
00427 \
00428         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00429         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00430 \
00431         t1v.v = a1v.v; \
00432         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00433         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00434 \
00435         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00436         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00437         alpha1 += step_a1; \
00438 \
00439         t2v.v = a2v.v; \
00440         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00441         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00442 \
00443         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00444         alpha2 += step_a2; \
00445         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00446         alpha3 += step_a3; \
00447 \
00448         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00449         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00450 \
00451         t1v.v = a1v.v; \
00452         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00453         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00454 \
00455         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00456         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00457         alpha1 += step_a1; \
00458 \
00459         t2v.v = a2v.v; \
00460         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00461         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00462 \
00463         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00464         alpha2 += step_a2; \
00465         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00466         alpha3 += step_a3; \
00467 \
00468         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00469         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00470 \
00471         t1v.v = a1v.v; \
00472         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00473         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00474 \
00475         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00476         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00477         alpha1 += step_a1; \
00478 \
00479         t2v.v = a2v.v; \
00480         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00481         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00482 \
00483         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00484         alpha2 += step_a2; \
00485         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00486         alpha3 += step_a3; \
00487     } \
00488 \
00489     for ( i = 0; i < n_iter2; ++i ) \
00490     { \
00491 \
00492         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00493         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00494 \
00495         t1v.v = a1v.v; \
00496         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00497         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00498 \
00499         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00500         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00501         alpha1 += step_a1; \
00502 \
00503         t2v.v = a2v.v; \
00504         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00505         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00506 \
00507         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00508         alpha2 += step_a2; \
00509         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00510         alpha3 += step_a3; \
00511     } \
00512 \
00513     if ( n_left == 1 ) \
00514     { \
00515         double ga12 = *gamma12; \
00516         double si12 = *sigma12; \
00517         double ga23 = *gamma23; \
00518         double si23 = *sigma23; \
00519         double temp1; \
00520         double temp2; \
00521         double temp3; \
00522 \
00523         temp1 = *alpha1; \
00524         temp2 = *alpha2; \
00525 \
00526         *alpha1 = temp1 * ga12 + temp2 * si12; \
00527         *alpha2 = temp2 * ga12 - temp1 * si12; \
00528 \
00529         temp2 = *alpha2; \
00530         temp3 = *alpha3; \
00531 \
00532         *alpha2 = temp2 * ga23 + temp3 * si23; \
00533         *alpha3 = temp3 * ga23 - temp2 * si23; \
00534     } \
00535 }
00536 
00537 #define MAC_Apply_G_mx3_asc( m_A, \
00538                              gamma12, \
00539                              sigma12, \
00540                              gamma23, \
00541                              sigma23, \
00542                              a1, inc_a1, \
00543                              a2, inc_a2, \
00544                              a3, inc_a3 ) \
00545 { \
00546     int                n_iter16 = m_A / ( 2 * 8 ); \
00547     int                n_left16 = m_A % ( 2 * 8 ); \
00548     int                n_iter2  = n_left16 / ( 2 * 1 ); \
00549     int                n_left   = n_left16 % ( 2 * 1 ); \
00550     int                i; \
00551 \
00552     const int          step_a1 = inc_a1 * 2; \
00553     const int          step_a2 = inc_a1 * 2; \
00554     const int          step_a3 = inc_a1 * 2; \
00555 \
00556     scomplex* restrict alpha1 = a1; \
00557     scomplex* restrict alpha2 = a2; \
00558     scomplex* restrict alpha3 = a3; \
00559 \
00560     v4sf_t             a1v, a2v, a3v; \
00561     v4sf_t             g12v, s12v; \
00562     v4sf_t             g23v, s23v; \
00563     v4sf_t             t1v, t2v; \
00564 \
00565     g12v.v = _mm_load1_ps( gamma12 ); \
00566     s12v.v = _mm_load1_ps( sigma12 ); \
00567     g23v.v = _mm_load1_ps( gamma23 ); \
00568     s23v.v = _mm_load1_ps( sigma23 ); \
00569 \
00570     for ( i = 0; i < n_iter16; ++i ) \
00571     { \
00572 \
00573         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00574         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00575 \
00576         t1v.v = a1v.v; \
00577         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00578         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00579 \
00580         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00581         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00582         alpha1 += step_a1; \
00583 \
00584         t2v.v = a2v.v; \
00585         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00586         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00587 \
00588         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00589         alpha2 += step_a2; \
00590         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00591         alpha3 += step_a3; \
00592 \
00593         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00594         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00595 \
00596         t1v.v = a1v.v; \
00597         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00598         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00599 \
00600         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00601         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00602         alpha1 += step_a1; \
00603 \
00604         t2v.v = a2v.v; \
00605         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00606         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00607 \
00608         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00609         alpha2 += step_a2; \
00610         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00611         alpha3 += step_a3; \
00612 \
00613         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00614         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00615 \
00616         t1v.v = a1v.v; \
00617         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00618         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00619 \
00620         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00621         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00622         alpha1 += step_a1; \
00623 \
00624         t2v.v = a2v.v; \
00625         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00626         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00627 \
00628         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00629         alpha2 += step_a2; \
00630         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00631         alpha3 += step_a3; \
00632 \
00633         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00634         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00635 \
00636         t1v.v = a1v.v; \
00637         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00638         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00639 \
00640         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00641         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00642         alpha1 += step_a1; \
00643 \
00644         t2v.v = a2v.v; \
00645         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00646         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00647 \
00648         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00649         alpha2 += step_a2; \
00650         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00651         alpha3 += step_a3; \
00652 \
00653         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00654         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00655 \
00656         t1v.v = a1v.v; \
00657         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00658         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00659 \
00660         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00661         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00662         alpha1 += step_a1; \
00663 \
00664         t2v.v = a2v.v; \
00665         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00666         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00667 \
00668         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00669         alpha2 += step_a2; \
00670         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00671         alpha3 += step_a3; \
00672 \
00673         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00674         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00675 \
00676         t1v.v = a1v.v; \
00677         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00678         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00679 \
00680         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00681         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00682         alpha1 += step_a1; \
00683 \
00684         t2v.v = a2v.v; \
00685         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00686         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00687 \
00688         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00689         alpha2 += step_a2; \
00690         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00691         alpha3 += step_a3; \
00692 \
00693         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00694         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00695 \
00696         t1v.v = a1v.v; \
00697         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00698         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00699 \
00700         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00701         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00702         alpha1 += step_a1; \
00703 \
00704         t2v.v = a2v.v; \
00705         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00706         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00707 \
00708         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00709         alpha2 += step_a2; \
00710         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00711         alpha3 += step_a3; \
00712 \
00713         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00714         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00715 \
00716         t1v.v = a1v.v; \
00717         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00718         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00719 \
00720         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00721         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00722         alpha1 += step_a1; \
00723 \
00724         t2v.v = a2v.v; \
00725         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00726         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00727 \
00728         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00729         alpha2 += step_a2; \
00730         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00731         alpha3 += step_a3; \
00732     } \
00733 \
00734     for ( i = 0; i < n_iter2; ++i ) \
00735     { \
00736 \
00737         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00738         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00739 \
00740         t1v.v = a1v.v; \
00741         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00742         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00743 \
00744         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00745         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00746         alpha1 += step_a1; \
00747 \
00748         t2v.v = a2v.v; \
00749         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00750         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00751 \
00752         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00753         alpha2 += step_a2; \
00754         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00755         alpha3 += step_a3; \
00756     } \
00757 \
00758     if ( n_left == 1 ) \
00759     { \
00760         float ga12 = *gamma12; \
00761         float si12 = *sigma12; \
00762         float ga23 = *gamma23; \
00763         float si23 = *sigma23; \
00764         scomplex temp1; \
00765         scomplex temp2; \
00766         scomplex temp3; \
00767 \
00768         temp1 = *alpha1; \
00769         temp2 = *alpha2; \
00770 \
00771         alpha1->real = temp1.real * ga12 + temp2.real * si12; \
00772         alpha2->real = temp2.real * ga12 - temp1.real * si12; \
00773 \
00774         alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \
00775         alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \
00776 \
00777         temp2 = *alpha2; \
00778         temp3 = *alpha3; \
00779 \
00780         alpha2->real = temp2.real * ga23 + temp3.real * si23; \
00781         alpha3->real = temp3.real * ga23 - temp2.real * si23; \
00782 \
00783         alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \
00784         alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \
00785     } \
00786 }
00787 
00788 #define MAC_Apply_G_mx3_asz( m_A, \
00789                              gamma12, \
00790                              sigma12, \
00791                              gamma23, \
00792                              sigma23, \
00793                              a1, inc_a1, \
00794                              a2, inc_a2, \
00795                              a3, inc_a3 ) \
00796 {\
00797     int                n_iter = m_A / 8; \
00798     int                n_left = m_A % 8; \
00799     int                i; \
00800 \
00801     const int          step_a1 = inc_a1 * 1; \
00802     const int          step_a2 = inc_a1 * 1; \
00803     const int          step_a3 = inc_a1 * 1; \
00804 \
00805     dcomplex* restrict alpha1 = a1; \
00806     dcomplex* restrict alpha2 = a2; \
00807     dcomplex* restrict alpha3 = a3; \
00808 \
00809     v2df_t             a1v, a2v, a3v; \
00810     v2df_t             g12v, s12v; \
00811     v2df_t             g23v, s23v; \
00812     v2df_t             t1v, t2v; \
00813 \
00814     g12v.v = _mm_loaddup_pd( gamma12 ); \
00815     s12v.v = _mm_loaddup_pd( sigma12 ); \
00816     g23v.v = _mm_loaddup_pd( gamma23 ); \
00817     s23v.v = _mm_loaddup_pd( sigma23 ); \
00818 \
00819     for ( i = 0; i < n_iter; ++i ) \
00820     { \
00821 \
00822         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00823         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00824 \
00825         t1v.v = a1v.v; \
00826         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00827         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00828 \
00829         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00830         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00831         alpha1 += step_a1; \
00832 \
00833         t2v.v = a2v.v; \
00834         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00835         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00836 \
00837         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00838         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00839         alpha2 += step_a2; \
00840         alpha3 += step_a3; \
00841 \
00842         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00843         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00844 \
00845         t1v.v = a1v.v; \
00846         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00847         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00848 \
00849         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00850         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00851         alpha1 += step_a1; \
00852 \
00853         t2v.v = a2v.v; \
00854         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00855         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00856 \
00857         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00858         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00859         alpha2 += step_a2; \
00860         alpha3 += step_a3; \
00861 \
00862         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00863         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00864 \
00865         t1v.v = a1v.v; \
00866         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00867         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00868 \
00869         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00870         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00871         alpha1 += step_a1; \
00872 \
00873         t2v.v = a2v.v; \
00874         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00875         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00876 \
00877         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00878         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00879         alpha2 += step_a2; \
00880         alpha3 += step_a3; \
00881 \
00882         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00883         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00884 \
00885         t1v.v = a1v.v; \
00886         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00887         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00888 \
00889         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00890         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00891         alpha1 += step_a1; \
00892 \
00893         t2v.v = a2v.v; \
00894         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00895         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00896 \
00897         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00898         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00899         alpha2 += step_a2; \
00900         alpha3 += step_a3; \
00901 \
00902         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00903         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00904 \
00905         t1v.v = a1v.v; \
00906         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00907         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00908 \
00909         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00910         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00911         alpha1 += step_a1; \
00912 \
00913         t2v.v = a2v.v; \
00914         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00915         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00916 \
00917         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00918         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00919         alpha2 += step_a2; \
00920         alpha3 += step_a3; \
00921 \
00922         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00923         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00924 \
00925         t1v.v = a1v.v; \
00926         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00927         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00928 \
00929         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00930         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00931         alpha1 += step_a1; \
00932 \
00933         t2v.v = a2v.v; \
00934         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00935         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00936 \
00937         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00938         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00939         alpha2 += step_a2; \
00940         alpha3 += step_a3; \
00941 \
00942         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00943         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00944 \
00945         t1v.v = a1v.v; \
00946         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00947         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00948 \
00949         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00950         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00951         alpha1 += step_a1; \
00952 \
00953         t2v.v = a2v.v; \
00954         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00955         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00956 \
00957         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00958         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00959         alpha2 += step_a2; \
00960         alpha3 += step_a3; \
00961 \
00962         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00963         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00964 \
00965         t1v.v = a1v.v; \
00966         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00967         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00968 \
00969         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00970         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00971         alpha1 += step_a1; \
00972 \
00973         t2v.v = a2v.v; \
00974         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00975         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00976 \
00977         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00978         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00979         alpha2 += step_a2; \
00980         alpha3 += step_a3; \
00981     } \
00982 \
00983     for ( i = 0; i < n_left; ++i ) \
00984     { \
00985         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00986         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00987 \
00988         t1v.v = a1v.v; \
00989         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00990         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00991 \
00992         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00993         alpha1 += step_a1; \
00994         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00995 \
00996         t2v.v = a2v.v; \
00997         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00998         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00999 \
01000         _mm_store_pd( ( double* )alpha2, a2v.v ); \
01001         alpha2 += step_a2; \
01002         _mm_store_pd( ( double* )alpha3, a3v.v ); \
01003         alpha3 += step_a3; \
01004     } \
01005 }
01006 
01007 #endif