libflame  revision_anchor
FLA_Apply_G_mx4s_opt.h
Go to the documentation of this file.
00001 /*
00002    libflame
00003    An object-based infrastructure for developing high-performance
00004    dense linear algebra libraries.
00005 
00006    Copyright (C) 2011, The University of Texas
00007 
00008    libflame is free software; you can redistribute it and/or modify
00009    it under the terms of the GNU Lesser General Public License as
00010    published by the Free Software Foundation; either version 2.1 of
00011    the License, or (at your option) any later version.
00012 
00013    libflame is distributed in the hope that it will be useful, but
00014    WITHOUT ANY WARRANTY; without even the implied warranty of
00015    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
00016    Lesser General Public License for more details.
00017 
00018    You should have received a copy of the GNU Lesser General Public
00019    License along with libflame; if you did not receive a copy, see
00020    http://www.gnu.org/licenses/.
00021 
00022    For more information, please contact us at flame@cs.utexas.edu or
00023    send mail to:
00024 
00025    Field G. Van Zee and/or
00026    Robert A. van de Geijn
00027    The University of Texas at Austin
00028    Department of Computer Sciences
00029    1 University Station C0500
00030    Austin TX 78712
00031 */
00032 
00033 #define MAC_Apply_G_mx4s_ops( m_A, \
00034                               gamma23_k1, \
00035                               sigma23_k1, \
00036                               gamma34_k1, \
00037                               sigma34_k1, \
00038                               gamma12_k2, \
00039                               sigma12_k2, \
00040                               gamma23_k2, \
00041                               sigma23_k2, \
00042                               a1, inc_a1, \
00043                               a2, inc_a2, \
00044                               a3, inc_a3, \
00045                               a4, inc_a4 ) \
00046 { \
00047     float              ga23_k1 = *gamma23_k1; \
00048     float              si23_k1 = *sigma23_k1; \
00049     float              ga34_k1 = *gamma34_k1; \
00050     float              si34_k1 = *sigma34_k1; \
00051     float              ga12_k2 = *gamma12_k2; \
00052     float              si12_k2 = *sigma12_k2; \
00053     float              ga23_k2 = *gamma23_k2; \
00054     float              si23_k2 = *sigma23_k2; \
00055     float*    restrict alpha1 = a1; \
00056     float*    restrict alpha2 = a2; \
00057     float*    restrict alpha3 = a3; \
00058     float*    restrict alpha4 = a4; \
00059     float              temp1; \
00060     float              temp2; \
00061     float              temp3; \
00062     float              temp4; \
00063     int                i; \
00064 \
00065     for ( i = 0; i < m_A; ++i ) \
00066     { \
00067         temp2 = *alpha2; \
00068         temp3 = *alpha3; \
00069 \
00070         *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
00071         *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
00072 \
00073         temp3 = *alpha3; \
00074         temp4 = *alpha4; \
00075 \
00076         *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
00077         *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
00078 \
00079         temp1 = *alpha1; \
00080         temp2 = *alpha2; \
00081 \
00082         *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
00083         *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
00084 \
00085         temp2 = *alpha2; \
00086         temp3 = *alpha3; \
00087 \
00088         *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
00089         *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
00090 \
00091         alpha1 += inc_a1; \
00092         alpha2 += inc_a2; \
00093         alpha3 += inc_a3; \
00094         alpha4 += inc_a4; \
00095     } \
00096 }
00097 
00098 #define MAC_Apply_G_mx4s_opc( m_A, \
00099                               gamma23_k1, \
00100                               sigma23_k1, \
00101                               gamma34_k1, \
00102                               sigma34_k1, \
00103                               gamma12_k2, \
00104                               sigma12_k2, \
00105                               gamma23_k2, \
00106                               sigma23_k2, \
00107                               a1, inc_a1, \
00108                               a2, inc_a2, \
00109                               a3, inc_a3, \
00110                               a4, inc_a4 ) \
00111 { \
00112     float              ga23_k1 = *gamma23_k1; \
00113     float              si23_k1 = *sigma23_k1; \
00114     float              ga34_k1 = *gamma34_k1; \
00115     float              si34_k1 = *sigma34_k1; \
00116     float              ga12_k2 = *gamma12_k2; \
00117     float              si12_k2 = *sigma12_k2; \
00118     float              ga23_k2 = *gamma23_k2; \
00119     float              si23_k2 = *sigma23_k2; \
00120     scomplex* restrict alpha1 = a1; \
00121     scomplex* restrict alpha2 = a2; \
00122     scomplex* restrict alpha3 = a3; \
00123     scomplex* restrict alpha4 = a4; \
00124     scomplex           temp1; \
00125     scomplex           temp2; \
00126     scomplex           temp3; \
00127     scomplex           temp4; \
00128     int                i; \
00129 \
00130     for ( i = 0; i < m_A; ++i ) \
00131     { \
00132 \
00133         temp2 = *alpha2; \
00134         temp3 = *alpha3; \
00135 \
00136         alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \
00137         alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \
00138 \
00139         alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \
00140         alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \
00141 \
00142         temp3 = *alpha3; \
00143         temp4 = *alpha4; \
00144 \
00145         alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \
00146         alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \
00147 \
00148         alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \
00149         alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \
00150 \
00151         temp1 = *alpha1; \
00152         temp2 = *alpha2; \
00153 \
00154         alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \
00155         alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \
00156 \
00157         alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \
00158         alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \
00159 \
00160         temp2 = *alpha2; \
00161         temp3 = *alpha3; \
00162 \
00163         alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \
00164         alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \
00165 \
00166         alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \
00167         alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \
00168 \
00169         alpha1 += inc_a1; \
00170         alpha2 += inc_a2; \
00171         alpha3 += inc_a3; \
00172         alpha4 += inc_a4; \
00173     } \
00174 }
00175 
00176 #define MAC_Apply_G_mx4s_opd( m_A, \
00177                               gamma23_k1, \
00178                               sigma23_k1, \
00179                               gamma34_k1, \
00180                               sigma34_k1, \
00181                               gamma12_k2, \
00182                               sigma12_k2, \
00183                               gamma23_k2, \
00184                               sigma23_k2, \
00185                               a1, inc_a1, \
00186                               a2, inc_a2, \
00187                               a3, inc_a3, \
00188                               a4, inc_a4 ) \
00189 { \
00190     double             ga23_k1 = *gamma23_k1; \
00191     double             si23_k1 = *sigma23_k1; \
00192     double             ga34_k1 = *gamma34_k1; \
00193     double             si34_k1 = *sigma34_k1; \
00194     double             ga12_k2 = *gamma12_k2; \
00195     double             si12_k2 = *sigma12_k2; \
00196     double             ga23_k2 = *gamma23_k2; \
00197     double             si23_k2 = *sigma23_k2; \
00198     double*   restrict alpha1 = a1; \
00199     double*   restrict alpha2 = a2; \
00200     double*   restrict alpha3 = a3; \
00201     double*   restrict alpha4 = a4; \
00202     double             temp1; \
00203     double             temp2; \
00204     double             temp3; \
00205     double             temp4; \
00206     int                i; \
00207 \
00208     for ( i = 0; i < m_A; ++i ) \
00209     { \
00210         temp2 = *alpha2; \
00211         temp3 = *alpha3; \
00212 \
00213         *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
00214         *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
00215 \
00216         temp3 = *alpha3; \
00217         temp4 = *alpha4; \
00218 \
00219         *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
00220         *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
00221 \
00222         temp1 = *alpha1; \
00223         temp2 = *alpha2; \
00224 \
00225         *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
00226         *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
00227 \
00228         temp2 = *alpha2; \
00229         temp3 = *alpha3; \
00230 \
00231         *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
00232         *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
00233 \
00234         alpha1 += inc_a1; \
00235         alpha2 += inc_a2; \
00236         alpha3 += inc_a3; \
00237         alpha4 += inc_a4; \
00238     } \
00239 }
00240 
00241 #define MAC_Apply_G_mx4s_opz( m_A, \
00242                               gamma23_k1, \
00243                               sigma23_k1, \
00244                               gamma34_k1, \
00245                               sigma34_k1, \
00246                               gamma12_k2, \
00247                               sigma12_k2, \
00248                               gamma23_k2, \
00249                               sigma23_k2, \
00250                               a1, inc_a1, \
00251                               a2, inc_a2, \
00252                               a3, inc_a3, \
00253                               a4, inc_a4 ) \
00254 { \
00255     double             ga23_k1 = *gamma23_k1; \
00256     double             si23_k1 = *sigma23_k1; \
00257     double             ga34_k1 = *gamma34_k1; \
00258     double             si34_k1 = *sigma34_k1; \
00259     double             ga12_k2 = *gamma12_k2; \
00260     double             si12_k2 = *sigma12_k2; \
00261     double             ga23_k2 = *gamma23_k2; \
00262     double             si23_k2 = *sigma23_k2; \
00263     dcomplex* restrict alpha1 = a1; \
00264     dcomplex* restrict alpha2 = a2; \
00265     dcomplex* restrict alpha3 = a3; \
00266     dcomplex* restrict alpha4 = a4; \
00267     dcomplex           temp1; \
00268     dcomplex           temp2; \
00269     dcomplex           temp3; \
00270     dcomplex           temp4; \
00271     int                i; \
00272 \
00273     for ( i = 0; i < m_A; ++i ) \
00274     { \
00275 \
00276         temp2 = *alpha2; \
00277         temp3 = *alpha3; \
00278 \
00279         alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \
00280         alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \
00281 \
00282         alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \
00283         alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \
00284 \
00285         temp3 = *alpha3; \
00286         temp4 = *alpha4; \
00287 \
00288         alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \
00289         alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \
00290 \
00291         alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \
00292         alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \
00293 \
00294         temp1 = *alpha1; \
00295         temp2 = *alpha2; \
00296 \
00297         alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \
00298         alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \
00299 \
00300         alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \
00301         alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \
00302 \
00303         temp2 = *alpha2; \
00304         temp3 = *alpha3; \
00305 \
00306         alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \
00307         alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \
00308 \
00309         alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \
00310         alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \
00311 \
00312         alpha1 += inc_a1; \
00313         alpha2 += inc_a2; \
00314         alpha3 += inc_a3; \
00315         alpha4 += inc_a4; \
00316     } \
00317 }
00318