libflame
revision_anchor
|
00001 /* 00002 libflame 00003 An object-based infrastructure for developing high-performance 00004 dense linear algebra libraries. 00005 00006 Copyright (C) 2011, The University of Texas 00007 00008 libflame is free software; you can redistribute it and/or modify 00009 it under the terms of the GNU Lesser General Public License as 00010 published by the Free Software Foundation; either version 2.1 of 00011 the License, or (at your option) any later version. 00012 00013 libflame is distributed in the hope that it will be useful, but 00014 WITHOUT ANY WARRANTY; without even the implied warranty of 00015 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 Lesser General Public License for more details. 00017 00018 You should have received a copy of the GNU Lesser General Public 00019 License along with libflame; if you did not receive a copy, see 00020 http://www.gnu.org/licenses/. 00021 00022 For more information, please contact us at flame@cs.utexas.edu or 00023 send mail to: 00024 00025 Field G. Van Zee and/or 00026 Robert A. van de Geijn 00027 The University of Texas at Austin 00028 Department of Computer Sciences 00029 1 University Station C0500 00030 Austin TX 78712 00031 */ 00032 00033 #define MAC_Apply_G_mx4s_ops( m_A, \ 00034 gamma23_k1, \ 00035 sigma23_k1, \ 00036 gamma34_k1, \ 00037 sigma34_k1, \ 00038 gamma12_k2, \ 00039 sigma12_k2, \ 00040 gamma23_k2, \ 00041 sigma23_k2, \ 00042 a1, inc_a1, \ 00043 a2, inc_a2, \ 00044 a3, inc_a3, \ 00045 a4, inc_a4 ) \ 00046 { \ 00047 float ga23_k1 = *gamma23_k1; \ 00048 float si23_k1 = *sigma23_k1; \ 00049 float ga34_k1 = *gamma34_k1; \ 00050 float si34_k1 = *sigma34_k1; \ 00051 float ga12_k2 = *gamma12_k2; \ 00052 float si12_k2 = *sigma12_k2; \ 00053 float ga23_k2 = *gamma23_k2; \ 00054 float si23_k2 = *sigma23_k2; \ 00055 float* restrict alpha1 = a1; \ 00056 float* restrict alpha2 = a2; \ 00057 float* restrict alpha3 = a3; \ 00058 float* restrict alpha4 = a4; \ 00059 float temp1; \ 00060 float temp2; \ 00061 float temp3; \ 00062 float temp4; \ 00063 int i; \ 00064 \ 00065 for ( i = 0; i < m_A; ++i ) \ 00066 { \ 00067 temp2 = *alpha2; \ 00068 temp3 = *alpha3; \ 00069 \ 00070 *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \ 00071 *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \ 00072 \ 00073 temp3 = *alpha3; \ 00074 temp4 = *alpha4; \ 00075 \ 00076 *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \ 00077 *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \ 00078 \ 00079 temp1 = *alpha1; \ 00080 temp2 = *alpha2; \ 00081 \ 00082 *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \ 00083 *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \ 00084 \ 00085 temp2 = *alpha2; \ 00086 temp3 = *alpha3; \ 00087 \ 00088 *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \ 00089 *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \ 00090 \ 00091 alpha1 += inc_a1; \ 00092 alpha2 += inc_a2; \ 00093 alpha3 += inc_a3; \ 00094 alpha4 += inc_a4; \ 00095 } \ 00096 } 00097 00098 #define MAC_Apply_G_mx4s_opc( m_A, \ 00099 gamma23_k1, \ 00100 sigma23_k1, \ 00101 gamma34_k1, \ 00102 sigma34_k1, \ 00103 gamma12_k2, \ 00104 sigma12_k2, \ 00105 gamma23_k2, \ 00106 sigma23_k2, \ 00107 a1, inc_a1, \ 00108 a2, inc_a2, \ 00109 a3, inc_a3, \ 00110 a4, inc_a4 ) \ 00111 { \ 00112 float ga23_k1 = *gamma23_k1; \ 00113 float si23_k1 = *sigma23_k1; \ 00114 float ga34_k1 = *gamma34_k1; \ 00115 float si34_k1 = *sigma34_k1; \ 00116 float ga12_k2 = *gamma12_k2; \ 00117 float si12_k2 = *sigma12_k2; \ 00118 float ga23_k2 = *gamma23_k2; \ 00119 float si23_k2 = *sigma23_k2; \ 00120 scomplex* restrict alpha1 = a1; \ 00121 scomplex* restrict alpha2 = a2; \ 00122 scomplex* restrict alpha3 = a3; \ 00123 scomplex* restrict alpha4 = a4; \ 00124 scomplex temp1; \ 00125 scomplex temp2; \ 00126 scomplex temp3; \ 00127 scomplex temp4; \ 00128 int i; \ 00129 \ 00130 for ( i = 0; i < m_A; ++i ) \ 00131 { \ 00132 \ 00133 temp2 = *alpha2; \ 00134 temp3 = *alpha3; \ 00135 \ 00136 alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \ 00137 alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \ 00138 \ 00139 alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \ 00140 alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \ 00141 \ 00142 temp3 = *alpha3; \ 00143 temp4 = *alpha4; \ 00144 \ 00145 alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \ 00146 alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \ 00147 \ 00148 alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \ 00149 alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \ 00150 \ 00151 temp1 = *alpha1; \ 00152 temp2 = *alpha2; \ 00153 \ 00154 alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \ 00155 alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \ 00156 \ 00157 alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \ 00158 alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \ 00159 \ 00160 temp2 = *alpha2; \ 00161 temp3 = *alpha3; \ 00162 \ 00163 alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \ 00164 alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \ 00165 \ 00166 alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \ 00167 alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \ 00168 \ 00169 alpha1 += inc_a1; \ 00170 alpha2 += inc_a2; \ 00171 alpha3 += inc_a3; \ 00172 alpha4 += inc_a4; \ 00173 } \ 00174 } 00175 00176 #define MAC_Apply_G_mx4s_opd( m_A, \ 00177 gamma23_k1, \ 00178 sigma23_k1, \ 00179 gamma34_k1, \ 00180 sigma34_k1, \ 00181 gamma12_k2, \ 00182 sigma12_k2, \ 00183 gamma23_k2, \ 00184 sigma23_k2, \ 00185 a1, inc_a1, \ 00186 a2, inc_a2, \ 00187 a3, inc_a3, \ 00188 a4, inc_a4 ) \ 00189 { \ 00190 double ga23_k1 = *gamma23_k1; \ 00191 double si23_k1 = *sigma23_k1; \ 00192 double ga34_k1 = *gamma34_k1; \ 00193 double si34_k1 = *sigma34_k1; \ 00194 double ga12_k2 = *gamma12_k2; \ 00195 double si12_k2 = *sigma12_k2; \ 00196 double ga23_k2 = *gamma23_k2; \ 00197 double si23_k2 = *sigma23_k2; \ 00198 double* restrict alpha1 = a1; \ 00199 double* restrict alpha2 = a2; \ 00200 double* restrict alpha3 = a3; \ 00201 double* restrict alpha4 = a4; \ 00202 double temp1; \ 00203 double temp2; \ 00204 double temp3; \ 00205 double temp4; \ 00206 int i; \ 00207 \ 00208 for ( i = 0; i < m_A; ++i ) \ 00209 { \ 00210 temp2 = *alpha2; \ 00211 temp3 = *alpha3; \ 00212 \ 00213 *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \ 00214 *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \ 00215 \ 00216 temp3 = *alpha3; \ 00217 temp4 = *alpha4; \ 00218 \ 00219 *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \ 00220 *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \ 00221 \ 00222 temp1 = *alpha1; \ 00223 temp2 = *alpha2; \ 00224 \ 00225 *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \ 00226 *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \ 00227 \ 00228 temp2 = *alpha2; \ 00229 temp3 = *alpha3; \ 00230 \ 00231 *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \ 00232 *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \ 00233 \ 00234 alpha1 += inc_a1; \ 00235 alpha2 += inc_a2; \ 00236 alpha3 += inc_a3; \ 00237 alpha4 += inc_a4; \ 00238 } \ 00239 } 00240 00241 #define MAC_Apply_G_mx4s_opz( m_A, \ 00242 gamma23_k1, \ 00243 sigma23_k1, \ 00244 gamma34_k1, \ 00245 sigma34_k1, \ 00246 gamma12_k2, \ 00247 sigma12_k2, \ 00248 gamma23_k2, \ 00249 sigma23_k2, \ 00250 a1, inc_a1, \ 00251 a2, inc_a2, \ 00252 a3, inc_a3, \ 00253 a4, inc_a4 ) \ 00254 { \ 00255 double ga23_k1 = *gamma23_k1; \ 00256 double si23_k1 = *sigma23_k1; \ 00257 double ga34_k1 = *gamma34_k1; \ 00258 double si34_k1 = *sigma34_k1; \ 00259 double ga12_k2 = *gamma12_k2; \ 00260 double si12_k2 = *sigma12_k2; \ 00261 double ga23_k2 = *gamma23_k2; \ 00262 double si23_k2 = *sigma23_k2; \ 00263 dcomplex* restrict alpha1 = a1; \ 00264 dcomplex* restrict alpha2 = a2; \ 00265 dcomplex* restrict alpha3 = a3; \ 00266 dcomplex* restrict alpha4 = a4; \ 00267 dcomplex temp1; \ 00268 dcomplex temp2; \ 00269 dcomplex temp3; \ 00270 dcomplex temp4; \ 00271 int i; \ 00272 \ 00273 for ( i = 0; i < m_A; ++i ) \ 00274 { \ 00275 \ 00276 temp2 = *alpha2; \ 00277 temp3 = *alpha3; \ 00278 \ 00279 alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \ 00280 alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \ 00281 \ 00282 alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \ 00283 alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \ 00284 \ 00285 temp3 = *alpha3; \ 00286 temp4 = *alpha4; \ 00287 \ 00288 alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \ 00289 alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \ 00290 \ 00291 alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \ 00292 alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \ 00293 \ 00294 temp1 = *alpha1; \ 00295 temp2 = *alpha2; \ 00296 \ 00297 alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \ 00298 alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \ 00299 \ 00300 alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \ 00301 alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \ 00302 \ 00303 temp2 = *alpha2; \ 00304 temp3 = *alpha3; \ 00305 \ 00306 alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \ 00307 alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \ 00308 \ 00309 alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \ 00310 alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \ 00311 \ 00312 alpha1 += inc_a1; \ 00313 alpha2 += inc_a2; \ 00314 alpha3 += inc_a3; \ 00315 alpha4 += inc_a4; \ 00316 } \ 00317 } 00318