libflame
revision_anchor
|
00001 /* 00002 libflame 00003 An object-based infrastructure for developing high-performance 00004 dense linear algebra libraries. 00005 00006 Copyright (C) 2011, The University of Texas 00007 00008 libflame is free software; you can redistribute it and/or modify 00009 it under the terms of the GNU Lesser General Public License as 00010 published by the Free Software Foundation; either version 2.1 of 00011 the License, or (at your option) any later version. 00012 00013 libflame is distributed in the hope that it will be useful, but 00014 WITHOUT ANY WARRANTY; without even the implied warranty of 00015 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 Lesser General Public License for more details. 00017 00018 You should have received a copy of the GNU Lesser General Public 00019 License along with libflame; if you did not receive a copy, see 00020 http://www.gnu.org/licenses/. 00021 00022 For more information, please contact us at flame@cs.utexas.edu or 00023 send mail to: 00024 00025 Field G. Van Zee and/or 00026 Robert A. van de Geijn 00027 The University of Texas at Austin 00028 Department of Computer Sciences 00029 1 University Station C0500 00030 Austin TX 78712 00031 */ 00032 00033 00034 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS 00035 00036 #define MAC_Apply_G_mx4s_ass MAC_Apply_G_mx4s_ops 00037 #define MAC_Apply_G_mx4s_asd MAC_Apply_G_mx4s_opd 00038 #define MAC_Apply_G_mx4s_asc MAC_Apply_G_mx4s_opc 00039 #define MAC_Apply_G_mx4s_asz MAC_Apply_G_mx4s_opz 00040 00041 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS 00042 00043 #define MAC_Apply_G_mx4s_ass( m_A, \ 00044 gamma23_k1, \ 00045 sigma23_k1, \ 00046 gamma34_k1, \ 00047 sigma34_k1, \ 00048 gamma12_k2, \ 00049 sigma12_k2, \ 00050 gamma23_k2, \ 00051 sigma23_k2, \ 00052 a1, inc_a1, \ 00053 a2, inc_a2, \ 00054 a3, inc_a3, \ 00055 a4, inc_a4 ) \ 00056 {\ 00057 int n_iter32 = m_A / ( 4 * 8 ); \ 00058 int n_left32 = m_A % ( 4 * 8 ); \ 00059 int n_iter4 = n_left32 / ( 4 * 1 ); \ 00060 int n_left = n_left32 % ( 4 * 1 ); \ 00061 int i; \ 00062 \ 00063 const int step_a1 = inc_a1 * 4; \ 00064 const int step_a2 = inc_a2 * 4; \ 00065 const int step_a3 = inc_a3 * 4; \ 00066 const int step_a4 = inc_a4 * 4; \ 00067 \ 00068 float* restrict alpha1 = a1; \ 00069 float* restrict alpha2 = a2; \ 00070 float* restrict alpha3 = a3; \ 00071 float* restrict alpha4 = a4; \ 00072 \ 00073 v4sf_t a1v, a2v, a3v, a4v; \ 00074 v4sf_t b1v, b2v, b3v, b4v; \ 00075 v4sf_t g23_k1v, s23_k1v; \ 00076 v4sf_t g34_k1v, s34_k1v; \ 00077 v4sf_t g12_k2v, s12_k2v; \ 00078 v4sf_t g23_k2v, s23_k2v; \ 00079 v4sf_t t1v, t2v, t3v; \ 00080 \ 00081 g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \ 00082 s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \ 00083 g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \ 00084 s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \ 00085 g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \ 00086 s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \ 00087 g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \ 00088 s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \ 00089 \ 00090 for ( i = 0; i < n_iter32; ++i ) \ 00091 { \ 00092 \ 00093 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00094 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00095 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00096 \ 00097 t2v.v = a2v.v; \ 00098 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00099 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00100 \ 00101 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00102 \ 00103 t3v.v = a3v.v; \ 00104 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00105 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00106 \ 00107 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 00108 alpha4 += step_a4; \ 00109 \ 00110 t1v.v = a1v.v; \ 00111 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00112 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00113 \ 00114 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00115 alpha1 += step_a1; \ 00116 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00117 \ 00118 t2v.v = a2v.v; \ 00119 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00120 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00121 \ 00122 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00123 alpha2 += step_a2; \ 00124 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00125 \ 00126 /* ----------------------------------------------------------- */ \ 00127 \ 00128 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00129 \ 00130 t2v.v = b2v.v; \ 00131 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00132 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00133 \ 00134 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00135 alpha3 += step_a3; \ 00136 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00137 \ 00138 t3v.v = b3v.v; \ 00139 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00140 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00141 \ 00142 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 00143 alpha4 += step_a4; \ 00144 \ 00145 t1v.v = b1v.v; \ 00146 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00147 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00148 \ 00149 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 00150 alpha1 += step_a1; \ 00151 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00152 \ 00153 t2v.v = b2v.v; \ 00154 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00155 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00156 \ 00157 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 00158 alpha2 += step_a2; \ 00159 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00160 \ 00161 /* ----------------------------------------------------------- */ \ 00162 \ 00163 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00164 \ 00165 t2v.v = a2v.v; \ 00166 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00167 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00168 \ 00169 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 00170 alpha3 += step_a3; \ 00171 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00172 \ 00173 t3v.v = a3v.v; \ 00174 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00175 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00176 \ 00177 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 00178 alpha4 += step_a4; \ 00179 \ 00180 t1v.v = a1v.v; \ 00181 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00182 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00183 \ 00184 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00185 alpha1 += step_a1; \ 00186 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00187 \ 00188 t2v.v = a2v.v; \ 00189 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00190 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00191 \ 00192 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00193 alpha2 += step_a2; \ 00194 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00195 \ 00196 /* ----------------------------------------------------------- */ \ 00197 \ 00198 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00199 \ 00200 t2v.v = b2v.v; \ 00201 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00202 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00203 \ 00204 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00205 alpha3 += step_a3; \ 00206 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00207 \ 00208 t3v.v = b3v.v; \ 00209 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00210 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00211 \ 00212 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 00213 alpha4 += step_a4; \ 00214 \ 00215 t1v.v = b1v.v; \ 00216 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00217 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00218 \ 00219 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 00220 alpha1 += step_a1; \ 00221 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \ 00222 \ 00223 t2v.v = b2v.v; \ 00224 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00225 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00226 \ 00227 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 00228 alpha2 += step_a2; \ 00229 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00230 \ 00231 \ 00232 /* ----------------------------------------------------------- */ \ 00233 \ 00234 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00235 \ 00236 t2v.v = a2v.v; \ 00237 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00238 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00239 \ 00240 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 00241 alpha3 += step_a3; \ 00242 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00243 \ 00244 t3v.v = a3v.v; \ 00245 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00246 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00247 \ 00248 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 00249 alpha4 += step_a4; \ 00250 \ 00251 t1v.v = a1v.v; \ 00252 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00253 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00254 \ 00255 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00256 alpha1 += step_a1; \ 00257 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00258 \ 00259 t2v.v = a2v.v; \ 00260 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00261 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00262 \ 00263 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00264 alpha2 += step_a2; \ 00265 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00266 \ 00267 /* ----------------------------------------------------------- */ \ 00268 \ 00269 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00270 \ 00271 t2v.v = b2v.v; \ 00272 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00273 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00274 \ 00275 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00276 alpha3 += step_a3; \ 00277 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00278 \ 00279 t3v.v = b3v.v; \ 00280 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00281 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00282 \ 00283 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 00284 alpha4 += step_a4; \ 00285 \ 00286 t1v.v = b1v.v; \ 00287 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00288 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00289 \ 00290 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 00291 alpha1 += step_a1; \ 00292 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00293 \ 00294 t2v.v = b2v.v; \ 00295 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00296 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00297 \ 00298 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 00299 alpha2 += step_a2; \ 00300 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00301 \ 00302 /* ----------------------------------------------------------- */ \ 00303 \ 00304 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00305 \ 00306 t2v.v = a2v.v; \ 00307 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00308 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00309 \ 00310 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 00311 alpha3 += step_a3; \ 00312 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00313 \ 00314 t3v.v = a3v.v; \ 00315 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00316 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00317 \ 00318 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 00319 alpha4 += step_a4; \ 00320 \ 00321 t1v.v = a1v.v; \ 00322 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00323 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00324 \ 00325 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00326 alpha1 += step_a1; \ 00327 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00328 \ 00329 t2v.v = a2v.v; \ 00330 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00331 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00332 \ 00333 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00334 alpha2 += step_a2; \ 00335 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00336 \ 00337 /* ----------------------------------------------------------- */ \ 00338 \ 00339 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00340 \ 00341 t2v.v = b2v.v; \ 00342 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00343 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00344 \ 00345 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00346 alpha3 += step_a3; \ 00347 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00348 \ 00349 t3v.v = b3v.v; \ 00350 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00351 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00352 \ 00353 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 00354 alpha4 += step_a4; \ 00355 \ 00356 t1v.v = b1v.v; \ 00357 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00358 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00359 \ 00360 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 00361 alpha1 += step_a1; \ 00362 \ 00363 t2v.v = b2v.v; \ 00364 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00365 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00366 \ 00367 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 00368 alpha2 += step_a2; \ 00369 \ 00370 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 00371 alpha3 += step_a3; \ 00372 \ 00373 /* ----------------------------------------------------------- */ \ 00374 } \ 00375 \ 00376 for ( i = 0; i < n_iter4; ++i ) \ 00377 { \ 00378 \ 00379 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00380 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00381 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00382 \ 00383 t2v.v = a2v.v; \ 00384 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00385 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00386 \ 00387 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00388 \ 00389 t3v.v = a3v.v; \ 00390 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00391 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00392 \ 00393 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 00394 alpha4 += step_a4; \ 00395 \ 00396 t1v.v = a1v.v; \ 00397 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00398 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00399 \ 00400 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00401 alpha1 += step_a1; \ 00402 \ 00403 t2v.v = a2v.v; \ 00404 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00405 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00406 \ 00407 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00408 alpha2 += step_a2; \ 00409 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00410 alpha3 += step_a3; \ 00411 } \ 00412 \ 00413 for ( i = 0; i < n_left; ++i ) \ 00414 { \ 00415 float ga23_k1 = *gamma23_k1; \ 00416 float si23_k1 = *sigma23_k1; \ 00417 float ga34_k1 = *gamma34_k1; \ 00418 float si34_k1 = *sigma34_k1; \ 00419 float ga12_k2 = *gamma12_k2; \ 00420 float si12_k2 = *sigma12_k2; \ 00421 float ga23_k2 = *gamma23_k2; \ 00422 float si23_k2 = *sigma23_k2; \ 00423 float temp1; \ 00424 float temp2; \ 00425 float temp3; \ 00426 float temp4; \ 00427 \ 00428 temp2 = *alpha2; \ 00429 temp3 = *alpha3; \ 00430 \ 00431 *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \ 00432 *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \ 00433 \ 00434 temp3 = *alpha3; \ 00435 temp4 = *alpha4; \ 00436 \ 00437 *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \ 00438 *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \ 00439 \ 00440 temp1 = *alpha1; \ 00441 temp2 = *alpha2; \ 00442 \ 00443 *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \ 00444 *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \ 00445 \ 00446 temp2 = *alpha2; \ 00447 temp3 = *alpha3; \ 00448 \ 00449 *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \ 00450 *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \ 00451 \ 00452 alpha1 += 1; \ 00453 alpha2 += 1; \ 00454 alpha3 += 1; \ 00455 alpha4 += 1; \ 00456 } \ 00457 } 00458 00459 #define MAC_Apply_G_mx4s_asd( m_A, \ 00460 gamma23_k1, \ 00461 sigma23_k1, \ 00462 gamma34_k1, \ 00463 sigma34_k1, \ 00464 gamma12_k2, \ 00465 sigma12_k2, \ 00466 gamma23_k2, \ 00467 sigma23_k2, \ 00468 a1, inc_a1, \ 00469 a2, inc_a2, \ 00470 a3, inc_a3, \ 00471 a4, inc_a4 ) \ 00472 {\ 00473 int n_iter16 = m_A / ( 2 * 8 ); \ 00474 int n_left16 = m_A % ( 2 * 8 ); \ 00475 int n_iter2 = n_left16 / ( 2 * 1 ); \ 00476 int n_left = n_left16 % ( 2 * 1 ); \ 00477 int i; \ 00478 \ 00479 const int step_a1 = inc_a1 * 2; \ 00480 const int step_a2 = inc_a2 * 2; \ 00481 const int step_a3 = inc_a3 * 2; \ 00482 const int step_a4 = inc_a4 * 2; \ 00483 \ 00484 double* restrict alpha1 = a1; \ 00485 double* restrict alpha2 = a2; \ 00486 double* restrict alpha3 = a3; \ 00487 double* restrict alpha4 = a4; \ 00488 \ 00489 v2df_t a1v, a2v, a3v, a4v; \ 00490 v2df_t b1v, b2v, b3v, b4v; \ 00491 v2df_t g23_k1v, s23_k1v; \ 00492 v2df_t g34_k1v, s34_k1v; \ 00493 v2df_t g12_k2v, s12_k2v; \ 00494 v2df_t g23_k2v, s23_k2v; \ 00495 v2df_t t1v, t2v, t3v; \ 00496 \ 00497 g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \ 00498 s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \ 00499 g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \ 00500 s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \ 00501 g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \ 00502 s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \ 00503 g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \ 00504 s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \ 00505 \ 00506 for ( i = 0; i < n_iter16; ++i ) \ 00507 { \ 00508 \ 00509 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00510 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00511 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00512 \ 00513 t2v.v = a2v.v; \ 00514 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00515 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00516 \ 00517 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00518 \ 00519 t3v.v = a3v.v; \ 00520 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00521 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00522 \ 00523 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 00524 alpha4 += step_a4; \ 00525 \ 00526 t1v.v = a1v.v; \ 00527 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00528 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00529 \ 00530 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00531 alpha1 += step_a1; \ 00532 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 00533 \ 00534 t2v.v = a2v.v; \ 00535 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00536 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00537 \ 00538 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00539 alpha2 += step_a2; \ 00540 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 00541 \ 00542 /* ----------------------------------------------------------- */ \ 00543 \ 00544 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00545 \ 00546 t2v.v = b2v.v; \ 00547 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00548 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00549 \ 00550 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00551 alpha3 += step_a3; \ 00552 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00553 \ 00554 t3v.v = b3v.v; \ 00555 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00556 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00557 \ 00558 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 00559 alpha4 += step_a4; \ 00560 \ 00561 t1v.v = b1v.v; \ 00562 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00563 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00564 \ 00565 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 00566 alpha1 += step_a1; \ 00567 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 00568 \ 00569 t2v.v = b2v.v; \ 00570 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00571 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00572 \ 00573 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 00574 alpha2 += step_a2; \ 00575 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 00576 \ 00577 /* ----------------------------------------------------------- */ \ 00578 \ 00579 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00580 \ 00581 t2v.v = a2v.v; \ 00582 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00583 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00584 \ 00585 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 00586 alpha3 += step_a3; \ 00587 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00588 \ 00589 t3v.v = a3v.v; \ 00590 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00591 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00592 \ 00593 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 00594 alpha4 += step_a4; \ 00595 \ 00596 t1v.v = a1v.v; \ 00597 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00598 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00599 \ 00600 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00601 alpha1 += step_a1; \ 00602 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 00603 \ 00604 t2v.v = a2v.v; \ 00605 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00606 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00607 \ 00608 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00609 alpha2 += step_a2; \ 00610 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 00611 \ 00612 /* ----------------------------------------------------------- */ \ 00613 \ 00614 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00615 \ 00616 t2v.v = b2v.v; \ 00617 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00618 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00619 \ 00620 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00621 alpha3 += step_a3; \ 00622 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00623 \ 00624 t3v.v = b3v.v; \ 00625 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00626 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00627 \ 00628 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 00629 alpha4 += step_a4; \ 00630 \ 00631 t1v.v = b1v.v; \ 00632 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00633 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00634 \ 00635 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 00636 alpha1 += step_a1; \ 00637 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \ 00638 \ 00639 t2v.v = b2v.v; \ 00640 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00641 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00642 \ 00643 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 00644 alpha2 += step_a2; \ 00645 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 00646 \ 00647 \ 00648 /* ----------------------------------------------------------- */ \ 00649 \ 00650 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00651 \ 00652 t2v.v = a2v.v; \ 00653 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00654 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00655 \ 00656 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 00657 alpha3 += step_a3; \ 00658 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00659 \ 00660 t3v.v = a3v.v; \ 00661 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00662 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00663 \ 00664 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 00665 alpha4 += step_a4; \ 00666 \ 00667 t1v.v = a1v.v; \ 00668 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00669 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00670 \ 00671 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00672 alpha1 += step_a1; \ 00673 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 00674 \ 00675 t2v.v = a2v.v; \ 00676 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00677 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00678 \ 00679 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00680 alpha2 += step_a2; \ 00681 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 00682 \ 00683 /* ----------------------------------------------------------- */ \ 00684 \ 00685 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00686 \ 00687 t2v.v = b2v.v; \ 00688 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00689 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00690 \ 00691 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00692 alpha3 += step_a3; \ 00693 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00694 \ 00695 t3v.v = b3v.v; \ 00696 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00697 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00698 \ 00699 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 00700 alpha4 += step_a4; \ 00701 \ 00702 t1v.v = b1v.v; \ 00703 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00704 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00705 \ 00706 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 00707 alpha1 += step_a1; \ 00708 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 00709 \ 00710 t2v.v = b2v.v; \ 00711 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00712 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00713 \ 00714 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 00715 alpha2 += step_a2; \ 00716 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 00717 \ 00718 /* ----------------------------------------------------------- */ \ 00719 \ 00720 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00721 \ 00722 t2v.v = a2v.v; \ 00723 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00724 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00725 \ 00726 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 00727 alpha3 += step_a3; \ 00728 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00729 \ 00730 t3v.v = a3v.v; \ 00731 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00732 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00733 \ 00734 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 00735 alpha4 += step_a4; \ 00736 \ 00737 t1v.v = a1v.v; \ 00738 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00739 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00740 \ 00741 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00742 alpha1 += step_a1; \ 00743 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 00744 \ 00745 t2v.v = a2v.v; \ 00746 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00747 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00748 \ 00749 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00750 alpha2 += step_a2; \ 00751 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 00752 \ 00753 /* ----------------------------------------------------------- */ \ 00754 \ 00755 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00756 \ 00757 t2v.v = b2v.v; \ 00758 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00759 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00760 \ 00761 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00762 alpha3 += step_a3; \ 00763 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00764 \ 00765 t3v.v = b3v.v; \ 00766 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00767 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00768 \ 00769 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 00770 alpha4 += step_a4; \ 00771 \ 00772 t1v.v = b1v.v; \ 00773 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00774 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00775 \ 00776 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 00777 alpha1 += step_a1; \ 00778 \ 00779 t2v.v = b2v.v; \ 00780 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00781 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00782 \ 00783 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 00784 alpha2 += step_a2; \ 00785 \ 00786 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 00787 alpha3 += step_a3; \ 00788 \ 00789 /* ----------------------------------------------------------- */ \ 00790 } \ 00791 \ 00792 for ( i = 0; i < n_iter2; ++i ) \ 00793 { \ 00794 \ 00795 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00796 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00797 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00798 \ 00799 t2v.v = a2v.v; \ 00800 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00801 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00802 \ 00803 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00804 \ 00805 t3v.v = a3v.v; \ 00806 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00807 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00808 \ 00809 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 00810 alpha4 += step_a4; \ 00811 \ 00812 t1v.v = a1v.v; \ 00813 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00814 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00815 \ 00816 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00817 alpha1 += step_a1; \ 00818 \ 00819 t2v.v = a2v.v; \ 00820 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00821 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00822 \ 00823 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00824 alpha2 += step_a2; \ 00825 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00826 alpha3 += step_a3; \ 00827 } \ 00828 \ 00829 if ( n_left == 1 ) \ 00830 { \ 00831 double ga23_k1 = *gamma23_k1; \ 00832 double si23_k1 = *sigma23_k1; \ 00833 double ga34_k1 = *gamma34_k1; \ 00834 double si34_k1 = *sigma34_k1; \ 00835 double ga12_k2 = *gamma12_k2; \ 00836 double si12_k2 = *sigma12_k2; \ 00837 double ga23_k2 = *gamma23_k2; \ 00838 double si23_k2 = *sigma23_k2; \ 00839 double temp1; \ 00840 double temp2; \ 00841 double temp3; \ 00842 double temp4; \ 00843 \ 00844 temp2 = *alpha2; \ 00845 temp3 = *alpha3; \ 00846 \ 00847 *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \ 00848 *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \ 00849 \ 00850 temp3 = *alpha3; \ 00851 temp4 = *alpha4; \ 00852 \ 00853 *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \ 00854 *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \ 00855 \ 00856 temp1 = *alpha1; \ 00857 temp2 = *alpha2; \ 00858 \ 00859 *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \ 00860 *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \ 00861 \ 00862 temp2 = *alpha2; \ 00863 temp3 = *alpha3; \ 00864 \ 00865 *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \ 00866 *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \ 00867 \ 00868 } \ 00869 } 00870 00871 #define MAC_Apply_G_mx4s_asc( m_A, \ 00872 gamma23_k1, \ 00873 sigma23_k1, \ 00874 gamma34_k1, \ 00875 sigma34_k1, \ 00876 gamma12_k2, \ 00877 sigma12_k2, \ 00878 gamma23_k2, \ 00879 sigma23_k2, \ 00880 a1, inc_a1, \ 00881 a2, inc_a2, \ 00882 a3, inc_a3, \ 00883 a4, inc_a4 ) \ 00884 {\ 00885 int n_iter16 = m_A / ( 2 * 8 ); \ 00886 int n_left16 = m_A % ( 2 * 8 ); \ 00887 int n_iter2 = n_left16 / ( 2 * 1 ); \ 00888 int n_left = n_left16 % ( 2 * 1 ); \ 00889 int i; \ 00890 \ 00891 const int step_a1 = inc_a1 * 2; \ 00892 const int step_a2 = inc_a2 * 2; \ 00893 const int step_a3 = inc_a3 * 2; \ 00894 const int step_a4 = inc_a4 * 2; \ 00895 \ 00896 scomplex* restrict alpha1 = a1; \ 00897 scomplex* restrict alpha2 = a2; \ 00898 scomplex* restrict alpha3 = a3; \ 00899 scomplex* restrict alpha4 = a4; \ 00900 \ 00901 v4sf_t a1v, a2v, a3v, a4v; \ 00902 v4sf_t b1v, b2v, b3v, b4v; \ 00903 v4sf_t g23_k1v, s23_k1v; \ 00904 v4sf_t g34_k1v, s34_k1v; \ 00905 v4sf_t g12_k2v, s12_k2v; \ 00906 v4sf_t g23_k2v, s23_k2v; \ 00907 v4sf_t t1v, t2v, t3v; \ 00908 \ 00909 g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \ 00910 s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \ 00911 g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \ 00912 s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \ 00913 g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \ 00914 s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \ 00915 g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \ 00916 s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \ 00917 \ 00918 for ( i = 0; i < n_iter16; ++i ) \ 00919 { \ 00920 \ 00921 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00922 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00923 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00924 \ 00925 t2v.v = a2v.v; \ 00926 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00927 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00928 \ 00929 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00930 \ 00931 t3v.v = a3v.v; \ 00932 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00933 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00934 \ 00935 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 00936 alpha4 += step_a4; \ 00937 \ 00938 t1v.v = a1v.v; \ 00939 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00940 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00941 \ 00942 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00943 alpha1 += step_a1; \ 00944 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00945 \ 00946 t2v.v = a2v.v; \ 00947 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00948 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00949 \ 00950 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00951 alpha2 += step_a2; \ 00952 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00953 \ 00954 /* ----------------------------------------------------------- */ \ 00955 \ 00956 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00957 \ 00958 t2v.v = b2v.v; \ 00959 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00960 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00961 \ 00962 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00963 alpha3 += step_a3; \ 00964 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00965 \ 00966 t3v.v = b3v.v; \ 00967 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00968 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00969 \ 00970 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 00971 alpha4 += step_a4; \ 00972 \ 00973 t1v.v = b1v.v; \ 00974 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00975 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00976 \ 00977 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 00978 alpha1 += step_a1; \ 00979 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00980 \ 00981 t2v.v = b2v.v; \ 00982 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00983 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00984 \ 00985 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 00986 alpha2 += step_a2; \ 00987 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00988 \ 00989 /* ----------------------------------------------------------- */ \ 00990 \ 00991 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00992 \ 00993 t2v.v = a2v.v; \ 00994 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00995 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00996 \ 00997 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 00998 alpha3 += step_a3; \ 00999 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 01000 \ 01001 t3v.v = a3v.v; \ 01002 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01003 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01004 \ 01005 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 01006 alpha4 += step_a4; \ 01007 \ 01008 t1v.v = a1v.v; \ 01009 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01010 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01011 \ 01012 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 01013 alpha1 += step_a1; \ 01014 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 01015 \ 01016 t2v.v = a2v.v; \ 01017 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01018 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01019 \ 01020 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 01021 alpha2 += step_a2; \ 01022 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 01023 \ 01024 /* ----------------------------------------------------------- */ \ 01025 \ 01026 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 01027 \ 01028 t2v.v = b2v.v; \ 01029 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 01030 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01031 \ 01032 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 01033 alpha3 += step_a3; \ 01034 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 01035 \ 01036 t3v.v = b3v.v; \ 01037 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 01038 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01039 \ 01040 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 01041 alpha4 += step_a4; \ 01042 \ 01043 t1v.v = b1v.v; \ 01044 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 01045 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01046 \ 01047 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 01048 alpha1 += step_a1; \ 01049 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \ 01050 \ 01051 t2v.v = b2v.v; \ 01052 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 01053 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01054 \ 01055 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 01056 alpha2 += step_a2; \ 01057 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 01058 \ 01059 \ 01060 /* ----------------------------------------------------------- */ \ 01061 \ 01062 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 01063 \ 01064 t2v.v = a2v.v; \ 01065 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01066 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01067 \ 01068 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 01069 alpha3 += step_a3; \ 01070 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 01071 \ 01072 t3v.v = a3v.v; \ 01073 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01074 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01075 \ 01076 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 01077 alpha4 += step_a4; \ 01078 \ 01079 t1v.v = a1v.v; \ 01080 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01081 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01082 \ 01083 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 01084 alpha1 += step_a1; \ 01085 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 01086 \ 01087 t2v.v = a2v.v; \ 01088 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01089 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01090 \ 01091 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 01092 alpha2 += step_a2; \ 01093 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 01094 \ 01095 /* ----------------------------------------------------------- */ \ 01096 \ 01097 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 01098 \ 01099 t2v.v = b2v.v; \ 01100 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 01101 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01102 \ 01103 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 01104 alpha3 += step_a3; \ 01105 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 01106 \ 01107 t3v.v = b3v.v; \ 01108 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 01109 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01110 \ 01111 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 01112 alpha4 += step_a4; \ 01113 \ 01114 t1v.v = b1v.v; \ 01115 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 01116 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01117 \ 01118 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 01119 alpha1 += step_a1; \ 01120 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 01121 \ 01122 t2v.v = b2v.v; \ 01123 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 01124 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01125 \ 01126 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 01127 alpha2 += step_a2; \ 01128 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 01129 \ 01130 /* ----------------------------------------------------------- */ \ 01131 \ 01132 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 01133 \ 01134 t2v.v = a2v.v; \ 01135 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01136 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01137 \ 01138 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 01139 alpha3 += step_a3; \ 01140 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 01141 \ 01142 t3v.v = a3v.v; \ 01143 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01144 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01145 \ 01146 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 01147 alpha4 += step_a4; \ 01148 \ 01149 t1v.v = a1v.v; \ 01150 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01151 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01152 \ 01153 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 01154 alpha1 += step_a1; \ 01155 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 01156 \ 01157 t2v.v = a2v.v; \ 01158 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01159 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01160 \ 01161 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 01162 alpha2 += step_a2; \ 01163 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 01164 \ 01165 /* ----------------------------------------------------------- */ \ 01166 \ 01167 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 01168 \ 01169 t2v.v = b2v.v; \ 01170 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 01171 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01172 \ 01173 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 01174 alpha3 += step_a3; \ 01175 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 01176 \ 01177 t3v.v = b3v.v; \ 01178 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 01179 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01180 \ 01181 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 01182 alpha4 += step_a4; \ 01183 \ 01184 t1v.v = b1v.v; \ 01185 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 01186 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01187 \ 01188 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 01189 alpha1 += step_a1; \ 01190 \ 01191 t2v.v = b2v.v; \ 01192 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 01193 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01194 \ 01195 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 01196 alpha2 += step_a2; \ 01197 \ 01198 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 01199 alpha3 += step_a3; \ 01200 \ 01201 /* ----------------------------------------------------------- */ \ 01202 } \ 01203 \ 01204 for ( i = 0; i < n_iter2; ++i ) \ 01205 { \ 01206 \ 01207 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 01208 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 01209 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 01210 \ 01211 t2v.v = a2v.v; \ 01212 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01213 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01214 \ 01215 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 01216 \ 01217 t3v.v = a3v.v; \ 01218 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01219 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01220 \ 01221 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 01222 alpha4 += step_a4; \ 01223 \ 01224 t1v.v = a1v.v; \ 01225 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01226 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01227 \ 01228 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 01229 alpha1 += step_a1; \ 01230 \ 01231 t2v.v = a2v.v; \ 01232 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01233 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01234 \ 01235 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 01236 alpha2 += step_a2; \ 01237 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 01238 alpha3 += step_a3; \ 01239 } \ 01240 \ 01241 if ( n_left == 1 ) \ 01242 { \ 01243 float ga23_k1 = *gamma23_k1; \ 01244 float si23_k1 = *sigma23_k1; \ 01245 float ga34_k1 = *gamma34_k1; \ 01246 float si34_k1 = *sigma34_k1; \ 01247 float ga12_k2 = *gamma12_k2; \ 01248 float si12_k2 = *sigma12_k2; \ 01249 float ga23_k2 = *gamma23_k2; \ 01250 float si23_k2 = *sigma23_k2; \ 01251 scomplex temp1; \ 01252 scomplex temp2; \ 01253 scomplex temp3; \ 01254 scomplex temp4; \ 01255 \ 01256 temp2 = *alpha2; \ 01257 temp3 = *alpha3; \ 01258 \ 01259 alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \ 01260 alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \ 01261 \ 01262 alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \ 01263 alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \ 01264 \ 01265 temp3 = *alpha3; \ 01266 temp4 = *alpha4; \ 01267 \ 01268 alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \ 01269 alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \ 01270 \ 01271 alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \ 01272 alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \ 01273 \ 01274 temp1 = *alpha1; \ 01275 temp2 = *alpha2; \ 01276 \ 01277 alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \ 01278 alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \ 01279 \ 01280 alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \ 01281 alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \ 01282 \ 01283 temp2 = *alpha2; \ 01284 temp3 = *alpha3; \ 01285 \ 01286 alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \ 01287 alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \ 01288 \ 01289 alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \ 01290 alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \ 01291 \ 01292 } \ 01293 } 01294 01295 #define MAC_Apply_G_mx4s_asz( m_A, \ 01296 gamma23_k1, \ 01297 sigma23_k1, \ 01298 gamma34_k1, \ 01299 sigma34_k1, \ 01300 gamma12_k2, \ 01301 sigma12_k2, \ 01302 gamma23_k2, \ 01303 sigma23_k2, \ 01304 a1, inc_a1, \ 01305 a2, inc_a2, \ 01306 a3, inc_a3, \ 01307 a4, inc_a4 ) \ 01308 {\ 01309 int n_iter = m_A / 8; \ 01310 int n_left = m_A % 8; \ 01311 int i; \ 01312 \ 01313 const int step_a1 = inc_a1 * 1; \ 01314 const int step_a2 = inc_a2 * 1; \ 01315 const int step_a3 = inc_a3 * 1; \ 01316 const int step_a4 = inc_a4 * 1; \ 01317 \ 01318 dcomplex* restrict alpha1 = a1; \ 01319 dcomplex* restrict alpha2 = a2; \ 01320 dcomplex* restrict alpha3 = a3; \ 01321 dcomplex* restrict alpha4 = a4; \ 01322 \ 01323 v2df_t a1v, a2v, a3v, a4v; \ 01324 v2df_t b1v, b2v, b3v, b4v; \ 01325 v2df_t g23_k1v, s23_k1v; \ 01326 v2df_t g34_k1v, s34_k1v; \ 01327 v2df_t g12_k2v, s12_k2v; \ 01328 v2df_t g23_k2v, s23_k2v; \ 01329 v2df_t t1v, t2v, t3v; \ 01330 \ 01331 g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \ 01332 s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \ 01333 g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \ 01334 s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \ 01335 g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \ 01336 s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \ 01337 g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \ 01338 s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \ 01339 \ 01340 for ( i = 0; i < n_iter; ++i ) \ 01341 { \ 01342 \ 01343 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 01344 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 01345 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01346 \ 01347 t2v.v = a2v.v; \ 01348 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01349 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01350 \ 01351 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01352 \ 01353 t3v.v = a3v.v; \ 01354 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01355 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01356 \ 01357 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 01358 alpha4 += step_a4; \ 01359 \ 01360 t1v.v = a1v.v; \ 01361 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01362 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01363 \ 01364 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 01365 alpha1 += step_a1; \ 01366 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 01367 \ 01368 t2v.v = a2v.v; \ 01369 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01370 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01371 \ 01372 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 01373 alpha2 += step_a2; \ 01374 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 01375 \ 01376 /* ----------------------------------------------------------- */ \ 01377 \ 01378 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01379 \ 01380 t2v.v = b2v.v; \ 01381 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 01382 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01383 \ 01384 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 01385 alpha3 += step_a3; \ 01386 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01387 \ 01388 t3v.v = b3v.v; \ 01389 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 01390 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01391 \ 01392 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 01393 alpha4 += step_a4; \ 01394 \ 01395 t1v.v = b1v.v; \ 01396 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 01397 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01398 \ 01399 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 01400 alpha1 += step_a1; \ 01401 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 01402 \ 01403 t2v.v = b2v.v; \ 01404 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 01405 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01406 \ 01407 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 01408 alpha2 += step_a2; \ 01409 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 01410 \ 01411 /* ----------------------------------------------------------- */ \ 01412 \ 01413 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01414 \ 01415 t2v.v = a2v.v; \ 01416 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01417 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01418 \ 01419 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 01420 alpha3 += step_a3; \ 01421 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01422 \ 01423 t3v.v = a3v.v; \ 01424 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01425 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01426 \ 01427 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 01428 alpha4 += step_a4; \ 01429 \ 01430 t1v.v = a1v.v; \ 01431 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01432 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01433 \ 01434 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 01435 alpha1 += step_a1; \ 01436 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 01437 \ 01438 t2v.v = a2v.v; \ 01439 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01440 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01441 \ 01442 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 01443 alpha2 += step_a2; \ 01444 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 01445 \ 01446 /* ----------------------------------------------------------- */ \ 01447 \ 01448 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01449 \ 01450 t2v.v = b2v.v; \ 01451 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 01452 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01453 \ 01454 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 01455 alpha3 += step_a3; \ 01456 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01457 \ 01458 t3v.v = b3v.v; \ 01459 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 01460 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01461 \ 01462 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 01463 alpha4 += step_a4; \ 01464 \ 01465 t1v.v = b1v.v; \ 01466 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 01467 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01468 \ 01469 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 01470 alpha1 += step_a1; \ 01471 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \ 01472 \ 01473 t2v.v = b2v.v; \ 01474 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 01475 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01476 \ 01477 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 01478 alpha2 += step_a2; \ 01479 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 01480 \ 01481 /* ----------------------------------------------------------- */ \ 01482 \ 01483 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01484 \ 01485 t2v.v = a2v.v; \ 01486 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01487 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01488 \ 01489 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 01490 alpha3 += step_a3; \ 01491 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01492 \ 01493 t3v.v = a3v.v; \ 01494 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01495 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01496 \ 01497 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 01498 alpha4 += step_a4; \ 01499 \ 01500 t1v.v = a1v.v; \ 01501 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01502 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01503 \ 01504 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 01505 alpha1 += step_a1; \ 01506 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 01507 \ 01508 t2v.v = a2v.v; \ 01509 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01510 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01511 \ 01512 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 01513 alpha2 += step_a2; \ 01514 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 01515 \ 01516 /* ----------------------------------------------------------- */ \ 01517 \ 01518 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01519 \ 01520 t2v.v = b2v.v; \ 01521 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 01522 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01523 \ 01524 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 01525 alpha3 += step_a3; \ 01526 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01527 \ 01528 t3v.v = b3v.v; \ 01529 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 01530 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01531 \ 01532 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 01533 alpha4 += step_a4; \ 01534 \ 01535 t1v.v = b1v.v; \ 01536 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 01537 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01538 \ 01539 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 01540 alpha1 += step_a1; \ 01541 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 01542 \ 01543 t2v.v = b2v.v; \ 01544 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 01545 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01546 \ 01547 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 01548 alpha2 += step_a2; \ 01549 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 01550 \ 01551 /* ----------------------------------------------------------- */ \ 01552 \ 01553 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01554 \ 01555 t2v.v = a2v.v; \ 01556 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01557 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01558 \ 01559 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 01560 alpha3 += step_a3; \ 01561 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01562 \ 01563 t3v.v = a3v.v; \ 01564 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01565 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01566 \ 01567 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 01568 alpha4 += step_a4; \ 01569 \ 01570 t1v.v = a1v.v; \ 01571 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01572 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01573 \ 01574 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 01575 alpha1 += step_a1; \ 01576 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 01577 \ 01578 t2v.v = a2v.v; \ 01579 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01580 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01581 \ 01582 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 01583 alpha2 += step_a2; \ 01584 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 01585 \ 01586 /* ----------------------------------------------------------- */ \ 01587 \ 01588 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01589 \ 01590 t2v.v = b2v.v; \ 01591 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 01592 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01593 \ 01594 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 01595 alpha3 += step_a3; \ 01596 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01597 \ 01598 t3v.v = b3v.v; \ 01599 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 01600 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01601 \ 01602 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 01603 alpha4 += step_a4; \ 01604 \ 01605 t1v.v = b1v.v; \ 01606 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 01607 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01608 \ 01609 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 01610 alpha1 += step_a1; \ 01611 \ 01612 t2v.v = b2v.v; \ 01613 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 01614 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01615 \ 01616 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 01617 alpha2 += step_a2; \ 01618 \ 01619 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 01620 alpha3 += step_a3; \ 01621 \ 01622 /* ----------------------------------------------------------- */ \ 01623 } \ 01624 \ 01625 for ( i = 0; i < n_left; ++i ) \ 01626 { \ 01627 \ 01628 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 01629 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 01630 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01631 \ 01632 t2v.v = a2v.v; \ 01633 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01634 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01635 \ 01636 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01637 \ 01638 t3v.v = a3v.v; \ 01639 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01640 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01641 \ 01642 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 01643 alpha4 += step_a4; \ 01644 \ 01645 t1v.v = a1v.v; \ 01646 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01647 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01648 \ 01649 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 01650 alpha1 += step_a1; \ 01651 \ 01652 t2v.v = a2v.v; \ 01653 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01654 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01655 \ 01656 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 01657 alpha2 += step_a2; \ 01658 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 01659 alpha3 += step_a3; \ 01660 } \ 01661 } 01662 01663 #endif