libflame
revision_anchor
|
00001 /* 00002 libflame 00003 An object-based infrastructure for developing high-performance 00004 dense linear algebra libraries. 00005 00006 Copyright (C) 2011, The University of Texas 00007 00008 libflame is free software; you can redistribute it and/or modify 00009 it under the terms of the GNU Lesser General Public License as 00010 published by the Free Software Foundation; either version 2.1 of 00011 the License, or (at your option) any later version. 00012 00013 libflame is distributed in the hope that it will be useful, but 00014 WITHOUT ANY WARRANTY; without even the implied warranty of 00015 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 Lesser General Public License for more details. 00017 00018 You should have received a copy of the GNU Lesser General Public 00019 License along with libflame; if you did not receive a copy, see 00020 http://www.gnu.org/licenses/. 00021 00022 For more information, please contact us at flame@cs.utexas.edu or 00023 send mail to: 00024 00025 Field G. Van Zee and/or 00026 Robert A. van de Geijn 00027 The University of Texas at Austin 00028 Department of Computer Sciences 00029 1 University Station C0500 00030 Austin TX 78712 00031 */ 00032 00033 00034 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS 00035 00036 #define MAC_Apply_G_mx3b_ass MAC_Apply_G_mx3b_ops 00037 #define MAC_Apply_G_mx3b_asd MAC_Apply_G_mx3b_opd 00038 #define MAC_Apply_G_mx3b_asc MAC_Apply_G_mx3b_opc 00039 #define MAC_Apply_G_mx3b_asz MAC_Apply_G_mx3b_opz 00040 00041 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS 00042 00043 #define MAC_Apply_G_mx3b_ass( m_A, \ 00044 gamma12, \ 00045 sigma12, \ 00046 gamma23, \ 00047 sigma23, \ 00048 a1, inc_a1, \ 00049 a2, inc_a2, \ 00050 a3, inc_a3 ) \ 00051 {\ 00052 int n_iter32 = m_A / ( 4 * 8 ); \ 00053 int n_left32 = m_A % ( 4 * 8 ); \ 00054 int n_iter4 = n_left32 / ( 4 * 1 ); \ 00055 int n_left = n_left32 % ( 4 * 1 ); \ 00056 int i; \ 00057 \ 00058 const int step_a1 = inc_a1 * 4; \ 00059 const int step_a2 = inc_a2 * 4; \ 00060 const int step_a3 = inc_a3 * 4; \ 00061 \ 00062 float* restrict alpha1 = a1; \ 00063 float* restrict alpha2 = a2; \ 00064 float* restrict alpha3 = a3; \ 00065 \ 00066 v4sf_t a1v, a2v, a3v; \ 00067 v4sf_t g12v, s12v; \ 00068 v4sf_t g23v, s23v; \ 00069 v4sf_t t1v, t2v; \ 00070 \ 00071 g12v.v = _mm_load1_ps( gamma12 ); \ 00072 s12v.v = _mm_load1_ps( sigma12 ); \ 00073 g23v.v = _mm_load1_ps( gamma23 ); \ 00074 s23v.v = _mm_load1_ps( sigma23 ); \ 00075 \ 00076 for ( i = 0; i < n_iter32; ++i ) \ 00077 { \ 00078 \ 00079 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00080 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00081 \ 00082 t2v.v = a2v.v; \ 00083 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00084 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00085 \ 00086 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00087 alpha3 += step_a3; \ 00088 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00089 \ 00090 t1v.v = a1v.v; \ 00091 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00092 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00093 \ 00094 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00095 alpha1 += step_a1; \ 00096 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00097 alpha2 += step_a2; \ 00098 \ 00099 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00100 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00101 \ 00102 t2v.v = a2v.v; \ 00103 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00104 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00105 \ 00106 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00107 alpha3 += step_a3; \ 00108 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00109 \ 00110 t1v.v = a1v.v; \ 00111 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00112 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00113 \ 00114 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00115 alpha1 += step_a1; \ 00116 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00117 alpha2 += step_a2; \ 00118 \ 00119 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00120 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00121 \ 00122 t2v.v = a2v.v; \ 00123 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00124 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00125 \ 00126 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00127 alpha3 += step_a3; \ 00128 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00129 \ 00130 t1v.v = a1v.v; \ 00131 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00132 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00133 \ 00134 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00135 alpha1 += step_a1; \ 00136 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00137 alpha2 += step_a2; \ 00138 \ 00139 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00140 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00141 \ 00142 t2v.v = a2v.v; \ 00143 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00144 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00145 \ 00146 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00147 alpha3 += step_a3; \ 00148 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00149 \ 00150 t1v.v = a1v.v; \ 00151 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00152 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00153 \ 00154 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00155 alpha1 += step_a1; \ 00156 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00157 alpha2 += step_a2; \ 00158 \ 00159 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00160 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00161 \ 00162 t2v.v = a2v.v; \ 00163 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00164 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00165 \ 00166 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00167 alpha3 += step_a3; \ 00168 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00169 \ 00170 t1v.v = a1v.v; \ 00171 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00172 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00173 \ 00174 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00175 alpha1 += step_a1; \ 00176 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00177 alpha2 += step_a2; \ 00178 \ 00179 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00180 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00181 \ 00182 t2v.v = a2v.v; \ 00183 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00184 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00185 \ 00186 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00187 alpha3 += step_a3; \ 00188 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00189 \ 00190 t1v.v = a1v.v; \ 00191 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00192 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00193 \ 00194 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00195 alpha1 += step_a1; \ 00196 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00197 alpha2 += step_a2; \ 00198 \ 00199 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00200 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00201 \ 00202 t2v.v = a2v.v; \ 00203 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00204 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00205 \ 00206 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00207 alpha3 += step_a3; \ 00208 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00209 \ 00210 t1v.v = a1v.v; \ 00211 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00212 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00213 \ 00214 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00215 alpha1 += step_a1; \ 00216 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00217 alpha2 += step_a2; \ 00218 \ 00219 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00220 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00221 \ 00222 t2v.v = a2v.v; \ 00223 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00224 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00225 \ 00226 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00227 alpha3 += step_a3; \ 00228 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00229 \ 00230 t1v.v = a1v.v; \ 00231 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00232 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00233 \ 00234 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00235 alpha1 += step_a1; \ 00236 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00237 alpha2 += step_a2; \ 00238 } \ 00239 \ 00240 for ( i = 0; i < n_iter4; ++i ) \ 00241 { \ 00242 \ 00243 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00244 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00245 \ 00246 t2v.v = a2v.v; \ 00247 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00248 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00249 \ 00250 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00251 alpha3 += step_a3; \ 00252 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00253 \ 00254 t1v.v = a1v.v; \ 00255 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00256 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00257 \ 00258 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00259 alpha1 += step_a1; \ 00260 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00261 alpha2 += step_a2; \ 00262 } \ 00263 \ 00264 for ( i = 0; i < n_left; ++i ) \ 00265 { \ 00266 float ga12 = *gamma12; \ 00267 float si12 = *sigma12; \ 00268 float ga23 = *gamma23; \ 00269 float si23 = *sigma23; \ 00270 float temp1; \ 00271 float temp2; \ 00272 float temp3; \ 00273 \ 00274 temp2 = *alpha2; \ 00275 temp3 = *alpha3; \ 00276 \ 00277 *alpha2 = temp2 * ga23 + temp3 * si23; \ 00278 *alpha3 = temp3 * ga23 - temp2 * si23; \ 00279 \ 00280 temp1 = *alpha1; \ 00281 temp2 = *alpha2; \ 00282 \ 00283 *alpha1 = temp1 * ga12 + temp2 * si12; \ 00284 *alpha2 = temp2 * ga12 - temp1 * si12; \ 00285 \ 00286 alpha1 += 1; \ 00287 alpha2 += 1; \ 00288 alpha3 += 1; \ 00289 } \ 00290 } 00291 00292 #define MAC_Apply_G_mx3b_asd( m_A, \ 00293 gamma12, \ 00294 sigma12, \ 00295 gamma23, \ 00296 sigma23, \ 00297 a1, inc_a1, \ 00298 a2, inc_a2, \ 00299 a3, inc_a3 ) \ 00300 {\ 00301 int n_iter16 = m_A / ( 2 * 8 ); \ 00302 int n_left16 = m_A % ( 2 * 8 ); \ 00303 int n_iter2 = n_left16 / ( 2 * 1 ); \ 00304 int n_left = n_left16 % ( 2 * 1 ); \ 00305 int i; \ 00306 \ 00307 const int step_a1 = inc_a1 * 2; \ 00308 const int step_a2 = inc_a2 * 2; \ 00309 const int step_a3 = inc_a3 * 2; \ 00310 \ 00311 double* restrict alpha1 = a1; \ 00312 double* restrict alpha2 = a2; \ 00313 double* restrict alpha3 = a3; \ 00314 \ 00315 v2df_t a1v, a2v, a3v; \ 00316 v2df_t g12v, s12v; \ 00317 v2df_t g23v, s23v; \ 00318 v2df_t t1v, t2v; \ 00319 \ 00320 g12v.v = _mm_loaddup_pd( gamma12 ); \ 00321 s12v.v = _mm_loaddup_pd( sigma12 ); \ 00322 g23v.v = _mm_loaddup_pd( gamma23 ); \ 00323 s23v.v = _mm_loaddup_pd( sigma23 ); \ 00324 \ 00325 for ( i = 0; i < n_iter16; ++i ) \ 00326 { \ 00327 \ 00328 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00329 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00330 \ 00331 t2v.v = a2v.v; \ 00332 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00333 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00334 \ 00335 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00336 alpha3 += step_a3; \ 00337 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00338 \ 00339 t1v.v = a1v.v; \ 00340 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00341 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00342 \ 00343 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00344 alpha1 += step_a1; \ 00345 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00346 alpha2 += step_a2; \ 00347 \ 00348 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00349 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00350 \ 00351 t2v.v = a2v.v; \ 00352 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00353 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00354 \ 00355 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00356 alpha3 += step_a3; \ 00357 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00358 \ 00359 t1v.v = a1v.v; \ 00360 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00361 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00362 \ 00363 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00364 alpha1 += step_a1; \ 00365 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00366 alpha2 += step_a2; \ 00367 \ 00368 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00369 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00370 \ 00371 t2v.v = a2v.v; \ 00372 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00373 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00374 \ 00375 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00376 alpha3 += step_a3; \ 00377 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00378 \ 00379 t1v.v = a1v.v; \ 00380 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00381 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00382 \ 00383 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00384 alpha1 += step_a1; \ 00385 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00386 alpha2 += step_a2; \ 00387 \ 00388 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00389 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00390 \ 00391 t2v.v = a2v.v; \ 00392 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00393 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00394 \ 00395 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00396 alpha3 += step_a3; \ 00397 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00398 \ 00399 t1v.v = a1v.v; \ 00400 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00401 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00402 \ 00403 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00404 alpha1 += step_a1; \ 00405 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00406 alpha2 += step_a2; \ 00407 \ 00408 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00409 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00410 \ 00411 t2v.v = a2v.v; \ 00412 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00413 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00414 \ 00415 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00416 alpha3 += step_a3; \ 00417 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00418 \ 00419 t1v.v = a1v.v; \ 00420 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00421 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00422 \ 00423 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00424 alpha1 += step_a1; \ 00425 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00426 alpha2 += step_a2; \ 00427 \ 00428 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00429 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00430 \ 00431 t2v.v = a2v.v; \ 00432 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00433 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00434 \ 00435 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00436 alpha3 += step_a3; \ 00437 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00438 \ 00439 t1v.v = a1v.v; \ 00440 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00441 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00442 \ 00443 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00444 alpha1 += step_a1; \ 00445 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00446 alpha2 += step_a2; \ 00447 \ 00448 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00449 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00450 \ 00451 t2v.v = a2v.v; \ 00452 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00453 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00454 \ 00455 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00456 alpha3 += step_a3; \ 00457 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00458 \ 00459 t1v.v = a1v.v; \ 00460 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00461 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00462 \ 00463 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00464 alpha1 += step_a1; \ 00465 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00466 alpha2 += step_a2; \ 00467 \ 00468 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00469 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00470 \ 00471 t2v.v = a2v.v; \ 00472 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00473 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00474 \ 00475 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00476 alpha3 += step_a3; \ 00477 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00478 \ 00479 t1v.v = a1v.v; \ 00480 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00481 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00482 \ 00483 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00484 alpha1 += step_a1; \ 00485 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00486 alpha2 += step_a2; \ 00487 } \ 00488 \ 00489 for ( i = 0; i < n_iter2; ++i ) \ 00490 { \ 00491 \ 00492 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00493 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00494 \ 00495 t2v.v = a2v.v; \ 00496 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00497 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00498 \ 00499 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00500 alpha3 += step_a3; \ 00501 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00502 \ 00503 t1v.v = a1v.v; \ 00504 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00505 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00506 \ 00507 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00508 alpha1 += step_a1; \ 00509 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00510 alpha2 += step_a2; \ 00511 } \ 00512 \ 00513 if ( n_left == 1 ) \ 00514 { \ 00515 double ga12 = *gamma12; \ 00516 double si12 = *sigma12; \ 00517 double ga23 = *gamma23; \ 00518 double si23 = *sigma23; \ 00519 double temp1; \ 00520 double temp2; \ 00521 double temp3; \ 00522 \ 00523 temp2 = *alpha2; \ 00524 temp3 = *alpha3; \ 00525 \ 00526 *alpha2 = temp2 * ga23 + temp3 * si23; \ 00527 *alpha3 = temp3 * ga23 - temp2 * si23; \ 00528 \ 00529 temp1 = *alpha1; \ 00530 temp2 = *alpha2; \ 00531 \ 00532 *alpha1 = temp1 * ga12 + temp2 * si12; \ 00533 *alpha2 = temp2 * ga12 - temp1 * si12; \ 00534 } \ 00535 } 00536 00537 #define MAC_Apply_G_mx3b_asc( m_A, \ 00538 gamma12, \ 00539 sigma12, \ 00540 gamma23, \ 00541 sigma23, \ 00542 a1, inc_a1, \ 00543 a2, inc_a2, \ 00544 a3, inc_a3 ) \ 00545 {\ 00546 int n_iter16 = m_A / ( 2 * 8 ); \ 00547 int n_left16 = m_A % ( 2 * 8 ); \ 00548 int n_iter2 = n_left16 / ( 2 * 1 ); \ 00549 int n_left = n_left16 % ( 2 * 1 ); \ 00550 int i; \ 00551 \ 00552 const int step_a1 = inc_a1 * 2; \ 00553 const int step_a2 = inc_a2 * 2; \ 00554 const int step_a3 = inc_a3 * 2; \ 00555 \ 00556 scomplex* restrict alpha1 = a1; \ 00557 scomplex* restrict alpha2 = a2; \ 00558 scomplex* restrict alpha3 = a3; \ 00559 \ 00560 v4sf_t a1v, a2v, a3v; \ 00561 v4sf_t g12v, s12v; \ 00562 v4sf_t g23v, s23v; \ 00563 v4sf_t t1v, t2v; \ 00564 \ 00565 g12v.v = _mm_load1_ps( gamma12 ); \ 00566 s12v.v = _mm_load1_ps( sigma12 ); \ 00567 g23v.v = _mm_load1_ps( gamma23 ); \ 00568 s23v.v = _mm_load1_ps( sigma23 ); \ 00569 \ 00570 for ( i = 0; i < n_iter16; ++i ) \ 00571 { \ 00572 \ 00573 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00574 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00575 \ 00576 t2v.v = a2v.v; \ 00577 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00578 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00579 \ 00580 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00581 alpha3 += step_a3; \ 00582 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00583 \ 00584 t1v.v = a1v.v; \ 00585 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00586 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00587 \ 00588 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00589 alpha1 += step_a1; \ 00590 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00591 alpha2 += step_a2; \ 00592 \ 00593 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00594 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00595 \ 00596 t2v.v = a2v.v; \ 00597 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00598 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00599 \ 00600 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00601 alpha3 += step_a3; \ 00602 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00603 \ 00604 t1v.v = a1v.v; \ 00605 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00606 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00607 \ 00608 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00609 alpha1 += step_a1; \ 00610 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00611 alpha2 += step_a2; \ 00612 \ 00613 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00614 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00615 \ 00616 t2v.v = a2v.v; \ 00617 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00618 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00619 \ 00620 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00621 alpha3 += step_a3; \ 00622 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00623 \ 00624 t1v.v = a1v.v; \ 00625 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00626 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00627 \ 00628 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00629 alpha1 += step_a1; \ 00630 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00631 alpha2 += step_a2; \ 00632 \ 00633 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00634 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00635 \ 00636 t2v.v = a2v.v; \ 00637 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00638 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00639 \ 00640 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00641 alpha3 += step_a3; \ 00642 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00643 \ 00644 t1v.v = a1v.v; \ 00645 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00646 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00647 \ 00648 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00649 alpha1 += step_a1; \ 00650 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00651 alpha2 += step_a2; \ 00652 \ 00653 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00654 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00655 \ 00656 t2v.v = a2v.v; \ 00657 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00658 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00659 \ 00660 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00661 alpha3 += step_a3; \ 00662 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00663 \ 00664 t1v.v = a1v.v; \ 00665 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00666 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00667 \ 00668 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00669 alpha1 += step_a1; \ 00670 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00671 alpha2 += step_a2; \ 00672 \ 00673 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00674 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00675 \ 00676 t2v.v = a2v.v; \ 00677 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00678 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00679 \ 00680 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00681 alpha3 += step_a3; \ 00682 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00683 \ 00684 t1v.v = a1v.v; \ 00685 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00686 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00687 \ 00688 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00689 alpha1 += step_a1; \ 00690 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00691 alpha2 += step_a2; \ 00692 \ 00693 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00694 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00695 \ 00696 t2v.v = a2v.v; \ 00697 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00698 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00699 \ 00700 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00701 alpha3 += step_a3; \ 00702 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00703 \ 00704 t1v.v = a1v.v; \ 00705 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00706 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00707 \ 00708 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00709 alpha1 += step_a1; \ 00710 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00711 alpha2 += step_a2; \ 00712 \ 00713 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00714 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00715 \ 00716 t2v.v = a2v.v; \ 00717 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00718 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00719 \ 00720 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00721 alpha3 += step_a3; \ 00722 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00723 \ 00724 t1v.v = a1v.v; \ 00725 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00726 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00727 \ 00728 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00729 alpha1 += step_a1; \ 00730 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00731 alpha2 += step_a2; \ 00732 } \ 00733 \ 00734 for ( i = 0; i < n_iter2; ++i ) \ 00735 { \ 00736 \ 00737 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00738 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00739 \ 00740 t2v.v = a2v.v; \ 00741 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00742 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00743 \ 00744 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00745 alpha3 += step_a3; \ 00746 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00747 \ 00748 t1v.v = a1v.v; \ 00749 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00750 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00751 \ 00752 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00753 alpha1 += step_a1; \ 00754 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00755 alpha2 += step_a2; \ 00756 } \ 00757 \ 00758 if ( n_left == 1 ) \ 00759 { \ 00760 float ga12 = *gamma12; \ 00761 float si12 = *sigma12; \ 00762 float ga23 = *gamma23; \ 00763 float si23 = *sigma23; \ 00764 scomplex temp1; \ 00765 scomplex temp2; \ 00766 scomplex temp3; \ 00767 \ 00768 temp1 = *alpha1; \ 00769 temp2 = *alpha2; \ 00770 \ 00771 alpha1->real = temp1.real * ga12 + temp2.real * si12; \ 00772 alpha2->real = temp2.real * ga12 - temp1.real * si12; \ 00773 \ 00774 alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \ 00775 alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \ 00776 \ 00777 temp2 = *alpha2; \ 00778 temp3 = *alpha3; \ 00779 \ 00780 alpha2->real = temp2.real * ga23 + temp3.real * si23; \ 00781 alpha3->real = temp3.real * ga23 - temp2.real * si23; \ 00782 \ 00783 alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \ 00784 alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \ 00785 } \ 00786 } 00787 00788 #define MAC_Apply_G_mx3b_asz( m_A, \ 00789 gamma12, \ 00790 sigma12, \ 00791 gamma23, \ 00792 sigma23, \ 00793 a1, inc_a1, \ 00794 a2, inc_a2, \ 00795 a3, inc_a3 ) \ 00796 {\ 00797 int n_iter = m_A / 8; \ 00798 int n_left = m_A % 8; \ 00799 int i; \ 00800 \ 00801 const int step_a1 = inc_a1 * 1; \ 00802 const int step_a2 = inc_a2 * 1; \ 00803 const int step_a3 = inc_a3 * 1; \ 00804 \ 00805 dcomplex* restrict alpha1 = a1; \ 00806 dcomplex* restrict alpha2 = a2; \ 00807 dcomplex* restrict alpha3 = a3; \ 00808 \ 00809 v2df_t a1v, a2v, a3v; \ 00810 v2df_t g12v, s12v; \ 00811 v2df_t g23v, s23v; \ 00812 v2df_t t1v, t2v; \ 00813 \ 00814 g12v.v = _mm_loaddup_pd( gamma12 ); \ 00815 s12v.v = _mm_loaddup_pd( sigma12 ); \ 00816 g23v.v = _mm_loaddup_pd( gamma23 ); \ 00817 s23v.v = _mm_loaddup_pd( sigma23 ); \ 00818 \ 00819 for ( i = 0; i < n_iter; ++i ) \ 00820 { \ 00821 \ 00822 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00823 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00824 \ 00825 t2v.v = a2v.v; \ 00826 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00827 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00828 \ 00829 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00830 alpha3 += step_a3; \ 00831 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00832 \ 00833 t1v.v = a1v.v; \ 00834 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00835 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00836 \ 00837 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00838 alpha1 += step_a1; \ 00839 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00840 alpha2 += step_a2; \ 00841 \ 00842 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00843 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00844 \ 00845 t2v.v = a2v.v; \ 00846 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00847 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00848 \ 00849 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00850 alpha3 += step_a3; \ 00851 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00852 \ 00853 t1v.v = a1v.v; \ 00854 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00855 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00856 \ 00857 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00858 alpha1 += step_a1; \ 00859 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00860 alpha2 += step_a2; \ 00861 \ 00862 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00863 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00864 \ 00865 t2v.v = a2v.v; \ 00866 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00867 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00868 \ 00869 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00870 alpha3 += step_a3; \ 00871 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00872 \ 00873 t1v.v = a1v.v; \ 00874 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00875 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00876 \ 00877 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00878 alpha1 += step_a1; \ 00879 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00880 alpha2 += step_a2; \ 00881 \ 00882 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00883 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00884 \ 00885 t2v.v = a2v.v; \ 00886 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00887 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00888 \ 00889 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00890 alpha3 += step_a3; \ 00891 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00892 \ 00893 t1v.v = a1v.v; \ 00894 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00895 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00896 \ 00897 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00898 alpha1 += step_a1; \ 00899 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00900 alpha2 += step_a2; \ 00901 \ 00902 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00903 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00904 \ 00905 t2v.v = a2v.v; \ 00906 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00907 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00908 \ 00909 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00910 alpha3 += step_a3; \ 00911 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00912 \ 00913 t1v.v = a1v.v; \ 00914 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00915 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00916 \ 00917 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00918 alpha1 += step_a1; \ 00919 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00920 alpha2 += step_a2; \ 00921 \ 00922 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00923 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00924 \ 00925 t2v.v = a2v.v; \ 00926 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00927 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00928 \ 00929 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00930 alpha3 += step_a3; \ 00931 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00932 \ 00933 t1v.v = a1v.v; \ 00934 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00935 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00936 \ 00937 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00938 alpha1 += step_a1; \ 00939 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00940 alpha2 += step_a2; \ 00941 \ 00942 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00943 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00944 \ 00945 t2v.v = a2v.v; \ 00946 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00947 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00948 \ 00949 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00950 alpha3 += step_a3; \ 00951 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00952 \ 00953 t1v.v = a1v.v; \ 00954 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00955 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00956 \ 00957 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00958 alpha1 += step_a1; \ 00959 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00960 alpha2 += step_a2; \ 00961 \ 00962 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00963 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00964 \ 00965 t2v.v = a2v.v; \ 00966 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00967 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00968 \ 00969 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00970 alpha3 += step_a3; \ 00971 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00972 \ 00973 t1v.v = a1v.v; \ 00974 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00975 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00976 \ 00977 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00978 alpha1 += step_a1; \ 00979 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00980 alpha2 += step_a2; \ 00981 } \ 00982 \ 00983 for ( i = 0; i < n_left; ++i ) \ 00984 { \ 00985 \ 00986 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00987 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00988 \ 00989 t2v.v = a2v.v; \ 00990 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00991 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00992 \ 00993 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00994 alpha3 += step_a3; \ 00995 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00996 \ 00997 t1v.v = a1v.v; \ 00998 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00999 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 01000 \ 01001 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 01002 alpha1 += step_a1; \ 01003 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 01004 alpha2 += step_a2; \ 01005 } \ 01006 } 01007 01008 #endif