libflame
revision_anchor
|
00001 /* 00002 libflame 00003 An object-based infrastructure for developing high-performance 00004 dense linear algebra libraries. 00005 00006 Copyright (C) 2011, The University of Texas 00007 00008 libflame is free software; you can redistribute it and/or modify 00009 it under the terms of the GNU Lesser General Public License as 00010 published by the Free Software Foundation; either version 2.1 of 00011 the License, or (at your option) any later version. 00012 00013 libflame is distributed in the hope that it will be useful, but 00014 WITHOUT ANY WARRANTY; without even the implied warranty of 00015 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 Lesser General Public License for more details. 00017 00018 You should have received a copy of the GNU Lesser General Public 00019 License along with libflame; if you did not receive a copy, see 00020 http://www.gnu.org/licenses/. 00021 00022 For more information, please contact us at flame@cs.utexas.edu or 00023 send mail to: 00024 00025 Field G. Van Zee and/or 00026 Robert A. van de Geijn 00027 The University of Texas at Austin 00028 Department of Computer Sciences 00029 1 University Station C0500 00030 Austin TX 78712 00031 */ 00032 00033 00034 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS 00035 00036 #define MAC_Apply_G_mx3_ass MAC_Apply_G_mx3_ops 00037 #define MAC_Apply_G_mx3_asd MAC_Apply_G_mx3_opd 00038 #define MAC_Apply_G_mx3_asc MAC_Apply_G_mx3_opc 00039 #define MAC_Apply_G_mx3_asz MAC_Apply_G_mx3_opz 00040 00041 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS 00042 00043 #define MAC_Apply_G_mx3_ass( m_A, \ 00044 gamma12, \ 00045 sigma12, \ 00046 gamma23, \ 00047 sigma23, \ 00048 a1, inc_a1, \ 00049 a2, inc_a2, \ 00050 a3, inc_a3 ) \ 00051 {\ 00052 int n_iter32 = m_A / ( 4 * 8 ); \ 00053 int n_left32 = m_A % ( 4 * 8 ); \ 00054 int n_iter4 = n_left32 / ( 4 * 1 ); \ 00055 int n_left = n_left32 % ( 4 * 1 ); \ 00056 int i; \ 00057 \ 00058 const int step_a1 = inc_a1 * 4; \ 00059 const int step_a2 = inc_a1 * 4; \ 00060 const int step_a3 = inc_a1 * 4; \ 00061 \ 00062 float* restrict alpha1 = a1; \ 00063 float* restrict alpha2 = a2; \ 00064 float* restrict alpha3 = a3; \ 00065 \ 00066 v4sf_t a1v, a2v, a3v; \ 00067 v4sf_t g12v, s12v; \ 00068 v4sf_t g23v, s23v; \ 00069 v4sf_t t1v, t2v; \ 00070 \ 00071 g12v.v = _mm_load1_ps( gamma12 ); \ 00072 s12v.v = _mm_load1_ps( sigma12 ); \ 00073 g23v.v = _mm_load1_ps( gamma23 ); \ 00074 s23v.v = _mm_load1_ps( sigma23 ); \ 00075 \ 00076 for ( i = 0; i < n_iter32; ++i ) \ 00077 { \ 00078 \ 00079 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00080 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00081 \ 00082 t1v.v = a1v.v; \ 00083 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00084 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00085 \ 00086 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00087 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00088 alpha1 += step_a1; \ 00089 \ 00090 t2v.v = a2v.v; \ 00091 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00092 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00093 \ 00094 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00095 alpha2 += step_a2; \ 00096 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00097 alpha3 += step_a3; \ 00098 \ 00099 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00100 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00101 \ 00102 t1v.v = a1v.v; \ 00103 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00104 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00105 \ 00106 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00107 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00108 alpha1 += step_a1; \ 00109 \ 00110 t2v.v = a2v.v; \ 00111 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00112 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00113 \ 00114 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00115 alpha2 += step_a2; \ 00116 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00117 alpha3 += step_a3; \ 00118 \ 00119 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00120 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00121 \ 00122 t1v.v = a1v.v; \ 00123 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00124 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00125 \ 00126 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00127 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00128 alpha1 += step_a1; \ 00129 \ 00130 t2v.v = a2v.v; \ 00131 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00132 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00133 \ 00134 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00135 alpha2 += step_a2; \ 00136 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00137 alpha3 += step_a3; \ 00138 \ 00139 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00140 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00141 \ 00142 t1v.v = a1v.v; \ 00143 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00144 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00145 \ 00146 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00147 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00148 alpha1 += step_a1; \ 00149 \ 00150 t2v.v = a2v.v; \ 00151 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00152 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00153 \ 00154 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00155 alpha2 += step_a2; \ 00156 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00157 alpha3 += step_a3; \ 00158 \ 00159 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00160 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00161 \ 00162 t1v.v = a1v.v; \ 00163 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00164 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00165 \ 00166 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00167 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00168 alpha1 += step_a1; \ 00169 \ 00170 t2v.v = a2v.v; \ 00171 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00172 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00173 \ 00174 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00175 alpha2 += step_a2; \ 00176 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00177 alpha3 += step_a3; \ 00178 \ 00179 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00180 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00181 \ 00182 t1v.v = a1v.v; \ 00183 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00184 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00185 \ 00186 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00187 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00188 alpha1 += step_a1; \ 00189 \ 00190 t2v.v = a2v.v; \ 00191 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00192 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00193 \ 00194 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00195 alpha2 += step_a2; \ 00196 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00197 alpha3 += step_a3; \ 00198 \ 00199 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00200 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00201 \ 00202 t1v.v = a1v.v; \ 00203 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00204 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00205 \ 00206 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00207 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00208 alpha1 += step_a1; \ 00209 \ 00210 t2v.v = a2v.v; \ 00211 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00212 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00213 \ 00214 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00215 alpha2 += step_a2; \ 00216 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00217 alpha3 += step_a3; \ 00218 \ 00219 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00220 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00221 \ 00222 t1v.v = a1v.v; \ 00223 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00224 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00225 \ 00226 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00227 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00228 alpha1 += step_a1; \ 00229 \ 00230 t2v.v = a2v.v; \ 00231 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00232 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00233 \ 00234 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00235 alpha2 += step_a2; \ 00236 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00237 alpha3 += step_a3; \ 00238 } \ 00239 \ 00240 for ( i = 0; i < n_iter4; ++i ) \ 00241 { \ 00242 \ 00243 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00244 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00245 \ 00246 t1v.v = a1v.v; \ 00247 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00248 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00249 \ 00250 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00251 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00252 alpha1 += step_a1; \ 00253 \ 00254 t2v.v = a2v.v; \ 00255 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00256 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00257 \ 00258 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00259 alpha2 += step_a2; \ 00260 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00261 alpha3 += step_a3; \ 00262 } \ 00263 \ 00264 for ( i = 0; i < n_left; ++i ) \ 00265 { \ 00266 float ga12 = *gamma12; \ 00267 float si12 = *sigma12; \ 00268 float ga23 = *gamma23; \ 00269 float si23 = *sigma23; \ 00270 float temp1; \ 00271 float temp2; \ 00272 float temp3; \ 00273 \ 00274 temp1 = *alpha1; \ 00275 temp2 = *alpha2; \ 00276 \ 00277 *alpha1 = temp1 * ga12 + temp2 * si12; \ 00278 *alpha2 = temp2 * ga12 - temp1 * si12; \ 00279 \ 00280 temp2 = *alpha2; \ 00281 temp3 = *alpha3; \ 00282 \ 00283 *alpha2 = temp2 * ga23 + temp3 * si23; \ 00284 *alpha3 = temp3 * ga23 - temp2 * si23; \ 00285 \ 00286 alpha1 += 1; \ 00287 alpha2 += 1; \ 00288 alpha3 += 1; \ 00289 } \ 00290 } 00291 00292 #define MAC_Apply_G_mx3_asd( m_A, \ 00293 gamma12, \ 00294 sigma12, \ 00295 gamma23, \ 00296 sigma23, \ 00297 a1, inc_a1, \ 00298 a2, inc_a2, \ 00299 a3, inc_a3 ) \ 00300 {\ 00301 int n_iter16 = m_A / ( 2 * 8 ); \ 00302 int n_left16 = m_A % ( 2 * 8 ); \ 00303 int n_iter2 = n_left16 / ( 2 * 1 ); \ 00304 int n_left = n_left16 % ( 2 * 1 ); \ 00305 int i; \ 00306 \ 00307 const int step_a1 = inc_a1 * 2; \ 00308 const int step_a2 = inc_a1 * 2; \ 00309 const int step_a3 = inc_a1 * 2; \ 00310 \ 00311 double* restrict alpha1 = a1; \ 00312 double* restrict alpha2 = a2; \ 00313 double* restrict alpha3 = a3; \ 00314 \ 00315 v2df_t a1v, a2v, a3v; \ 00316 v2df_t g12v, s12v; \ 00317 v2df_t g23v, s23v; \ 00318 v2df_t t1v, t2v; \ 00319 \ 00320 g12v.v = _mm_loaddup_pd( gamma12 ); \ 00321 s12v.v = _mm_loaddup_pd( sigma12 ); \ 00322 g23v.v = _mm_loaddup_pd( gamma23 ); \ 00323 s23v.v = _mm_loaddup_pd( sigma23 ); \ 00324 \ 00325 for ( i = 0; i < n_iter16; ++i ) \ 00326 { \ 00327 \ 00328 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00329 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00330 \ 00331 t1v.v = a1v.v; \ 00332 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00333 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00334 \ 00335 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00336 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00337 alpha1 += step_a1; \ 00338 \ 00339 t2v.v = a2v.v; \ 00340 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00341 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00342 \ 00343 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00344 alpha2 += step_a2; \ 00345 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00346 alpha3 += step_a3; \ 00347 \ 00348 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00349 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00350 \ 00351 t1v.v = a1v.v; \ 00352 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00353 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00354 \ 00355 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00356 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00357 alpha1 += step_a1; \ 00358 \ 00359 t2v.v = a2v.v; \ 00360 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00361 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00362 \ 00363 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00364 alpha2 += step_a2; \ 00365 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00366 alpha3 += step_a3; \ 00367 \ 00368 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00369 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00370 \ 00371 t1v.v = a1v.v; \ 00372 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00373 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00374 \ 00375 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00376 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00377 alpha1 += step_a1; \ 00378 \ 00379 t2v.v = a2v.v; \ 00380 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00381 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00382 \ 00383 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00384 alpha2 += step_a2; \ 00385 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00386 alpha3 += step_a3; \ 00387 \ 00388 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00389 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00390 \ 00391 t1v.v = a1v.v; \ 00392 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00393 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00394 \ 00395 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00396 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00397 alpha1 += step_a1; \ 00398 \ 00399 t2v.v = a2v.v; \ 00400 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00401 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00402 \ 00403 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00404 alpha2 += step_a2; \ 00405 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00406 alpha3 += step_a3; \ 00407 \ 00408 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00409 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00410 \ 00411 t1v.v = a1v.v; \ 00412 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00413 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00414 \ 00415 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00416 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00417 alpha1 += step_a1; \ 00418 \ 00419 t2v.v = a2v.v; \ 00420 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00421 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00422 \ 00423 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00424 alpha2 += step_a2; \ 00425 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00426 alpha3 += step_a3; \ 00427 \ 00428 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00429 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00430 \ 00431 t1v.v = a1v.v; \ 00432 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00433 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00434 \ 00435 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00436 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00437 alpha1 += step_a1; \ 00438 \ 00439 t2v.v = a2v.v; \ 00440 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00441 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00442 \ 00443 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00444 alpha2 += step_a2; \ 00445 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00446 alpha3 += step_a3; \ 00447 \ 00448 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00449 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00450 \ 00451 t1v.v = a1v.v; \ 00452 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00453 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00454 \ 00455 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00456 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00457 alpha1 += step_a1; \ 00458 \ 00459 t2v.v = a2v.v; \ 00460 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00461 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00462 \ 00463 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00464 alpha2 += step_a2; \ 00465 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00466 alpha3 += step_a3; \ 00467 \ 00468 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00469 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00470 \ 00471 t1v.v = a1v.v; \ 00472 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00473 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00474 \ 00475 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00476 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00477 alpha1 += step_a1; \ 00478 \ 00479 t2v.v = a2v.v; \ 00480 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00481 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00482 \ 00483 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00484 alpha2 += step_a2; \ 00485 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00486 alpha3 += step_a3; \ 00487 } \ 00488 \ 00489 for ( i = 0; i < n_iter2; ++i ) \ 00490 { \ 00491 \ 00492 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00493 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00494 \ 00495 t1v.v = a1v.v; \ 00496 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00497 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00498 \ 00499 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00500 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00501 alpha1 += step_a1; \ 00502 \ 00503 t2v.v = a2v.v; \ 00504 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00505 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00506 \ 00507 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00508 alpha2 += step_a2; \ 00509 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00510 alpha3 += step_a3; \ 00511 } \ 00512 \ 00513 if ( n_left == 1 ) \ 00514 { \ 00515 double ga12 = *gamma12; \ 00516 double si12 = *sigma12; \ 00517 double ga23 = *gamma23; \ 00518 double si23 = *sigma23; \ 00519 double temp1; \ 00520 double temp2; \ 00521 double temp3; \ 00522 \ 00523 temp1 = *alpha1; \ 00524 temp2 = *alpha2; \ 00525 \ 00526 *alpha1 = temp1 * ga12 + temp2 * si12; \ 00527 *alpha2 = temp2 * ga12 - temp1 * si12; \ 00528 \ 00529 temp2 = *alpha2; \ 00530 temp3 = *alpha3; \ 00531 \ 00532 *alpha2 = temp2 * ga23 + temp3 * si23; \ 00533 *alpha3 = temp3 * ga23 - temp2 * si23; \ 00534 } \ 00535 } 00536 00537 #define MAC_Apply_G_mx3_asc( m_A, \ 00538 gamma12, \ 00539 sigma12, \ 00540 gamma23, \ 00541 sigma23, \ 00542 a1, inc_a1, \ 00543 a2, inc_a2, \ 00544 a3, inc_a3 ) \ 00545 { \ 00546 int n_iter16 = m_A / ( 2 * 8 ); \ 00547 int n_left16 = m_A % ( 2 * 8 ); \ 00548 int n_iter2 = n_left16 / ( 2 * 1 ); \ 00549 int n_left = n_left16 % ( 2 * 1 ); \ 00550 int i; \ 00551 \ 00552 const int step_a1 = inc_a1 * 2; \ 00553 const int step_a2 = inc_a1 * 2; \ 00554 const int step_a3 = inc_a1 * 2; \ 00555 \ 00556 scomplex* restrict alpha1 = a1; \ 00557 scomplex* restrict alpha2 = a2; \ 00558 scomplex* restrict alpha3 = a3; \ 00559 \ 00560 v4sf_t a1v, a2v, a3v; \ 00561 v4sf_t g12v, s12v; \ 00562 v4sf_t g23v, s23v; \ 00563 v4sf_t t1v, t2v; \ 00564 \ 00565 g12v.v = _mm_load1_ps( gamma12 ); \ 00566 s12v.v = _mm_load1_ps( sigma12 ); \ 00567 g23v.v = _mm_load1_ps( gamma23 ); \ 00568 s23v.v = _mm_load1_ps( sigma23 ); \ 00569 \ 00570 for ( i = 0; i < n_iter16; ++i ) \ 00571 { \ 00572 \ 00573 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00574 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00575 \ 00576 t1v.v = a1v.v; \ 00577 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00578 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00579 \ 00580 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00581 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00582 alpha1 += step_a1; \ 00583 \ 00584 t2v.v = a2v.v; \ 00585 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00586 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00587 \ 00588 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00589 alpha2 += step_a2; \ 00590 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00591 alpha3 += step_a3; \ 00592 \ 00593 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00594 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00595 \ 00596 t1v.v = a1v.v; \ 00597 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00598 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00599 \ 00600 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00601 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00602 alpha1 += step_a1; \ 00603 \ 00604 t2v.v = a2v.v; \ 00605 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00606 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00607 \ 00608 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00609 alpha2 += step_a2; \ 00610 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00611 alpha3 += step_a3; \ 00612 \ 00613 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00614 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00615 \ 00616 t1v.v = a1v.v; \ 00617 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00618 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00619 \ 00620 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00621 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00622 alpha1 += step_a1; \ 00623 \ 00624 t2v.v = a2v.v; \ 00625 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00626 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00627 \ 00628 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00629 alpha2 += step_a2; \ 00630 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00631 alpha3 += step_a3; \ 00632 \ 00633 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00634 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00635 \ 00636 t1v.v = a1v.v; \ 00637 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00638 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00639 \ 00640 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00641 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00642 alpha1 += step_a1; \ 00643 \ 00644 t2v.v = a2v.v; \ 00645 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00646 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00647 \ 00648 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00649 alpha2 += step_a2; \ 00650 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00651 alpha3 += step_a3; \ 00652 \ 00653 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00654 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00655 \ 00656 t1v.v = a1v.v; \ 00657 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00658 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00659 \ 00660 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00661 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00662 alpha1 += step_a1; \ 00663 \ 00664 t2v.v = a2v.v; \ 00665 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00666 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00667 \ 00668 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00669 alpha2 += step_a2; \ 00670 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00671 alpha3 += step_a3; \ 00672 \ 00673 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00674 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00675 \ 00676 t1v.v = a1v.v; \ 00677 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00678 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00679 \ 00680 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00681 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00682 alpha1 += step_a1; \ 00683 \ 00684 t2v.v = a2v.v; \ 00685 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00686 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00687 \ 00688 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00689 alpha2 += step_a2; \ 00690 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00691 alpha3 += step_a3; \ 00692 \ 00693 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00694 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00695 \ 00696 t1v.v = a1v.v; \ 00697 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00698 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00699 \ 00700 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00701 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00702 alpha1 += step_a1; \ 00703 \ 00704 t2v.v = a2v.v; \ 00705 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00706 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00707 \ 00708 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00709 alpha2 += step_a2; \ 00710 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00711 alpha3 += step_a3; \ 00712 \ 00713 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00714 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00715 \ 00716 t1v.v = a1v.v; \ 00717 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00718 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00719 \ 00720 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00721 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00722 alpha1 += step_a1; \ 00723 \ 00724 t2v.v = a2v.v; \ 00725 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00726 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00727 \ 00728 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00729 alpha2 += step_a2; \ 00730 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00731 alpha3 += step_a3; \ 00732 } \ 00733 \ 00734 for ( i = 0; i < n_iter2; ++i ) \ 00735 { \ 00736 \ 00737 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00738 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00739 \ 00740 t1v.v = a1v.v; \ 00741 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00742 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00743 \ 00744 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00745 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00746 alpha1 += step_a1; \ 00747 \ 00748 t2v.v = a2v.v; \ 00749 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00750 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00751 \ 00752 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00753 alpha2 += step_a2; \ 00754 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00755 alpha3 += step_a3; \ 00756 } \ 00757 \ 00758 if ( n_left == 1 ) \ 00759 { \ 00760 float ga12 = *gamma12; \ 00761 float si12 = *sigma12; \ 00762 float ga23 = *gamma23; \ 00763 float si23 = *sigma23; \ 00764 scomplex temp1; \ 00765 scomplex temp2; \ 00766 scomplex temp3; \ 00767 \ 00768 temp1 = *alpha1; \ 00769 temp2 = *alpha2; \ 00770 \ 00771 alpha1->real = temp1.real * ga12 + temp2.real * si12; \ 00772 alpha2->real = temp2.real * ga12 - temp1.real * si12; \ 00773 \ 00774 alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \ 00775 alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \ 00776 \ 00777 temp2 = *alpha2; \ 00778 temp3 = *alpha3; \ 00779 \ 00780 alpha2->real = temp2.real * ga23 + temp3.real * si23; \ 00781 alpha3->real = temp3.real * ga23 - temp2.real * si23; \ 00782 \ 00783 alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \ 00784 alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \ 00785 } \ 00786 } 00787 00788 #define MAC_Apply_G_mx3_asz( m_A, \ 00789 gamma12, \ 00790 sigma12, \ 00791 gamma23, \ 00792 sigma23, \ 00793 a1, inc_a1, \ 00794 a2, inc_a2, \ 00795 a3, inc_a3 ) \ 00796 {\ 00797 int n_iter = m_A / 8; \ 00798 int n_left = m_A % 8; \ 00799 int i; \ 00800 \ 00801 const int step_a1 = inc_a1 * 1; \ 00802 const int step_a2 = inc_a1 * 1; \ 00803 const int step_a3 = inc_a1 * 1; \ 00804 \ 00805 dcomplex* restrict alpha1 = a1; \ 00806 dcomplex* restrict alpha2 = a2; \ 00807 dcomplex* restrict alpha3 = a3; \ 00808 \ 00809 v2df_t a1v, a2v, a3v; \ 00810 v2df_t g12v, s12v; \ 00811 v2df_t g23v, s23v; \ 00812 v2df_t t1v, t2v; \ 00813 \ 00814 g12v.v = _mm_loaddup_pd( gamma12 ); \ 00815 s12v.v = _mm_loaddup_pd( sigma12 ); \ 00816 g23v.v = _mm_loaddup_pd( gamma23 ); \ 00817 s23v.v = _mm_loaddup_pd( sigma23 ); \ 00818 \ 00819 for ( i = 0; i < n_iter; ++i ) \ 00820 { \ 00821 \ 00822 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00823 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00824 \ 00825 t1v.v = a1v.v; \ 00826 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00827 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00828 \ 00829 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00830 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00831 alpha1 += step_a1; \ 00832 \ 00833 t2v.v = a2v.v; \ 00834 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00835 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00836 \ 00837 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00838 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00839 alpha2 += step_a2; \ 00840 alpha3 += step_a3; \ 00841 \ 00842 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00843 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00844 \ 00845 t1v.v = a1v.v; \ 00846 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00847 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00848 \ 00849 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00850 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00851 alpha1 += step_a1; \ 00852 \ 00853 t2v.v = a2v.v; \ 00854 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00855 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00856 \ 00857 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00858 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00859 alpha2 += step_a2; \ 00860 alpha3 += step_a3; \ 00861 \ 00862 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00863 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00864 \ 00865 t1v.v = a1v.v; \ 00866 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00867 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00868 \ 00869 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00870 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00871 alpha1 += step_a1; \ 00872 \ 00873 t2v.v = a2v.v; \ 00874 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00875 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00876 \ 00877 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00878 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00879 alpha2 += step_a2; \ 00880 alpha3 += step_a3; \ 00881 \ 00882 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00883 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00884 \ 00885 t1v.v = a1v.v; \ 00886 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00887 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00888 \ 00889 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00890 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00891 alpha1 += step_a1; \ 00892 \ 00893 t2v.v = a2v.v; \ 00894 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00895 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00896 \ 00897 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00898 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00899 alpha2 += step_a2; \ 00900 alpha3 += step_a3; \ 00901 \ 00902 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00903 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00904 \ 00905 t1v.v = a1v.v; \ 00906 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00907 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00908 \ 00909 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00910 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00911 alpha1 += step_a1; \ 00912 \ 00913 t2v.v = a2v.v; \ 00914 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00915 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00916 \ 00917 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00918 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00919 alpha2 += step_a2; \ 00920 alpha3 += step_a3; \ 00921 \ 00922 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00923 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00924 \ 00925 t1v.v = a1v.v; \ 00926 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00927 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00928 \ 00929 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00930 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00931 alpha1 += step_a1; \ 00932 \ 00933 t2v.v = a2v.v; \ 00934 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00935 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00936 \ 00937 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00938 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00939 alpha2 += step_a2; \ 00940 alpha3 += step_a3; \ 00941 \ 00942 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00943 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00944 \ 00945 t1v.v = a1v.v; \ 00946 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00947 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00948 \ 00949 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00950 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00951 alpha1 += step_a1; \ 00952 \ 00953 t2v.v = a2v.v; \ 00954 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00955 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00956 \ 00957 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00958 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00959 alpha2 += step_a2; \ 00960 alpha3 += step_a3; \ 00961 \ 00962 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00963 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00964 \ 00965 t1v.v = a1v.v; \ 00966 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00967 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00968 \ 00969 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00970 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00971 alpha1 += step_a1; \ 00972 \ 00973 t2v.v = a2v.v; \ 00974 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00975 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00976 \ 00977 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00978 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00979 alpha2 += step_a2; \ 00980 alpha3 += step_a3; \ 00981 } \ 00982 \ 00983 for ( i = 0; i < n_left; ++i ) \ 00984 { \ 00985 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00986 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00987 \ 00988 t1v.v = a1v.v; \ 00989 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00990 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00991 \ 00992 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00993 alpha1 += step_a1; \ 00994 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00995 \ 00996 t2v.v = a2v.v; \ 00997 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00998 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00999 \ 01000 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 01001 alpha2 += step_a2; \ 01002 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 01003 alpha3 += step_a3; \ 01004 } \ 01005 } 01006 01007 #endif