libflame
revision_anchor
|
00001 /* 00002 libflame 00003 An object-based infrastructure for developing high-performance 00004 dense linear algebra libraries. 00005 00006 Copyright (C) 2011, The University of Texas 00007 00008 libflame is free software; you can redistribute it and/or modify 00009 it under the terms of the GNU Lesser General Public License as 00010 published by the Free Software Foundation; either version 2.1 of 00011 the License, or (at your option) any later version. 00012 00013 libflame is distributed in the hope that it will be useful, but 00014 WITHOUT ANY WARRANTY; without even the implied warranty of 00015 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 Lesser General Public License for more details. 00017 00018 You should have received a copy of the GNU Lesser General Public 00019 License along with libflame; if you did not receive a copy, see 00020 http://www.gnu.org/licenses/. 00021 00022 For more information, please contact us at flame@cs.utexas.edu or 00023 send mail to: 00024 00025 Field G. Van Zee and/or 00026 Robert A. van de Geijn 00027 The University of Texas at Austin 00028 Department of Computer Sciences 00029 1 University Station C0500 00030 Austin TX 78712 00031 */ 00032 00033 00034 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS 00035 00036 #define MAC_Apply_G_mx2_ass MAC_Apply_G_mx2_ops 00037 #define MAC_Apply_G_mx2_asd MAC_Apply_G_mx2_opd 00038 #define MAC_Apply_G_mx2_asc MAC_Apply_G_mx2_opc 00039 #define MAC_Apply_G_mx2_asz MAC_Apply_G_mx2_opz 00040 00041 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS 00042 00043 #define MAC_Apply_G_mx2_ass( m_A, \ 00044 gamma12, \ 00045 sigma12, \ 00046 a1, inc_a1, \ 00047 a2, inc_a2 ) \ 00048 {\ 00049 int n_iter32 = m_A / ( 4 * 8 ); \ 00050 int n_left32 = m_A % ( 4 * 8 ); \ 00051 int n_iter4 = n_left32 / ( 4 * 1 ); \ 00052 int n_left = n_left32 % ( 4 * 1 ); \ 00053 int i; \ 00054 \ 00055 const int step_a1 = inc_a1 * 4; \ 00056 const int step_a2 = inc_a2 * 4; \ 00057 \ 00058 float* restrict alpha1 = a1; \ 00059 float* restrict alpha2 = a2; \ 00060 \ 00061 v4sf_t a1v, a2v; \ 00062 v4sf_t g12v, s12v; \ 00063 v4sf_t t1v; \ 00064 \ 00065 g12v.v = _mm_load1_ps( gamma12 ); \ 00066 s12v.v = _mm_load1_ps( sigma12 ); \ 00067 \ 00068 for ( i = 0; i < n_iter32; ++i ) \ 00069 { \ 00070 \ 00071 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00072 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00073 \ 00074 t1v.v = a1v.v; \ 00075 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00076 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00077 \ 00078 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00079 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00080 \ 00081 alpha1 += step_a1; \ 00082 alpha2 += step_a2; \ 00083 \ 00084 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00085 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00086 \ 00087 t1v.v = a1v.v; \ 00088 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00089 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00090 \ 00091 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00092 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00093 \ 00094 alpha1 += step_a1; \ 00095 alpha2 += step_a2; \ 00096 \ 00097 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00098 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00099 \ 00100 t1v.v = a1v.v; \ 00101 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00102 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00103 \ 00104 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00105 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00106 \ 00107 alpha1 += step_a1; \ 00108 alpha2 += step_a2; \ 00109 \ 00110 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00111 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00112 \ 00113 t1v.v = a1v.v; \ 00114 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00115 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00116 \ 00117 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00118 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00119 \ 00120 alpha1 += step_a1; \ 00121 alpha2 += step_a2; \ 00122 \ 00123 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00124 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00125 \ 00126 t1v.v = a1v.v; \ 00127 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00128 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00129 \ 00130 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00131 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00132 \ 00133 alpha1 += step_a1; \ 00134 alpha2 += step_a2; \ 00135 \ 00136 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00137 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00138 \ 00139 t1v.v = a1v.v; \ 00140 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00141 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00142 \ 00143 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00144 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00145 \ 00146 alpha1 += step_a1; \ 00147 alpha2 += step_a2; \ 00148 \ 00149 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00150 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00151 \ 00152 t1v.v = a1v.v; \ 00153 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00154 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00155 \ 00156 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00157 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00158 \ 00159 alpha1 += step_a1; \ 00160 alpha2 += step_a2; \ 00161 \ 00162 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00163 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00164 \ 00165 t1v.v = a1v.v; \ 00166 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00167 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00168 \ 00169 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00170 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00171 \ 00172 alpha1 += step_a1; \ 00173 alpha2 += step_a2; \ 00174 } \ 00175 \ 00176 for ( i = 0; i < n_iter4; ++i ) \ 00177 { \ 00178 \ 00179 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00180 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00181 \ 00182 t1v.v = a1v.v; \ 00183 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00184 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00185 \ 00186 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00187 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00188 \ 00189 alpha1 += step_a1; \ 00190 alpha2 += step_a2; \ 00191 } \ 00192 \ 00193 for ( i = 0; i < n_left; ++i ) \ 00194 { \ 00195 float ga12 = *gamma12; \ 00196 float si12 = *sigma12; \ 00197 float temp1; \ 00198 float temp2; \ 00199 \ 00200 temp1 = *alpha1; \ 00201 temp2 = *alpha2; \ 00202 \ 00203 *alpha1 = temp1 * ga12 + temp2 * si12; \ 00204 *alpha2 = temp2 * ga12 - temp1 * si12; \ 00205 \ 00206 alpha1 += 1; \ 00207 alpha2 += 1; \ 00208 } \ 00209 } 00210 00211 #define MAC_Apply_G_mx2_asd( m_A, \ 00212 gamma12, \ 00213 sigma12, \ 00214 a1, inc_a1, \ 00215 a2, inc_a2 ) \ 00216 {\ 00217 int n_iter16 = m_A / ( 2 * 8 ); \ 00218 int n_left16 = m_A % ( 2 * 8 ); \ 00219 int n_iter2 = n_left16 / ( 2 * 1 ); \ 00220 int n_left = n_left16 % ( 2 * 1 ); \ 00221 int i; \ 00222 \ 00223 const int step_a1 = inc_a1 * 2; \ 00224 const int step_a2 = inc_a2 * 2; \ 00225 \ 00226 double* restrict alpha1 = a1; \ 00227 double* restrict alpha2 = a2; \ 00228 \ 00229 v2df_t a1v, a2v; \ 00230 v2df_t g12v, s12v; \ 00231 v2df_t t1v; \ 00232 \ 00233 g12v.v = _mm_loaddup_pd( gamma12 ); \ 00234 s12v.v = _mm_loaddup_pd( sigma12 ); \ 00235 \ 00236 for ( i = 0; i < n_iter16; ++i ) \ 00237 { \ 00238 \ 00239 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00240 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00241 \ 00242 t1v.v = a1v.v; \ 00243 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00244 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00245 \ 00246 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00247 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00248 \ 00249 alpha1 += step_a1; \ 00250 alpha2 += step_a2; \ 00251 \ 00252 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00253 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00254 \ 00255 t1v.v = a1v.v; \ 00256 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00257 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00258 \ 00259 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00260 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00261 \ 00262 alpha1 += step_a1; \ 00263 alpha2 += step_a2; \ 00264 \ 00265 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00266 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00267 \ 00268 t1v.v = a1v.v; \ 00269 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00270 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00271 \ 00272 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00273 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00274 \ 00275 alpha1 += step_a1; \ 00276 alpha2 += step_a2; \ 00277 \ 00278 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00279 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00280 \ 00281 t1v.v = a1v.v; \ 00282 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00283 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00284 \ 00285 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00286 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00287 \ 00288 alpha1 += step_a1; \ 00289 alpha2 += step_a2; \ 00290 \ 00291 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00292 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00293 \ 00294 t1v.v = a1v.v; \ 00295 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00296 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00297 \ 00298 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00299 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00300 \ 00301 alpha1 += step_a1; \ 00302 alpha2 += step_a2; \ 00303 \ 00304 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00305 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00306 \ 00307 t1v.v = a1v.v; \ 00308 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00309 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00310 \ 00311 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00312 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00313 \ 00314 alpha1 += step_a1; \ 00315 alpha2 += step_a2; \ 00316 \ 00317 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00318 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00319 \ 00320 t1v.v = a1v.v; \ 00321 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00322 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00323 \ 00324 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00325 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00326 \ 00327 alpha1 += step_a1; \ 00328 alpha2 += step_a2; \ 00329 \ 00330 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00331 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00332 \ 00333 t1v.v = a1v.v; \ 00334 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00335 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00336 \ 00337 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00338 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00339 \ 00340 alpha1 += step_a1; \ 00341 alpha2 += step_a2; \ 00342 } \ 00343 \ 00344 for ( i = 0; i < n_iter2; ++i ) \ 00345 { \ 00346 \ 00347 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00348 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00349 \ 00350 t1v.v = a1v.v; \ 00351 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00352 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00353 \ 00354 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00355 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00356 \ 00357 alpha1 += step_a1; \ 00358 alpha2 += step_a2; \ 00359 } \ 00360 \ 00361 if ( n_left == 1 ) \ 00362 { \ 00363 double ga12 = *gamma12; \ 00364 double si12 = *sigma12; \ 00365 double temp1; \ 00366 double temp2; \ 00367 \ 00368 temp1 = *alpha1; \ 00369 temp2 = *alpha2; \ 00370 \ 00371 *alpha1 = temp1 * ga12 + temp2 * si12; \ 00372 *alpha2 = temp2 * ga12 - temp1 * si12; \ 00373 } \ 00374 } 00375 00376 #define MAC_Apply_G_mx2_asc( m_A, \ 00377 gamma12, \ 00378 sigma12, \ 00379 a1, inc_a1, \ 00380 a2, inc_a2 ) \ 00381 {\ 00382 int n_iter16 = m_A / ( 2 * 8 ); \ 00383 int n_left16 = m_A % ( 2 * 8 ); \ 00384 int n_iter2 = n_left16 / ( 2 * 1 ); \ 00385 int n_left = n_left16 % ( 2 * 1 ); \ 00386 int i; \ 00387 \ 00388 const int step_a1 = inc_a1 * 2; \ 00389 const int step_a2 = inc_a2 * 2; \ 00390 \ 00391 scomplex* restrict alpha1 = a1; \ 00392 scomplex* restrict alpha2 = a2; \ 00393 \ 00394 v4sf_t a1v, a2v; \ 00395 v4sf_t g12v, s12v; \ 00396 v4sf_t t1v; \ 00397 \ 00398 g12v.v = _mm_load1_ps( gamma12 ); \ 00399 s12v.v = _mm_load1_ps( sigma12 ); \ 00400 \ 00401 for ( i = 0; i < n_iter16; ++i ) \ 00402 { \ 00403 \ 00404 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00405 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00406 \ 00407 t1v.v = a1v.v; \ 00408 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00409 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00410 \ 00411 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00412 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00413 \ 00414 alpha1 += step_a1; \ 00415 alpha2 += step_a2; \ 00416 \ 00417 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00418 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00419 \ 00420 t1v.v = a1v.v; \ 00421 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00422 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00423 \ 00424 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00425 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00426 \ 00427 alpha1 += step_a1; \ 00428 alpha2 += step_a2; \ 00429 \ 00430 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00431 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00432 \ 00433 t1v.v = a1v.v; \ 00434 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00435 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00436 \ 00437 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00438 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00439 \ 00440 alpha1 += step_a1; \ 00441 alpha2 += step_a2; \ 00442 \ 00443 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00444 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00445 \ 00446 t1v.v = a1v.v; \ 00447 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00448 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00449 \ 00450 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00451 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00452 \ 00453 alpha1 += step_a1; \ 00454 alpha2 += step_a2; \ 00455 \ 00456 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00457 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00458 \ 00459 t1v.v = a1v.v; \ 00460 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00461 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00462 \ 00463 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00464 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00465 \ 00466 alpha1 += step_a1; \ 00467 alpha2 += step_a2; \ 00468 \ 00469 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00470 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00471 \ 00472 t1v.v = a1v.v; \ 00473 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00474 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00475 \ 00476 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00477 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00478 \ 00479 alpha1 += step_a1; \ 00480 alpha2 += step_a2; \ 00481 \ 00482 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00483 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00484 \ 00485 t1v.v = a1v.v; \ 00486 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00487 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00488 \ 00489 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00490 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00491 \ 00492 alpha1 += step_a1; \ 00493 alpha2 += step_a2; \ 00494 \ 00495 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00496 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00497 \ 00498 t1v.v = a1v.v; \ 00499 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00500 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00501 \ 00502 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00503 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00504 \ 00505 alpha1 += step_a1; \ 00506 alpha2 += step_a2; \ 00507 } \ 00508 \ 00509 for ( i = 0; i < n_iter2; ++i ) \ 00510 { \ 00511 \ 00512 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00513 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00514 \ 00515 t1v.v = a1v.v; \ 00516 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00517 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00518 \ 00519 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00520 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00521 \ 00522 alpha1 += step_a1; \ 00523 alpha2 += step_a2; \ 00524 } \ 00525 \ 00526 if ( n_left == 1 ) \ 00527 { \ 00528 float ga12 = *gamma12; \ 00529 float si12 = *sigma12; \ 00530 scomplex temp1; \ 00531 scomplex temp2; \ 00532 \ 00533 temp1 = *alpha1; \ 00534 temp2 = *alpha2; \ 00535 \ 00536 alpha1->real = temp1.real * ga12 + temp2.real * si12; \ 00537 alpha2->real = temp2.real * ga12 - temp1.real * si12; \ 00538 \ 00539 alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \ 00540 alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \ 00541 } \ 00542 } 00543 00544 #define MAC_Apply_G_mx2_asz( m_A, \ 00545 gamma12, \ 00546 sigma12, \ 00547 a1, inc_a1, \ 00548 a2, inc_a2 ) \ 00549 {\ 00550 int n_iter = m_A / 8; \ 00551 int n_left = m_A % 8; \ 00552 int i; \ 00553 \ 00554 const int step_a1 = inc_a1 * 1; \ 00555 const int step_a2 = inc_a2 * 1; \ 00556 \ 00557 dcomplex* restrict alpha1 = a1; \ 00558 dcomplex* restrict alpha2 = a2; \ 00559 \ 00560 v2df_t a1v, a2v; \ 00561 v2df_t g12v, s12v; \ 00562 v2df_t t1v; \ 00563 \ 00564 g12v.v = _mm_loaddup_pd( gamma12 ); \ 00565 s12v.v = _mm_loaddup_pd( sigma12 ); \ 00566 \ 00567 for ( i = 0; i < n_iter; ++i ) \ 00568 { \ 00569 \ 00570 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00571 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00572 \ 00573 t1v.v = a1v.v; \ 00574 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00575 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00576 \ 00577 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00578 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00579 \ 00580 alpha1 += step_a1; \ 00581 alpha2 += step_a2; \ 00582 \ 00583 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00584 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00585 \ 00586 t1v.v = a1v.v; \ 00587 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00588 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00589 \ 00590 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00591 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00592 \ 00593 alpha1 += step_a1; \ 00594 alpha2 += step_a2; \ 00595 \ 00596 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00597 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00598 \ 00599 t1v.v = a1v.v; \ 00600 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00601 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00602 \ 00603 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00604 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00605 \ 00606 alpha1 += step_a1; \ 00607 alpha2 += step_a2; \ 00608 \ 00609 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00610 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00611 \ 00612 t1v.v = a1v.v; \ 00613 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00614 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00615 \ 00616 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00617 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00618 \ 00619 alpha1 += step_a1; \ 00620 alpha2 += step_a2; \ 00621 \ 00622 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00623 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00624 \ 00625 t1v.v = a1v.v; \ 00626 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00627 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00628 \ 00629 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00630 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00631 \ 00632 alpha1 += step_a1; \ 00633 alpha2 += step_a2; \ 00634 \ 00635 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00636 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00637 \ 00638 t1v.v = a1v.v; \ 00639 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00640 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00641 \ 00642 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00643 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00644 \ 00645 alpha1 += step_a1; \ 00646 alpha2 += step_a2; \ 00647 \ 00648 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00649 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00650 \ 00651 t1v.v = a1v.v; \ 00652 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00653 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00654 \ 00655 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00656 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00657 \ 00658 alpha1 += step_a1; \ 00659 alpha2 += step_a2; \ 00660 \ 00661 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00662 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00663 \ 00664 t1v.v = a1v.v; \ 00665 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00666 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00667 \ 00668 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00669 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00670 \ 00671 alpha1 += step_a1; \ 00672 alpha2 += step_a2; \ 00673 } \ 00674 \ 00675 for ( i = 0; i < n_left; ++i ) \ 00676 { \ 00677 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00678 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00679 \ 00680 t1v.v = a1v.v; \ 00681 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00682 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00683 \ 00684 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00685 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00686 \ 00687 alpha1 += step_a1; \ 00688 alpha2 += step_a2; \ 00689 } \ 00690 } 00691 00692 #endif