|
void | bl1_sdotsv3 (conj1_t conjxyw, int n, float *x, int inc_x, float *y, int inc_y, float *w, int inc_w, float *z, int inc_z, float *beta, float *rho_xz, float *rho_yz, float *rho_wz) |
|
void | bl1_ddotsv3 (conj1_t conjxyw, int n, double *x, int inc_x, double *y, int inc_y, double *w, int inc_w, double *z, int inc_z, double *beta, double *rho_xz, double *rho_yz, double *rho_wz) |
|
| if (inc_x !=1||inc_y !=1||inc_w !=1||inc_z !=1) |
|
| for (i=0;i< n_run;++i) |
|
| if (n_left > 0) |
|
void | bl1_cdotsv3 (conj1_t conjxyw, int n, scomplex *x, int inc_x, scomplex *y, int inc_y, scomplex *w, int inc_w, scomplex *z, int inc_z, scomplex *beta, scomplex *rho_xz, scomplex *rho_yz, scomplex *rho_wz) |
|
void | bl1_zdotsv3 (conj1_t conjxyw, int n, dcomplex *x, int inc_x, dcomplex *y, int inc_y, dcomplex *w, int inc_w, dcomplex *z, int inc_z, dcomplex *beta, dcomplex *rho_xz, dcomplex *rho_yz, dcomplex *rho_wz) |
|
| if (bl1_is_conj(conjxyw)) |
|
| bl1_zscals (beta, rho_yz) |
|
| bl1_zscals (beta, rho_wz) |
|
void bl1_ddotsv3 |
( |
conj1_t |
conjxyw, |
|
|
int |
n, |
|
|
double * |
x, |
|
|
int |
inc_x, |
|
|
double * |
y, |
|
|
int |
inc_y, |
|
|
double * |
w, |
|
|
int |
inc_w, |
|
|
double * |
z, |
|
|
int |
inc_z, |
|
|
double * |
beta, |
|
|
double * |
rho_xz, |
|
|
double * |
rho_yz, |
|
|
double * |
rho_wz |
|
) |
| |
62 v2df_t rho1v, rho2v, rho3v;
72 if ( (
unsigned long ) z % 16 != 0 )
74 if ( (
unsigned long ) x % 16 == 0 ||
75 (
unsigned long ) y % 16 == 0 ||
76 (
unsigned long ) w % 16 == 0 )
bl1_abort();
110 rho1v.
v = _mm_setzero_pd();
111 rho2v.
v = _mm_setzero_pd();
112 rho3v.
v = _mm_setzero_pd();
116 x1v.
v = _mm_load_pd( (
double* )
x1 );
117 y1v.
v = _mm_load_pd( (
double* )
y1 );
118 w1v.
v = _mm_load_pd( (
double* )
w1 );
119 z1v.
v = _mm_load_pd( (
double* )
z1 );
121 rho1v.
v += x1v.
v * z1v.
v;
122 rho2v.
v += y1v.
v * z1v.
v;
123 rho3v.
v += w1v.
v * z1v.
v;
125 x2v.
v = _mm_load_pd( (
double* )(
x1 + 2) );
126 y2v.
v = _mm_load_pd( (
double* )(
y1 + 2) );
127 w2v.
v = _mm_load_pd( (
double* )(
w1 + 2) );
128 z2v.
v = _mm_load_pd( (
double* )(
z1 + 2) );
130 rho1v.
v += x2v.
v * z2v.
v;
131 rho2v.
v += y2v.
v * z2v.
v;
132 rho3v.
v += w2v.
v * z2v.
v;
140 rho1 += rho1v.
d[0] + rho1v.
d[1];
141 rho2 += rho2v.
d[0] + rho2v.
d[1];
142 rho3 += rho3v.
d[0] + rho3v.
d[1];
double *restrict z1
Definition: bl1_dotsv3.c:173
int n_left
Definition: bl1_dotsv3.c:181
* rho_wz
Definition: bl1_dotsv3.c:270
int n_pre
Definition: bl1_dotsv3.c:179
double *restrict y1
Definition: bl1_dotsv3.c:169
double *restrict w1
Definition: bl1_dotsv3.c:172
double z1c
Definition: bl1_dotsv3.c:175
int n_run
Definition: bl1_dotsv3.c:180
double rho1
Definition: bl1_dotsv3.c:174
double rho3
Definition: bl1_dotsv3.c:174
double y1c
Definition: bl1_dotsv3.c:175
double rho2
Definition: bl1_dotsv3.c:174
* rho_xz
Definition: bl1_dotsv3.c:268
x1
Definition: bl1_dotsv3.c:452
int i
Definition: bl1_dotsv3.c:177
double x1c
Definition: bl1_dotsv3.c:175
* rho_yz
Definition: bl1_dotsv3.c:269
double w1c
Definition: bl1_dotsv3.c:175
Definition: blis_type_defs.h:117
double d[2]
Definition: blis_type_defs.h:119
__m128d v
Definition: blis_type_defs.h:118
References bl1_abort(), v2df_t::d, i, n_left, n_pre, n_run, rho1, rho2, rho3, rho_wz, rho_xz, rho_yz, v2df_t::v, w1, w1c, x1, x1c, y1, y1c, z1, and z1c.
void bl1_zdotsv3 |
( |
conj1_t |
conjxyw, |
|
|
int |
n, |
|
|
dcomplex * |
x, |
|
|
int |
inc_x, |
|
|
dcomplex * |
y, |
|
|
int |
inc_y, |
|
|
dcomplex * |
w, |
|
|
int |
inc_w, |
|
|
dcomplex * |
z, |
|
|
int |
inc_z, |
|
|
dcomplex * |
beta, |
|
|
dcomplex * |
rho_xz, |
|
|
dcomplex * |
rho_yz, |
|
|
dcomplex * |
rho_wz |
|
) |
| |
320 rho1v.
v = _mm_setzero_pd();
321 rho2v.
v = _mm_setzero_pd();
322 rho3v.
v = _mm_setzero_pd();
328 for (
i = 0;
i < n; ++
i )
330 z11v.
v = _mm_loaddup_pd( (
double* )&(
z1->real) );
331 z12v.
v = _mm_loaddup_pd( (
double* )&(
z1->imag) );
333 x1v.
v = _mm_load_pd( (
double* )
x1 );
334 x1rv.
v = _mm_shuffle_pd( x1v.
v, x1v.
v, _MM_SHUFFLE2 (0,1) );
335 bcac.
v = x1rv.
v * z11v.
v;
336 adbd.
v = x1v.
v * z12v.
v;
337 rho1v.
v = rho1v.
v + _mm_addsub_pd( bcac.
v, adbd.
v );
339 y1v.
v = _mm_load_pd( (
double* )
y1 );
340 y1rv.
v = _mm_shuffle_pd( y1v.
v, y1v.
v, _MM_SHUFFLE2 (0,1) );
341 bcac.
v = y1rv.
v * z11v.
v;
342 adbd.
v = y1v.
v * z12v.
v;
343 rho2v.
v = rho2v.
v + _mm_addsub_pd( bcac.
v, adbd.
v );
345 w1v.
v = _mm_load_pd( (
double* )
w1 );
346 w1rv.
v = _mm_shuffle_pd( w1v.
v, w1v.
v, _MM_SHUFFLE2 (0,1) );
347 bcac.
v = w1rv.
v * z11v.
v;
348 adbd.
v = w1v.
v * z12v.
v;
349 rho3v.
v = rho3v.
v + _mm_addsub_pd( bcac.
v, adbd.
v );
357 rho1v.
v = _mm_shuffle_pd( rho1v.
v, rho1v.
v, _MM_SHUFFLE2 (0,1) );
358 rho2v.
v = _mm_shuffle_pd( rho2v.
v, rho2v.
v, _MM_SHUFFLE2 (0,1) );
359 rho3v.
v = _mm_shuffle_pd( rho3v.
v, rho3v.
v, _MM_SHUFFLE2 (0,1) );
361 rho1v.
d[1] = -rho1v.
d[1];
362 rho2v.
d[1] = -rho2v.
d[1];
363 rho3v.
d[1] = -rho3v.
d[1];
369 for (
i = 0;
i < n; ++
i )
371 z11v.
v = _mm_loaddup_pd( (
double* )&(
z1->real) );
372 z12v.
v = _mm_loaddup_pd( (
double* )&(
z1->imag) );
374 x1v.
v = _mm_load_pd( (
double* )
x1 );
375 x1rv.
v = _mm_shuffle_pd( x1v.
v, x1v.
v, _MM_SHUFFLE2 (0,1) );
376 cada.
v = x1v.
v * z11v.
v;
377 dbcb.
v = x1rv.
v * z12v.
v;
378 rho1v.
v = rho1v.
v + _mm_addsub_pd( cada.
v, dbcb.
v );
380 y1v.
v = _mm_load_pd( (
double* )
y1 );
381 y1rv.
v = _mm_shuffle_pd( y1v.
v, y1v.
v, _MM_SHUFFLE2 (0,1) );
382 cada.
v = y1v.
v * z11v.
v;
383 dbcb.
v = y1rv.
v * z12v.
v;
384 rho2v.
v = rho2v.
v + _mm_addsub_pd( cada.
v, dbcb.
v );
386 w1v.
v = _mm_load_pd( (
double* )
w1 );
387 w1rv.
v = _mm_shuffle_pd( w1v.
v, w1v.
v, _MM_SHUFFLE2 (0,1) );
388 cada.
v = w1v.
v * z11v.
v;
389 dbcb.
v = w1rv.
v * z12v.
v;
390 rho3v.
v = rho3v.
v + _mm_addsub_pd( cada.
v, dbcb.
v );
403 v2df_t ab, ba, cc, dd, acbc, bdad;
405 ab.
v = _mm_load_pd( (
double* )beta );
406 ba.
v = _mm_shuffle_pd( ab.
v, ab.
v, _MM_SHUFFLE2 (0,1) );
408 cc.
v = _mm_loaddup_pd( (
double* )&(
rho_xz->real) );
409 dd.
v = _mm_loaddup_pd( (
double* )&(
rho_xz->imag) );
410 acbc.
v = ab.
v * cc.
v;
411 bdad.
v = ba.
v * dd.
v;
412 r1v.
v = _mm_addsub_pd( acbc.
v, bdad.
v );
414 cc.
v = _mm_loaddup_pd( (
double* )&(
rho_yz->real) );
415 dd.
v = _mm_loaddup_pd( (
double* )&(
rho_yz->imag) );
416 acbc.
v = ab.
v * cc.
v;
417 bdad.
v = ba.
v * dd.
v;
418 r2v.
v = _mm_addsub_pd( acbc.
v, bdad.
v );
420 cc.
v = _mm_loaddup_pd( (
double* )&(
rho_wz->real) );
421 dd.
v = _mm_loaddup_pd( (
double* )&(
rho_wz->imag) );
422 acbc.
v = ab.
v * cc.
v;
423 bdad.
v = ba.
v * dd.
v;
424 r3v.
v = _mm_addsub_pd( acbc.
v, bdad.
v );
429 rho1v.
v = r1v.
v + rho1v.
v;
430 _mm_store_pd( (
double* )
rho_xz, rho1v.
v );
434 rho2v.
v = r2v.
v + rho2v.
v;
435 _mm_store_pd( (
double* )
rho_yz, rho2v.
v );
439 rho3v.
v = r3v.
v + rho3v.
v;
440 _mm_store_pd( (
double* )
rho_wz, rho3v.
v );
int bl1_is_conj(conj1_t conj)
Definition: bl1_is.c:42
Definition: blis_type_defs.h:138
References bl1_is_conj(), v2df_t::d, i, rho_wz, rho_xz, rho_yz, v2df_t::v, w1, x1, y1, and z1.
Referenced by FLA_Fused_Uhu_Yhu_Zhu_opz_var1().
double x2c
Definition: bl1_dotsv3.c:176
double z2c
Definition: bl1_dotsv3.c:176
double w2c
Definition: bl1_dotsv3.c:176
double y2c
Definition: bl1_dotsv3.c:176
References rho1, rho2, rho3, w1, w1c, w2c, x1, x1c, x2c, y1, y1c, y2c, z1, z1c, and z2c.