libflame  revision_anchor
Functions | Variables
bl1_axpyv2bdotaxpy.c File Reference

(r)

Functions

void bl1_saxpyv2bdotaxpy (int n, float *beta, float *u, int inc_u, float *gamma, float *z, int inc_z, float *a, int inc_a, float *x, int inc_x, float *kappa, float *rho, float *w, int inc_w)
 
void bl1_daxpyv2bdotaxpy (int n, double *beta, double *u, int inc_u, double *gamma, double *z, int inc_z, double *a, int inc_a, double *x, int inc_x, double *kappa, double *rho, double *w, int inc_w)
 
 if (n_pre==1)
 
 for (i=0;i< n_run;++i)
 
 if (n_left > 0)
 
void bl1_caxpyv2bdotaxpy (int n, scomplex *beta, scomplex *u, int inc_u, scomplex *gamma, scomplex *z, int inc_z, scomplex *a, int inc_a, scomplex *x, int inc_x, scomplex *kappa, scomplex *rho, scomplex *w, int inc_w)
 
void bl1_zaxpyv2bdotaxpy (int n, dcomplex *beta, dcomplex *u, int inc_u, dcomplex *gamma, dcomplex *z, int inc_z, dcomplex *a, int inc_a, dcomplex *x, int inc_x, dcomplex *kappa, dcomplex *rho, dcomplex *w, int inc_w)
 

Variables

double *restrict zeta1
 
double *restrict alpha1 = a
 
double *restrict chi1 = x
 
double *restrict omega1 = w
 
double beta_c = *beta
 
double gamma_c = *gamma
 
double kappa_c = *kappa
 
double rho_c = 0.0
 
int i
 
int n_pre = 0
 
int n_run = ( n - n_pre ) / 2
 
int n_left = ( n - n_pre ) % 2
 
 upsilon1 = u
 
rho = rho_c
 
rho_c real = 0.0
 
rho_c imag = 0.0
 

Function Documentation

◆ bl1_caxpyv2bdotaxpy()

void bl1_caxpyv2bdotaxpy ( int  n,
scomplex beta,
scomplex u,
int  inc_u,
scomplex gamma,
scomplex z,
int  inc_z,
scomplex a,
int  inc_a,
scomplex x,
int  inc_x,
scomplex kappa,
scomplex rho,
scomplex w,
int  inc_w 
)
337 {
338  bl1_abort();
339 }
void bl1_abort(void)
Definition: bl1_abort.c:13

References bl1_abort().

◆ bl1_daxpyv2bdotaxpy()

void bl1_daxpyv2bdotaxpy ( int  n,
double *  beta,
double *  u,
int  inc_u,
double *  gamma,
double *  z,
int  inc_z,
double *  a,
int  inc_a,
double *  x,
int  inc_x,
double *  kappa,
double *  rho,
double *  w,
int  inc_w 
)
47 {
48  double* restrict upsilon1;
49  double* restrict zeta1;
50  double* restrict alpha1;
51  double* restrict chi1;
52  double* restrict omega1;
53  double rho_c;
54  int i;
55  v2df_t b1v, g1v, k1v;
56  v2df_t rhov;
57  v2df_t u1v, z1v, a1v;
58  v2df_t u2v, z2v, a2v;
59  v2df_t x1v, w1v;
60  v2df_t x2v, w2v;
61 
62  int n_pre;
63  int n_run;
64  int n_left;
65 
66  n_pre = 0;
67  if ( ( unsigned long ) a % 16 != 0 )
68  {
69  if ( ( unsigned long ) u % 16 == 0 ||
70  ( unsigned long ) z % 16 == 0 ||
71  ( unsigned long ) x % 16 == 0 ||
72  ( unsigned long ) w % 16 == 0 ) bl1_abort();
73 
74  n_pre = 1;
75  }
76 
77  n_run = ( n - n_pre ) / 4;
78  n_left = ( n - n_pre ) % 4;
79 
80  upsilon1 = u;
81  zeta1 = z;
82  alpha1 = a;
83  chi1 = x;
84  omega1 = w;
85 
86 
87  rho_c = 0.0;
88 
89  if ( n_pre == 1 )
90  {
91  double beta_c = *beta;
92  double gamma_c = *gamma;
93  double kappa_c = *kappa;
94 
95  double upsilon1_c = *upsilon1;
96  double zeta1_c = *zeta1;
97  double alpha1_c = *alpha1;
98  double chi1_c = *chi1;
99  double omega1_c = *omega1;
100 
101  alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
102  rho_c += alpha1_c * chi1_c;
103  omega1_c += kappa_c * alpha1_c;
104 
105  *alpha1 = alpha1_c;
106  *omega1 = omega1_c;
107 
108  upsilon1 += inc_u;
109  zeta1 += inc_z;
110  alpha1 += inc_a;
111  chi1 += inc_x;
112  omega1 += inc_w;
113  }
114 
115  b1v.v = _mm_loaddup_pd( ( double* )beta );
116  g1v.v = _mm_loaddup_pd( ( double* )gamma );
117  k1v.v = _mm_loaddup_pd( ( double* )kappa );
118 
119  rhov.v = _mm_setzero_pd();
120 
121  for ( i = 0; i < n_run; ++i )
122  {
123  u1v.v = _mm_load_pd( ( double* )upsilon1 );
124  z1v.v = _mm_load_pd( ( double* )zeta1 );
125  a1v.v = _mm_load_pd( ( double* )alpha1 );
126 
127  a1v.v += b1v.v * u1v.v + g1v.v * z1v.v;
128 
129  u2v.v = _mm_load_pd( ( double* )(upsilon1 + 2) );
130  z2v.v = _mm_load_pd( ( double* )(zeta1 + 2) );
131  a2v.v = _mm_load_pd( ( double* )(alpha1 + 2) );
132 
133  a2v.v += b1v.v * u2v.v + g1v.v * z2v.v;
134 
135  x1v.v = _mm_load_pd( ( double* )chi1 );
136  x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
137 
138  w1v.v = _mm_load_pd( ( double* )omega1 );
139  w2v.v = _mm_load_pd( ( double* )(omega1 + 2) );
140 
141  rhov.v += a1v.v * x1v.v;
142  rhov.v += a2v.v * x2v.v;
143 
144  w1v.v += k1v.v * a1v.v;
145  w2v.v += k1v.v * a2v.v;
146 
147  _mm_store_pd( ( double* )alpha1, a1v.v );
148  _mm_store_pd( ( double* )(alpha1 + 2), a2v.v );
149 
150  _mm_store_pd( ( double* )omega1, w1v.v );
151  _mm_store_pd( ( double* )(omega1 + 2), w2v.v );
152 
153 
154  upsilon1 += 4;
155  zeta1 += 4;
156  alpha1 += 4;
157  chi1 += 4;
158  omega1 += 4;
159  }
160 
161  rho_c += rhov.d[0] + rhov.d[1];
162 
163  if ( n_left > 0 )
164  {
165  double beta_c = *beta;
166  double gamma_c = *gamma;
167  double kappa_c = *kappa;
168 
169  for ( i = 0; i < n_left; ++i )
170  {
171  double upsilon1_c = *upsilon1;
172  double zeta1_c = *zeta1;
173  double alpha1_c = *alpha1;
174  double chi1_c = *chi1;
175  double omega1_c = *omega1;
176 
177  alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
178  rho_c += alpha1_c * chi1_c;
179  omega1_c += kappa_c * alpha1_c;
180 
181  *alpha1 = alpha1_c;
182  *omega1 = omega1_c;
183 
184  upsilon1 += inc_u;
185  zeta1 += inc_z;
186  alpha1 += inc_a;
187  chi1 += inc_x;
188  omega1 += inc_w;
189  }
190  }
191 
192  *rho = rho_c;
193 }
double alpha1_c
Definition: bl1_axpyv2b.c:144
int n_left
Definition: bl1_axpyv2bdotaxpy.c:209
double *restrict chi1
Definition: bl1_axpyv2bdotaxpy.c:199
upsilon1
Definition: bl1_axpyv2bdotaxpy.c:225
double beta_c
Definition: bl1_axpyv2bdotaxpy.c:201
double rho_c
Definition: bl1_axpyv2bdotaxpy.c:204
double *restrict alpha1
Definition: bl1_axpyv2bdotaxpy.c:198
double kappa_c
Definition: bl1_axpyv2bdotaxpy.c:203
* rho
Definition: bl1_axpyv2bdotaxpy.c:322
double *restrict zeta1
Definition: bl1_axpyv2bdotaxpy.c:195
int i
Definition: bl1_axpyv2bdotaxpy.c:205
int n_pre
Definition: bl1_axpyv2bdotaxpy.c:207
double gamma_c
Definition: bl1_axpyv2bdotaxpy.c:202
double *restrict omega1
Definition: bl1_axpyv2bdotaxpy.c:200
int n_run
Definition: bl1_axpyv2bdotaxpy.c:208
Definition: blis_type_defs.h:117
double d[2]
Definition: blis_type_defs.h:119
__m128d v
Definition: blis_type_defs.h:118

References alpha1, alpha1_c, beta_c, bl1_abort(), chi1, v2df_t::d, gamma_c, i, kappa_c, n_left, n_pre, n_run, omega1, rho, rho_c, upsilon1, v2df_t::v, and zeta1.

Referenced by FLA_Fused_Gerc2_Ahx_Ax_opd_var1(), and FLA_Fused_Her2_Ax_l_opd_var1().

◆ bl1_saxpyv2bdotaxpy()

void bl1_saxpyv2bdotaxpy ( int  n,
float *  beta,
float *  u,
int  inc_u,
float *  gamma,
float *  z,
int  inc_z,
float *  a,
int  inc_a,
float *  x,
int  inc_x,
float *  kappa,
float *  rho,
float *  w,
int  inc_w 
)
31 {
32  bl1_abort();
33 }

References bl1_abort().

◆ bl1_zaxpyv2bdotaxpy()

void bl1_zaxpyv2bdotaxpy ( int  n,
dcomplex beta,
dcomplex u,
int  inc_u,
dcomplex gamma,
dcomplex z,
int  inc_z,
dcomplex a,
int  inc_a,
dcomplex x,
int  inc_x,
dcomplex kappa,
dcomplex rho,
dcomplex w,
int  inc_w 
)
353 {
354  dcomplex* restrict upsilon1;
355  dcomplex* restrict zeta1;
356  dcomplex* restrict alpha1;
357  dcomplex* restrict chi1;
358  dcomplex* restrict omega1;
359  int i;
360 
361  //v2df_t beta1v, beta1rv;
362  //v2df_t gamma1v, gamma1rv;
363  //v2df_t kappa1v, kappa1rv;
364  v2df_t rho1v;
365  //v2df_t u11v, u12v;
366  //v2df_t z11v, z12v;
367  v2df_t a11v, a12v;
368  v2df_t x1v, x1rv;
369  v2df_t w1v;
370  v2df_t acbc, bdad;
371  v2df_t adac, bcbd;
372 
373  v2df_t a1v, a1rv;
374  v2df_t u1v, u1rv;
375  v2df_t z1v, z1rv;
376  v2df_t beta11v, gamma11v, kappa11v;
377  v2df_t beta12v, gamma12v, kappa12v;
378 
379  upsilon1 = u;
380  zeta1 = z;
381  alpha1 = a;
382  chi1 = x;
383  omega1 = w;
384 
385  if ( inc_u != 1 ||
386  inc_z != 1 ||
387  inc_a != 1 ||
388  inc_x != 1 ||
389  inc_w != 1 ) bl1_abort();
390 
391 
392  beta11v.v = _mm_loaddup_pd( ( double* )&(beta->real) );
393  beta12v.v = _mm_loaddup_pd( ( double* )&(beta->imag) );
394  gamma11v.v = _mm_loaddup_pd( ( double* )&(gamma->real) );
395  gamma12v.v = _mm_loaddup_pd( ( double* )&(gamma->imag) );
396  kappa11v.v = _mm_loaddup_pd( ( double* )&(kappa->real) );
397  kappa12v.v = _mm_loaddup_pd( ( double* )&(kappa->imag) );
398 
399  rho1v.v = _mm_setzero_pd();
400 
401  for ( i = 0; i < n; ++i )
402  {
403  //alpha_c = *alpha1;
404  a1v.v = _mm_load_pd( ( double* )alpha1 );
405 
406  //alpha1_c.real += beta_c.real * upsilon1_c.real - beta_c.imag * upsilon1_c.imag;
407  //alpha1_c.imag += beta_c.real * upsilon1_c.imag + beta_c.imag * upsilon1_c.real;
408  u1v.v = _mm_load_pd( ( double* )upsilon1 );
409  u1rv.v = _mm_shuffle_pd( u1v.v, u1v.v, _MM_SHUFFLE2 (0,1) );
410  acbc.v = beta11v.v * u1v.v;
411  bdad.v = beta12v.v * u1rv.v;
412  a1v.v += _mm_addsub_pd( acbc.v, bdad.v );
413 
414  //alpha1_c.real += gamma_c.real * zeta1_c.real - gamma_c.imag * zeta1_c.imag;
415  //alpha1_c.imag += gamma_c.real * zeta1_c.imag + gamma_c.imag * zeta1_c.real;
416  z1v.v = _mm_load_pd( ( double* )zeta1 );
417  z1rv.v = _mm_shuffle_pd( z1v.v, z1v.v, _MM_SHUFFLE2 (0,1) );
418  acbc.v = gamma11v.v * z1v.v;
419  bdad.v = gamma12v.v * z1rv.v;
420  a1v.v += _mm_addsub_pd( acbc.v, bdad.v );
421 
422  //*alpha1 = alpha1_c;
423  _mm_store_pd( ( double* )alpha1, a1v.v );
424 
425  //rho_c.real += alpha1_c.real * chi1_c.real - -alpha1_c.imag * chi1_c.imag;
426  //rho_c.imag += alpha1_c.real * chi1_c.imag + -alpha1_c.imag * chi1_c.real;
427  x1v.v = _mm_load_pd( ( double* )chi1 );
428  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
429  a11v.v = a1v.v;
430  a12v.v = _mm_shuffle_pd( a11v.v, a11v.v, _MM_SHUFFLE2 (1,1) );
431  a11v.v = _mm_shuffle_pd( a11v.v, a11v.v, _MM_SHUFFLE2 (0,0) );
432  adac.v = a11v.v * x1rv.v;
433  bcbd.v = a12v.v * x1v.v;
434  rho1v.v = rho1v.v + _mm_addsub_pd( adac.v, bcbd.v );
435 
436  //omega_c = *omega1;
437  w1v.v = _mm_load_pd( ( double* )omega1 );
438 
439  //omega1_c.real += kappa_c.real * alpha1_c.real - kappa_c.imag * alpha1_c.imag;
440  //omega1_c.imag += kappa_c.real * alpha1_c.imag + kappa_c.imag * alpha1_c.real;
441  a1rv.v = _mm_shuffle_pd( a1v.v, a1v.v, _MM_SHUFFLE2 (0,1) );
442  acbc.v = kappa11v.v * a1v.v;
443  bdad.v = kappa12v.v * a1rv.v;
444  w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
445 
446  // *omega1 = omega1_c;
447  _mm_store_pd( ( double* )omega1, w1v.v );
448 
449 
450  upsilon1 += 1;
451  zeta1 += 1;
452  alpha1 += 1;
453  chi1 += 1;
454  omega1 += 1;
455  }
456 
457  rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
458 
459  //rho->real = rho_c.real;
460  //rho->imag = rho_c.imag;
461  _mm_store_pd( ( double* )rho, rho1v.v );
462 }
Definition: blis_type_defs.h:138
double real
Definition: blis_type_defs.h:139
double imag
Definition: blis_type_defs.h:139

References alpha1, bl1_abort(), chi1, i, dcomplex::imag, omega1, dcomplex::real, rho, upsilon1, v2df_t::v, and zeta1.

◆ for()

for ( )
260  {
261  double upsilon1_c = *upsilon1;
262  double upsilon2_c = *(upsilon1 + 1);
263  double zeta1_c = *zeta1;
264  double zeta2_c = *(zeta1 + 1);
265  double alpha1_c = *alpha1;
266  double alpha2_c = *(alpha1 + 1);
267  double chi1_c = *chi1;
268  double chi2_c = *(chi1 + 1);
269  double omega1_c = *omega1;
270  double omega2_c = *(omega1 + 1);
271 
272  // alpha1 += beta * upsilon1 + gamma * zeta1;
273  alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
274  alpha2_c += beta_c * upsilon2_c + gamma_c * zeta2_c;
275 
276  // rho += conj(alpha1) * chi1 +
277  // conj(alpha2) * chi2;
278  rho_c += alpha1_c * chi1_c + alpha2_c * chi2_c;
279 
280  // omega1 += kappa * alpha1;
281  omega1_c += kappa_c * alpha1_c;
282  omega2_c += kappa_c * alpha2_c;
283 
284  *alpha1 = alpha1_c;
285  *(alpha1 + 1) = alpha2_c;
286  *omega1 = omega1_c;
287  *(omega1 + 1) = omega2_c;
288 
289  upsilon1 += 2*inc_u;
290  zeta1 += 2*inc_z;
291  alpha1 += 2*inc_a;
292  chi1 += 2*inc_x;
293  omega1 += 2*inc_w;
294  }
double alpha2_c
Definition: bl1_axpyv2b.c:145

References alpha1, alpha1_c, alpha2_c, beta_c, chi1, gamma_c, kappa_c, omega1, rho_c, upsilon1, and zeta1.

◆ if() [1/2]

if ( n_left  ,
 
)
297  {
298 
299  for ( i = 0; i < n_left; ++i )
300  {
301  double upsilon1_c = *upsilon1;
302  double zeta1_c = *zeta1;
303  double alpha1_c = *alpha1;
304  double chi1_c = *chi1;
305  double omega1_c = *omega1;
306 
307  alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
308  rho_c += alpha1_c * chi1_c;
309  omega1_c += kappa_c * alpha1_c;
310 
311  *alpha1 = alpha1_c;
312  *omega1 = omega1_c;
313 
314  upsilon1 += inc_u;
315  zeta1 += inc_z;
316  alpha1 += inc_a;
317  chi1 += inc_x;
318  omega1 += inc_w;
319  }
320  }

References alpha1, alpha1_c, beta_c, chi1, gamma_c, i, kappa_c, n_left, omega1, rho_c, upsilon1, and zeta1.

◆ if() [2/2]

if ( n_pre  = = 1)
238  {
239  double upsilon1_c = *upsilon1;
240  double zeta1_c = *zeta1;
241  double alpha1_c = *alpha1;
242  double chi1_c = *chi1;
243  double omega1_c = *omega1;
244 
245  alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
246  rho_c += alpha1_c * chi1_c;
247  omega1_c += kappa_c * alpha1_c;
248 
249  *alpha1 = alpha1_c;
250  *omega1 = omega1_c;
251 
252  upsilon1 += inc_u;
253  zeta1 += inc_z;
254  alpha1 += inc_a;
255  chi1 += inc_x;
256  omega1 += inc_w;
257  }

References alpha1, alpha1_c, beta_c, chi1, gamma_c, kappa_c, omega1, rho_c, upsilon1, and zeta1.

Variable Documentation

◆ alpha1

dcomplex *restrict alpha1 = a

◆ beta_c

beta_c = *beta

Referenced by bl1_daxpyv2bdotaxpy(), for(), and if().

◆ chi1

chi1 = x

◆ gamma_c

dcomplex gamma_c = *gamma

Referenced by bl1_daxpyv2bdotaxpy(), for(), and if().

◆ i

int i

◆ imag

rho2_c imag = 0.0

◆ kappa_c

dcomplex kappa_c = *kappa

Referenced by bl1_daxpyv2bdotaxpy(), for(), and if().

◆ n_left

n_left = ( n - n_pre ) % 2

Referenced by bl1_daxpyv2bdotaxpy(), and if().

◆ n_pre

n_pre = 0

Referenced by bl1_daxpyv2bdotaxpy().

◆ n_run

n_run = ( n - n_pre ) / 2

Referenced by bl1_daxpyv2bdotaxpy().

◆ omega1

dcomplex *restrict omega1 = w

◆ real

rho real = 0.0

◆ rho

* rho = rho_c

◆ rho_c

dcomplex rho_c = 0.0

Referenced by bl1_daxpyv2bdotaxpy(), for(), and if().

◆ upsilon1

dcomplex *restrict upsilon1 = u

◆ zeta1

zeta1
Initial value:
{
double* restrict upsilon1

Referenced by bl1_daxpyv2bdotaxpy(), bl1_zaxpyv2bdotaxpy(), for(), and if().