libflame  revision_anchor
Functions
blis_prototypes_fused1.h File Reference

(r)

Go to the source code of this file.

Functions

void bl1_saxmyv2 (conj1_t conjx, int n, float *alpha, float *beta, float *x, int inc_x, float *y, int inc_y, float *z, int inc_z)
 
void bl1_daxmyv2 (conj1_t conjx, int n, double *alpha, double *beta, double *x, int inc_x, double *y, int inc_y, double *z, int inc_z)
 
void bl1_caxmyv2 (conj1_t conjx, int n, scomplex *alpha, scomplex *beta, scomplex *x, int inc_x, scomplex *y, int inc_y, scomplex *z, int inc_z)
 
void bl1_zaxmyv2 (conj1_t conjx, int n, dcomplex *alpha, dcomplex *beta, dcomplex *x, int inc_x, dcomplex *y, int inc_y, dcomplex *z, int inc_z)
 
void bl1_saxpyv2b (int n, float *beta1, float *beta2, float *a1, int inc_a1, float *a2, int inc_a2, float *w, int inc_w)
 
void bl1_daxpyv2b (int n, double *beta1, double *beta2, double *a1, int inc_a1, double *a2, int inc_a2, double *w, int inc_w)
 
void bl1_caxpyv2b (int n, scomplex *beta1, scomplex *beta2, scomplex *a1, int inc_a1, scomplex *a2, int inc_a2, scomplex *w, int inc_w)
 
void bl1_zaxpyv2b (int n, dcomplex *beta1, dcomplex *beta2, dcomplex *a1, int inc_a1, dcomplex *a2, int inc_a2, dcomplex *w, int inc_w)
 
void bl1_saxpyv3b (int n, float *beta1, float *beta2, float *beta3, float *a1, int inc_a1, float *a2, int inc_a2, float *a3, int inc_a3, float *w, int inc_w)
 
void bl1_daxpyv3b (int n, double *beta1, double *beta2, double *beta3, double *a1, int inc_a1, double *a2, int inc_a2, double *a3, int inc_a3, double *w, int inc_w)
 
void bl1_caxpyv3b (int n, scomplex *beta1, scomplex *beta2, scomplex *beta3, scomplex *a1, int inc_a1, scomplex *a2, int inc_a2, scomplex *a3, int inc_a3, scomplex *w, int inc_w)
 
void bl1_zaxpyv3b (int n, dcomplex *beta1, dcomplex *beta2, dcomplex *beta3, dcomplex *a1, int inc_a1, dcomplex *a2, int inc_a2, dcomplex *a3, int inc_a3, dcomplex *w, int inc_w)
 
void bl1_saxpyv2bdotaxpy (int n, float *beta, float *u, int inc_u, float *gamma, float *z, int inc_z, float *a, int inc_a, float *x, int inc_x, float *kappa, float *rho, float *w, int inc_w)
 
void bl1_daxpyv2bdotaxpy (int n, double *beta, double *u, int inc_u, double *gamma, double *z, int inc_z, double *a, int inc_a, double *x, int inc_x, double *kappa, double *rho, double *w, int inc_w)
 
void bl1_caxpyv2bdotaxpy (int n, scomplex *beta, scomplex *u, int inc_u, scomplex *gamma, scomplex *z, int inc_z, scomplex *a, int inc_a, scomplex *x, int inc_x, scomplex *kappa, scomplex *rho, scomplex *w, int inc_w)
 
void bl1_zaxpyv2bdotaxpy (int n, dcomplex *beta, dcomplex *u, int inc_u, dcomplex *gamma, dcomplex *z, int inc_z, dcomplex *a, int inc_a, dcomplex *x, int inc_x, dcomplex *kappa, dcomplex *rho, dcomplex *w, int inc_w)
 
void bl1_sdotsv2 (conj1_t conjxy, int n, float *x, int inc_x, float *y, int inc_y, float *z, int inc_z, float *beta, float *rho_xz, float *rho_yz)
 
void bl1_ddotsv2 (conj1_t conjxy, int n, double *x, int inc_x, double *y, int inc_y, double *z, int inc_z, double *beta, double *rho_xz, double *rho_yz)
 
void bl1_cdotsv2 (conj1_t conjxy, int n, scomplex *x, int inc_x, scomplex *y, int inc_y, scomplex *z, int inc_z, scomplex *beta, scomplex *rho_xz, scomplex *rho_yz)
 
void bl1_zdotsv2 (conj1_t conjxy, int n, dcomplex *x, int inc_x, dcomplex *y, int inc_y, dcomplex *z, int inc_z, dcomplex *beta, dcomplex *rho_xz, dcomplex *rho_yz)
 
void bl1_sdotsv3 (conj1_t conjxyw, int n, float *x, int inc_x, float *y, int inc_y, float *w, int inc_w, float *z, int inc_z, float *beta, float *rho_xz, float *rho_yz, float *rho_wz)
 
void bl1_ddotsv3 (conj1_t conjxyw, int n, double *x, int inc_x, double *y, int inc_y, double *w, int inc_w, double *z, int inc_z, double *beta, double *rho_xz, double *rho_yz, double *rho_wz)
 
void bl1_cdotsv3 (conj1_t conjxyw, int n, scomplex *x, int inc_x, scomplex *y, int inc_y, scomplex *w, int inc_w, scomplex *z, int inc_z, scomplex *beta, scomplex *rho_xz, scomplex *rho_yz, scomplex *rho_wz)
 
void bl1_zdotsv3 (conj1_t conjxyw, int n, dcomplex *x, int inc_x, dcomplex *y, int inc_y, dcomplex *w, int inc_w, dcomplex *z, int inc_z, dcomplex *beta, dcomplex *rho_xz, dcomplex *rho_yz, dcomplex *rho_wz)
 
void bl1_sdotaxpy (int n, float *a, int inc_a, float *x, int inc_x, float *kappa, float *rho, float *w, int inc_w)
 
void bl1_ddotaxpy (int n, double *a, int inc_a, double *x, int inc_x, double *kappa, double *rho, double *w, int inc_w)
 
void bl1_cdotaxpy (int n, scomplex *a, int inc_a, scomplex *x, int inc_x, scomplex *kappa, scomplex *rho, scomplex *w, int inc_w)
 
void bl1_zdotaxpy (int n, dcomplex *a, int inc_a, dcomplex *x, int inc_x, dcomplex *kappa, dcomplex *rho, dcomplex *w, int inc_w)
 
void bl1_sdotaxmyv2 (int n, float *alpha, float *beta, float *x, int inc_x, float *u, int inc_u, float *rho, float *y, int inc_y, float *z, int inc_z)
 
void bl1_ddotaxmyv2 (int n, double *alpha, double *beta, double *x, int inc_x, double *u, int inc_u, double *rho, double *y, int inc_y, double *z, int inc_z)
 
void bl1_cdotaxmyv2 (int n, scomplex *alpha, scomplex *beta, scomplex *x, int inc_x, scomplex *u, int inc_u, scomplex *rho, scomplex *y, int inc_y, scomplex *z, int inc_z)
 
void bl1_zdotaxmyv2 (int n, dcomplex *alpha, dcomplex *beta, dcomplex *x, int inc_x, dcomplex *u, int inc_u, dcomplex *rho, dcomplex *y, int inc_y, dcomplex *z, int inc_z)
 
void bl1_sdotv2axpyv2b (int n, float *a1, int inc_a1, float *a2, int inc_a2, float *x, int inc_x, float *kappa1, float *kappa2, float *rho1, float *rho2, float *w, int inc_w)
 
void bl1_ddotv2axpyv2b (int n, double *a1, int inc_a1, double *a2, int inc_a2, double *x, int inc_x, double *kappa1, double *kappa2, double *rho1, double *rho2, double *w, int inc_w)
 
void bl1_cdotv2axpyv2b (int n, scomplex *a1, int inc_a1, scomplex *a2, int inc_a2, scomplex *x, int inc_x, scomplex *kappa1, scomplex *kappa2, scomplex *rho1, scomplex *rho2, scomplex *w, int inc_w)
 
void bl1_zdotv2axpyv2b (int n, dcomplex *a1, int inc_a1, dcomplex *a2, int inc_a2, dcomplex *x, int inc_x, dcomplex *kappa1, dcomplex *kappa2, dcomplex *rho1, dcomplex *rho2, dcomplex *w, int inc_w)
 
void bl1_zaxpyv2bdots (int n, dcomplex *alpha1, dcomplex *alpha2, dcomplex *x1, int inc_x1, dcomplex *x2, int inc_x2, dcomplex *y, int inc_y, dcomplex *u, int inc_u, dcomplex *beta, dcomplex *rho)
 

Function Documentation

◆ bl1_caxmyv2()

void bl1_caxmyv2 ( conj1_t  conjx,
int  n,
scomplex alpha,
scomplex beta,
scomplex x,
int  inc_x,
scomplex y,
int  inc_y,
scomplex z,
int  inc_z 
)
245 {
246  bl1_abort();
247 }
void bl1_abort(void)
Definition: bl1_abort.c:13

References bl1_abort().

◆ bl1_caxpyv2b()

void bl1_caxpyv2b ( int  n,
scomplex beta1,
scomplex beta2,
scomplex a1,
int  inc_a1,
scomplex a2,
int  inc_a2,
scomplex w,
int  inc_w 
)
205 {
206  bl1_abort();
207 }

References bl1_abort().

◆ bl1_caxpyv2bdotaxpy()

void bl1_caxpyv2bdotaxpy ( int  n,
scomplex beta,
scomplex u,
int  inc_u,
scomplex gamma,
scomplex z,
int  inc_z,
scomplex a,
int  inc_a,
scomplex x,
int  inc_x,
scomplex kappa,
scomplex rho,
scomplex w,
int  inc_w 
)
337 {
338  bl1_abort();
339 }

References bl1_abort().

◆ bl1_caxpyv3b()

void bl1_caxpyv3b ( int  n,
scomplex beta1,
scomplex beta2,
scomplex beta3,
scomplex a1,
int  inc_a1,
scomplex a2,
int  inc_a2,
scomplex a3,
int  inc_a3,
scomplex w,
int  inc_w 
)
219 {
220  bl1_abort();
221 }

References bl1_abort().

◆ bl1_cdotaxmyv2()

void bl1_cdotaxmyv2 ( int  n,
scomplex alpha,
scomplex beta,
scomplex x,
int  inc_x,
scomplex u,
int  inc_u,
scomplex rho,
scomplex y,
int  inc_y,
scomplex z,
int  inc_z 
)
271 {
272  bl1_abort();
273 }

References bl1_abort().

◆ bl1_cdotaxpy()

void bl1_cdotaxpy ( int  n,
scomplex a,
int  inc_a,
scomplex x,
int  inc_x,
scomplex kappa,
scomplex rho,
scomplex w,
int  inc_w 
)
253 {
254  bl1_abort();
255 }

References bl1_abort().

◆ bl1_cdotsv2()

void bl1_cdotsv2 ( conj1_t  conjxy,
int  n,
scomplex x,
int  inc_x,
scomplex y,
int  inc_y,
scomplex z,
int  inc_z,
scomplex beta,
scomplex rho_xz,
scomplex rho_yz 
)
243 {
244  bl1_abort();
245 }

References bl1_abort().

◆ bl1_cdotsv3()

void bl1_cdotsv3 ( conj1_t  conjxyw,
int  n,
scomplex x,
int  inc_x,
scomplex y,
int  inc_y,
scomplex w,
int  inc_w,
scomplex z,
int  inc_z,
scomplex beta,
scomplex rho_xz,
scomplex rho_yz,
scomplex rho_wz 
)
285 {
286  bl1_abort();
287 }

References bl1_abort().

◆ bl1_cdotv2axpyv2b()

void bl1_cdotv2axpyv2b ( int  n,
scomplex a1,
int  inc_a1,
scomplex a2,
int  inc_a2,
scomplex x,
int  inc_x,
scomplex kappa1,
scomplex kappa2,
scomplex rho1,
scomplex rho2,
scomplex w,
int  inc_w 
)
326 {
327  bl1_abort();
328 }

References bl1_abort().

◆ bl1_daxmyv2()

void bl1_daxmyv2 ( conj1_t  conjx,
int  n,
double *  alpha,
double *  beta,
double *  x,
int  inc_x,
double *  y,
int  inc_y,
double *  z,
int  inc_z 
)
42 {
43  double* restrict chi1;
44  double* restrict psi1;
45  double* restrict zeta1;
46  int i;
47 
48  int n_pre;
49  int n_run;
50  int n_left;
51 
52  v2df_t a1v, b1v;
53  v2df_t x1v, y1v, z1v;
54  v2df_t x2v, y2v, z2v;
55 
56  if ( inc_x != 1 ||
57  inc_y != 1 ||
58  inc_z != 1 ) bl1_abort();
59 
60  n_pre = 0;
61  if ( ( unsigned long ) z % 16 != 0 )
62  {
63  if ( ( unsigned long ) x % 16 == 0 ||
64  ( unsigned long ) y % 16 == 0 ) bl1_abort();
65 
66  n_pre = 1;
67  }
68 
69  n_run = ( n - n_pre ) / 4;
70  n_left = ( n - n_pre ) % 4;
71 
72  chi1 = x;
73  psi1 = y;
74  zeta1 = z;
75 
76  if ( n_pre == 1 )
77  {
78  double alpha_c = *alpha;
79  double beta_c = *beta;
80  double chi1_c = *chi1;
81 
82  *psi1 -= alpha_c * chi1_c;
83  *zeta1 -= beta_c * chi1_c;
84 
85  chi1 += inc_x;
86  psi1 += inc_y;
87  zeta1 += inc_z;
88  }
89 
90  a1v.v = _mm_loaddup_pd( ( double* )alpha );
91  b1v.v = _mm_loaddup_pd( ( double* )beta );
92 
93  for ( i = 0; i < n_run; ++i )
94  {
95  x1v.v = _mm_load_pd( ( double* )chi1 );
96  y1v.v = _mm_load_pd( ( double* )psi1 );
97  z1v.v = _mm_load_pd( ( double* )zeta1 );
98 
99  x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
100  y2v.v = _mm_load_pd( ( double* )(psi1 + 2) );
101  z2v.v = _mm_load_pd( ( double* )(zeta1 + 2) );
102 
103  y1v.v = y1v.v - a1v.v * x1v.v;
104  z1v.v = z1v.v - b1v.v * x1v.v;
105 
106  _mm_store_pd( ( double* )psi1, y1v.v );
107  _mm_store_pd( ( double* )zeta1, z1v.v );
108 
109  y2v.v = y2v.v - a1v.v * x2v.v;
110  z2v.v = z2v.v - b1v.v * x2v.v;
111 
112  _mm_store_pd( ( double* )(psi1 + 2), y2v.v );
113  _mm_store_pd( ( double* )(zeta1 + 2), z2v.v );
114 
115  chi1 += 4;
116  psi1 += 4;
117  zeta1 += 4;
118  }
119 
120  if ( n_left > 0 )
121  {
122  double alpha_c = *alpha;
123  double beta_c = *beta;
124 
125  for( i = 0; i < n_left; ++i )
126  {
127  double chi1_c = *chi1;
128 
129  *psi1 -= alpha_c * chi1_c;
130  *zeta1 -= beta_c * chi1_c;
131 
132  chi1 += inc_x;
133  psi1 += inc_y;
134  zeta1 += inc_z;
135  }
136  }
137 }
double *restrict zeta1
Definition: bl1_axmyv2.c:142
double *restrict psi1
Definition: bl1_axmyv2.c:139
double beta_c
Definition: bl1_axmyv2.c:144
double alpha_c
Definition: bl1_axmyv2.c:143
int n_left
Definition: bl1_axmyv2.c:149
int n_pre
Definition: bl1_axmyv2.c:147
int n_run
Definition: bl1_axmyv2.c:148
int i
Definition: bl1_axmyv2.c:145
chi1
Definition: bl1_axmyv2.c:366
Definition: blis_type_defs.h:117
__m128d v
Definition: blis_type_defs.h:118

References alpha_c, beta_c, bl1_abort(), chi1, i, n_left, n_pre, n_run, psi1, v2df_t::v, and zeta1.

Referenced by FLA_Fused_UYx_ZVx_opd_var1().

◆ bl1_daxpyv2b()

void bl1_daxpyv2b ( int  n,
double *  beta1,
double *  beta2,
double *  a1,
int  inc_a1,
double *  a2,
int  inc_a2,
double *  w,
int  inc_w 
)
38 {
39  double* restrict chi1;
40  double* restrict chi2;
41  double* restrict psi1;
42  int i;
43 
44  int n_pre;
45  int n_run;
46  int n_left;
47 
48  v2df_t a1v, a2v;
49  v2df_t x11v, x12v;
50  v2df_t x21v, x22v;
51  v2df_t y1v;
52  v2df_t y2v;
53 
54  if ( inc_x1 != 1 ||
55  inc_x2 != 1 ||
56  inc_y != 1 ) bl1_abort();
57 
58  n_pre = 0;
59  if ( ( unsigned long ) y % 16 != 0 )
60  {
61  if ( ( unsigned long ) x1 % 16 == 0 ||
62  ( unsigned long ) x2 % 16 == 0 ) bl1_abort();
63 
64  n_pre = 1;
65  }
66 
67  n_run = ( n - n_pre ) / 4;
68  n_left = ( n - n_pre ) % 4;
69 
70  chi1 = x1;
71  chi2 = x2;
72  psi1 = y;
73 
74  if ( n_pre == 1 )
75  {
76  double alpha1_c = *alpha1;
77  double alpha2_c = *alpha2;
78  double chi11_c = *chi1;
79  double chi12_c = *chi2;
80  double temp1;
81 
82  // psi1 = psi1 + alpha1 * chi11 + alpha2 * chi12;
83  temp1 = alpha1_c * chi11_c + alpha2_c * chi12_c;
84  *psi1 = *psi1 + temp1;
85 
86  chi1 += inc_x1;
87  chi2 += inc_x2;
88  psi1 += inc_y;
89  }
90 
91  a1v.v = _mm_loaddup_pd( ( double* )alpha1 );
92  a2v.v = _mm_loaddup_pd( ( double* )alpha2 );
93 
94  for ( i = 0; i < n_run; ++i )
95  {
96  x11v.v = _mm_load_pd( ( double* )chi1 );
97  x12v.v = _mm_load_pd( ( double* )chi2 );
98  y1v.v = _mm_load_pd( ( double* )psi1 );
99 
100  x21v.v = _mm_load_pd( ( double* )(chi1 + 2) );
101  x22v.v = _mm_load_pd( ( double* )(chi2 + 2) );
102  y2v.v = _mm_load_pd( ( double* )(psi1 + 2) );
103 
104  y1v.v += a1v.v * x11v.v + a2v.v * x12v.v;
105  y2v.v += a1v.v * x21v.v + a2v.v * x22v.v;
106 
107  _mm_store_pd( ( double* )psi1, y1v.v );
108  _mm_store_pd( ( double* )(psi1 + 2), y2v.v );
109 
110  //chi1 += step_x1;
111  //chi2 += step_x2;
112  //psi1 += step_y;
113  chi1 += 4;
114  chi2 += 4;
115  psi1 += 4;
116  }
117 
118  if ( n_left > 0 )
119  {
120  double alpha1_c = *alpha1;
121  double alpha2_c = *alpha2;
122 
123  for ( i = 0; i < n_left; ++i )
124  {
125  double chi11_c = *chi1;
126  double chi12_c = *chi2;
127  double psi1_c = *psi1;
128  double temp1;
129 
130  temp1 = alpha1_c * chi11_c + alpha2_c * chi12_c;
131  *psi1 = psi1_c + temp1;
132 
133  chi1 += inc_x1;
134  chi2 += inc_x2;
135  psi1 += inc_y;
136  }
137  }
138 }
chi1
Definition: bl1_axpyv2b.c:156
int n_left
Definition: bl1_axpyv2b.c:151
int n_run
Definition: bl1_axpyv2b.c:150
double *restrict psi1
Definition: bl1_axpyv2b.c:143
int i
Definition: bl1_axpyv2b.c:148
double temp1
Definition: bl1_axpyv2b.c:146
double *restrict chi2
Definition: bl1_axpyv2b.c:140
double alpha1_c
Definition: bl1_axpyv2b.c:144
double alpha2_c
Definition: bl1_axpyv2b.c:145
double *restrict alpha1
Definition: bl1_axpyv2bdotaxpy.c:198
x1
Definition: bl1_dotsv2.c:374
double *restrict alpha2
Definition: bl1_dotv2axpyv2b.c:186

References alpha1, alpha1_c, alpha2, alpha2_c, bl1_abort(), chi1, chi2, i, n_left, n_pre, n_run, psi1, temp1, v2df_t::v, and x1.

Referenced by FLA_Fused_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Gerc2_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().

◆ bl1_daxpyv2bdotaxpy()

void bl1_daxpyv2bdotaxpy ( int  n,
double *  beta,
double *  u,
int  inc_u,
double *  gamma,
double *  z,
int  inc_z,
double *  a,
int  inc_a,
double *  x,
int  inc_x,
double *  kappa,
double *  rho,
double *  w,
int  inc_w 
)
47 {
48  double* restrict upsilon1;
49  double* restrict zeta1;
50  double* restrict alpha1;
51  double* restrict chi1;
52  double* restrict omega1;
53  double rho_c;
54  int i;
55  v2df_t b1v, g1v, k1v;
56  v2df_t rhov;
57  v2df_t u1v, z1v, a1v;
58  v2df_t u2v, z2v, a2v;
59  v2df_t x1v, w1v;
60  v2df_t x2v, w2v;
61 
62  int n_pre;
63  int n_run;
64  int n_left;
65 
66  n_pre = 0;
67  if ( ( unsigned long ) a % 16 != 0 )
68  {
69  if ( ( unsigned long ) u % 16 == 0 ||
70  ( unsigned long ) z % 16 == 0 ||
71  ( unsigned long ) x % 16 == 0 ||
72  ( unsigned long ) w % 16 == 0 ) bl1_abort();
73 
74  n_pre = 1;
75  }
76 
77  n_run = ( n - n_pre ) / 4;
78  n_left = ( n - n_pre ) % 4;
79 
80  upsilon1 = u;
81  zeta1 = z;
82  alpha1 = a;
83  chi1 = x;
84  omega1 = w;
85 
86 
87  rho_c = 0.0;
88 
89  if ( n_pre == 1 )
90  {
91  double beta_c = *beta;
92  double gamma_c = *gamma;
93  double kappa_c = *kappa;
94 
95  double upsilon1_c = *upsilon1;
96  double zeta1_c = *zeta1;
97  double alpha1_c = *alpha1;
98  double chi1_c = *chi1;
99  double omega1_c = *omega1;
100 
101  alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
102  rho_c += alpha1_c * chi1_c;
103  omega1_c += kappa_c * alpha1_c;
104 
105  *alpha1 = alpha1_c;
106  *omega1 = omega1_c;
107 
108  upsilon1 += inc_u;
109  zeta1 += inc_z;
110  alpha1 += inc_a;
111  chi1 += inc_x;
112  omega1 += inc_w;
113  }
114 
115  b1v.v = _mm_loaddup_pd( ( double* )beta );
116  g1v.v = _mm_loaddup_pd( ( double* )gamma );
117  k1v.v = _mm_loaddup_pd( ( double* )kappa );
118 
119  rhov.v = _mm_setzero_pd();
120 
121  for ( i = 0; i < n_run; ++i )
122  {
123  u1v.v = _mm_load_pd( ( double* )upsilon1 );
124  z1v.v = _mm_load_pd( ( double* )zeta1 );
125  a1v.v = _mm_load_pd( ( double* )alpha1 );
126 
127  a1v.v += b1v.v * u1v.v + g1v.v * z1v.v;
128 
129  u2v.v = _mm_load_pd( ( double* )(upsilon1 + 2) );
130  z2v.v = _mm_load_pd( ( double* )(zeta1 + 2) );
131  a2v.v = _mm_load_pd( ( double* )(alpha1 + 2) );
132 
133  a2v.v += b1v.v * u2v.v + g1v.v * z2v.v;
134 
135  x1v.v = _mm_load_pd( ( double* )chi1 );
136  x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
137 
138  w1v.v = _mm_load_pd( ( double* )omega1 );
139  w2v.v = _mm_load_pd( ( double* )(omega1 + 2) );
140 
141  rhov.v += a1v.v * x1v.v;
142  rhov.v += a2v.v * x2v.v;
143 
144  w1v.v += k1v.v * a1v.v;
145  w2v.v += k1v.v * a2v.v;
146 
147  _mm_store_pd( ( double* )alpha1, a1v.v );
148  _mm_store_pd( ( double* )(alpha1 + 2), a2v.v );
149 
150  _mm_store_pd( ( double* )omega1, w1v.v );
151  _mm_store_pd( ( double* )(omega1 + 2), w2v.v );
152 
153 
154  upsilon1 += 4;
155  zeta1 += 4;
156  alpha1 += 4;
157  chi1 += 4;
158  omega1 += 4;
159  }
160 
161  rho_c += rhov.d[0] + rhov.d[1];
162 
163  if ( n_left > 0 )
164  {
165  double beta_c = *beta;
166  double gamma_c = *gamma;
167  double kappa_c = *kappa;
168 
169  for ( i = 0; i < n_left; ++i )
170  {
171  double upsilon1_c = *upsilon1;
172  double zeta1_c = *zeta1;
173  double alpha1_c = *alpha1;
174  double chi1_c = *chi1;
175  double omega1_c = *omega1;
176 
177  alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
178  rho_c += alpha1_c * chi1_c;
179  omega1_c += kappa_c * alpha1_c;
180 
181  *alpha1 = alpha1_c;
182  *omega1 = omega1_c;
183 
184  upsilon1 += inc_u;
185  zeta1 += inc_z;
186  alpha1 += inc_a;
187  chi1 += inc_x;
188  omega1 += inc_w;
189  }
190  }
191 
192  *rho = rho_c;
193 }
int n_left
Definition: bl1_axpyv2bdotaxpy.c:209
double *restrict chi1
Definition: bl1_axpyv2bdotaxpy.c:199
upsilon1
Definition: bl1_axpyv2bdotaxpy.c:225
double beta_c
Definition: bl1_axpyv2bdotaxpy.c:201
double rho_c
Definition: bl1_axpyv2bdotaxpy.c:204
double kappa_c
Definition: bl1_axpyv2bdotaxpy.c:203
* rho
Definition: bl1_axpyv2bdotaxpy.c:322
double *restrict zeta1
Definition: bl1_axpyv2bdotaxpy.c:195
int i
Definition: bl1_axpyv2bdotaxpy.c:205
int n_pre
Definition: bl1_axpyv2bdotaxpy.c:207
double gamma_c
Definition: bl1_axpyv2bdotaxpy.c:202
double *restrict omega1
Definition: bl1_axpyv2bdotaxpy.c:200
int n_run
Definition: bl1_axpyv2bdotaxpy.c:208
double d[2]
Definition: blis_type_defs.h:119

References alpha1, alpha1_c, beta_c, bl1_abort(), chi1, v2df_t::d, gamma_c, i, kappa_c, n_left, n_pre, n_run, omega1, rho, rho_c, upsilon1, v2df_t::v, and zeta1.

Referenced by FLA_Fused_Gerc2_Ahx_Ax_opd_var1(), and FLA_Fused_Her2_Ax_l_opd_var1().

◆ bl1_daxpyv3b()

void bl1_daxpyv3b ( int  n,
double *  beta1,
double *  beta2,
double *  beta3,
double *  a1,
int  inc_a1,
double *  a2,
int  inc_a2,
double *  a3,
int  inc_a3,
double *  w,
int  inc_w 
)
43 {
44  double* restrict chi1;
45  double* restrict chi2;
46  double* restrict chi3;
47  double* restrict psi1;
48  int i;
49 
50  int n_pre;
51  int n_run;
52  int n_left;
53 
54  v2df_t a1v, a2v, a3v;
55  v2df_t x11v, x12v, x13v;
56  v2df_t x21v, x22v, x23v;
57  v2df_t y1v;
58  v2df_t y2v;
59 
60  if ( inc_x1 != 1 ||
61  inc_x2 != 1 ||
62  inc_x3 != 1 ||
63  inc_y != 1 ) bl1_abort();
64 
65  n_pre = 0;
66  if ( ( unsigned long ) y % 16 != 0 )
67  {
68  if ( ( unsigned long ) x1 % 16 == 0 ||
69  ( unsigned long ) x2 % 16 == 0 ||
70  ( unsigned long ) x3 % 16 == 0 ) bl1_abort();
71 
72  n_pre = 1;
73  }
74 
75  n_run = ( n - n_pre ) / 4;
76  n_left = ( n - n_pre ) % 4;
77 
78  chi1 = x1;
79  chi2 = x2;
80  chi3 = x3;
81  psi1 = y;
82 
83  if ( n_pre == 1 )
84  {
85  double alpha1_c = *alpha1;
86  double alpha2_c = *alpha2;
87  double alpha3_c = *alpha3;
88  double chi11_c = *chi1;
89  double chi12_c = *chi2;
90  double chi13_c = *chi3;
91 
92  *psi1 += alpha1_c * chi11_c + alpha2_c * chi12_c + alpha3_c * chi13_c;
93 
94  chi1 += inc_x1;
95  chi2 += inc_x2;
96  chi3 += inc_x3;
97  psi1 += inc_y;
98  }
99 
100  a1v.v = _mm_loaddup_pd( ( double* )alpha1 );
101  a2v.v = _mm_loaddup_pd( ( double* )alpha2 );
102  a3v.v = _mm_loaddup_pd( ( double* )alpha3 );
103 
104  for ( i = 0; i < n_run; ++i )
105  {
106  x11v.v = _mm_load_pd( ( double* )chi1 );
107  x12v.v = _mm_load_pd( ( double* )chi2 );
108  x13v.v = _mm_load_pd( ( double* )chi3 );
109  y1v.v = _mm_load_pd( ( double* )psi1 );
110 
111  y1v.v += a1v.v * x11v.v + a2v.v * x12v.v + a3v.v * x13v.v;
112 
113  _mm_store_pd( ( double* )psi1, y1v.v );
114 
115  x21v.v = _mm_load_pd( ( double* )(chi1 + 2) );
116  x22v.v = _mm_load_pd( ( double* )(chi2 + 2) );
117  x23v.v = _mm_load_pd( ( double* )(chi3 + 2) );
118  y2v.v = _mm_load_pd( ( double* )(psi1 + 2) );
119 
120  y2v.v += a1v.v * x21v.v + a2v.v * x22v.v + a3v.v * x23v.v;
121 
122  _mm_store_pd( ( double* )(psi1 + 2), y2v.v );
123 
124  chi1 += 4;
125  chi2 += 4;
126  chi3 += 4;
127  psi1 += 4;
128  }
129 
130  if ( n_left > 0 )
131  {
132  double alpha1_c = *alpha1;
133  double alpha2_c = *alpha2;
134  double alpha3_c = *alpha3;
135 
136  for ( i = 0; i < n_left; ++i )
137  {
138  double chi11_c = *chi1;
139  double chi12_c = *chi2;
140  double chi13_c = *chi3;
141 
142  *psi1 += alpha1_c * chi11_c + alpha2_c * chi12_c + alpha3_c * chi13_c;
143 
144  chi1 += inc_x1;
145  chi2 += inc_x2;
146  chi3 += inc_x3;
147  psi1 += inc_y;
148  }
149  }
150 }
chi1
Definition: bl1_axpyv3b.c:168
int n_left
Definition: bl1_axpyv3b.c:163
double *restrict chi2
Definition: bl1_axpyv3b.c:152
double alpha1_c
Definition: bl1_axpyv3b.c:156
int n_run
Definition: bl1_axpyv3b.c:162
double *restrict psi1
Definition: bl1_axpyv3b.c:155
int i
Definition: bl1_axpyv3b.c:160
double alpha2_c
Definition: bl1_axpyv3b.c:157

References alpha1, alpha1_c, alpha2, alpha2_c, bl1_abort(), chi1, chi2, i, n_left, n_pre, n_run, psi1, v2df_t::v, and x1.

◆ bl1_ddotaxmyv2()

void bl1_ddotaxmyv2 ( int  n,
double *  alpha,
double *  beta,
double *  x,
int  inc_x,
double *  u,
int  inc_u,
double *  rho,
double *  y,
int  inc_y,
double *  z,
int  inc_z 
)
43 {
44  double* restrict chi1;
45  double* restrict upsilon1;
46  double* restrict psi1;
47  double* restrict zeta1;
48  double rho_c;
49  int i;
50 
51  int n_pre;
52  int n_run;
53  int n_left;
54 
55  v2df_t a1v, b1v;
56  v2df_t rho1v;
57  v2df_t x1v, u1v, y1v, z1v;
58 
59  if ( inc_x != 1 ||
60  inc_u != 1 ||
61  inc_y != 1 ||
62  inc_z != 1 ) bl1_abort();
63 
64  n_pre = 0;
65  if ( ( unsigned long ) z % 16 != 0 )
66  {
67  if ( ( unsigned long ) x % 16 == 0 ||
68  ( unsigned long ) u % 16 == 0 ||
69  ( unsigned long ) y % 16 == 0 ) bl1_abort();
70 
71  n_pre = 1;
72  }
73 
74  n_run = ( n - n_pre ) / 2;
75  n_left = ( n - n_pre ) % 2;
76 
77  chi1 = x;
78  upsilon1 = u;
79  psi1 = y;
80  zeta1 = z;
81 
82  rho_c = 0.0;
83 
84  if ( n_pre == 1 )
85  {
86  double alpha_c = *alpha;
87  double beta_c = *beta;
88  double chi1_c = *chi1;
89  double upsilon_c = *upsilon1;
90 
91  rho_c += chi1_c * upsilon_c;
92  *psi1 -= alpha_c * chi1_c;
93  *zeta1 -= beta_c * chi1_c;
94 
95  chi1 += inc_x;
96  upsilon1 += inc_u;
97  psi1 += inc_y;
98  zeta1 += inc_z;
99  }
100 
101  a1v.v = _mm_loaddup_pd( ( double* )alpha );
102  b1v.v = _mm_loaddup_pd( ( double* )beta );
103 
104  rho1v.v = _mm_setzero_pd();
105 
106  for ( i = 0; i < n_run; ++i )
107  {
108  x1v.v = _mm_load_pd( ( double* )chi1 );
109  u1v.v = _mm_load_pd( ( double* )upsilon1 );
110  y1v.v = _mm_load_pd( ( double* )psi1 );
111  z1v.v = _mm_load_pd( ( double* )zeta1 );
112 
113  rho1v.v += x1v.v * u1v.v;
114  y1v.v -= a1v.v * x1v.v;
115  z1v.v -= b1v.v * x1v.v;
116 
117  _mm_store_pd( ( double* )psi1, y1v.v );
118  _mm_store_pd( ( double* )zeta1, z1v.v );
119 
120  chi1 += 2;
121  upsilon1 += 2;
122  psi1 += 2;
123  zeta1 += 2;
124  }
125 
126  rho_c += rho1v.d[0] + rho1v.d[1];
127 
128  if ( n_left > 0 )
129  {
130  double alpha_c = *alpha;
131  double beta_c = *beta;
132 
133  for( i = 0; i < n_left; ++i )
134  {
135  double chi1_c = *chi1;
136  double upsilon_c = *upsilon1;
137 
138  rho_c += chi1_c * upsilon_c;
139  *psi1 -= alpha_c * chi1_c;
140  *zeta1 -= beta_c * chi1_c;
141 
142  chi1 += inc_x;
143  upsilon1 += inc_u;
144  psi1 += inc_y;
145  zeta1 += inc_z;
146  }
147  }
148 
149  *rho = rho_c;
150 }
double beta_c
Definition: bl1_dotaxmyv2.c:158
double alpha_c
Definition: bl1_dotaxmyv2.c:157
int n_left
Definition: bl1_dotaxmyv2.c:164
double *restrict upsilon1
Definition: bl1_dotaxmyv2.c:152
int n_pre
Definition: bl1_dotaxmyv2.c:162
double rho_c
Definition: bl1_dotaxmyv2.c:159
double *restrict psi1
Definition: bl1_dotaxmyv2.c:155
int n_run
Definition: bl1_dotaxmyv2.c:163
* rho
Definition: bl1_dotaxmyv2.c:258
int i
Definition: bl1_dotaxmyv2.c:160
double *restrict zeta1
Definition: bl1_dotaxmyv2.c:156

References alpha_c, beta_c, bl1_abort(), chi1, v2df_t::d, i, n_left, n_pre, n_run, psi1, rho, rho_c, upsilon1, v2df_t::v, and zeta1.

Referenced by FLA_Fused_Uhu_Yhu_Zhu_opd_var1().

◆ bl1_ddotaxpy()

void bl1_ddotaxpy ( int  n,
double *  a,
int  inc_a,
double *  x,
int  inc_x,
double *  kappa,
double *  rho,
double *  w,
int  inc_w 
)
38 {
39  double* restrict alpha1;
40  double* restrict chi1;
41  double* restrict omega1;
42  double rho_c;
43  int i;
44 
45  int n_pre;
46  int n_run;
47  int n_left;
48 
49  v2df_t k1v, rho1v;
50  v2df_t a1v, x1v, w1v;
51  v2df_t a2v, x2v, w2v;
52 
53  if ( inc_a != 1 ||
54  inc_x != 1 ||
55  inc_w != 1 ) bl1_abort();
56 
57  n_pre = 0;
58  if ( ( unsigned long ) a % 16 != 0 )
59  {
60  if ( ( unsigned long ) x % 16 == 0 ||
61  ( unsigned long ) w % 16 == 0 ) bl1_abort();
62 
63  n_pre = 1;
64  }
65 
66  n_run = ( n - n_pre ) / 4;
67  n_left = ( n - n_pre ) % 4;
68 
69  alpha1 = a;
70  chi1 = x;
71  omega1 = w;
72 
73  rho_c = 0.0;
74 
75  if ( n_pre == 1 )
76  {
77  double kappa_c = *kappa;
78  double alpha1_c = *alpha1;
79  double chi1_c = *chi1;
80  double omega1_c = *omega1;
81 
82  rho_c += alpha1_c * chi1_c;
83  omega1_c += kappa_c * alpha1_c;
84 
85  *omega1 = omega1_c;
86 
87  alpha1 += inc_a;
88  chi1 += inc_x;
89  omega1 += inc_w;
90  }
91 
92  rho1v.v = _mm_setzero_pd();
93 
94  k1v.v = _mm_loaddup_pd( ( double* )kappa );
95 
96  for ( i = 0; i < n_run; ++i )
97  {
98  a1v.v = _mm_load_pd( ( double* )alpha1 );
99  x1v.v = _mm_load_pd( ( double* )chi1 );
100  w1v.v = _mm_load_pd( ( double* )omega1 );
101 
102  a2v.v = _mm_load_pd( ( double* )(alpha1 + 2) );
103  x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
104  w2v.v = _mm_load_pd( ( double* )(omega1 + 2) );
105 
106  rho1v.v += a1v.v * x1v.v;
107  w1v.v += k1v.v * a1v.v;
108 
109  _mm_store_pd( ( double* )omega1, w1v.v );
110 
111  rho1v.v += a2v.v * x2v.v;
112  w2v.v += k1v.v * a2v.v;
113 
114  _mm_store_pd( ( double* )(omega1 + 2), w2v.v );
115 
116  alpha1 += 4;
117  chi1 += 4;
118  omega1 += 4;
119  }
120 
121  if ( n_left > 0 )
122  {
123  for ( i = 0; i < n_left; ++i )
124  {
125  double kappa_c = *kappa;
126  double alpha1_c = *alpha1;
127  double chi1_c = *chi1;
128  double omega1_c = *omega1;
129 
130  rho_c += alpha1_c * chi1_c;
131  omega1_c += kappa_c * alpha1_c;
132 
133  *omega1 = omega1_c;
134 
135  alpha1 += inc_a;
136  chi1 += inc_x;
137  omega1 += inc_w;
138  }
139  }
140 
141  rho_c += rho1v.d[0] + rho1v.d[1];
142 
143  *rho = rho_c;
144 }
double *restrict omega1
Definition: bl1_dotaxpy.c:149
double *restrict chi1
Definition: bl1_dotaxpy.c:146
alpha1
Definition: bl1_dotaxpy.c:338
int n_left
Definition: bl1_dotaxpy.c:156
int n_pre
Definition: bl1_dotaxpy.c:154
double rho_c
Definition: bl1_dotaxpy.c:151
double kappa_c
Definition: bl1_dotaxpy.c:150
int n_run
Definition: bl1_dotaxpy.c:155
* rho
Definition: bl1_dotaxpy.c:242
int i
Definition: bl1_dotaxpy.c:152

References alpha1, alpha1_c, bl1_abort(), chi1, v2df_t::d, i, kappa_c, n_left, n_pre, n_run, omega1, rho, rho_c, and v2df_t::v.

Referenced by FLA_Fused_Ahx_Ax_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().

◆ bl1_ddotsv2()

void bl1_ddotsv2 ( conj1_t  conjxy,
int  n,
double *  x,
int  inc_x,
double *  y,
int  inc_y,
double *  z,
int  inc_z,
double *  beta,
double *  rho_xz,
double *  rho_yz 
)
44 {
45  double* restrict x1;
46  double* restrict y1;
47  double* restrict z1;
48  double rho1, rho2;
49  double x1c, y1c, z1c;
50  int i;
51 
52  int n_pre;
53  int n_run;
54  int n_left;
55 
56  v2df_t rho1v, rho2v;
57  v2df_t x1v, y1v, z1v;
58  v2df_t x2v, y2v, z2v;
59 
60  if ( inc_x != 1 ||
61  inc_y != 1 ||
62  inc_z != 1 ) bl1_abort();
63 
64  n_pre = 0;
65  if ( ( unsigned long ) z % 16 != 0 )
66  {
67  if ( ( unsigned long ) x % 16 == 0 ||
68  ( unsigned long ) y % 16 == 0 ) bl1_abort();
69 
70  n_pre = 1;
71  }
72 
73  n_run = ( n - n_pre ) / 4;
74  n_left = ( n - n_pre ) % 4;
75 
76  x1 = x;
77  y1 = y;
78  z1 = z;
79 
80  rho1 = 0.0;
81  rho2 = 0.0;
82 
83  if ( n_pre == 1 )
84  {
85  x1c = *x1;
86  y1c = *y1;
87  z1c = *z1;
88 
89  rho1 += x1c * z1c;
90  rho2 += y1c * z1c;
91 
92  x1 += inc_x;
93  y1 += inc_y;
94  z1 += inc_z;
95  }
96 
97  rho1v.v = _mm_setzero_pd();
98  rho2v.v = _mm_setzero_pd();
99 
100  for ( i = 0; i < n_run; ++i )
101  {
102  x1v.v = _mm_load_pd( ( double* )x1 );
103  y1v.v = _mm_load_pd( ( double* )y1 );
104  z1v.v = _mm_load_pd( ( double* )z1 );
105 
106  x2v.v = _mm_load_pd( ( double* )(x1 + 2) );
107  y2v.v = _mm_load_pd( ( double* )(y1 + 2) );
108  z2v.v = _mm_load_pd( ( double* )(z1 + 2) );
109 
110  rho1v.v += x1v.v * z1v.v;
111  rho2v.v += y1v.v * z1v.v;
112 
113  rho1v.v += x2v.v * z2v.v;
114  rho2v.v += y2v.v * z2v.v;
115 
116  x1 += 4;
117  y1 += 4;
118  z1 += 4;
119  }
120 
121  rho1 += rho1v.d[0] + rho1v.d[1];
122  rho2 += rho2v.d[0] + rho2v.d[1];
123 
124  if ( n_left > 0 )
125  {
126  for ( i = 0; i < n_left; ++i )
127  {
128  x1c = *x1;
129  y1c = *y1;
130  z1c = *z1;
131 
132  rho1 += x1c * z1c;
133  rho2 += y1c * z1c;
134 
135  x1 += inc_x;
136  y1 += inc_y;
137  z1 += inc_z;
138  }
139  }
140 
141  *rho_xz = *beta * *rho_xz + rho1;
142  *rho_yz = *beta * *rho_yz + rho2;
143 }
double *restrict z1
Definition: bl1_dotsv2.c:148
double rho2
Definition: bl1_dotsv2.c:149
int n_left
Definition: bl1_dotsv2.c:156
int n_pre
Definition: bl1_dotsv2.c:154
double rho1
Definition: bl1_dotsv2.c:149
double z1c
Definition: bl1_dotsv2.c:150
int n_run
Definition: bl1_dotsv2.c:155
double y1c
Definition: bl1_dotsv2.c:150
* rho_xz
Definition: bl1_dotsv2.c:229
double *restrict y1
Definition: bl1_dotsv2.c:145
int i
Definition: bl1_dotsv2.c:152
double x1c
Definition: bl1_dotsv2.c:150
* rho_yz
Definition: bl1_dotsv2.c:230

References bl1_abort(), v2df_t::d, i, n_left, n_pre, n_run, rho1, rho2, rho_xz, rho_yz, v2df_t::v, x1, x1c, y1, y1c, z1, and z1c.

Referenced by FLA_Fused_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Uhu_Yhu_Zhu_opd_var1(), FLA_Fused_UYx_ZVx_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().

◆ bl1_ddotsv3()

void bl1_ddotsv3 ( conj1_t  conjxyw,
int  n,
double *  x,
int  inc_x,
double *  y,
int  inc_y,
double *  w,
int  inc_w,
double *  z,
int  inc_z,
double *  beta,
double *  rho_xz,
double *  rho_yz,
double *  rho_wz 
)
49 {
50  double* restrict x1;
51  double* restrict y1;
52  double* restrict w1;
53  double* restrict z1;
54  double rho1, rho2, rho3;
55  double x1c, y1c, w1c, z1c;
56  int i;
57 
58  int n_pre;
59  int n_run;
60  int n_left;
61 
62  v2df_t rho1v, rho2v, rho3v;
63  v2df_t x1v, y1v, w1v, z1v;
64  v2df_t x2v, y2v, w2v, z2v;
65 
66  if ( inc_x != 1 ||
67  inc_y != 1 ||
68  inc_w != 1 ||
69  inc_z != 1 ) bl1_abort();
70 
71  n_pre = 0;
72  if ( ( unsigned long ) z % 16 != 0 )
73  {
74  if ( ( unsigned long ) x % 16 == 0 ||
75  ( unsigned long ) y % 16 == 0 ||
76  ( unsigned long ) w % 16 == 0 ) bl1_abort();
77 
78  n_pre = 1;
79  }
80 
81  n_run = ( n - n_pre ) / 4;
82  n_left = ( n - n_pre ) % 4;
83 
84  x1 = x;
85  y1 = y;
86  w1 = w;
87  z1 = z;
88 
89  rho1 = 0.0;
90  rho2 = 0.0;
91  rho3 = 0.0;
92 
93  if ( n_pre == 1 )
94  {
95  x1c = *x1;
96  y1c = *y1;
97  w1c = *w1;
98  z1c = *z1;
99 
100  rho1 += x1c * z1c;
101  rho2 += y1c * z1c;
102  rho3 += w1c * z1c;
103 
104  x1 += inc_x;
105  y1 += inc_y;
106  w1 += inc_w;
107  z1 += inc_z;
108  }
109 
110  rho1v.v = _mm_setzero_pd();
111  rho2v.v = _mm_setzero_pd();
112  rho3v.v = _mm_setzero_pd();
113 
114  for ( i = 0; i < n_run; ++i )
115  {
116  x1v.v = _mm_load_pd( ( double* )x1 );
117  y1v.v = _mm_load_pd( ( double* )y1 );
118  w1v.v = _mm_load_pd( ( double* )w1 );
119  z1v.v = _mm_load_pd( ( double* )z1 );
120 
121  rho1v.v += x1v.v * z1v.v;
122  rho2v.v += y1v.v * z1v.v;
123  rho3v.v += w1v.v * z1v.v;
124 
125  x2v.v = _mm_load_pd( ( double* )(x1 + 2) );
126  y2v.v = _mm_load_pd( ( double* )(y1 + 2) );
127  w2v.v = _mm_load_pd( ( double* )(w1 + 2) );
128  z2v.v = _mm_load_pd( ( double* )(z1 + 2) );
129 
130  rho1v.v += x2v.v * z2v.v;
131  rho2v.v += y2v.v * z2v.v;
132  rho3v.v += w2v.v * z2v.v;
133 
134  x1 += 4;
135  y1 += 4;
136  w1 += 4;
137  z1 += 4;
138  }
139 
140  rho1 += rho1v.d[0] + rho1v.d[1];
141  rho2 += rho2v.d[0] + rho2v.d[1];
142  rho3 += rho3v.d[0] + rho3v.d[1];
143 
144  if ( n_left > 0 )
145  {
146  for ( i = 0; i < n_left; ++i )
147  {
148  x1c = *x1;
149  y1c = *y1;
150  w1c = *w1;
151  z1c = *z1;
152 
153  rho1 += x1c * z1c;
154  rho2 += y1c * z1c;
155  rho3 += w1c * z1c;
156 
157  x1 += inc_x;
158  y1 += inc_y;
159  w1 += inc_w;
160  z1 += inc_z;
161  }
162  }
163 
164  *rho_xz = *beta * *rho_xz + rho1;
165  *rho_yz = *beta * *rho_yz + rho2;
166  *rho_wz = *beta * *rho_wz + rho3;
167 }
double *restrict z1
Definition: bl1_dotsv3.c:173
int n_left
Definition: bl1_dotsv3.c:181
* rho_wz
Definition: bl1_dotsv3.c:270
int n_pre
Definition: bl1_dotsv3.c:179
double *restrict y1
Definition: bl1_dotsv3.c:169
double *restrict w1
Definition: bl1_dotsv3.c:172
double z1c
Definition: bl1_dotsv3.c:175
int n_run
Definition: bl1_dotsv3.c:180
double rho1
Definition: bl1_dotsv3.c:174
double rho3
Definition: bl1_dotsv3.c:174
double y1c
Definition: bl1_dotsv3.c:175
double rho2
Definition: bl1_dotsv3.c:174
* rho_xz
Definition: bl1_dotsv3.c:268
x1
Definition: bl1_dotsv3.c:452
int i
Definition: bl1_dotsv3.c:177
double x1c
Definition: bl1_dotsv3.c:175
* rho_yz
Definition: bl1_dotsv3.c:269
double w1c
Definition: bl1_dotsv3.c:175

References bl1_abort(), v2df_t::d, i, n_left, n_pre, n_run, rho1, rho2, rho3, rho_wz, rho_xz, rho_yz, v2df_t::v, w1, w1c, x1, x1c, y1, y1c, z1, and z1c.

◆ bl1_ddotv2axpyv2b()

void bl1_ddotv2axpyv2b ( int  n,
double *  a1,
int  inc_a1,
double *  a2,
int  inc_a2,
double *  x,
int  inc_x,
double *  kappa1,
double *  kappa2,
double *  rho1,
double *  rho2,
double *  w,
int  inc_w 
)
46 {
47  double* restrict alpha1;
48  double* restrict alpha2;
49  double* restrict chi1;
50  double* restrict omega1;
51  double rho1_c;
52  double rho2_c;
53  int i;
54 
55  int n_pre;
56  int n_run;
57  int n_left;
58 
59  v2df_t k1v, rho1v;
60  v2df_t k2v, rho2v;
61  v2df_t a11v, a12v, x1v, w1v;
62  v2df_t a21v, a22v, x2v, w2v;
63 
64  if ( inc_a1 != 1 ||
65  inc_a2 != 1 ||
66  inc_x != 1 ||
67  inc_w != 1 ) bl1_abort();
68 
69  n_pre = 0;
70  if ( ( unsigned long ) a1 % 16 != 0 )
71  {
72  if ( ( unsigned long ) a2 % 16 == 0 ||
73  ( unsigned long ) x % 16 == 0 ||
74  ( unsigned long ) w % 16 == 0 ) bl1_abort();
75 
76  n_pre = 1;
77  }
78 
79  n_run = ( n - n_pre ) / 4;
80  n_left = ( n - n_pre ) % 4;
81 
82  alpha1 = a1;
83  alpha2 = a2;
84  chi1 = x;
85  omega1 = w;
86 
87  rho1_c = 0.0;
88  rho2_c = 0.0;
89 
90  if ( n_pre == 1 )
91  {
92  double kappa1_c = *kappa1;
93  double kappa2_c = *kappa2;
94  double alpha1_c = *alpha1;
95  double alpha2_c = *alpha2;
96  double chi1_c = *chi1;
97  double omega1_c = *omega1;
98 
99  rho1_c += alpha1_c * chi1_c;
100  omega1_c += kappa1_c * alpha1_c;
101 
102  rho2_c += alpha2_c * chi1_c;
103  omega1_c += kappa2_c * alpha2_c;
104 
105  *omega1 = omega1_c;
106 
107  alpha1 += inc_a1;
108  alpha2 += inc_a2;
109  chi1 += inc_x;
110  omega1 += inc_w;
111  }
112 
113  rho1v.v = _mm_setzero_pd();
114  rho2v.v = _mm_setzero_pd();
115 
116  k1v.v = _mm_loaddup_pd( ( double* )kappa1 );
117  k2v.v = _mm_loaddup_pd( ( double* )kappa2 );
118 
119  for ( i = 0; i < n_run; ++i )
120  {
121  a11v.v = _mm_load_pd( ( double* )alpha1 );
122  a12v.v = _mm_load_pd( ( double* )alpha2 );
123  x1v.v = _mm_load_pd( ( double* )chi1 );
124  w1v.v = _mm_load_pd( ( double* )omega1 );
125 
126  rho1v.v += a11v.v * x1v.v;
127  w1v.v += k1v.v * a11v.v;
128 
129  rho2v.v += a12v.v * x1v.v;
130  w1v.v += k2v.v * a12v.v;
131 
132  _mm_store_pd( ( double* )omega1, w1v.v );
133 
134  a21v.v = _mm_load_pd( ( double* )(alpha1 + 2) );
135  a22v.v = _mm_load_pd( ( double* )(alpha2 + 2) );
136  x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
137  w2v.v = _mm_load_pd( ( double* )(omega1 + 2) );
138 
139  rho1v.v += a21v.v * x2v.v;
140  w2v.v += k1v.v * a21v.v;
141 
142  rho2v.v += a22v.v * x2v.v;
143  w2v.v += k2v.v * a22v.v;
144 
145  _mm_store_pd( ( double* )(omega1 + 2), w2v.v );
146 
147  alpha1 += 4;
148  alpha2 += 4;
149  chi1 += 4;
150  omega1 += 4;
151  }
152 
153  if ( n_left > 0 )
154  {
155  for ( i = 0; i < n_left; ++i )
156  {
157  double kappa1_c = *kappa1;
158  double kappa2_c = *kappa2;
159  double alpha1_c = *alpha1;
160  double alpha2_c = *alpha2;
161  double chi1_c = *chi1;
162  double omega1_c = *omega1;
163 
164  rho1_c += alpha1_c * chi1_c;
165  omega1_c += kappa1_c * alpha1_c;
166 
167  rho2_c += alpha2_c * chi1_c;
168  omega1_c += kappa2_c * alpha2_c;
169 
170  *omega1 = omega1_c;
171 
172  alpha1 += inc_a1;
173  alpha2 += inc_a2;
174  chi1 += inc_x;
175  omega1 += inc_w;
176  }
177  }
178 
179  rho1_c += rho1v.d[0] + rho1v.d[1];
180  rho2_c += rho2v.d[0] + rho2v.d[1];
181 
182  *rho1 = rho1_c;
183  *rho2 = rho2_c;
184 }
double *restrict omega1
Definition: bl1_dotv2axpyv2b.c:190
double rho2_c
Definition: bl1_dotv2axpyv2b.c:194
double *restrict chi1
Definition: bl1_dotv2axpyv2b.c:189
double kappa1_c
Definition: bl1_dotv2axpyv2b.c:191
alpha1
Definition: bl1_dotv2axpyv2b.c:456
* rho2
Definition: bl1_dotv2axpyv2b.c:312
int n_left
Definition: bl1_dotv2axpyv2b.c:199
double rho1_c
Definition: bl1_dotv2axpyv2b.c:193
int n_pre
Definition: bl1_dotv2axpyv2b.c:197
* rho1
Definition: bl1_dotv2axpyv2b.c:311
int n_run
Definition: bl1_dotv2axpyv2b.c:198
double kappa2_c
Definition: bl1_dotv2axpyv2b.c:192
int i
Definition: bl1_dotv2axpyv2b.c:195

References alpha1, alpha1_c, alpha2, alpha2_c, bl1_abort(), chi1, v2df_t::d, i, kappa1_c, kappa2_c, n_left, n_pre, n_run, omega1, rho1, rho1_c, rho2, rho2_c, and v2df_t::v.

Referenced by FLA_Fused_Ahx_Ax_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().

◆ bl1_saxmyv2()

void bl1_saxmyv2 ( conj1_t  conjx,
int  n,
float *  alpha,
float *  beta,
float *  x,
int  inc_x,
float *  y,
int  inc_y,
float *  z,
int  inc_z 
)
29 {
30  bl1_abort();
31 }

References bl1_abort().

◆ bl1_saxpyv2b()

void bl1_saxpyv2b ( int  n,
float *  beta1,
float *  beta2,
float *  a1,
int  inc_a1,
float *  a2,
int  inc_a2,
float *  w,
int  inc_w 
)
26 {
27  bl1_abort();
28 }

References bl1_abort().

◆ bl1_saxpyv2bdotaxpy()

void bl1_saxpyv2bdotaxpy ( int  n,
float *  beta,
float *  u,
int  inc_u,
float *  gamma,
float *  z,
int  inc_z,
float *  a,
int  inc_a,
float *  x,
int  inc_x,
float *  kappa,
float *  rho,
float *  w,
int  inc_w 
)
31 {
32  bl1_abort();
33 }

References bl1_abort().

◆ bl1_saxpyv3b()

void bl1_saxpyv3b ( int  n,
float *  beta1,
float *  beta2,
float *  beta3,
float *  a1,
int  inc_a1,
float *  a2,
int  inc_a2,
float *  a3,
int  inc_a3,
float *  w,
int  inc_w 
)
29 {
30  bl1_abort();
31 }

References bl1_abort().

◆ bl1_sdotaxmyv2()

void bl1_sdotaxmyv2 ( int  n,
float *  alpha,
float *  beta,
float *  x,
int  inc_x,
float *  u,
int  inc_u,
float *  rho,
float *  y,
int  inc_y,
float *  z,
int  inc_z 
)
29 {
30  bl1_abort();
31 }

References bl1_abort().

◆ bl1_sdotaxpy()

void bl1_sdotaxpy ( int  n,
float *  a,
int  inc_a,
float *  x,
int  inc_x,
float *  kappa,
float *  rho,
float *  w,
int  inc_w 
)
26 {
27  bl1_abort();
28 }

References bl1_abort().

◆ bl1_sdotsv2()

void bl1_sdotsv2 ( conj1_t  conjxy,
int  n,
float *  x,
int  inc_x,
float *  y,
int  inc_y,
float *  z,
int  inc_z,
float *  beta,
float *  rho_xz,
float *  rho_yz 
)
30 {
31  bl1_abort();
32 }

References bl1_abort().

◆ bl1_sdotsv3()

void bl1_sdotsv3 ( conj1_t  conjxyw,
int  n,
float *  x,
int  inc_x,
float *  y,
int  inc_y,
float *  w,
int  inc_w,
float *  z,
int  inc_z,
float *  beta,
float *  rho_xz,
float *  rho_yz,
float *  rho_wz 
)
33 {
34  bl1_abort();
35 }

References bl1_abort().

◆ bl1_sdotv2axpyv2b()

void bl1_sdotv2axpyv2b ( int  n,
float *  a1,
int  inc_a1,
float *  a2,
int  inc_a2,
float *  x,
int  inc_x,
float *  kappa1,
float *  kappa2,
float *  rho1,
float *  rho2,
float *  w,
int  inc_w 
)
31 {
32  bl1_abort();
33 }

References bl1_abort().

◆ bl1_zaxmyv2()

void bl1_zaxmyv2 ( conj1_t  conjx,
int  n,
dcomplex alpha,
dcomplex beta,
dcomplex x,
int  inc_x,
dcomplex y,
int  inc_y,
dcomplex z,
int  inc_z 
)
258 {
259  dcomplex* restrict chi1;
260  dcomplex* restrict psi1;
261  dcomplex* restrict zeta1;
264  int i;
265  v2df_t alphav, alpharv;
266  v2df_t betav, betarv;
267  v2df_t x11v, x12v, y1v, z1v;
268  v2df_t acbc, bdad;
269 
270  chi1 = x;
271  psi1 = y;
272  zeta1 = z;
273 
274  alphav.v = _mm_load_pd( ( double* )alpha );
275  betav.v = _mm_load_pd( ( double* )beta );
276  alpharv.v = _mm_shuffle_pd( alphav.v, alphav.v, _MM_SHUFFLE2 (0,1) );
277  betarv.v = _mm_shuffle_pd( betav.v, betav.v, _MM_SHUFFLE2 (0,1) );
278 
279  if ( bl1_is_conj( conjx ) )
280  {
281  alpha_c = *alpha;
282  beta_c = *beta;
283 
284  for ( i = 0; i < n; ++i )
285  {
286  dcomplex chi1_c = *chi1;
287 
288  // psi1 = psi1 + alpha * chi1;
289  psi1->real += alpha_c.real * chi1_c.real - alpha_c.imag * -chi1_c.imag;
290  psi1->imag += alpha_c.real * -chi1_c.imag + alpha_c.imag * chi1_c.real;
291 
292  // zeta1 = zeta1 + beta * chi1;
293  zeta1->real += beta_c.real * chi1_c.real - beta_c.imag * -chi1_c.imag;
294  zeta1->imag += beta_c.real * -chi1_c.imag + beta_c.imag * chi1_c.real;
295 
296  chi1 += inc_x;
297  psi1 += inc_y;
298  zeta1 += inc_z;
299  }
300  }
301  else
302  {
303  if ( inc_x == 1 &&
304  inc_y == 1 &&
305  inc_z == 1 )
306  {
307  for ( i = 0; i < n; ++i )
308  {
309  x11v.v = _mm_load_pd( ( double* )chi1 );
310  x12v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (1,1) );
311  x11v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (0,0) );
312 
313  acbc.v = alphav.v * x11v.v;
314  bdad.v = alpharv.v * x12v.v;
315  y1v.v = _mm_load_pd( ( double* )psi1 );
316  y1v.v = y1v.v - _mm_addsub_pd( acbc.v, bdad.v );
317  _mm_store_pd( ( double* )psi1, y1v.v );
318 
319  acbc.v = betav.v * x11v.v;
320  bdad.v = betarv.v * x12v.v;
321  z1v.v = _mm_load_pd( ( double* )zeta1 );
322  z1v.v = z1v.v - _mm_addsub_pd( acbc.v, bdad.v );
323  _mm_store_pd( ( double* )zeta1, z1v.v );
324 
325  chi1 += 1;
326  psi1 += 1;
327  zeta1 += 1;
328  }
329  }
330  else
331  {
332  for ( i = 0; i < n; ++i )
333  {
334  x11v.v = _mm_load_pd( ( double* )chi1 );
335  x12v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (1,1) );
336  x11v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (0,0) );
337 
338  acbc.v = alphav.v * x11v.v;
339  bdad.v = alpharv.v * x12v.v;
340  y1v.v = _mm_load_pd( ( double* )psi1 );
341  y1v.v = y1v.v - _mm_addsub_pd( acbc.v, bdad.v );
342  _mm_store_pd( ( double* )psi1, y1v.v );
343 
344  acbc.v = betav.v * x11v.v;
345  bdad.v = betarv.v * x12v.v;
346  z1v.v = _mm_load_pd( ( double* )zeta1 );
347  z1v.v = z1v.v - _mm_addsub_pd( acbc.v, bdad.v );
348  _mm_store_pd( ( double* )zeta1, z1v.v );
349 
350  chi1 += inc_x;
351  psi1 += inc_y;
352  zeta1 += inc_z;
353  }
354  }
355  }
356 }
int bl1_is_conj(conj1_t conj)
Definition: bl1_is.c:42
Definition: blis_type_defs.h:138
double real
Definition: blis_type_defs.h:139
double imag
Definition: blis_type_defs.h:139

References alpha_c, beta_c, bl1_is_conj(), chi1, i, dcomplex::imag, psi1, dcomplex::real, v2df_t::v, and zeta1.

Referenced by FLA_Fused_UYx_ZVx_opz_var1().

◆ bl1_zaxpyv2b()

void bl1_zaxpyv2b ( int  n,
dcomplex beta1,
dcomplex beta2,
dcomplex a1,
int  inc_a1,
dcomplex a2,
int  inc_a2,
dcomplex w,
int  inc_w 
)
217 {
218  dcomplex* restrict chi1;
219  dcomplex* restrict chi2;
220  dcomplex* restrict psi1;
221  int i;
222  v2df_t alpha1v, alpha1rv;
223  v2df_t alpha2v, alpha2rv;
224  v2df_t x11v, x12v;
225  v2df_t t1v, y1v;
226  v2df_t acbc, bdad;
227 
228  chi1 = x1;
229  chi2 = x2;
230  psi1 = y;
231 
232  alpha1v.v = _mm_load_pd( ( double* )alpha1 );
233  alpha2v.v = _mm_load_pd( ( double* )alpha2 );
234  alpha1rv.v = _mm_shuffle_pd( alpha1v.v, alpha1v.v, _MM_SHUFFLE2 (0,1) );
235  alpha2rv.v = _mm_shuffle_pd( alpha2v.v, alpha2v.v, _MM_SHUFFLE2 (0,1) );
236 
237  if ( inc_x1 == 1 &&
238  inc_x2 == 1 &&
239  inc_y == 1 )
240  {
241  for ( i = 0; i < n; ++i )
242  {
243  x11v.v = _mm_load_pd( ( double* )chi1 );
244  x12v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (1,1) );
245  x11v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (0,0) );
246  acbc.v = alpha1v.v * x11v.v;
247  bdad.v = alpha1rv.v * x12v.v;
248  t1v.v = _mm_addsub_pd( acbc.v, bdad.v );
249 
250  x11v.v = _mm_load_pd( ( double* )chi2 );
251  x12v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (1,1) );
252  x11v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (0,0) );
253  acbc.v = alpha2v.v * x11v.v;
254  bdad.v = alpha2rv.v * x12v.v;
255  t1v.v = t1v.v + _mm_addsub_pd( acbc.v, bdad.v );
256 
257  y1v.v = _mm_load_pd( ( double* )psi1 );
258  y1v.v = y1v.v + t1v.v;
259  _mm_store_pd( ( double* )psi1, y1v.v );
260 
261  chi1 += 1;
262  chi2 += 1;
263  psi1 += 1;
264  }
265  }
266  else
267  {
268  for ( i = 0; i < n; ++i )
269  {
270  x11v.v = _mm_load_pd( ( double* )chi1 );
271  x12v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (1,1) );
272  x11v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (0,0) );
273  acbc.v = alpha1v.v * x11v.v;
274  bdad.v = alpha1rv.v * x12v.v;
275  t1v.v = _mm_addsub_pd( acbc.v, bdad.v );
276 
277  x11v.v = _mm_load_pd( ( double* )chi2 );
278  x12v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (1,1) );
279  x11v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (0,0) );
280  acbc.v = alpha2v.v * x11v.v;
281  bdad.v = alpha2rv.v * x12v.v;
282  t1v.v = t1v.v + _mm_addsub_pd( acbc.v, bdad.v );
283 
284  y1v.v = _mm_load_pd( ( double* )psi1 );
285  y1v.v = y1v.v + t1v.v;
286  _mm_store_pd( ( double* )psi1, y1v.v );
287 
288  chi1 += inc_x1;
289  chi2 += inc_x2;
290  psi1 += inc_y;
291  }
292  }
293 }

References alpha1, alpha2, chi1, chi2, i, psi1, v2df_t::v, and x1.

Referenced by FLA_Fused_Ahx_Axpy_Ax_opz_var1(), FLA_Fused_Gerc2_Ahx_Ax_opz_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opz_var1(), FLA_Fused_Gerc2_opz_var1(), FLA_Fused_Her2_Ax_l_opz_var1(), and FLA_Fused_Uhu_Yhu_Zhu_opz_var1().

◆ bl1_zaxpyv2bdotaxpy()

void bl1_zaxpyv2bdotaxpy ( int  n,
dcomplex beta,
dcomplex u,
int  inc_u,
dcomplex gamma,
dcomplex z,
int  inc_z,
dcomplex a,
int  inc_a,
dcomplex x,
int  inc_x,
dcomplex kappa,
dcomplex rho,
dcomplex w,
int  inc_w 
)
353 {
354  dcomplex* restrict upsilon1;
355  dcomplex* restrict zeta1;
356  dcomplex* restrict alpha1;
357  dcomplex* restrict chi1;
358  dcomplex* restrict omega1;
359  int i;
360 
361  //v2df_t beta1v, beta1rv;
362  //v2df_t gamma1v, gamma1rv;
363  //v2df_t kappa1v, kappa1rv;
364  v2df_t rho1v;
365  //v2df_t u11v, u12v;
366  //v2df_t z11v, z12v;
367  v2df_t a11v, a12v;
368  v2df_t x1v, x1rv;
369  v2df_t w1v;
370  v2df_t acbc, bdad;
371  v2df_t adac, bcbd;
372 
373  v2df_t a1v, a1rv;
374  v2df_t u1v, u1rv;
375  v2df_t z1v, z1rv;
376  v2df_t beta11v, gamma11v, kappa11v;
377  v2df_t beta12v, gamma12v, kappa12v;
378 
379  upsilon1 = u;
380  zeta1 = z;
381  alpha1 = a;
382  chi1 = x;
383  omega1 = w;
384 
385  if ( inc_u != 1 ||
386  inc_z != 1 ||
387  inc_a != 1 ||
388  inc_x != 1 ||
389  inc_w != 1 ) bl1_abort();
390 
391 
392  beta11v.v = _mm_loaddup_pd( ( double* )&(beta->real) );
393  beta12v.v = _mm_loaddup_pd( ( double* )&(beta->imag) );
394  gamma11v.v = _mm_loaddup_pd( ( double* )&(gamma->real) );
395  gamma12v.v = _mm_loaddup_pd( ( double* )&(gamma->imag) );
396  kappa11v.v = _mm_loaddup_pd( ( double* )&(kappa->real) );
397  kappa12v.v = _mm_loaddup_pd( ( double* )&(kappa->imag) );
398 
399  rho1v.v = _mm_setzero_pd();
400 
401  for ( i = 0; i < n; ++i )
402  {
403  //alpha_c = *alpha1;
404  a1v.v = _mm_load_pd( ( double* )alpha1 );
405 
406  //alpha1_c.real += beta_c.real * upsilon1_c.real - beta_c.imag * upsilon1_c.imag;
407  //alpha1_c.imag += beta_c.real * upsilon1_c.imag + beta_c.imag * upsilon1_c.real;
408  u1v.v = _mm_load_pd( ( double* )upsilon1 );
409  u1rv.v = _mm_shuffle_pd( u1v.v, u1v.v, _MM_SHUFFLE2 (0,1) );
410  acbc.v = beta11v.v * u1v.v;
411  bdad.v = beta12v.v * u1rv.v;
412  a1v.v += _mm_addsub_pd( acbc.v, bdad.v );
413 
414  //alpha1_c.real += gamma_c.real * zeta1_c.real - gamma_c.imag * zeta1_c.imag;
415  //alpha1_c.imag += gamma_c.real * zeta1_c.imag + gamma_c.imag * zeta1_c.real;
416  z1v.v = _mm_load_pd( ( double* )zeta1 );
417  z1rv.v = _mm_shuffle_pd( z1v.v, z1v.v, _MM_SHUFFLE2 (0,1) );
418  acbc.v = gamma11v.v * z1v.v;
419  bdad.v = gamma12v.v * z1rv.v;
420  a1v.v += _mm_addsub_pd( acbc.v, bdad.v );
421 
422  //*alpha1 = alpha1_c;
423  _mm_store_pd( ( double* )alpha1, a1v.v );
424 
425  //rho_c.real += alpha1_c.real * chi1_c.real - -alpha1_c.imag * chi1_c.imag;
426  //rho_c.imag += alpha1_c.real * chi1_c.imag + -alpha1_c.imag * chi1_c.real;
427  x1v.v = _mm_load_pd( ( double* )chi1 );
428  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
429  a11v.v = a1v.v;
430  a12v.v = _mm_shuffle_pd( a11v.v, a11v.v, _MM_SHUFFLE2 (1,1) );
431  a11v.v = _mm_shuffle_pd( a11v.v, a11v.v, _MM_SHUFFLE2 (0,0) );
432  adac.v = a11v.v * x1rv.v;
433  bcbd.v = a12v.v * x1v.v;
434  rho1v.v = rho1v.v + _mm_addsub_pd( adac.v, bcbd.v );
435 
436  //omega_c = *omega1;
437  w1v.v = _mm_load_pd( ( double* )omega1 );
438 
439  //omega1_c.real += kappa_c.real * alpha1_c.real - kappa_c.imag * alpha1_c.imag;
440  //omega1_c.imag += kappa_c.real * alpha1_c.imag + kappa_c.imag * alpha1_c.real;
441  a1rv.v = _mm_shuffle_pd( a1v.v, a1v.v, _MM_SHUFFLE2 (0,1) );
442  acbc.v = kappa11v.v * a1v.v;
443  bdad.v = kappa12v.v * a1rv.v;
444  w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
445 
446  // *omega1 = omega1_c;
447  _mm_store_pd( ( double* )omega1, w1v.v );
448 
449 
450  upsilon1 += 1;
451  zeta1 += 1;
452  alpha1 += 1;
453  chi1 += 1;
454  omega1 += 1;
455  }
456 
457  rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
458 
459  //rho->real = rho_c.real;
460  //rho->imag = rho_c.imag;
461  _mm_store_pd( ( double* )rho, rho1v.v );
462 }

References alpha1, bl1_abort(), chi1, i, dcomplex::imag, omega1, dcomplex::real, rho, upsilon1, v2df_t::v, and zeta1.

◆ bl1_zaxpyv2bdots()

void bl1_zaxpyv2bdots ( int  n,
dcomplex alpha1,
dcomplex alpha2,
dcomplex x1,
int  inc_x1,
dcomplex x2,
int  inc_x2,
dcomplex y,
int  inc_y,
dcomplex u,
int  inc_u,
dcomplex beta,
dcomplex rho 
)

◆ bl1_zaxpyv3b()

void bl1_zaxpyv3b ( int  n,
dcomplex beta1,
dcomplex beta2,
dcomplex beta3,
dcomplex a1,
int  inc_a1,
dcomplex a2,
int  inc_a2,
dcomplex a3,
int  inc_a3,
dcomplex w,
int  inc_w 
)
232 {
233  bl1_abort();
234 }

References bl1_abort().

◆ bl1_zdotaxmyv2()

void bl1_zdotaxmyv2 ( int  n,
dcomplex alpha,
dcomplex beta,
dcomplex x,
int  inc_x,
dcomplex u,
int  inc_u,
dcomplex rho,
dcomplex y,
int  inc_y,
dcomplex z,
int  inc_z 
)
285 {
286  dcomplex* restrict chi1;
287  dcomplex* restrict upsilon1;
288  dcomplex* restrict psi1;
289  dcomplex* restrict zeta1;
290  int i;
291 
292  v2df_t alpha11v, alpha12v;
293  v2df_t beta11v, beta12v;
294  v2df_t rho1v;
295  v2df_t x1v, x1rv;
296  v2df_t y1v;
297  v2df_t z1v;
298  v2df_t u11v, u12v;
299  v2df_t acad, bdbc;
300  v2df_t bcac, adbd;
301 
302  if ( inc_x != 1 ||
303  inc_u != 1 ||
304  inc_y != 1 ||
305  inc_z != 1 ) bl1_abort();
306 
307  chi1 = x;
308  upsilon1 = u;
309  psi1 = y;
310  zeta1 = z;
311 
312  //rho_c.real = 0.0;
313  //rho_c.imag = 0.0;
314  rho1v.v = _mm_setzero_pd();
315 
316  //alpha_c = *alpha;
317  //beta_c = *beta;
318  alpha11v.v = _mm_loaddup_pd( ( double* )&(alpha->real) );
319  alpha12v.v = _mm_loaddup_pd( ( double* )&(alpha->imag) );
320  beta11v.v = _mm_loaddup_pd( ( double* )&(beta->real) );
321  beta12v.v = _mm_loaddup_pd( ( double* )&(beta->imag) );
322 
323  for ( i = 0; i < n; ++i )
324  {
325  //dcomplex chi1_c = *chi1;
326  x1v.v = _mm_load_pd( ( double* )chi1 );
327 
328  //psi1->real -= alpha_c.real * chi1_c.real - alpha_c.imag * chi1_c.imag;
329  //psi1->imag -= alpha_c.real * chi1_c.imag + alpha_c.imag * chi1_c.real;
330  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
331  acad.v = alpha11v.v * x1v.v;
332  bdbc.v = alpha12v.v * x1rv.v;
333  y1v.v = _mm_load_pd( ( double* )psi1 );
334  y1v.v = y1v.v - _mm_addsub_pd( acad.v, bdbc.v );
335  _mm_store_pd( ( double* )psi1, y1v.v );
336 
337  //zeta1->real -= beta_c.real * chi1_c.real - beta_c.imag * chi1_c.imag;
338  //zeta1->imag -= beta_c.real * chi1_c.imag + beta_c.imag * chi1_c.real;
339  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
340  acad.v = beta11v.v * x1v.v;
341  bdbc.v = beta12v.v * x1rv.v;
342  z1v.v = _mm_load_pd( ( double* )zeta1 );
343  z1v.v = z1v.v - _mm_addsub_pd( acad.v, bdbc.v );
344  _mm_store_pd( ( double* )zeta1, z1v.v );
345 
346  //rho_c.real = chi1_c.real * upsilon1_c.real - -chi1_c.imag * upsilon1_c.imag;
347  //rho_c.imag = chi1_c.real * upsilon1_c.imag + -chi1_c.imag * upsilon1_c.real;
348  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
349  u11v.v = _mm_loaddup_pd( ( double* )&(upsilon1->real) );
350  u12v.v = _mm_loaddup_pd( ( double* )&(upsilon1->imag) );
351  bcac.v = x1rv.v * u11v.v;
352  adbd.v = x1v.v * u12v.v;
353  rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v );
354 
355  chi1 += 1;
356  upsilon1 += 1;
357  psi1 += 1;
358  zeta1 += 1;
359  }
360 
361  rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
362 
363  rho1v.d[1] = -rho1v.d[1];
364 
365  _mm_store_pd( ( double* )rho, rho1v.v );
366 }

References bl1_abort(), chi1, v2df_t::d, i, dcomplex::imag, psi1, dcomplex::real, rho, upsilon1, v2df_t::v, and zeta1.

◆ bl1_zdotaxpy()

void bl1_zdotaxpy ( int  n,
dcomplex a,
int  inc_a,
dcomplex x,
int  inc_x,
dcomplex kappa,
dcomplex rho,
dcomplex w,
int  inc_w 
)
265 {
266  dcomplex* restrict alpha1;
267  dcomplex* restrict chi1;
268  dcomplex* restrict omega1;
269  int i;
270 
271  v2df_t kappa1v, kappa1rv;
272  v2df_t rho1v;
273  v2df_t a11v, a12v;
274  v2df_t x1v, x1rv;
275  v2df_t w1v;
276  v2df_t acbc, bdad;
277  v2df_t adac, bcbd;
278 
279  alpha1 = a;
280  chi1 = x;
281  omega1 = w;
282 
283  if ( inc_a != 1 ||
284  inc_x != 1 ||
285  inc_w != 1 ) bl1_abort();
286 
287  kappa1v.v = _mm_load_pd( ( double* )kappa );
288  kappa1rv.v = _mm_shuffle_pd( kappa1v.v, kappa1v.v, _MM_SHUFFLE2 (0,1) );
289 
290  rho1v.v = _mm_setzero_pd();
291 
292  for ( i = 0; i < n; ++i )
293  {
294  //alpha_c = *alpha1;
295  a11v.v = _mm_loaddup_pd( ( double* )&(alpha1->real) );
296  a12v.v = _mm_loaddup_pd( ( double* )&(alpha1->imag) );
297 
298  //rho_c.real += alpha1_c.real * chi1_c.real - -alpha1_c.imag * chi1_c.imag;
299  //rho_c.imag += alpha1_c.real * chi1_c.imag + -alpha1_c.imag * chi1_c.real;
300  x1v.v = _mm_load_pd( ( double* )chi1 );
301  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
302  adac.v = a11v.v * x1rv.v;
303  bcbd.v = a12v.v * x1v.v;
304  rho1v.v = rho1v.v + _mm_addsub_pd( adac.v, bcbd.v );
305 
306  //omega_c = *omega1;
307  w1v.v = _mm_load_pd( ( double* )omega1 );
308 
309  //omega1_c.real += kappa_c.real * alpha1_c.real - kappa_c.imag * alpha1_c.imag;
310  //omega1_c.imag += kappa_c.real * alpha1_c.imag + kappa_c.imag * alpha1_c.real;
311  acbc.v = kappa1v.v * a11v.v;
312  bdad.v = kappa1rv.v * a12v.v;
313  w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
314 
315  //*omega1 = omega1_c;
316  _mm_store_pd( ( double* )omega1, w1v.v );
317 
318  alpha1 += 1;
319  chi1 += 1;
320  omega1 += 1;
321  }
322 
323  rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
324 
325  //rho->real = rho_c.real;
326  //rho->imag = rho_c.imag;
327  _mm_store_pd( ( double* )rho, rho1v.v );
328 }

References alpha1, bl1_abort(), chi1, i, omega1, rho, and v2df_t::v.

Referenced by FLA_Fused_Ahx_Ax_opz_var1(), FLA_Fused_Gerc2_Ahx_Ax_opz_var1(), FLA_Fused_Her2_Ax_l_opz_var1(), and FLA_Fused_UZhu_ZUhu_opz_var1().

◆ bl1_zdotsv2()

void bl1_zdotsv2 ( conj1_t  conjxy,
int  n,
dcomplex x,
int  inc_x,
dcomplex y,
int  inc_y,
dcomplex z,
int  inc_z,
dcomplex beta,
dcomplex rho_xz,
dcomplex rho_yz 
)
257 {
258  dcomplex* restrict x1;
259  dcomplex* restrict y1;
260  dcomplex* restrict z1;
261  int i;
262  v2df_t r1v, rho1v;
263  v2df_t r2v, rho2v;
264  v2df_t z11v, z12v;
265  v2df_t x1v, x1rv;
266  v2df_t y1v, y1rv;
267 
268  x1 = x;
269  y1 = y;
270  z1 = z;
271 
272  rho1v.v = _mm_setzero_pd();
273  rho2v.v = _mm_setzero_pd();
274 
275  if ( bl1_is_conj( conjxy ) )
276  {
277  v2df_t bcac, adbd;
278 
279  for ( i = 0; i < n; ++i )
280  {
281  z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
282  z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );
283 
284  x1v.v = _mm_load_pd( ( double* )x1 );
285  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
286  bcac.v = x1rv.v * z11v.v;
287  adbd.v = x1v.v * z12v.v;
288  rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v );
289 
290  y1v.v = _mm_load_pd( ( double* )y1 );
291  y1rv.v = _mm_shuffle_pd( y1v.v, y1v.v, _MM_SHUFFLE2 (0,1) );
292  bcac.v = y1rv.v * z11v.v;
293  adbd.v = y1v.v * z12v.v;
294  rho2v.v = rho2v.v + _mm_addsub_pd( bcac.v, adbd.v );
295 
296  x1 += inc_x;
297  y1 += inc_y;
298  z1 += inc_z;
299  }
300 
301  rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
302  rho2v.v = _mm_shuffle_pd( rho2v.v, rho2v.v, _MM_SHUFFLE2 (0,1) );
303 
304  rho1v.d[1] = -rho1v.d[1];
305  rho2v.d[1] = -rho2v.d[1];
306  }
307  else
308  {
309  v2df_t cada, dbcb;
310 
311  for ( i = 0; i < n; ++i )
312  {
313  z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
314  z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );
315 
316  x1v.v = _mm_load_pd( ( double* )x1 );
317  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
318  cada.v = x1v.v * z11v.v;
319  dbcb.v = x1rv.v * z12v.v;
320  rho1v.v = rho1v.v + _mm_addsub_pd( cada.v, dbcb.v );
321 
322  y1v.v = _mm_load_pd( ( double* )y1 );
323  y1rv.v = _mm_shuffle_pd( y1v.v, y1v.v, _MM_SHUFFLE2 (0,1) );
324  cada.v = y1v.v * z11v.v;
325  dbcb.v = y1rv.v * z12v.v;
326  rho2v.v = rho2v.v + _mm_addsub_pd( cada.v, dbcb.v );
327 
328  x1 += inc_x;
329  y1 += inc_y;
330  z1 += inc_z;
331  }
332  }
333 
334  //bl1_zscals( beta, rho_xz );
335  //bl1_zscals( beta, rho_yz );
336  {
337  v2df_t ab, ba, cc, dd, acbc, bdad;
338 
339  ab.v = _mm_load_pd( ( double* )beta );
340  ba.v = _mm_shuffle_pd( ab.v, ab.v, _MM_SHUFFLE2 (0,1) );
341 
342  cc.v = _mm_loaddup_pd( ( double* )&(rho_xz->real) );
343  dd.v = _mm_loaddup_pd( ( double* )&(rho_xz->imag) );
344  acbc.v = ab.v * cc.v;
345  bdad.v = ba.v * dd.v;
346  r1v.v = _mm_addsub_pd( acbc.v, bdad.v );
347 
348  cc.v = _mm_loaddup_pd( ( double* )&(rho_yz->real) );
349  dd.v = _mm_loaddup_pd( ( double* )&(rho_yz->imag) );
350  acbc.v = ab.v * cc.v;
351  bdad.v = ba.v * dd.v;
352  r2v.v = _mm_addsub_pd( acbc.v, bdad.v );
353  }
354 
355  //rho_xz->real = rho_xz->real + rho1.real;
356  //rho_xz->imag = rho_xz->imag + rho1.imag;
357  rho1v.v = r1v.v + rho1v.v;
358  _mm_store_pd( ( double* )rho_xz, rho1v.v );
359 
360  //rho_yz->real = rho_yz->real + rho2.real;
361  //rho_yz->imag = rho_yz->imag + rho2.imag;
362  rho2v.v = r2v.v + rho2v.v;
363  _mm_store_pd( ( double* )rho_yz, rho2v.v );
364 }

References bl1_is_conj(), v2df_t::d, i, rho_xz, rho_yz, v2df_t::v, x1, y1, and z1.

Referenced by FLA_Fused_Ahx_Axpy_Ax_opz_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opz_var1(), and FLA_Fused_UYx_ZVx_opz_var1().

◆ bl1_zdotsv3()

void bl1_zdotsv3 ( conj1_t  conjxyw,
int  n,
dcomplex x,
int  inc_x,
dcomplex y,
int  inc_y,
dcomplex w,
int  inc_w,
dcomplex z,
int  inc_z,
dcomplex beta,
dcomplex rho_xz,
dcomplex rho_yz,
dcomplex rho_wz 
)
301 {
302  dcomplex* restrict x1;
303  dcomplex* restrict y1;
304  dcomplex* restrict w1;
305  dcomplex* restrict z1;
306  int i;
307  v2df_t r1v, rho1v;
308  v2df_t r2v, rho2v;
309  v2df_t r3v, rho3v;
310  v2df_t z11v, z12v;
311  v2df_t x1v, x1rv;
312  v2df_t y1v, y1rv;
313  v2df_t w1v, w1rv;
314 
315  x1 = x;
316  y1 = y;
317  w1 = w;
318  z1 = z;
319 
320  rho1v.v = _mm_setzero_pd();
321  rho2v.v = _mm_setzero_pd();
322  rho3v.v = _mm_setzero_pd();
323 
324  if ( bl1_is_conj( conjxyw ) )
325  {
326  v2df_t bcac, adbd;
327 
328  for ( i = 0; i < n; ++i )
329  {
330  z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
331  z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );
332 
333  x1v.v = _mm_load_pd( ( double* )x1 );
334  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
335  bcac.v = x1rv.v * z11v.v;
336  adbd.v = x1v.v * z12v.v;
337  rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v );
338 
339  y1v.v = _mm_load_pd( ( double* )y1 );
340  y1rv.v = _mm_shuffle_pd( y1v.v, y1v.v, _MM_SHUFFLE2 (0,1) );
341  bcac.v = y1rv.v * z11v.v;
342  adbd.v = y1v.v * z12v.v;
343  rho2v.v = rho2v.v + _mm_addsub_pd( bcac.v, adbd.v );
344 
345  w1v.v = _mm_load_pd( ( double* )w1 );
346  w1rv.v = _mm_shuffle_pd( w1v.v, w1v.v, _MM_SHUFFLE2 (0,1) );
347  bcac.v = w1rv.v * z11v.v;
348  adbd.v = w1v.v * z12v.v;
349  rho3v.v = rho3v.v + _mm_addsub_pd( bcac.v, adbd.v );
350 
351  x1 += inc_x;
352  y1 += inc_y;
353  w1 += inc_w;
354  z1 += inc_z;
355  }
356 
357  rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
358  rho2v.v = _mm_shuffle_pd( rho2v.v, rho2v.v, _MM_SHUFFLE2 (0,1) );
359  rho3v.v = _mm_shuffle_pd( rho3v.v, rho3v.v, _MM_SHUFFLE2 (0,1) );
360 
361  rho1v.d[1] = -rho1v.d[1];
362  rho2v.d[1] = -rho2v.d[1];
363  rho3v.d[1] = -rho3v.d[1];
364  }
365  else
366  {
367  v2df_t cada, dbcb;
368 
369  for ( i = 0; i < n; ++i )
370  {
371  z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
372  z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );
373 
374  x1v.v = _mm_load_pd( ( double* )x1 );
375  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
376  cada.v = x1v.v * z11v.v;
377  dbcb.v = x1rv.v * z12v.v;
378  rho1v.v = rho1v.v + _mm_addsub_pd( cada.v, dbcb.v );
379 
380  y1v.v = _mm_load_pd( ( double* )y1 );
381  y1rv.v = _mm_shuffle_pd( y1v.v, y1v.v, _MM_SHUFFLE2 (0,1) );
382  cada.v = y1v.v * z11v.v;
383  dbcb.v = y1rv.v * z12v.v;
384  rho2v.v = rho2v.v + _mm_addsub_pd( cada.v, dbcb.v );
385 
386  w1v.v = _mm_load_pd( ( double* )w1 );
387  w1rv.v = _mm_shuffle_pd( w1v.v, w1v.v, _MM_SHUFFLE2 (0,1) );
388  cada.v = w1v.v * z11v.v;
389  dbcb.v = w1rv.v * z12v.v;
390  rho3v.v = rho3v.v + _mm_addsub_pd( cada.v, dbcb.v );
391 
392  x1 += inc_x;
393  y1 += inc_y;
394  w1 += inc_w;
395  z1 += inc_z;
396  }
397  }
398 
399  //bl1_zscals( beta, rho_xz );
400  //bl1_zscals( beta, rho_yz );
401  //bl1_zscals( beta, rho_wz );
402  {
403  v2df_t ab, ba, cc, dd, acbc, bdad;
404 
405  ab.v = _mm_load_pd( ( double* )beta );
406  ba.v = _mm_shuffle_pd( ab.v, ab.v, _MM_SHUFFLE2 (0,1) );
407 
408  cc.v = _mm_loaddup_pd( ( double* )&(rho_xz->real) );
409  dd.v = _mm_loaddup_pd( ( double* )&(rho_xz->imag) );
410  acbc.v = ab.v * cc.v;
411  bdad.v = ba.v * dd.v;
412  r1v.v = _mm_addsub_pd( acbc.v, bdad.v );
413 
414  cc.v = _mm_loaddup_pd( ( double* )&(rho_yz->real) );
415  dd.v = _mm_loaddup_pd( ( double* )&(rho_yz->imag) );
416  acbc.v = ab.v * cc.v;
417  bdad.v = ba.v * dd.v;
418  r2v.v = _mm_addsub_pd( acbc.v, bdad.v );
419 
420  cc.v = _mm_loaddup_pd( ( double* )&(rho_wz->real) );
421  dd.v = _mm_loaddup_pd( ( double* )&(rho_wz->imag) );
422  acbc.v = ab.v * cc.v;
423  bdad.v = ba.v * dd.v;
424  r3v.v = _mm_addsub_pd( acbc.v, bdad.v );
425  }
426 
427  //rho_xz->real = rho_xz->real + rho1.real;
428  //rho_xz->imag = rho_xz->imag + rho1.imag;
429  rho1v.v = r1v.v + rho1v.v;
430  _mm_store_pd( ( double* )rho_xz, rho1v.v );
431 
432  //rho_yz->real = rho_yz->real + rho2.real;
433  //rho_yz->imag = rho_yz->imag + rho2.imag;
434  rho2v.v = r2v.v + rho2v.v;
435  _mm_store_pd( ( double* )rho_yz, rho2v.v );
436 
437  //rho_wz->real = rho_wz->real + rho3.real;
438  //rho_wz->imag = rho_wz->imag + rho3.imag;
439  rho3v.v = r3v.v + rho3v.v;
440  _mm_store_pd( ( double* )rho_wz, rho3v.v );
441 }

References bl1_is_conj(), v2df_t::d, i, rho_wz, rho_xz, rho_yz, v2df_t::v, w1, x1, y1, and z1.

Referenced by FLA_Fused_Uhu_Yhu_Zhu_opz_var1().

◆ bl1_zdotv2axpyv2b()

void bl1_zdotv2axpyv2b ( int  n,
dcomplex a1,
int  inc_a1,
dcomplex a2,
int  inc_a2,
dcomplex x,
int  inc_x,
dcomplex kappa1,
dcomplex kappa2,
dcomplex rho1,
dcomplex rho2,
dcomplex w,
int  inc_w 
)
341 {
342  dcomplex* restrict alpha1;
343  dcomplex* restrict alpha2;
344  dcomplex* restrict chi1;
345  dcomplex* restrict omega1;
346  int i;
347 
348  v2df_t kappa1v, kappa1rv;
349  v2df_t kappa2v, kappa2rv;
350  v2df_t rho1v;
351  v2df_t rho2v;
352  v2df_t a11v, a12v;
353  v2df_t a21v, a22v;
354  v2df_t x1v, x1rv;
355  v2df_t w1v;
356  v2df_t acbc, bdad;
357  v2df_t adac, bcbd;
358 
359  if ( inc_a1 != 1 ||
360  inc_a2 != 1 ||
361  inc_x != 1 ||
362  inc_w != 1 ) bl1_abort();
363 
364  alpha1 = a1;
365  alpha2 = a2;
366  chi1 = x;
367  omega1 = w;
368 
369  rho1v.v = _mm_setzero_pd();
370  rho2v.v = _mm_setzero_pd();
371 
372  kappa1v.v = _mm_load_pd( ( double* )kappa1 );
373  kappa1rv.v = _mm_shuffle_pd( kappa1v.v, kappa1v.v, _MM_SHUFFLE2 (0,1) );
374  kappa2v.v = _mm_load_pd( ( double* )kappa2 );
375  kappa2rv.v = _mm_shuffle_pd( kappa2v.v, kappa2v.v, _MM_SHUFFLE2 (0,1) );
376 
377  for ( i = 0; i < n; ++i )
378  {
379  //dcomplex omega1_c = *omega1;
380  w1v.v = _mm_load_pd( ( double* )omega1 );
381 
382  //dcomplex chi1_c = *chi1;
383  x1v.v = _mm_load_pd( ( double* )chi1 );
384 
385 
386  //dcomplex alpha1_c = *alpha1;
387  a11v.v = _mm_loaddup_pd( ( double* )&(alpha1->real) );
388  a12v.v = _mm_loaddup_pd( ( double* )&(alpha1->imag) );
389 
390  //rho1_c.real += alpha1_c.real * chi1_c.real - -alpha1_c.imag * chi1_c.imag;
391  //rho1_c.imag += alpha1_c.real * chi1_c.imag + -alpha1_c.imag * chi1_c.real;
392  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
393  adac.v = a11v.v * x1rv.v;
394  bcbd.v = a12v.v * x1v.v;
395  rho1v.v = rho1v.v + _mm_addsub_pd( adac.v, bcbd.v );
396 
397  //omega1_c.real += kappa1_c.real * alpha1_c.real - kappa1_c.imag * alpha1_c.imag;
398  //omega1_c.imag += kappa1_c.real * alpha1_c.imag + kappa1_c.imag * alpha1_c.real;
399  acbc.v = kappa1v.v * a11v.v;
400  bdad.v = kappa1rv.v * a12v.v;
401  w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
402 
403 
404  //dcomplex alpha2_c = *alpha2;
405  a21v.v = _mm_loaddup_pd( ( double* )&(alpha2->real) );
406  a22v.v = _mm_loaddup_pd( ( double* )&(alpha2->imag) );
407 
408  //rho2_c.real += alpha2_c.real * chi1_c.real - -alpha2_c.imag * chi1_c.imag;
409  //rho2_c.imag += alpha2_c.real * chi1_c.imag + -alpha2_c.imag * chi1_c.real;
410  x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
411  adac.v = a21v.v * x1rv.v;
412  bcbd.v = a22v.v * x1v.v;
413  rho2v.v = rho2v.v + _mm_addsub_pd( adac.v, bcbd.v );
414 
415  //omega1_c.real += kappa2_c.real * alpha2_c.real - kappa2_c.imag * alpha2_c.imag;
416  //omega1_c.imag += kappa2_c.real * alpha2_c.imag + kappa2_c.imag * alpha2_c.real;
417  acbc.v = kappa2v.v * a21v.v;
418  bdad.v = kappa2rv.v * a22v.v;
419  w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
420 
421 
422  //*omega1 = omega1_c;
423  _mm_store_pd( ( double* )omega1, w1v.v );
424 
425 
426  //alpha1 += inc_a1;
427  //alpha2 += inc_a2;
428  //chi1 += inc_x;
429  //omega1 += inc_w;
430  alpha1 += 1;
431  alpha2 += 1;
432  chi1 += 1;
433  omega1 += 1;
434  }
435 
436  rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
437  rho2v.v = _mm_shuffle_pd( rho2v.v, rho2v.v, _MM_SHUFFLE2 (0,1) );
438 
439  //*rho1 = rho1_c;
440  //*rho2 = rho2_c;
441  _mm_store_pd( ( double* )rho1, rho1v.v );
442  _mm_store_pd( ( double* )rho2, rho2v.v );
443 }

References alpha1, alpha2, bl1_abort(), chi1, i, omega1, rho1, rho2, and v2df_t::v.

Referenced by FLA_Fused_Ahx_Ax_opz_var1().