libflame
revision_anchor
|
Functions | |
void | bli_saxpyv2bdotaxpy (int n, float *beta, float *u, int inc_u, float *gamma, float *z, int inc_z, float *a, int inc_a, float *x, int inc_x, float *kappa, float *rho, float *w, int inc_w) |
void | bli_daxpyv2bdotaxpy (int n, double *beta, double *u, int inc_u, double *gamma, double *z, int inc_z, double *a, int inc_a, double *x, int inc_x, double *kappa, double *rho, double *w, int inc_w) |
void bli_daxpyv2bdotaxpy | ( | int | n, |
double * | beta, | ||
double * | u, | ||
int | inc_u, | ||
double * | gamma, | ||
double * | z, | ||
int | inc_z, | ||
double * | a, | ||
int | inc_a, | ||
double * | x, | ||
int | inc_x, | ||
double * | kappa, | ||
double * | rho, | ||
double * | w, | ||
int | inc_w | ||
) |
References bli_abort(), v2df_t::d, and v2df_t::v.
Referenced by FLA_Fused_Gerc2_Ahx_Ax_opd_var1(), and FLA_Fused_Her2_Ax_l_opd_var1().
{ double* restrict upsilon1; double* restrict zeta1; double* restrict alpha1; double* restrict chi1; double* restrict omega1; double rho_c; int i; v2df_t b1v, g1v, k1v; v2df_t rhov; v2df_t u1v, z1v, a1v; v2df_t u2v, z2v, a2v; v2df_t x1v, w1v; v2df_t x2v, w2v; int n_pre; int n_run; int n_left; n_pre = 0; if ( ( unsigned long ) a % 16 != 0 ) { if ( ( unsigned long ) u % 16 == 0 || ( unsigned long ) z % 16 == 0 || ( unsigned long ) x % 16 == 0 || ( unsigned long ) w % 16 == 0 ) bli_abort(); n_pre = 1; } n_run = ( n - n_pre ) / 4; n_left = ( n - n_pre ) % 4; upsilon1 = u; zeta1 = z; alpha1 = a; chi1 = x; omega1 = w; rho_c = 0.0; if ( n_pre == 1 ) { double beta_c = *beta; double gamma_c = *gamma; double kappa_c = *kappa; double upsilon1_c = *upsilon1; double zeta1_c = *zeta1; double alpha1_c = *alpha1; double chi1_c = *chi1; double omega1_c = *omega1; alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c; rho_c += alpha1_c * chi1_c; omega1_c += kappa_c * alpha1_c; *alpha1 = alpha1_c; *omega1 = omega1_c; upsilon1 += inc_u; zeta1 += inc_z; alpha1 += inc_a; chi1 += inc_x; omega1 += inc_w; } b1v.v = _mm_loaddup_pd( ( double* )beta ); g1v.v = _mm_loaddup_pd( ( double* )gamma ); k1v.v = _mm_loaddup_pd( ( double* )kappa ); rhov.v = _mm_setzero_pd(); for ( i = 0; i < n_run; ++i ) { u1v.v = _mm_load_pd( ( double* )upsilon1 ); z1v.v = _mm_load_pd( ( double* )zeta1 ); a1v.v = _mm_load_pd( ( double* )alpha1 ); a1v.v += b1v.v * u1v.v + g1v.v * z1v.v; u2v.v = _mm_load_pd( ( double* )(upsilon1 + 2) ); z2v.v = _mm_load_pd( ( double* )(zeta1 + 2) ); a2v.v = _mm_load_pd( ( double* )(alpha1 + 2) ); a2v.v += b1v.v * u2v.v + g1v.v * z2v.v; x1v.v = _mm_load_pd( ( double* )chi1 ); x2v.v = _mm_load_pd( ( double* )(chi1 + 2) ); w1v.v = _mm_load_pd( ( double* )omega1 ); w2v.v = _mm_load_pd( ( double* )(omega1 + 2) ); rhov.v += a1v.v * x1v.v; rhov.v += a2v.v * x2v.v; w1v.v += k1v.v * a1v.v; w2v.v += k1v.v * a2v.v; _mm_store_pd( ( double* )alpha1, a1v.v ); _mm_store_pd( ( double* )(alpha1 + 2), a2v.v ); _mm_store_pd( ( double* )omega1, w1v.v ); _mm_store_pd( ( double* )(omega1 + 2), w2v.v ); upsilon1 += 4; zeta1 += 4; alpha1 += 4; chi1 += 4; omega1 += 4; } rho_c += rhov.d[0] + rhov.d[1]; if ( n_left > 0 ) { double beta_c = *beta; double gamma_c = *gamma; double kappa_c = *kappa; for ( i = 0; i < n_left; ++i ) { double upsilon1_c = *upsilon1; double zeta1_c = *zeta1; double alpha1_c = *alpha1; double chi1_c = *chi1; double omega1_c = *omega1; alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c; rho_c += alpha1_c * chi1_c; omega1_c += kappa_c * alpha1_c; *alpha1 = alpha1_c; *omega1 = omega1_c; upsilon1 += inc_u; zeta1 += inc_z; alpha1 += inc_a; chi1 += inc_x; omega1 += inc_w; } } *rho = rho_c; }
void bli_saxpyv2bdotaxpy | ( | int | n, |
float * | beta, | ||
float * | u, | ||
int | inc_u, | ||
float * | gamma, | ||
float * | z, | ||
int | inc_z, | ||
float * | a, | ||
int | inc_a, | ||
float * | x, | ||
int | inc_x, | ||
float * | kappa, | ||
float * | rho, | ||
float * | w, | ||
int | inc_w | ||
) |
References bli_abort().
{ bli_abort(); }