libflame  revision_anchor
Functions
bli_dotv2axpyv2b.c File Reference

(r)

Functions

void bli_sdotv2axpyv2b (int n, float *a1, int inc_a1, float *a2, int inc_a2, float *x, int inc_x, float *kappa1, float *kappa2, float *rho1, float *rho2, float *w, int inc_w)
void bli_ddotv2axpyv2b (int n, double *a1, int inc_a1, double *a2, int inc_a2, double *x, int inc_x, double *kappa1, double *kappa2, double *rho1, double *rho2, double *w, int inc_w)

Function Documentation

void bli_ddotv2axpyv2b ( int  n,
double *  a1,
int  inc_a1,
double *  a2,
int  inc_a2,
double *  x,
int  inc_x,
double *  kappa1,
double *  kappa2,
double *  rho1,
double *  rho2,
double *  w,
int  inc_w 
)

References bli_abort(), v2df_t::d, and v2df_t::v.

Referenced by FLA_Fused_Ahx_Ax_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().

{
    double*   restrict alpha1;
    double*   restrict alpha2;
    double*   restrict chi1;
    double*   restrict omega1;
    double             rho1_c;
    double             rho2_c;
    int                i;

    int                n_pre;
    int                n_run;
    int                n_left;
    
    v2df_t    k1v, rho1v;
    v2df_t    k2v, rho2v;
    v2df_t    a11v, a12v, x1v, w1v;
    v2df_t    a21v, a22v, x2v, w2v;
    
    if ( inc_a1 != 1 ||
         inc_a2 != 1 ||
         inc_x  != 1 ||
         inc_w  != 1 ) bli_abort();

    n_pre = 0;
    if ( ( unsigned long ) a1 % 16 != 0 )
    {
        if ( ( unsigned long ) a2 % 16 == 0 ||
             ( unsigned long ) x % 16 == 0 ||
             ( unsigned long ) w % 16 == 0 ) bli_abort();

        n_pre = 1;
    }

    n_run       = ( n - n_pre ) / 4;
    n_left      = ( n - n_pre ) % 4;

    alpha1   = a1;
    alpha2   = a2;
    chi1     = x;
    omega1   = w;

    rho1_c = 0.0;
    rho2_c = 0.0;

    if ( n_pre == 1 )
    {
        double   kappa1_c = *kappa1;
        double   kappa2_c = *kappa2;
        double   alpha1_c   = *alpha1;
        double   alpha2_c   = *alpha2;
        double   chi1_c     = *chi1;
        double   omega1_c   = *omega1;

        rho1_c   += alpha1_c * chi1_c;
        omega1_c += kappa1_c * alpha1_c;

        rho2_c   += alpha2_c * chi1_c;
        omega1_c += kappa2_c * alpha2_c;

        *omega1 = omega1_c;

        alpha1   += inc_a1;
        alpha2   += inc_a2;
        chi1     += inc_x;
        omega1   += inc_w;
    }

    rho1v.v = _mm_setzero_pd();
    rho2v.v = _mm_setzero_pd();

    k1v.v = _mm_loaddup_pd( ( double* )kappa1 );
    k2v.v = _mm_loaddup_pd( ( double* )kappa2 );

    for ( i = 0; i < n_run; ++i )
    {
        a11v.v = _mm_load_pd( ( double* )alpha1 );
        a12v.v = _mm_load_pd( ( double* )alpha2 );
        x1v.v  = _mm_load_pd( ( double* )chi1 );
        w1v.v  = _mm_load_pd( ( double* )omega1 );

        rho1v.v += a11v.v * x1v.v;
        w1v.v += k1v.v * a11v.v;

        rho2v.v += a12v.v * x1v.v;
        w1v.v += k2v.v * a12v.v;

        _mm_store_pd( ( double* )omega1, w1v.v );

        a21v.v = _mm_load_pd( ( double* )(alpha1 + 2) );
        a22v.v = _mm_load_pd( ( double* )(alpha2 + 2) );
        x2v.v  = _mm_load_pd( ( double* )(chi1 + 2) );
        w2v.v  = _mm_load_pd( ( double* )(omega1 + 2) );

        rho1v.v += a21v.v * x2v.v;
        w2v.v += k1v.v * a21v.v;

        rho2v.v += a22v.v * x2v.v;
        w2v.v += k2v.v * a22v.v;

        _mm_store_pd( ( double* )(omega1 + 2), w2v.v );

        alpha1   += 4;
        alpha2   += 4;
        chi1     += 4;
        omega1   += 4;
    }

    if ( n_left > 0 )
    {
        for ( i = 0; i < n_left; ++i )
        {
            double   kappa1_c = *kappa1;
            double   kappa2_c = *kappa2;
            double   alpha1_c   = *alpha1;
            double   alpha2_c   = *alpha2;
            double   chi1_c     = *chi1;
            double   omega1_c   = *omega1;

            rho1_c   += alpha1_c * chi1_c;
            omega1_c += kappa1_c * alpha1_c;

            rho2_c   += alpha2_c * chi1_c;
            omega1_c += kappa2_c * alpha2_c;

            *omega1 = omega1_c;

            alpha1   += inc_a1;
            alpha2   += inc_a2;
            chi1     += inc_x;
            omega1   += inc_w;
        }
    }

    rho1_c += rho1v.d[0] + rho1v.d[1];
    rho2_c += rho2v.d[0] + rho2v.d[1];

    *rho1 = rho1_c;
    *rho2 = rho2_c;
}
void bli_sdotv2axpyv2b ( int  n,
float *  a1,
int  inc_a1,
float *  a2,
int  inc_a2,
float *  x,
int  inc_x,
float *  kappa1,
float *  kappa2,
float *  rho1,
float *  rho2,
float *  w,
int  inc_w 
)

References bli_abort().

{
    bli_abort();
}