libflame  revision_anchor
Functions
bli_dotaxpy.c File Reference

(r)

Functions

void bli_sdotaxpy (int n, float *a, int inc_a, float *x, int inc_x, float *kappa, float *rho, float *w, int inc_w)
void bli_ddotaxpy (int n, double *a, int inc_a, double *x, int inc_x, double *kappa, double *rho, double *w, int inc_w)

Function Documentation

void bli_ddotaxpy ( int  n,
double *  a,
int  inc_a,
double *  x,
int  inc_x,
double *  kappa,
double *  rho,
double *  w,
int  inc_w 
)

References bli_abort(), v2df_t::d, and v2df_t::v.

Referenced by FLA_Fused_Ahx_Ax_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().

{
    double*   restrict alpha1;
    double*   restrict chi1;
    double*   restrict omega1;
    double             rho_c;
    int                i;

    int                n_pre;
    int                n_run;
    int                n_left;

    v2df_t    k1v, rho1v;
    v2df_t    a1v, x1v, w1v;
    v2df_t    a2v, x2v, w2v;
    
    if ( inc_a != 1 ||
         inc_x != 1 ||
         inc_w != 1 ) bli_abort();

    n_pre = 0;
    if ( ( unsigned long ) a % 16 != 0 )
    {
        if ( ( unsigned long ) x % 16 == 0 ||
             ( unsigned long ) w % 16 == 0 ) bli_abort();

        n_pre = 1;
    }

    n_run       = ( n - n_pre ) / 4;
    n_left      = ( n - n_pre ) % 4;

    alpha1   = a;
    chi1     = x;
    omega1   = w;

    rho_c = 0.0;

    if ( n_pre == 1 )
    {
        double   kappa_c    = *kappa;
        double   alpha1_c   = *alpha1;
        double   chi1_c     = *chi1;
        double   omega1_c   = *omega1;

        rho_c += alpha1_c * chi1_c;
        omega1_c += kappa_c * alpha1_c;

        *omega1 = omega1_c;

        alpha1   += inc_a;
        chi1     += inc_x;
        omega1   += inc_w;
    }

    rho1v.v = _mm_setzero_pd();

    k1v.v = _mm_loaddup_pd( ( double* )kappa );

    for ( i = 0; i < n_run; ++i )
    {
        a1v.v = _mm_load_pd( ( double* )alpha1 );
        x1v.v = _mm_load_pd( ( double* )chi1 );
        w1v.v = _mm_load_pd( ( double* )omega1 );

        a2v.v = _mm_load_pd( ( double* )(alpha1 + 2) );
        x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
        w2v.v = _mm_load_pd( ( double* )(omega1 + 2) );

        rho1v.v += a1v.v * x1v.v;
        w1v.v += k1v.v * a1v.v;

        _mm_store_pd( ( double* )omega1, w1v.v );

        rho1v.v += a2v.v * x2v.v;
        w2v.v += k1v.v * a2v.v;

        _mm_store_pd( ( double* )(omega1 + 2), w2v.v );

        alpha1   += 4;
        chi1     += 4;
        omega1   += 4;
    }

    if ( n_left > 0 )
    {
        for ( i = 0; i < n_left; ++i )
        {
            double   kappa_c    = *kappa;
            double   alpha1_c   = *alpha1;
            double   chi1_c     = *chi1;
            double   omega1_c   = *omega1;

            rho_c += alpha1_c * chi1_c;
            omega1_c += kappa_c * alpha1_c;

            *omega1 = omega1_c;

            alpha1   += inc_a;
            chi1     += inc_x;
            omega1   += inc_w;
        }
    }

    rho_c += rho1v.d[0] + rho1v.d[1];

    *rho = rho_c;
}
void bli_sdotaxpy ( int  n,
float *  a,
int  inc_a,
float *  x,
int  inc_x,
float *  kappa,
float *  rho,
float *  w,
int  inc_w 
)

References bli_abort().

{
    bli_abort();
}