libflame  revision_anchor
Functions
bli_axpyv3b.c File Reference

(r)

Functions

void bli_saxpyv3b (int n, float *alpha1, float *alpha2, float *alpha3, float *x1, int inc_x1, float *x2, int inc_x2, float *x3, int inc_x3, float *y, int inc_y)
void bli_daxpyv3b (int n, double *alpha1, double *alpha2, double *alpha3, double *x1, int inc_x1, double *x2, int inc_x2, double *x3, int inc_x3, double *y, int inc_y)

Function Documentation

void bli_daxpyv3b ( int  n,
double *  alpha1,
double *  alpha2,
double *  alpha3,
double *  x1,
int  inc_x1,
double *  x2,
int  inc_x2,
double *  x3,
int  inc_x3,
double *  y,
int  inc_y 
)

References bli_abort(), and v2df_t::v.

{
    double*   restrict chi1;
    double*   restrict chi2;
    double*   restrict chi3;
    double*   restrict psi1;
    int       i;

    int       n_pre;
    int       n_run;
    int       n_left;

    v2df_t    a1v, a2v, a3v;
    v2df_t    x11v, x12v, x13v;
    v2df_t    x21v, x22v, x23v;
    v2df_t    y1v;
    v2df_t    y2v;

    if ( inc_x1 != 1 ||
         inc_x2 != 1 ||
         inc_x3 != 1 ||
         inc_y  != 1 ) bli_abort();

    n_pre = 0;
    if ( ( unsigned long ) y % 16 != 0 )
    {
        if ( ( unsigned long ) x1 % 16 == 0 ||
             ( unsigned long ) x2 % 16 == 0 ||
             ( unsigned long ) x3 % 16 == 0 ) bli_abort();

        n_pre = 1;
    }

    n_run       = ( n - n_pre ) / 4;
    n_left      = ( n - n_pre ) % 4;

    chi1 = x1;
    chi2 = x2;
    chi3 = x3;
    psi1 = y;

    if ( n_pre == 1 )
    {
        double   alpha1_c = *alpha1;
        double   alpha2_c = *alpha2;
        double   alpha3_c = *alpha3;
        double   chi11_c = *chi1;
        double   chi12_c = *chi2;
        double   chi13_c = *chi3;

        *psi1 += alpha1_c * chi11_c + alpha2_c * chi12_c + alpha3_c * chi13_c;

        chi1 += inc_x1;
        chi2 += inc_x2;
        chi3 += inc_x3;
        psi1 += inc_y;
    }

    a1v.v = _mm_loaddup_pd( ( double* )alpha1 );
    a2v.v = _mm_loaddup_pd( ( double* )alpha2 );
    a3v.v = _mm_loaddup_pd( ( double* )alpha3 );

    for ( i = 0; i < n_run; ++i )
    {
        x11v.v = _mm_load_pd( ( double* )chi1 );
        x12v.v = _mm_load_pd( ( double* )chi2 );
        x13v.v = _mm_load_pd( ( double* )chi3 );
        y1v.v  = _mm_load_pd( ( double* )psi1 );

        y1v.v += a1v.v * x11v.v + a2v.v * x12v.v + a3v.v * x13v.v;

        _mm_store_pd( ( double* )psi1, y1v.v );

        x21v.v = _mm_load_pd( ( double* )(chi1 + 2) );
        x22v.v = _mm_load_pd( ( double* )(chi2 + 2) );
        x23v.v = _mm_load_pd( ( double* )(chi3 + 2) );
        y2v.v  = _mm_load_pd( ( double* )(psi1 + 2) );

        y2v.v += a1v.v * x21v.v + a2v.v * x22v.v + a3v.v * x23v.v;

        _mm_store_pd( ( double* )(psi1 + 2), y2v.v );

        chi1 += 4;
        chi2 += 4;
        chi3 += 4;
        psi1 += 4;
    }

    if ( n_left > 0 )
    {
        double   alpha1_c = *alpha1;
        double   alpha2_c = *alpha2;
        double   alpha3_c = *alpha3;

        for ( i = 0; i < n_left; ++i )
        {
            double   chi11_c = *chi1;
            double   chi12_c = *chi2;
            double   chi13_c = *chi3;

            *psi1 += alpha1_c * chi11_c + alpha2_c * chi12_c + alpha3_c * chi13_c;

            chi1 += inc_x1;
            chi2 += inc_x2;
            chi3 += inc_x3;
            psi1 += inc_y;
        }
    }
}
void bli_saxpyv3b ( int  n,
float *  alpha1,
float *  alpha2,
float *  alpha3,
float *  x1,
int  inc_x1,
float *  x2,
int  inc_x2,
float *  x3,
int  inc_x3,
float *  y,
int  inc_y 
)

References bli_abort().

{
    bli_abort();
}