libflame  revision_anchor
Functions
bli_dotsv2.c File Reference

(r)

Functions

void bli_sdotsv2 (conj_t conjxy, int n, float *x, int inc_x, float *y, int inc_y, float *z, int inc_z, float *beta, float *rho_xz, float *rho_yz)
void bli_ddotsv2 (conj_t conjxy, int n, double *x, int inc_x, double *y, int inc_y, double *z, int inc_z, double *beta, double *rho_xz, double *rho_yz)

Function Documentation

void bli_ddotsv2 ( conj_t  conjxy,
int  n,
double *  x,
int  inc_x,
double *  y,
int  inc_y,
double *  z,
int  inc_z,
double *  beta,
double *  rho_xz,
double *  rho_yz 
)

References bli_abort(), v2df_t::d, and v2df_t::v.

Referenced by FLA_Fused_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Uhu_Yhu_Zhu_opd_var1(), FLA_Fused_UYx_ZVx_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().

{
    double*   restrict x1;
    double*   restrict y1;
    double*   restrict z1;
    double             rho1, rho2;
    double             x1c, y1c, z1c;
    int                i;

    int                n_pre;
    int                n_run;
    int                n_left;

    v2df_t             rho1v, rho2v;
    v2df_t             x1v, y1v, z1v;
    v2df_t             x2v, y2v, z2v;
    
    if ( inc_x != 1 ||
         inc_y != 1 ||
         inc_z != 1 ) bli_abort();

    n_pre = 0;
    if ( ( unsigned long ) z % 16 != 0 )
    {
        if ( ( unsigned long ) x % 16 == 0 ||
             ( unsigned long ) y % 16 == 0 ) bli_abort();

        n_pre = 1;
    }

    n_run       = ( n - n_pre ) / 4;
    n_left      = ( n - n_pre ) % 4;

    x1 = x;
    y1 = y;
    z1 = z;

    rho1 = 0.0;
    rho2 = 0.0;

    if ( n_pre == 1 )
    {
        x1c = *x1;
        y1c = *y1;
        z1c = *z1;

        rho1 += x1c * z1c;
        rho2 += y1c * z1c;

        x1 += inc_x;
        y1 += inc_y;
        z1 += inc_z;
    }

    rho1v.v = _mm_setzero_pd();
    rho2v.v = _mm_setzero_pd();

    for ( i = 0; i < n_run; ++i )
    {
        x1v.v = _mm_load_pd( ( double* )x1 );
        y1v.v = _mm_load_pd( ( double* )y1 );
        z1v.v = _mm_load_pd( ( double* )z1 );

        x2v.v = _mm_load_pd( ( double* )(x1 + 2) );
        y2v.v = _mm_load_pd( ( double* )(y1 + 2) );
        z2v.v = _mm_load_pd( ( double* )(z1 + 2) );

        rho1v.v += x1v.v * z1v.v;
        rho2v.v += y1v.v * z1v.v;

        rho1v.v += x2v.v * z2v.v;
        rho2v.v += y2v.v * z2v.v;

        x1 += 4;
        y1 += 4;
        z1 += 4;
    }

    rho1 += rho1v.d[0] + rho1v.d[1];
    rho2 += rho2v.d[0] + rho2v.d[1];

    if ( n_left > 0 )
    {
        for ( i = 0; i < n_left; ++i )
        {
            x1c = *x1;
            y1c = *y1;
            z1c = *z1;

            rho1 += x1c * z1c;
            rho2 += y1c * z1c;

            x1 += inc_x;
            y1 += inc_y;
            z1 += inc_z;
        }
    }

    *rho_xz = *beta * *rho_xz + rho1;
    *rho_yz = *beta * *rho_yz + rho2;
}
void bli_sdotsv2 ( conj_t  conjxy,
int  n,
float *  x,
int  inc_x,
float *  y,
int  inc_y,
float *  z,
int  inc_z,
float *  beta,
float *  rho_xz,
float *  rho_yz 
)

References bli_abort().

{
    bli_abort();
}