libflame
revision_anchor
|
Go to the source code of this file.
Functions | |
FLA_Error | FLA_Hess_UT_blk_var1 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_unb_var1 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_unb_var1 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_blk_var2 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_blf_var2 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_unb_var2 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_unb_var2 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_blk_var3 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_blf_var3 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_unb_var3 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_unb_var3 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_blk_var4 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_blf_var4 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_unb_var4 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_unb_var4 (FLA_Obj A, FLA_Obj Y, FLA_Obj Z, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_blk_var5 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_unb_var5 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_unb_var5 (FLA_Obj A, FLA_Obj U, FLA_Obj Z, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_opt_var1 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_opt_var1 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_ops_var1 (int m_A, int m_T, float *buff_A, int rs_A, int cs_A, float *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_opd_var1 (int m_A, int m_T, double *buff_A, int rs_A, int cs_A, double *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_opc_var1 (int m_A, int m_T, scomplex *buff_A, int rs_A, int cs_A, scomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_opz_var1 (int m_A, int m_T, dcomplex *buff_A, int rs_A, int cs_A, dcomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_opt_var2 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_opt_var2 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_ops_var2 (int m_A, int m_T, float *buff_A, int rs_A, int cs_A, float *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_opd_var2 (int m_A, int m_T, double *buff_A, int rs_A, int cs_A, double *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_opc_var2 (int m_A, int m_T, scomplex *buff_A, int rs_A, int cs_A, scomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_opz_var2 (int m_A, int m_T, dcomplex *buff_A, int rs_A, int cs_A, dcomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_opt_var3 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_opt_var3 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_ops_var3 (int m_A, int m_T, float *buff_A, int rs_A, int cs_A, float *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_opd_var3 (int m_A, int m_T, double *buff_A, int rs_A, int cs_A, double *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_opc_var3 (int m_A, int m_T, scomplex *buff_A, int rs_A, int cs_A, scomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_opz_var3 (int m_A, int m_T, dcomplex *buff_A, int rs_A, int cs_A, dcomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_opt_var4 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_opt_var4 (FLA_Obj A, FLA_Obj Y, FLA_Obj Z, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_ops_var4 (int m_A, int m_T, float *buff_A, int rs_A, int cs_A, float *buff_Y, int rs_Y, int cs_Y, float *buff_Z, int rs_Z, int cs_Z, float *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_opd_var4 (int m_A, int m_T, double *buff_A, int rs_A, int cs_A, double *buff_Y, int rs_Y, int cs_Y, double *buff_Z, int rs_Z, int cs_Z, double *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_opc_var4 (int m_A, int m_T, scomplex *buff_A, int rs_A, int cs_A, scomplex *buff_Y, int rs_Y, int cs_Y, scomplex *buff_Z, int rs_Z, int cs_Z, scomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_opz_var4 (int m_A, int m_T, dcomplex *buff_A, int rs_A, int cs_A, dcomplex *buff_Y, int rs_Y, int cs_Y, dcomplex *buff_Z, int rs_Z, int cs_Z, dcomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_opt_var5 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_opt_var5 (FLA_Obj A, FLA_Obj U, FLA_Obj Z, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_ops_var5 (int m_A, int m_T, float *buff_A, int rs_A, int cs_A, float *buff_U, int rs_U, int cs_U, float *buff_Z, int rs_Z, int cs_Z, float *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_opd_var5 (int m_A, int m_T, double *buff_A, int rs_A, int cs_A, double *buff_U, int rs_U, int cs_U, double *buff_Z, int rs_Z, int cs_Z, double *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_opc_var5 (int m_A, int m_T, scomplex *buff_A, int rs_A, int cs_A, scomplex *buff_U, int rs_U, int cs_U, scomplex *buff_Z, int rs_Z, int cs_Z, scomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_opz_var5 (int m_A, int m_T, dcomplex *buff_A, int rs_A, int cs_A, dcomplex *buff_U, int rs_U, int cs_U, dcomplex *buff_Z, int rs_Z, int cs_Z, dcomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_ofu_var1 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_ofu_var1 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_ofs_var1 (int m_A, int m_T, float *buff_A, int rs_A, int cs_A, float *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_ofd_var1 (int m_A, int m_T, double *buff_A, int rs_A, int cs_A, double *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_ofc_var1 (int m_A, int m_T, scomplex *buff_A, int rs_A, int cs_A, scomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_ofz_var1 (int m_A, int m_T, dcomplex *buff_A, int rs_A, int cs_A, dcomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_ofu_var2 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_ofu_var2 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_ofs_var2 (int m_A, int m_T, float *buff_A, int rs_A, int cs_A, float *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_ofd_var2 (int m_A, int m_T, double *buff_A, int rs_A, int cs_A, double *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_ofc_var2 (int m_A, int m_T, scomplex *buff_A, int rs_A, int cs_A, scomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_ofz_var2 (int m_A, int m_T, dcomplex *buff_A, int rs_A, int cs_A, dcomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_ofu_var3 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_ofu_var3 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_ofs_var3 (int m_A, int m_T, float *buff_A, int rs_A, int cs_A, float *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_ofd_var3 (int m_A, int m_T, double *buff_A, int rs_A, int cs_A, double *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_ofc_var3 (int m_A, int m_T, scomplex *buff_A, int rs_A, int cs_A, scomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_ofz_var3 (int m_A, int m_T, dcomplex *buff_A, int rs_A, int cs_A, dcomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_ofu_var4 (FLA_Obj A, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_ofu_var4 (FLA_Obj A, FLA_Obj Y, FLA_Obj Z, FLA_Obj T) |
FLA_Error | FLA_Hess_UT_step_ofs_var4 (int m_A, int m_T, float *buff_A, int rs_A, int cs_A, float *buff_Y, int rs_Y, int cs_Y, float *buff_Z, int rs_Z, int cs_Z, float *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_ofd_var4 (int m_A, int m_T, double *buff_A, int rs_A, int cs_A, double *buff_Y, int rs_Y, int cs_Y, double *buff_Z, int rs_Z, int cs_Z, double *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_ofc_var4 (int m_A, int m_T, scomplex *buff_A, int rs_A, int cs_A, scomplex *buff_Y, int rs_Y, int cs_Y, scomplex *buff_Z, int rs_Z, int cs_Z, scomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Hess_UT_step_ofz_var4 (int m_A, int m_T, dcomplex *buff_A, int rs_A, int cs_A, dcomplex *buff_Y, int rs_Y, int cs_Y, dcomplex *buff_Z, int rs_Z, int cs_Z, dcomplex *buff_T, int rs_T, int cs_T) |
FLA_Error | FLA_Fused_Ahx_Ax_ops_var1 (int m_A, int n_A, float *buff_A, int rs_A, int cs_A, float *buff_x, int inc_x, float *buff_v, int inc_v, float *buff_w, int inc_w) |
FLA_Error | FLA_Fused_Ahx_Ax_opd_var1 (int m_A, int n_A, double *buff_A, int rs_A, int cs_A, double *buff_x, int inc_x, double *buff_v, int inc_v, double *buff_w, int inc_w) |
FLA_Error | FLA_Fused_Ahx_Ax_opc_var1 (int m_A, int n_A, scomplex *buff_A, int rs_A, int cs_A, scomplex *buff_x, int inc_x, scomplex *buff_v, int inc_v, scomplex *buff_w, int inc_w) |
FLA_Error | FLA_Fused_Ahx_Ax_opz_var1 (int m_A, int n_A, dcomplex *buff_A, int rs_A, int cs_A, dcomplex *buff_x, int inc_x, dcomplex *buff_v, int inc_v, dcomplex *buff_w, int inc_w) |
FLA_Error | FLA_Fused_Gerc2_Ahx_Ax_ops_var1 (int m_A, int n_A, float *buff_alpha, float *buff_u, int inc_u, float *buff_y, int inc_y, float *buff_z, int inc_z, float *buff_A, int rs_A, int cs_A, float *buff_x, int inc_x, float *buff_v, int inc_v, float *buff_w, int inc_w) |
FLA_Error | FLA_Fused_Gerc2_Ahx_Ax_opd_var1 (int m_A, int n_A, double *buff_alpha, double *buff_u, int inc_u, double *buff_y, int inc_y, double *buff_z, int inc_z, double *buff_A, int rs_A, int cs_A, double *buff_x, int inc_x, double *buff_v, int inc_v, double *buff_w, int inc_w) |
FLA_Error | FLA_Fused_Gerc2_Ahx_Ax_opc_var1 (int m_A, int n_A, scomplex *buff_alpha, scomplex *buff_u, int inc_u, scomplex *buff_y, int inc_y, scomplex *buff_z, int inc_z, scomplex *buff_A, int rs_A, int cs_A, scomplex *buff_x, int inc_x, scomplex *buff_v, int inc_v, scomplex *buff_w, int inc_w) |
FLA_Error | FLA_Fused_Gerc2_Ahx_Ax_opz_var1 (int m_A, int n_A, dcomplex *buff_alpha, dcomplex *buff_u, int inc_u, dcomplex *buff_y, int inc_y, dcomplex *buff_z, int inc_z, dcomplex *buff_A, int rs_A, int cs_A, dcomplex *buff_x, int inc_x, dcomplex *buff_v, int inc_v, dcomplex *buff_w, int inc_w) |
FLA_Error | FLA_Fused_Uhu_Yhu_Zhu_ops_var1 (int m_U, int n_U, float *buff_delta, float *buff_U, int rs_U, int cs_U, float *buff_Y, int rs_Y, int cs_Y, float *buff_Z, int rs_Z, int cs_Z, float *buff_t, int inc_t, float *buff_u, int inc_u, float *buff_y, int inc_y, float *buff_z, int inc_z) |
FLA_Error | FLA_Fused_Uhu_Yhu_Zhu_opd_var1 (int m_U, int n_U, double *buff_delta, double *buff_U, int rs_U, int cs_U, double *buff_Y, int rs_Y, int cs_Y, double *buff_Z, int rs_Z, int cs_Z, double *buff_t, int inc_t, double *buff_u, int inc_u, double *buff_y, int inc_y, double *buff_z, int inc_z) |
FLA_Error | FLA_Fused_Uhu_Yhu_Zhu_opc_var1 (int m_U, int n_U, scomplex *buff_delta, scomplex *buff_U, int rs_U, int cs_U, scomplex *buff_Y, int rs_Y, int cs_Y, scomplex *buff_Z, int rs_Z, int cs_Z, scomplex *buff_t, int inc_t, scomplex *buff_u, int inc_u, scomplex *buff_y, int inc_y, scomplex *buff_z, int inc_z) |
FLA_Error | FLA_Fused_Uhu_Yhu_Zhu_opz_var1 (int m_U, int n_U, dcomplex *buff_delta, dcomplex *buff_U, int rs_U, int cs_U, dcomplex *buff_Y, int rs_Y, int cs_Y, dcomplex *buff_Z, int rs_Z, int cs_Z, dcomplex *buff_t, int inc_t, dcomplex *buff_u, int inc_u, dcomplex *buff_y, int inc_y, dcomplex *buff_z, int inc_z) |
FLA_Error FLA_Fused_Ahx_Ax_opc_var1 | ( | int | m_A, |
int | n_A, | ||
scomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
scomplex * | buff_x, | ||
int | inc_x, | ||
scomplex * | buff_v, | ||
int | inc_v, | ||
scomplex * | buff_w, | ||
int | inc_w | ||
) |
References bli_cdot(), bli_csetv(), BLIS_CONJUGATE, F77_caxpy(), and FLA_ZERO.
Referenced by FLA_Fused_Ahx_Ax_opt_var1(), FLA_Hess_UT_step_ofc_var2(), FLA_Hess_UT_step_ofc_var3(), and FLA_Hess_UT_step_ofc_var4().
{ scomplex* buff_0 = FLA_COMPLEX_PTR( FLA_ZERO ); int i; bli_csetv( m_A, buff_0, buff_w, inc_w ); for ( i = 0; i < n_A; ++i ) { scomplex* a1 = buff_A + (i )*cs_A + (0 )*rs_A; scomplex* nu1 = buff_v + (i )*inc_v; scomplex* x = buff_x; scomplex* chi1 = buff_x + (i )*inc_x; scomplex* w = buff_w; /*------------------------------------------------------------*/ bli_cdot( BLIS_CONJUGATE, m_A, a1, rs_A, x, inc_x, nu1 ); // bli_caxpyv( BLIS_NO_CONJUGATE, // m_A, // chi1, // a1, rs_A, // w, inc_w ); F77_caxpy( &m_A, chi1, a1, &rs_A, w, &inc_w ); /*------------------------------------------------------------*/ } return FLA_SUCCESS; }
FLA_Error FLA_Fused_Ahx_Ax_opd_var1 | ( | int | m_A, |
int | n_A, | ||
double * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
double * | buff_x, | ||
int | inc_x, | ||
double * | buff_v, | ||
int | inc_v, | ||
double * | buff_w, | ||
int | inc_w | ||
) |
References bli_d0(), bli_ddotaxpy(), bli_ddotv2axpyv2b(), and bli_dsetv().
Referenced by FLA_Fused_Ahx_Ax_opt_var1(), FLA_Hess_UT_step_ofd_var2(), FLA_Hess_UT_step_ofd_var3(), and FLA_Hess_UT_step_ofd_var4().
{ double zero = bli_d0(); int i; double* restrict w = buff_w; double* restrict x = buff_x; double* restrict a1; double* restrict a2; double* restrict nu1; double* restrict nu2; double* restrict chi1; double* restrict chi2; int n_run = n_A / 2; int n_left = n_A % 2; int step_a1 = 2*cs_A; int step_nu1 = 2*inc_v; int step_chi1 = 2*inc_x; bli_dsetv( m_A, &zero, buff_w, inc_w ); a1 = buff_A; a2 = buff_A + cs_A; nu1 = buff_v; nu2 = buff_v + inc_v; chi1 = buff_x; chi2 = buff_x + inc_x; for ( i = 0; i < n_run; ++i ) { /*------------------------------------------------------------*/ bli_ddotv2axpyv2b( m_A, a1, rs_A, a2, rs_A, x, inc_x, chi1, chi2, nu1, nu2, w, inc_w ); /*------------------------------------------------------------*/ a1 += step_a1; a2 += step_a1; nu1 += step_nu1; nu2 += step_nu1; chi1 += step_chi1; chi2 += step_chi1; } if ( n_left > 0 ) { for ( i = 0; i < n_left; ++i ) { bli_ddotaxpy( m_A, a1, rs_A, x, inc_x, chi1, nu1, w, inc_w ); a1 += rs_A; nu1 += inc_v; chi1 += inc_x; } } return FLA_SUCCESS; }
FLA_Error FLA_Fused_Ahx_Ax_ops_var1 | ( | int | m_A, |
int | n_A, | ||
float * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
float * | buff_x, | ||
int | inc_x, | ||
float * | buff_v, | ||
int | inc_v, | ||
float * | buff_w, | ||
int | inc_w | ||
) |
References bli_ssetv(), F77_saxpy(), F77_sdot(), and FLA_ZERO.
Referenced by FLA_Fused_Ahx_Ax_opt_var1(), FLA_Hess_UT_step_ofs_var2(), FLA_Hess_UT_step_ofs_var3(), and FLA_Hess_UT_step_ofs_var4().
{ float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO ); int i; bli_ssetv( m_A, buff_0, buff_w, inc_w ); for ( i = 0; i < n_A; ++i ) { float* a1 = buff_A + (i )*cs_A + (0 )*rs_A; float* nu1 = buff_v + (i )*inc_v; float* x = buff_x; float* chi1 = buff_x + (i )*inc_x; float* w = buff_w; /*------------------------------------------------------------*/ // bli_sdot( BLIS_CONJUGATE, // m_A, // a1, rs_A, // x, inc_x, // nu1 ); *nu1 = F77_sdot( &m_A, a1, &rs_A, x, &inc_x ); // bli_saxpyv( BLIS_NO_CONJUGATE, // m_A, // chi1, // a1, rs_A, // w, inc_w ); F77_saxpy( &m_A, chi1, a1, &rs_A, w, &inc_w ); /*------------------------------------------------------------*/ } return FLA_SUCCESS; }
FLA_Error FLA_Fused_Ahx_Ax_opz_var1 | ( | int | m_A, |
int | n_A, | ||
dcomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
dcomplex * | buff_x, | ||
int | inc_x, | ||
dcomplex * | buff_v, | ||
int | inc_v, | ||
dcomplex * | buff_w, | ||
int | inc_w | ||
) |
References bli_z0(), bli_zdotaxpy(), bli_zdotv2axpyv2b(), and bli_zsetv().
Referenced by FLA_Fused_Ahx_Ax_opt_var1(), FLA_Hess_UT_step_ofz_var2(), FLA_Hess_UT_step_ofz_var3(), and FLA_Hess_UT_step_ofz_var4().
{ dcomplex zero = bli_z0(); int i; dcomplex* restrict w = buff_w; dcomplex* restrict x = buff_x; dcomplex* restrict a1; dcomplex* restrict a2; dcomplex* restrict nu1; dcomplex* restrict nu2; dcomplex* restrict chi1; dcomplex* restrict chi2; int n_run = n_A / 2; int n_left = n_A % 2; int step_a1 = 2*cs_A; int step_nu1 = 2*inc_v; int step_chi1 = 2*inc_x; bli_zsetv( m_A, &zero, buff_w, inc_w ); a1 = buff_A; a2 = buff_A + cs_A; nu1 = buff_v; nu2 = buff_v + inc_v; chi1 = buff_x; chi2 = buff_x + inc_x; for ( i = 0; i < n_run; ++i ) { /*------------------------------------------------------------*/ /* bli_zdotaxpy( m_A, a1, rs_A, x, inc_x, chi1, nu1, w, inc_w ); */ bli_zdotv2axpyv2b( m_A, a1, rs_A, a2, rs_A, x, inc_x, chi1, chi2, nu1, nu2, w, inc_w ); /*------------------------------------------------------------*/ a1 += step_a1; a2 += step_a1; nu1 += step_nu1; nu2 += step_nu1; chi1 += step_chi1; chi2 += step_chi1; } if ( n_left > 0 ) { for ( i = 0; i < n_left; ++i ) { bli_zdotaxpy( m_A, a1, rs_A, x, inc_x, chi1, nu1, w, inc_w ); a1 += rs_A; nu1 += inc_v; chi1 += inc_x; } } return FLA_SUCCESS; }
FLA_Error FLA_Fused_Gerc2_Ahx_Ax_opc_var1 | ( | int | m_A, |
int | n_A, | ||
scomplex * | buff_alpha, | ||
scomplex * | buff_u, | ||
int | inc_u, | ||
scomplex * | buff_y, | ||
int | inc_y, | ||
scomplex * | buff_z, | ||
int | inc_z, | ||
scomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
scomplex * | buff_x, | ||
int | inc_x, | ||
scomplex * | buff_v, | ||
int | inc_v, | ||
scomplex * | buff_w, | ||
int | inc_w | ||
) |
References bli_cdot(), bli_csetv(), BLIS_CONJUGATE, F77_caxpy(), and FLA_ZERO.
Referenced by FLA_Fused_Gerc2_Ahx_Ax_opt_var1(), and FLA_Hess_UT_step_ofc_var3().
{ scomplex* buff_0 = FLA_COMPLEX_PTR( FLA_ZERO ); int i; bli_csetv( m_A, buff_0, buff_w, inc_w ); for ( i = 0; i < n_A; ++i ) { scomplex* a1 = buff_A + (i )*cs_A + (0 )*rs_A; scomplex* nu1 = buff_v + (i )*inc_v; scomplex* x = buff_x; scomplex* chi1 = buff_x + (i )*inc_x; scomplex* psi1 = buff_y + (i )*inc_y; scomplex* u = buff_u; scomplex* upsilon1 = buff_u + (i )*inc_u; scomplex* w = buff_w; scomplex* z = buff_z; scomplex* alpha = buff_alpha; scomplex temp1; scomplex temp2; scomplex conj_psi1; scomplex conj_upsilon1; /*------------------------------------------------------------*/ bli_ccopyconj( psi1, &conj_psi1 ); bli_cmult3( alpha, &conj_psi1, &temp1 ); bli_ccopyconj( upsilon1, &conj_upsilon1 ); bli_cmult3( alpha, &conj_upsilon1, &temp2 ); // bli_caxpyv( BLIS_NO_CONJUGATE, // m_A, // &temp1, // u, inc_u, // a1, rs_A ); F77_caxpy( &m_A, &temp1, u, &inc_u, a1, &rs_A ); // bli_caxpyv( BLIS_NO_CONJUGATE, // m_A, // &temp2, // z, inc_z, // a1, rs_A ); F77_caxpy( &m_A, &temp2, z, &inc_z, a1, &rs_A ); bli_cdot( BLIS_CONJUGATE, m_A, a1, rs_A, x, inc_x, nu1 ); // bli_caxpyv( BLIS_NO_CONJUGATE, // m_A, // chi1, // a1, rs_A, // w, inc_w ); F77_caxpy( &m_A, chi1, a1, &rs_A, w, &inc_w ); /*------------------------------------------------------------*/ } return FLA_SUCCESS; }
FLA_Error FLA_Fused_Gerc2_Ahx_Ax_opd_var1 | ( | int | m_A, |
int | n_A, | ||
double * | buff_alpha, | ||
double * | buff_u, | ||
int | inc_u, | ||
double * | buff_y, | ||
int | inc_y, | ||
double * | buff_z, | ||
int | inc_z, | ||
double * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
double * | buff_x, | ||
int | inc_x, | ||
double * | buff_v, | ||
int | inc_v, | ||
double * | buff_w, | ||
int | inc_w | ||
) |
References bli_d0(), bli_daxpyv2bdotaxpy(), and bli_dsetv().
Referenced by FLA_Fused_Gerc2_Ahx_Ax_opt_var1(), and FLA_Hess_UT_step_ofd_var3().
{ double zero = bli_d0(); double* restrict alpha = buff_alpha; double* restrict u = buff_u; double* restrict z = buff_z; double* restrict x = buff_x; double* restrict w = buff_w; double* restrict a1; double* restrict nu1; double* restrict chi1; double* restrict psi1; double* restrict upsilon1; double alpha_psi1; double alpha_upsilon1; int n_run = n_A / 1; //int n_left = n_A % 1; int step_a1 = 1*cs_A; int step_nu1 = 1*inc_v; int step_chi1 = 1*inc_x; int step_psi1 = 1*inc_y; int step_upsilon1 = 1*inc_u; int i; bli_dsetv( m_A, &zero, buff_w, inc_w ); a1 = buff_A; nu1 = buff_v; chi1 = buff_x; psi1 = buff_y; upsilon1 = buff_u; for ( i = 0; i < n_run; ++i ) { /*------------------------------------------------------------*/ bli_dmult3( alpha, psi1, &alpha_psi1 ); bli_dmult3( alpha, upsilon1, &alpha_upsilon1 ); bli_daxpyv2bdotaxpy( m_A, &alpha_psi1, u, inc_u, &alpha_upsilon1, z, inc_z, a1, rs_A, x, inc_x, chi1, nu1, w, inc_w ); /*------------------------------------------------------------*/ a1 += step_a1; nu1 += step_nu1; chi1 += step_chi1; psi1 += step_psi1; upsilon1 += step_upsilon1; } return FLA_SUCCESS; }
FLA_Error FLA_Fused_Gerc2_Ahx_Ax_ops_var1 | ( | int | m_A, |
int | n_A, | ||
float * | buff_alpha, | ||
float * | buff_u, | ||
int | inc_u, | ||
float * | buff_y, | ||
int | inc_y, | ||
float * | buff_z, | ||
int | inc_z, | ||
float * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
float * | buff_x, | ||
int | inc_x, | ||
float * | buff_v, | ||
int | inc_v, | ||
float * | buff_w, | ||
int | inc_w | ||
) |
References bli_ssetv(), F77_saxpy(), F77_sdot(), and FLA_ZERO.
Referenced by FLA_Fused_Gerc2_Ahx_Ax_opt_var1(), and FLA_Hess_UT_step_ofs_var3().
{ float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO ); int i; bli_ssetv( m_A, buff_0, buff_w, inc_w ); for ( i = 0; i < n_A; ++i ) { float* a1 = buff_A + (i )*cs_A + (0 )*rs_A; float* nu1 = buff_v + (i )*inc_v; float* x = buff_x; float* chi1 = buff_x + (i )*inc_x; float* psi1 = buff_y + (i )*inc_y; float* u = buff_u; float* upsilon1 = buff_u + (i )*inc_u; float* w = buff_w; float* z = buff_z; float* alpha = buff_alpha; float temp1; float temp2; /*------------------------------------------------------------*/ // bli_scopyconj( psi1, &conj_psi1 ); // bli_smult3( alpha, &conj_psi1, &temp1 ); temp1 = *alpha * *psi1; // bli_scopyconj( upsilon1, &conj_upsilon1 ); // bli_smult3( alpha, &conj_upsilon1, &temp2 ); temp2 = *alpha * *upsilon1; // bli_saxpyv( BLIS_NO_CONJUGATE, // m_A, // &temp1, // u, inc_u, // a1, rs_A ); F77_saxpy( &m_A, &temp1, u, &inc_u, a1, &rs_A ); // bli_saxpyv( BLIS_NO_CONJUGATE, // m_A, // &temp2, // z, inc_z, // a1, rs_A ); F77_saxpy( &m_A, &temp2, z, &inc_z, a1, &rs_A ); // bli_sdot( BLIS_CONJUGATE, // m_A, // a1, rs_A, // x, inc_x, // nu1 ); *nu1 = F77_sdot( &m_A, a1, &rs_A, x, &inc_x ); // bli_saxpyv( BLIS_NO_CONJUGATE, // m_A, // chi1, // a1, rs_A, // w, inc_w ); F77_saxpy( &m_A, chi1, a1, &rs_A, w, &inc_w ); /*------------------------------------------------------------*/ } return FLA_SUCCESS; }
FLA_Error FLA_Fused_Gerc2_Ahx_Ax_opz_var1 | ( | int | m_A, |
int | n_A, | ||
dcomplex * | buff_alpha, | ||
dcomplex * | buff_u, | ||
int | inc_u, | ||
dcomplex * | buff_y, | ||
int | inc_y, | ||
dcomplex * | buff_z, | ||
int | inc_z, | ||
dcomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
dcomplex * | buff_x, | ||
int | inc_x, | ||
dcomplex * | buff_v, | ||
int | inc_v, | ||
dcomplex * | buff_w, | ||
int | inc_w | ||
) |
References bli_z0(), bli_zaxpyv2b(), bli_zdotaxpy(), and bli_zsetv().
Referenced by FLA_Fused_Gerc2_Ahx_Ax_opt_var1(), and FLA_Hess_UT_step_ofz_var3().
{ dcomplex zero = bli_z0(); dcomplex* restrict alpha = buff_alpha; dcomplex* restrict u = buff_u; dcomplex* restrict z = buff_z; dcomplex* restrict x = buff_x; dcomplex* restrict w = buff_w; dcomplex* restrict a1; dcomplex* restrict nu1; dcomplex* restrict chi1; dcomplex* restrict psi1; dcomplex* restrict upsilon1; dcomplex temp1; dcomplex temp2; dcomplex conj_psi1; dcomplex conj_upsilon1; int n_run = n_A / 1; //int n_left = n_A % 1; int step_a1 = 1*cs_A; int step_nu1 = 1*inc_v; int step_chi1 = 1*inc_x; int step_psi1 = 1*inc_y; int step_upsilon1 = 1*inc_u; int i; bli_zsetv( m_A, &zero, buff_w, inc_w ); a1 = buff_A; nu1 = buff_v; chi1 = buff_x; psi1 = buff_y; upsilon1 = buff_u; for ( i = 0; i < n_run; ++i ) { /*------------------------------------------------------------*/ bli_zcopyconj( psi1, &conj_psi1 ); bli_zmult3( alpha, &conj_psi1, &temp1 ); bli_zcopyconj( upsilon1, &conj_upsilon1 ); bli_zmult3( alpha, &conj_upsilon1, &temp2 ); /* bli_zaxpyv2bdotaxpy( m_A, &temp1, u, inc_u, &temp2, z, inc_z, a1, rs_A, x, inc_x, chi1, nu1, w, inc_w ); */ bli_zaxpyv2b( m_A, &temp1, &temp2, u, inc_u, z, inc_z, a1, rs_A ); bli_zdotaxpy( m_A, a1, rs_A, x, inc_x, chi1, nu1, w, inc_w ); /*------------------------------------------------------------*/ a1 += step_a1; nu1 += step_nu1; chi1 += step_chi1; psi1 += step_psi1; upsilon1 += step_upsilon1; } return FLA_SUCCESS; }
FLA_Error FLA_Fused_Uhu_Yhu_Zhu_opc_var1 | ( | int | m_U, |
int | n_U, | ||
scomplex * | buff_delta, | ||
scomplex * | buff_U, | ||
int | rs_U, | ||
int | cs_U, | ||
scomplex * | buff_Y, | ||
int | rs_Y, | ||
int | cs_Y, | ||
scomplex * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
scomplex * | buff_t, | ||
int | inc_t, | ||
scomplex * | buff_u, | ||
int | inc_u, | ||
scomplex * | buff_y, | ||
int | inc_y, | ||
scomplex * | buff_z, | ||
int | inc_z | ||
) |
References bli_cdot(), BLIS_CONJUGATE, and F77_caxpy().
Referenced by FLA_Fused_Uhu_Yhu_Zhu_opt_var1(), and FLA_Hess_UT_step_ofc_var4().
{ int i; for ( i = 0; i < n_U; ++i ) { scomplex* u1 = buff_U + (i )*cs_U + (0 )*rs_U; scomplex* y1 = buff_Y + (i )*cs_Y + (0 )*rs_Y; scomplex* z1 = buff_Z + (i )*cs_Z + (0 )*rs_Z; scomplex* delta = buff_delta; scomplex* tau1 = buff_t + (i )*inc_t; scomplex* u = buff_u; scomplex* y = buff_y; scomplex* z = buff_z; scomplex alpha; scomplex beta; scomplex gamma; /*------------------------------------------------------------*/ bli_cdot( BLIS_CONJUGATE, m_U, u1, rs_U, u, inc_u, &alpha ); bli_cdot( BLIS_CONJUGATE, m_U, z1, rs_Z, u, inc_u, &beta ); bli_cdot( BLIS_CONJUGATE, m_U, y1, rs_Y, u, inc_u, &gamma ); *tau1 = alpha; bli_cscals( delta, &alpha ); bli_cscals( delta, &beta ); bli_cscals( delta, &gamma ); // bli_caxpyv( BLIS_NO_CONJUGATE, // m_U, // &alpha, // y1, rs_Y, // y, inc_y ); F77_caxpy( &m_U, &alpha, y1, &rs_Y, y, &inc_y ); // bli_caxpyv( BLIS_NO_CONJUGATE, // m_U, // &beta, // u1, rs_U, // y, inc_y ); F77_caxpy( &m_U, &beta, u1, &rs_U, y, &inc_y ); // bli_caxpyv( BLIS_NO_CONJUGATE, // m_U, // &alpha, // z1, rs_Z, // z, inc_z ); F77_caxpy( &m_U, &alpha, z1, &rs_Z, z, &inc_z ); // bli_caxpyv( BLIS_NO_CONJUGATE, // m_U, // &gamma, // u1, rs_U, // z, inc_z ); F77_caxpy( &m_U, &gamma, u1, &rs_U, z, &inc_z ); /*------------------------------------------------------------*/ } return FLA_SUCCESS; }
FLA_Error FLA_Fused_Uhu_Yhu_Zhu_opd_var1 | ( | int | m_U, |
int | n_U, | ||
double * | buff_delta, | ||
double * | buff_U, | ||
int | rs_U, | ||
int | cs_U, | ||
double * | buff_Y, | ||
int | rs_Y, | ||
int | cs_Y, | ||
double * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
double * | buff_t, | ||
int | inc_t, | ||
double * | buff_u, | ||
int | inc_u, | ||
double * | buff_y, | ||
int | inc_y, | ||
double * | buff_z, | ||
int | inc_z | ||
) |
References bli_d0(), bli_daxpyv(), bli_ddotaxmyv2(), bli_ddotsv2(), BLIS_CONJUGATE, and BLIS_NO_CONJUGATE.
Referenced by FLA_Fused_Uhu_Yhu_Zhu_opt_var1(), and FLA_Hess_UT_step_ofd_var4().
{ double zero = bli_d0(); double* restrict delta = buff_delta; double* restrict u = buff_u; double* restrict y = buff_y; double* restrict z = buff_z; double* restrict u1; double* restrict y1; double* restrict z1; double* restrict upsilon1; double* restrict tau1; double alpha; double beta; double gamma; int i; int n_run = n_U / 1; //int n_left = n_U % 1; int step_u1 = 1*cs_U; int step_y1 = 1*cs_Y; int step_z1 = 1*cs_Z; int step_upsilon1 = 1*inc_u; int step_tau1 = 1*inc_t; u1 = buff_U; y1 = buff_Y; z1 = buff_Z; upsilon1 = buff_u; tau1 = buff_t; for ( i = 0; i < n_run; ++i ) { /*------------------------------------------------------------*/ /* bli_ddotsv3( BLIS_CONJUGATE, m_U, u1, rs_U, z1, rs_Z, y1, rs_Y, u, inc_u, &zero, &alpha, &beta, &gamma ); *tau1 = alpha; bli_dscals( delta, &alpha ); bli_dscals( delta, &beta ); bli_dscals( delta, &gamma ); bli_daxpyv2b( m_U, &alpha, &beta, y1, rs_Y, u1, rs_U, y, inc_y ); bli_daxpyv2b( m_U, &alpha, &gamma, z1, rs_Z, u1, rs_U, z, inc_z ); */ bli_ddotsv2( BLIS_CONJUGATE, m_U, y1, rs_Y, z1, rs_Z, u, inc_u, &zero, &beta, &gamma ); bli_ddotaxmyv2( m_U, &gamma, &beta, u1, rs_U, u, inc_u, &alpha, y, inc_y, z, inc_z ); *tau1 = alpha; bli_dscals( delta, &alpha ); bli_daxpyv( BLIS_NO_CONJUGATE, m_U, &alpha, y1, rs_Y, y, inc_y ); bli_daxpyv( BLIS_NO_CONJUGATE, m_U, &alpha, z1, rs_Z, z, inc_z ); /*------------------------------------------------------------*/ u1 += step_u1; y1 += step_y1; z1 += step_z1; upsilon1 += step_upsilon1; tau1 += step_tau1; } return FLA_SUCCESS; }
FLA_Error FLA_Fused_Uhu_Yhu_Zhu_ops_var1 | ( | int | m_U, |
int | n_U, | ||
float * | buff_delta, | ||
float * | buff_U, | ||
int | rs_U, | ||
int | cs_U, | ||
float * | buff_Y, | ||
int | rs_Y, | ||
int | cs_Y, | ||
float * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
float * | buff_t, | ||
int | inc_t, | ||
float * | buff_u, | ||
int | inc_u, | ||
float * | buff_y, | ||
int | inc_y, | ||
float * | buff_z, | ||
int | inc_z | ||
) |
References F77_saxpy(), and F77_sdot().
Referenced by FLA_Fused_Uhu_Yhu_Zhu_opt_var1(), and FLA_Hess_UT_step_ofs_var4().
{ int i; for ( i = 0; i < n_U; ++i ) { float* u1 = buff_U + (i )*cs_U + (0 )*rs_U; float* y1 = buff_Y + (i )*cs_Y + (0 )*rs_Y; float* z1 = buff_Z + (i )*cs_Z + (0 )*rs_Z; float* delta = buff_delta; float* tau1 = buff_t + (i )*inc_t; float* u = buff_u; float* y = buff_y; float* z = buff_z; float alpha; float beta; float gamma; /*------------------------------------------------------------*/ // bli_sdot( BLIS_CONJUGATE, // m_U, // u1, rs_U, // u, inc_u, // &alpha ); alpha = F77_sdot( &m_U, u1, &rs_U, u, &inc_u ); // bli_sdot( BLIS_CONJUGATE, // m_U, // z1, rs_Z, // u, inc_u, // &beta ); beta = F77_sdot( &m_U, z1, &rs_Z, u, &inc_u ); // bli_sdot( BLIS_CONJUGATE, // m_U, // y1, rs_Y, // u, inc_u, // &gamma ); gamma = F77_sdot( &m_U, y1, &rs_Y, u, &inc_u ); *tau1 = alpha; // bli_sscals( delta, &alpha ); // bli_sscals( delta, &beta ); // bli_sscals( delta, &gamma ); alpha *= *delta; beta *= *delta; gamma *= *delta; // bli_saxpyv( BLIS_NO_CONJUGATE, // m_U, // &alpha, // y1, rs_Y, // y, inc_y ); F77_saxpy( &m_U, &alpha, y1, &rs_Y, y, &inc_y ); // bli_saxpyv( BLIS_NO_CONJUGATE, // m_U, // &beta, // u1, rs_U, // y, inc_y ); F77_saxpy( &m_U, &beta, u1, &rs_U, y, &inc_y ); // bli_saxpyv( BLIS_NO_CONJUGATE, // m_U, // &alpha, // z1, rs_Z, // z, inc_z ); F77_saxpy( &m_U, &alpha, z1, &rs_Z, z, &inc_z ); // bli_saxpyv( BLIS_NO_CONJUGATE, // m_U, // &gamma, // u1, rs_U, // z, inc_z ); F77_saxpy( &m_U, &gamma, u1, &rs_U, z, &inc_z ); /*------------------------------------------------------------*/ } return FLA_SUCCESS; }
FLA_Error FLA_Fused_Uhu_Yhu_Zhu_opz_var1 | ( | int | m_U, |
int | n_U, | ||
dcomplex * | buff_delta, | ||
dcomplex * | buff_U, | ||
int | rs_U, | ||
int | cs_U, | ||
dcomplex * | buff_Y, | ||
int | rs_Y, | ||
int | cs_Y, | ||
dcomplex * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
dcomplex * | buff_t, | ||
int | inc_t, | ||
dcomplex * | buff_u, | ||
int | inc_u, | ||
dcomplex * | buff_y, | ||
int | inc_y, | ||
dcomplex * | buff_z, | ||
int | inc_z | ||
) |
References bli_z0(), bli_zaxpyv2b(), bli_zdotsv3(), and BLIS_CONJUGATE.
Referenced by FLA_Fused_Uhu_Yhu_Zhu_opt_var1(), and FLA_Hess_UT_step_ofz_var4().
{ dcomplex zero = bli_z0(); dcomplex* restrict delta = buff_delta; dcomplex* restrict u = buff_u; dcomplex* restrict y = buff_y; dcomplex* restrict z = buff_z; dcomplex* restrict u1; dcomplex* restrict y1; dcomplex* restrict z1; dcomplex* restrict upsilon1; dcomplex* restrict tau1; dcomplex alpha; dcomplex beta; dcomplex gamma; int i; int n_run = n_U / 1; //int n_left = n_U % 1; int step_u1 = 1*cs_U; int step_y1 = 1*cs_Y; int step_z1 = 1*cs_Z; int step_upsilon1 = 1*inc_u; int step_tau1 = 1*inc_t; u1 = buff_U; y1 = buff_Y; z1 = buff_Z; upsilon1 = buff_u; tau1 = buff_t; for ( i = 0; i < n_run; ++i ) { /*------------------------------------------------------------*/ bli_zdotsv3( BLIS_CONJUGATE, m_U, u1, rs_U, z1, rs_Z, y1, rs_Y, u, inc_u, &zero, &alpha, &beta, &gamma ); *tau1 = alpha; bli_zscals( delta, &alpha ); bli_zscals( delta, &beta ); bli_zscals( delta, &gamma ); bli_zaxpyv2b( m_U, &alpha, &beta, y1, rs_Y, u1, rs_U, y, inc_y ); bli_zaxpyv2b( m_U, &alpha, &gamma, z1, rs_Z, u1, rs_U, z, inc_z ); /* bli_zdotsv2( BLIS_CONJUGATE, m_U, y1, rs_Y, z1, rs_Z, u, inc_u, &zero, &beta, &gamma ); bli_zdotaxmyv2( m_U, &gamma, &beta, u1, rs_U, u, inc_u, &alpha, y, inc_y, z, inc_z ); *tau1 = alpha; bli_zscals( delta, &alpha ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_U, &alpha, y1, rs_Y, y, inc_y ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_U, &alpha, z1, rs_Z, z, inc_z ); */ /*------------------------------------------------------------*/ u1 += step_u1; y1 += step_y1; z1 += step_z1; upsilon1 += step_upsilon1; tau1 += step_tau1; } return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_blf_var2 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Copy_external(), FLA_Gemm_external(), FLA_Hess_UT_step_ofu_var2(), FLA_MINUS_ONE, FLA_Obj_create(), FLA_Obj_datatype(), FLA_Obj_free(), FLA_Obj_length(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_Set(), FLA_Triangularize(), FLA_Trsm_external(), and FLA_ZERO.
Referenced by FLA_Hess_UT_internal().
{ FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj UT, U0, UB, U1, U2; FLA_Obj ZT, Z0, ZB, Z1, Z2; FLA_Obj TL, TR, T0, T1, T2; FLA_Obj U, Z; FLA_Obj ABR_l; FLA_Obj UB_l; FLA_Obj T1_tl; FLA_Obj WT_l; FLA_Obj none, none2, none3; FLA_Obj UB_tl, UB_bl; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg, b, bb; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &U ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &Z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x1( U, &UT, &UB, 0, FLA_TOP ); FLA_Part_2x1( Z, &ZT, &ZB, 0, FLA_TOP ); FLA_Part_1x2( T, &TL, &TR, 0, FLA_LEFT ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ) { b = min( FLA_Obj_length( ABR ), b_alg ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); FLA_Repart_2x1_to_3x1( UT, &U0, /* ** */ /* ** */ &U1, UB, &U2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( ZT, &Z0, /* ** */ /* ** */ &Z1, ZB, &Z2, b, FLA_BOTTOM ); FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, /**/ &T1, &T2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, b, b, FLA_TL ); bb = min( FLA_Obj_length( ABR ) - 1, b_alg ); FLA_Part_1x2( ABR, &ABR_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( UB, &UB_l, &none, bb, FLA_LEFT ); // [ ABR, T1 ] = FLA_Hess_UT_step_unb_var2( ABR, T1, b ); //FLA_Hess_UT_step_unb_var2( ABR, T1_tl ); FLA_Hess_UT_step_ofu_var2( ABR, T1_tl ); //FLA_Hess_UT_step_opt_var2( ABR, T1_tl ); // Build UB from ABR, with explicit unit subdiagonal and zeros. FLA_Copy_external( ABR_l, UB_l ); FLA_Part_2x1( UB_l, &UB_tl, &UB_bl, 1, FLA_TOP ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_UNIT_DIAG, UB_bl ); FLA_Set( FLA_ZERO, UB_tl ); // ATR = ATR - ATR * UB * inv( triu( T1 ) ) * UB' ); if ( FLA_Obj_length( ATR ) > 0 ) { // NOTE: We use ZT as temporary workspace. FLA_Part_1x2( ZT, &WT_l, &none, bb, FLA_LEFT ); FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, bb, bb, FLA_TL ); // WT_l = ATR * UB_l * inv( triu( T1 ) ). FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, ATR, UB_l, FLA_ZERO, WT_l ); FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, T1_tl, WT_l ); // ATR = ATR - WT_l * UB_l' FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, WT_l, UB_l, FLA_ONE, ATR ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &UT, U0, U1, /* ** */ /* ** */ &UB, U2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &ZT, Z0, Z1, /* ** */ /* ** */ &ZB, Z2, FLA_TOP ); FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, T1, /**/ T2, FLA_LEFT ); } FLA_Obj_free( &U ); FLA_Obj_free( &Z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_blf_var3 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Copy_external(), FLA_Gemm_external(), FLA_Hess_UT_step_ofu_var3(), FLA_MINUS_ONE, FLA_Obj_create(), FLA_Obj_datatype(), FLA_Obj_free(), FLA_Obj_length(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_Set(), FLA_Triangularize(), FLA_Trsm_external(), and FLA_ZERO.
Referenced by FLA_Hess_UT_internal().
{ FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj UT, U0, UB, U1, U2; FLA_Obj ZT, Z0, ZB, Z1, Z2; FLA_Obj TL, TR, T0, T1, T2; FLA_Obj U, Z; FLA_Obj ABR_l; FLA_Obj UB_l; FLA_Obj WT_l; FLA_Obj T1_tl; FLA_Obj none, none2, none3; FLA_Obj UB_tl, UB_bl; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg, b, bb; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &U ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &Z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x1( U, &UT, &UB, 0, FLA_TOP ); FLA_Part_2x1( Z, &ZT, &ZB, 0, FLA_TOP ); FLA_Part_1x2( T, &TL, &TR, 0, FLA_LEFT ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ) { b = min( FLA_Obj_length( ABR ), b_alg ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); FLA_Repart_2x1_to_3x1( UT, &U0, /* ** */ /* ** */ &U1, UB, &U2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( ZT, &Z0, /* ** */ /* ** */ &Z1, ZB, &Z2, b, FLA_BOTTOM ); FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, /**/ &T1, &T2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, b, b, FLA_TL ); bb = min( FLA_Obj_length( ABR ) - 1, b_alg ); FLA_Part_1x2( ABR, &ABR_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( UB, &UB_l, &none, bb, FLA_LEFT ); // [ ABR, T1 ] = FLA_Hess_UT_step_unb_var3( ABR, T1, b ); //FLA_Hess_UT_step_unb_var3( ABR, T1_tl ); FLA_Hess_UT_step_ofu_var3( ABR, T1_tl ); //FLA_Hess_UT_step_opt_var3( ABR, T1_tl ); // Build UB from ABR, with explicit unit subdiagonal and zeros. FLA_Copy_external( ABR_l, UB_l ); FLA_Part_2x1( UB_l, &UB_tl, &UB_bl, 1, FLA_TOP ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_UNIT_DIAG, UB_bl ); FLA_Set( FLA_ZERO, UB_tl ); // ATR = ATR - ATR * UB * inv( triu( T1 ) ) * UB' ); if ( FLA_Obj_length( ATR ) > 0 ) { // NOTE: We use ZT as temporary workspace. FLA_Part_1x2( ZT, &WT_l, &none, bb, FLA_LEFT ); FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, bb, bb, FLA_TL ); // WT_l = ATR * UB_l * inv( triu( T1 ) ). FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, ATR, UB_l, FLA_ZERO, WT_l ); FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, T1_tl, WT_l ); // ATR = ATR - WT_l * UB_l' FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, WT_l, UB_l, FLA_ONE, ATR ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &UT, U0, U1, /* ** */ /* ** */ &UB, U2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &ZT, Z0, Z1, /* ** */ /* ** */ &ZB, Z2, FLA_TOP ); FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, T1, /**/ T2, FLA_LEFT ); } FLA_Obj_free( &U ); FLA_Obj_free( &Z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_blf_var4 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Copy_external(), FLA_Gemm_external(), FLA_Hess_UT_step_ofu_var4(), FLA_MINUS_ONE, FLA_Obj_create(), FLA_Obj_datatype(), FLA_Obj_free(), FLA_Obj_length(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_Set(), FLA_Triangularize(), FLA_Trsm_external(), and FLA_ZERO.
Referenced by FLA_Hess_UT_internal().
{ FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj UT, U0, UB, U1, U2; FLA_Obj YT, Y0, YB, Y1, Y2; FLA_Obj ZT, Z0, ZB, Z1, Z2; FLA_Obj TL, TR, T0, T1, T2; FLA_Obj U, Y, Z; FLA_Obj ABR_l; FLA_Obj UB_l, U2_l; FLA_Obj YB_l, Y2_l; FLA_Obj ZB_l, Z2_l; FLA_Obj WT_l; FLA_Obj T1_tl; FLA_Obj none, none2, none3; FLA_Obj UB_tl, UB_bl; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg, b, bb; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &U ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &Y ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &Z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x1( U, &UT, &UB, 0, FLA_TOP ); FLA_Part_2x1( Y, &YT, &YB, 0, FLA_TOP ); FLA_Part_2x1( Z, &ZT, &ZB, 0, FLA_TOP ); FLA_Part_1x2( T, &TL, &TR, 0, FLA_LEFT ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ) { b = min( FLA_Obj_length( ABR ), b_alg ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); FLA_Repart_2x1_to_3x1( UT, &U0, /* ** */ /* ** */ &U1, UB, &U2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( YT, &Y0, /* ** */ /* ** */ &Y1, YB, &Y2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( ZT, &Z0, /* ** */ /* ** */ &Z1, ZB, &Z2, b, FLA_BOTTOM ); FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, /**/ &T1, &T2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, b, b, FLA_TL ); bb = min( FLA_Obj_length( ABR ) - 1, b_alg ); FLA_Part_1x2( ABR, &ABR_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( UB, &UB_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( YB, &YB_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( ZB, &ZB_l, &none, bb, FLA_LEFT ); FLA_Part_2x1( UB_l, &none, &U2_l, b, FLA_TOP ); FLA_Part_2x1( YB_l, &none, &Y2_l, b, FLA_TOP ); FLA_Part_2x1( ZB_l, &none, &Z2_l, b, FLA_TOP ); // [ ABR, YB, ZB, T1 ] = FLA_Hess_UT_step_unb_var4( ABR, YB, ZB, T1, b ); //FLA_Hess_UT_step_unb_var4( ABR, YB, ZB, T1_tl ); FLA_Hess_UT_step_ofu_var4( ABR, YB, ZB, T1_tl ); //FLA_Hess_UT_step_opt_var4( ABR, YB, ZB, T1_tl ); // Build UB from ABR, with explicit unit subdiagonal and zeros. FLA_Copy_external( ABR_l, UB_l ); FLA_Part_2x1( UB_l, &UB_tl, &UB_bl, 1, FLA_TOP ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_UNIT_DIAG, UB_bl ); FLA_Set( FLA_ZERO, UB_tl ); // ATR = ATR - ATR * UB * inv( triu( T ) ) * UB' ); if ( FLA_Obj_length( ATR ) > 0 ) { // NOTE: We use ZT as temporary workspace. FLA_Part_1x2( ZT, &WT_l, &none, bb, FLA_LEFT ); FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, bb, bb, FLA_TL ); // WT_l = ATR * UB_l * inv( triu( T ) ). FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, ATR, UB_l, FLA_ZERO, WT_l ); FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, T1_tl, WT_l ); // ATR = ATR - WT_l * UB_l' FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, WT_l, UB_l, FLA_ONE, ATR ); } // A22 = A22 - U2 * Y2' - Z2 * U2'; FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, U2_l, Y2_l, FLA_ONE, A22 ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, Z2_l, U2_l, FLA_ONE, A22 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &UT, U0, U1, /* ** */ /* ** */ &UB, U2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &YT, Y0, Y1, /* ** */ /* ** */ &YB, Y2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &ZT, Z0, Z1, /* ** */ /* ** */ &ZB, Z2, FLA_TOP ); FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, T1, /**/ T2, FLA_LEFT ); } FLA_Obj_free( &U ); FLA_Obj_free( &Y ); FLA_Obj_free( &Z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_blk_var1 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Copy_external(), FLA_Gemm_external(), FLA_Hess_UT_step_opt_var1(), FLA_MINUS_ONE, FLA_Obj_create(), FLA_Obj_datatype(), FLA_Obj_free(), FLA_Obj_length(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_Set(), FLA_Triangularize(), FLA_Trsm_external(), and FLA_ZERO.
Referenced by FLA_Hess_UT_internal().
{ FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj UT, U0, UB, U1, U2; FLA_Obj ZT, Z0, ZB, Z1, Z2; FLA_Obj TL, TR, T0, T1, T2; FLA_Obj U, Z; FLA_Obj ABR_l; FLA_Obj UB_l; FLA_Obj T1_tl; FLA_Obj WT_l; FLA_Obj none, none2, none3; FLA_Obj UB_tl, UB_bl; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg, b, bb; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &U ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &Z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x1( U, &UT, &UB, 0, FLA_TOP ); FLA_Part_2x1( Z, &ZT, &ZB, 0, FLA_TOP ); FLA_Part_1x2( T, &TL, &TR, 0, FLA_LEFT ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ) { b = min( FLA_Obj_length( ABR ), b_alg ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); FLA_Repart_2x1_to_3x1( UT, &U0, /* ** */ /* ** */ &U1, UB, &U2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( ZT, &Z0, /* ** */ /* ** */ &Z1, ZB, &Z2, b, FLA_BOTTOM ); FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, /**/ &T1, &T2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, b, b, FLA_TL ); bb = min( FLA_Obj_length( ABR ) - 1, b_alg ); FLA_Part_1x2( ABR, &ABR_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( UB, &UB_l, &none, bb, FLA_LEFT ); // [ ABR, T1 ] = FLA_Hess_UT_step_unb_var1( ABR, T1, b ); //FLA_Hess_UT_step_unb_var1( ABR, T1_tl ); FLA_Hess_UT_step_opt_var1( ABR, T1_tl ); // Build UB from ABR, with explicit unit subdiagonal and zeros. FLA_Copy_external( ABR_l, UB_l ); FLA_Part_2x1( UB_l, &UB_tl, &UB_bl, 1, FLA_TOP ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_UNIT_DIAG, UB_bl ); FLA_Set( FLA_ZERO, UB_tl ); // ATR = ATR - ATR * UB * inv( triu( T1 ) ) * UB' ); if ( FLA_Obj_length( ATR ) > 0 ) { // NOTE: We use ZT as temporary workspace. FLA_Part_1x2( ZT, &WT_l, &none, bb, FLA_LEFT ); FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, bb, bb, FLA_TL ); // WT_l = ATR * UB_l * inv( triu( T1 ) ). FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, ATR, UB_l, FLA_ZERO, WT_l ); FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, T1_tl, WT_l ); // ATR = ATR - WT_l * UB_l' FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, WT_l, UB_l, FLA_ONE, ATR ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &UT, U0, U1, /* ** */ /* ** */ &UB, U2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &ZT, Z0, Z1, /* ** */ /* ** */ &ZB, Z2, FLA_TOP ); FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, T1, /**/ T2, FLA_LEFT ); } FLA_Obj_free( &U ); FLA_Obj_free( &Z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_blk_var2 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Copy_external(), FLA_Gemm_external(), FLA_Hess_UT_step_opt_var2(), FLA_MINUS_ONE, FLA_Obj_create(), FLA_Obj_datatype(), FLA_Obj_free(), FLA_Obj_length(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_Set(), FLA_Triangularize(), FLA_Trsm_external(), and FLA_ZERO.
Referenced by FLA_Hess_UT_internal().
{ FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj UT, U0, UB, U1, U2; FLA_Obj ZT, Z0, ZB, Z1, Z2; FLA_Obj TL, TR, T0, T1, T2; FLA_Obj U, Z; FLA_Obj ABR_l; FLA_Obj UB_l; FLA_Obj T1_tl; FLA_Obj WT_l; FLA_Obj none, none2, none3; FLA_Obj UB_tl, UB_bl; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg, b, bb; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &U ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &Z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x1( U, &UT, &UB, 0, FLA_TOP ); FLA_Part_2x1( Z, &ZT, &ZB, 0, FLA_TOP ); FLA_Part_1x2( T, &TL, &TR, 0, FLA_LEFT ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ) { b = min( FLA_Obj_length( ABR ), b_alg ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); FLA_Repart_2x1_to_3x1( UT, &U0, /* ** */ /* ** */ &U1, UB, &U2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( ZT, &Z0, /* ** */ /* ** */ &Z1, ZB, &Z2, b, FLA_BOTTOM ); FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, /**/ &T1, &T2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, b, b, FLA_TL ); bb = min( FLA_Obj_length( ABR ) - 1, b_alg ); FLA_Part_1x2( ABR, &ABR_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( UB, &UB_l, &none, bb, FLA_LEFT ); // [ ABR, T1 ] = FLA_Hess_UT_step_unb_var2( ABR, T1, b ); //FLA_Hess_UT_step_unb_var2( ABR, T1_tl ); //FLA_Hess_UT_step_ofu_var2( ABR, T1_tl ); FLA_Hess_UT_step_opt_var2( ABR, T1_tl ); // Build UB from ABR, with explicit unit subdiagonal and zeros. FLA_Copy_external( ABR_l, UB_l ); FLA_Part_2x1( UB_l, &UB_tl, &UB_bl, 1, FLA_TOP ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_UNIT_DIAG, UB_bl ); FLA_Set( FLA_ZERO, UB_tl ); // ATR = ATR - ATR * UB * inv( triu( T1 ) ) * UB' ); if ( FLA_Obj_length( ATR ) > 0 ) { // NOTE: We use ZT as temporary workspace. FLA_Part_1x2( ZT, &WT_l, &none, bb, FLA_LEFT ); FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, bb, bb, FLA_TL ); // WT_l = ATR * UB_l * inv( triu( T1 ) ). FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, ATR, UB_l, FLA_ZERO, WT_l ); FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, T1_tl, WT_l ); // ATR = ATR - WT_l * UB_l' FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, WT_l, UB_l, FLA_ONE, ATR ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &UT, U0, U1, /* ** */ /* ** */ &UB, U2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &ZT, Z0, Z1, /* ** */ /* ** */ &ZB, Z2, FLA_TOP ); FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, T1, /**/ T2, FLA_LEFT ); } FLA_Obj_free( &U ); FLA_Obj_free( &Z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_blk_var3 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Copy_external(), FLA_Gemm_external(), FLA_Hess_UT_step_opt_var3(), FLA_MINUS_ONE, FLA_Obj_create(), FLA_Obj_datatype(), FLA_Obj_free(), FLA_Obj_length(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_Set(), FLA_Triangularize(), FLA_Trsm_external(), and FLA_ZERO.
Referenced by FLA_Hess_UT_internal().
{ FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj UT, U0, UB, U1, U2; FLA_Obj ZT, Z0, ZB, Z1, Z2; FLA_Obj TL, TR, T0, T1, T2; FLA_Obj U, Z; FLA_Obj ABR_l; FLA_Obj UB_l; FLA_Obj WT_l; FLA_Obj T1_tl; FLA_Obj none, none2, none3; FLA_Obj UB_tl, UB_bl; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg, b, bb; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &U ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &Z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x1( U, &UT, &UB, 0, FLA_TOP ); FLA_Part_2x1( Z, &ZT, &ZB, 0, FLA_TOP ); FLA_Part_1x2( T, &TL, &TR, 0, FLA_LEFT ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ) { b = min( FLA_Obj_length( ABR ), b_alg ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); FLA_Repart_2x1_to_3x1( UT, &U0, /* ** */ /* ** */ &U1, UB, &U2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( ZT, &Z0, /* ** */ /* ** */ &Z1, ZB, &Z2, b, FLA_BOTTOM ); FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, /**/ &T1, &T2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, b, b, FLA_TL ); bb = min( FLA_Obj_length( ABR ) - 1, b_alg ); FLA_Part_1x2( ABR, &ABR_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( UB, &UB_l, &none, bb, FLA_LEFT ); // [ ABR, T1 ] = FLA_Hess_UT_step_unb_var3( ABR, T1, b ); //FLA_Hess_UT_step_unb_var3( ABR, T1_tl ); //FLA_Hess_UT_step_ofu_var3( ABR, T1_tl ); FLA_Hess_UT_step_opt_var3( ABR, T1_tl ); // Build UB from ABR, with explicit unit subdiagonal and zeros. FLA_Copy_external( ABR_l, UB_l ); FLA_Part_2x1( UB_l, &UB_tl, &UB_bl, 1, FLA_TOP ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_UNIT_DIAG, UB_bl ); FLA_Set( FLA_ZERO, UB_tl ); // ATR = ATR - ATR * UB * inv( triu( T1 ) ) * UB' ); if ( FLA_Obj_length( ATR ) > 0 ) { // NOTE: We use ZT as temporary workspace. FLA_Part_1x2( ZT, &WT_l, &none, bb, FLA_LEFT ); FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, bb, bb, FLA_TL ); // WT_l = ATR * UB_l * inv( triu( T1 ) ). FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, ATR, UB_l, FLA_ZERO, WT_l ); FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, T1_tl, WT_l ); // ATR = ATR - WT_l * UB_l' FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, WT_l, UB_l, FLA_ONE, ATR ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &UT, U0, U1, /* ** */ /* ** */ &UB, U2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &ZT, Z0, Z1, /* ** */ /* ** */ &ZB, Z2, FLA_TOP ); FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, T1, /**/ T2, FLA_LEFT ); } FLA_Obj_free( &U ); FLA_Obj_free( &Z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_blk_var4 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Copy_external(), FLA_Gemm_external(), FLA_Hess_UT_step_opt_var4(), FLA_MINUS_ONE, FLA_Obj_create(), FLA_Obj_datatype(), FLA_Obj_free(), FLA_Obj_length(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_Set(), FLA_Triangularize(), FLA_Trsm_external(), and FLA_ZERO.
Referenced by FLA_Hess_UT_internal().
{ FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj UT, U0, UB, U1, U2; FLA_Obj YT, Y0, YB, Y1, Y2; FLA_Obj ZT, Z0, ZB, Z1, Z2; FLA_Obj TL, TR, T0, T1, T2; FLA_Obj U, Y, Z; FLA_Obj ABR_l; FLA_Obj UB_l, U2_l; FLA_Obj YB_l, Y2_l; FLA_Obj ZB_l, Z2_l; FLA_Obj WT_l; FLA_Obj T1_tl; FLA_Obj none, none2, none3; FLA_Obj UB_tl, UB_bl; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg, b, bb; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &U ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &Y ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &Z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x1( U, &UT, &UB, 0, FLA_TOP ); FLA_Part_2x1( Y, &YT, &YB, 0, FLA_TOP ); FLA_Part_2x1( Z, &ZT, &ZB, 0, FLA_TOP ); FLA_Part_1x2( T, &TL, &TR, 0, FLA_LEFT ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ) { b = min( FLA_Obj_length( ABR ), b_alg ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); FLA_Repart_2x1_to_3x1( UT, &U0, /* ** */ /* ** */ &U1, UB, &U2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( YT, &Y0, /* ** */ /* ** */ &Y1, YB, &Y2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( ZT, &Z0, /* ** */ /* ** */ &Z1, ZB, &Z2, b, FLA_BOTTOM ); FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, /**/ &T1, &T2, b, FLA_RIGHT ); /*------------------------------------------------------------*/ FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, b, b, FLA_TL ); bb = min( FLA_Obj_length( ABR ) - 1, b_alg ); FLA_Part_1x2( ABR, &ABR_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( UB, &UB_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( YB, &YB_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( ZB, &ZB_l, &none, bb, FLA_LEFT ); FLA_Part_2x1( UB_l, &none, &U2_l, b, FLA_TOP ); FLA_Part_2x1( YB_l, &none, &Y2_l, b, FLA_TOP ); FLA_Part_2x1( ZB_l, &none, &Z2_l, b, FLA_TOP ); // [ ABR, YB, ZB, T1 ] = FLA_Hess_UT_step_unb_var4( ABR, YB, ZB, T1, b ); //FLA_Hess_UT_step_unb_var4( ABR, YB, ZB, T1_tl ); //FLA_Hess_UT_step_ofu_var4( ABR, YB, ZB, T1_tl ); FLA_Hess_UT_step_opt_var4( ABR, YB, ZB, T1_tl ); // Build UB from ABR, with explicit unit subdiagonal and zeros. FLA_Copy_external( ABR_l, UB_l ); FLA_Part_2x1( UB_l, &UB_tl, &UB_bl, 1, FLA_TOP ); FLA_Triangularize( FLA_LOWER_TRIANGULAR, FLA_UNIT_DIAG, UB_bl ); FLA_Set( FLA_ZERO, UB_tl ); // ATR = ATR - ATR * UB * inv( triu( T ) ) * UB' ); if ( FLA_Obj_length( ATR ) > 0 ) { // NOTE: We use ZT as temporary workspace. FLA_Part_1x2( ZT, &WT_l, &none, bb, FLA_LEFT ); FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, bb, bb, FLA_TL ); // WT_l = ATR * UB_l * inv( triu( T ) ). FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, ATR, UB_l, FLA_ZERO, WT_l ); FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, T1_tl, WT_l ); // ATR = ATR - WT_l * UB_l' FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, WT_l, UB_l, FLA_ONE, ATR ); } // A22 = A22 - U2 * Y2' - Z2 * U2'; FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, U2_l, Y2_l, FLA_ONE, A22 ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, Z2_l, U2_l, FLA_ONE, A22 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &UT, U0, U1, /* ** */ /* ** */ &UB, U2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &YT, Y0, Y1, /* ** */ /* ** */ &YB, Y2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &ZT, Z0, Z1, /* ** */ /* ** */ &ZB, Z2, FLA_TOP ); FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, T1, /**/ T2, FLA_LEFT ); } FLA_Obj_free( &U ); FLA_Obj_free( &Y ); FLA_Obj_free( &Z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_blk_var5 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Apply_Q_UT(), FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Copyt_external(), FLA_Gemm_external(), FLA_Hess_UT_step_opt_var5(), FLA_Merge_2x1(), FLA_MINUS_ONE, FLA_Obj_create(), FLA_Obj_datatype(), FLA_Obj_free(), FLA_Obj_length(), FLA_Obj_width(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_Trsm_external(), and FLA_ZERO.
Referenced by FLA_Hess_UT_internal().
{ FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj UT, U0, UB, U1, U2; FLA_Obj ZT, Z0, ZB, Z1, Z2; FLA_Obj TL, TR, T0, T1, W12; FLA_Obj U, Z; FLA_Obj UB_l; FLA_Obj ZB_l; FLA_Obj WT_l; FLA_Obj T1_tl; FLA_Obj none, none2, none3; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg, b, bb; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &U ); FLA_Obj_create( datatype_A, m_A, b_alg, 0, 0, &Z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x1( U, &UT, &UB, 0, FLA_TOP ); FLA_Part_2x1( Z, &ZT, &ZB, 0, FLA_TOP ); FLA_Part_1x2( T, &TL, &TR, 0, FLA_LEFT ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ) { b = min( FLA_Obj_length( ABR ), b_alg ); FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, b, b, FLA_BR ); FLA_Repart_2x1_to_3x1( UT, &U0, /* ** */ /* ** */ &U1, UB, &U2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( ZT, &Z0, /* ** */ /* ** */ &Z1, ZB, &Z2, b, FLA_BOTTOM ); FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, /**/ &T1, &W12, b, FLA_RIGHT ); /*------------------------------------------------------------*/ FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, b, b, FLA_TL ); bb = min( FLA_Obj_length( ABR ) - 1, b_alg ); FLA_Part_1x2( UB, &UB_l, &none, bb, FLA_LEFT ); FLA_Part_1x2( ZB, &ZB_l, &none, bb, FLA_LEFT ); // [ ABR, UB, ZB, T1 ] = FLA_Hess_UT_step_unb_var5( ABR, UB, ZB, T1, b ); //FLA_Hess_UT_step_unb_var5( ABR, UB, ZB, T1_tl ); FLA_Hess_UT_step_opt_var5( ABR, UB, ZB, T1_tl ); // ATR = ATR - ATR * UB * inv( triu ( T1 ) ) * UB' ); if ( FLA_Obj_length( ATR ) > 0 ) { // NOTE: We use ZT as temporary workspace. FLA_Part_1x2( ZT, &WT_l, &none, bb, FLA_LEFT ); FLA_Part_2x2( T1, &T1_tl, &none, &none2, &none3, bb, bb, FLA_TL ); // WT_l = ATR * UB_l * inv( triu( T1 ) ). FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, ATR, UB_l, FLA_ZERO, WT_l ); FLA_Trsm_external( FLA_RIGHT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, T1_tl, WT_l ); // ATR = ATR - WT_l * UB_l' FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_MINUS_ONE, WT_l, UB_l, FLA_ONE, ATR ); } // / A12 \ = Q11' * / / A12 \ - / Z1 \ * inv( triu( T1 ) ) * U2' \ // \ A22 / \ \ A22 / \ Z2 / / // // where Q11 corresponds to the block Householder transformation // associated with UB and T1. if ( FLA_Obj_width( A12 ) > 0 ) { FLA_Obj ABR2, ABR2_b; FLA_Obj UB_b; // NOTE: Since A12.n > 0, we are guaranteed to not be at an edge case, // namely the case where bb = b - 1 = ABR.m - 1, thus we are free to use // the "full" matrix partitions in this scope block (ie: ZB instead of // ZB_l). // W12 = U2' // W12 = inv( triu( T1 ) ) * W12; FLA_Copyt_external( FLA_CONJ_TRANSPOSE, U2, W12 ); FLA_Trsm_external( FLA_LEFT, FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, T1_tl, W12 ); FLA_Merge_2x1( A12, A22, &ABR2 ); // / A12 \ = / A12 \ - / Z1 \ * W12 // \ A22 / \ A22 / \ Z2 / FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_MINUS_ONE, ZB, W12, FLA_ONE, ABR2 ); // Omit the top row of UB so it has [implicit] unit diagonal, allowing us // to use FLA_Apply_Q_UT() to apply the block Householder transformation // corresponding to UB and T1. This trick is valid since the top row of // ABR2 would normally be unchanged by the transformation (ie: multiplied // by identity). FLA_Part_2x1( UB, &none, &UB_b, 1, FLA_TOP ); FLA_Part_2x1( ABR2, &none, &ABR2_b, 1, FLA_TOP ); // Apply Q11' to A12 and A22 from the left: // // / A12 \ = / I - / U1 \ * inv( triu( T1 ) ) * / U1 \' \' / A12 \ // \ A22 / \ \ U2 / \ U2 / / \ A22 / // FLA_Apply_Q_UT( FLA_LEFT, FLA_CONJ_TRANSPOSE, FLA_FORWARD, FLA_COLUMNWISE, UB_b, T1_tl, W12, ABR2_b ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &UT, U0, U1, /* ** */ /* ** */ &UB, U2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &ZT, Z0, Z1, /* ** */ /* ** */ &ZB, Z2, FLA_TOP ); FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, T1, /**/ W12, FLA_LEFT ); } FLA_Obj_free( &U ); FLA_Obj_free( &Z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_ofu_var1 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
FLA_Error FLA_Hess_UT_ofu_var2 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_ofu_var2().
{ return FLA_Hess_UT_step_ofu_var2( A, T ); }
FLA_Error FLA_Hess_UT_ofu_var3 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_ofu_var3().
{ return FLA_Hess_UT_step_ofu_var3( A, T ); }
FLA_Error FLA_Hess_UT_ofu_var4 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_ofu_var4(), FLA_Obj_create_conf_to(), and FLA_Obj_free().
{ FLA_Error r_val; FLA_Obj Y, Z; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Y ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z ); r_val = FLA_Hess_UT_step_ofu_var4( A, Y, Z, T ); FLA_Obj_free( &Y ); FLA_Obj_free( &Z ); return r_val; }
FLA_Error FLA_Hess_UT_opt_var1 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_opt_var1().
Referenced by FLA_Hess_UT_internal().
{ return FLA_Hess_UT_step_opt_var1( A, T ); }
FLA_Error FLA_Hess_UT_opt_var2 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_opt_var2().
Referenced by FLA_Hess_UT_internal().
{ return FLA_Hess_UT_step_opt_var2( A, T ); }
FLA_Error FLA_Hess_UT_opt_var3 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_opt_var3().
Referenced by FLA_Hess_UT_internal().
{ return FLA_Hess_UT_step_opt_var3( A, T ); }
FLA_Error FLA_Hess_UT_opt_var4 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_opt_var4(), FLA_Obj_create_conf_to(), and FLA_Obj_free().
Referenced by FLA_Hess_UT_internal().
{ FLA_Error r_val; FLA_Obj Y, Z; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Y ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z ); r_val = FLA_Hess_UT_step_opt_var4( A, Y, Z, T ); FLA_Obj_free( &Y ); FLA_Obj_free( &Z ); return r_val; }
FLA_Error FLA_Hess_UT_opt_var5 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_opt_var5(), FLA_Obj_create_conf_to(), and FLA_Obj_free().
Referenced by FLA_Hess_UT_internal().
{ FLA_Error r_val; FLA_Obj U, Z; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &U ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z ); r_val = FLA_Hess_UT_step_opt_var5( A, U, Z, T ); FLA_Obj_free( &U ); FLA_Obj_free( &Z ); return r_val; }
FLA_Error FLA_Hess_UT_step_ofc_var1 | ( | int | m_A, |
int | m_T, | ||
scomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
scomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
FLA_Error FLA_Hess_UT_step_ofc_var2 | ( | int | m_A, |
int | m_T, | ||
scomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
scomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_caxpyv(), bli_cdot(), bli_cgemv(), bli_cger(), bli_cscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Fused_Ahx_Ax_opc_var1(), FLA_Fused_Gerc2_opc_var1(), FLA_Househ2_UT_l_opc(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_ofu_var2().
{ scomplex* buff_2 = FLA_COMPLEX_PTR( FLA_TWO ); scomplex* buff_1 = FLA_COMPLEX_PTR( FLA_ONE ); scomplex* buff_0 = FLA_COMPLEX_PTR( FLA_ZERO ); scomplex* buff_m1 = FLA_COMPLEX_PTR( FLA_MINUS_ONE ); scomplex first_elem; scomplex dot_product; scomplex beta, conj_beta; scomplex inv_tau11; scomplex minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); scomplex* buff_y = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); scomplex* buff_z = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_y = 1; int inc_z = 1; for ( i = 0; i < b_alg; ++i ) { scomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; scomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; scomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; scomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; scomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; scomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; scomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; scomplex* y0 = buff_y + (0 )*inc_y; scomplex* y2 = buff_y + (i+1)*inc_y; scomplex* z2 = buff_z + (i+1)*inc_z; scomplex* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; scomplex* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opc( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_cdiv3( buff_1, tau11, &inv_tau11 ); bli_cneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y2 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z2 ); FLA_Fused_Ahx_Ax_opc_var1( m_ahead, n_ahead, A22, rs_A, cs_A, a21, rs_A, y2, inc_y, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_cdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_cinvscals( buff_2, &beta ); bli_ccopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_cscals( &minus_inv_tau11, &conj_beta ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_cscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_cscals( &minus_inv_tau11, &beta ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_cscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_cdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_cscals( &minus_inv_tau11, &dot_product ); bli_caxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_cger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, a21, A22 ); FLA_Fused_Gerc2_opc_var1( m_ahead, n_ahead, buff_m1, a21, rs_A, y2, inc_y, z2, inc_z, a21, rs_A, A22, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_cgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); FLA_free( buff_y ); FLA_free( buff_z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ofc_var3 | ( | int | m_A, |
int | m_T, | ||
scomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
scomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_caxpyv(), bli_ccopyv(), bli_cdot(), bli_cgemv(), bli_cger(), bli_cscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Fused_Ahx_Ax_opc_var1(), FLA_Fused_Gerc2_Ahx_Ax_opc_var1(), FLA_Fused_Gerc2_opc_var1(), FLA_Househ2_UT_l_opc(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_ofu_var3().
{ scomplex* buff_2 = FLA_COMPLEX_PTR( FLA_TWO ); scomplex* buff_1 = FLA_COMPLEX_PTR( FLA_ONE ); scomplex* buff_0 = FLA_COMPLEX_PTR( FLA_ZERO ); scomplex* buff_m1 = FLA_COMPLEX_PTR( FLA_MINUS_ONE ); scomplex first_elem; scomplex dot_product; scomplex beta, conj_beta; scomplex inv_tau11; scomplex minus_inv_tau11; scomplex minus_upsilon1, minus_conj_upsilon1; scomplex minus_psi1, minus_conj_psi1; scomplex minus_zeta1; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &u ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &v ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); scomplex* buff_u = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); scomplex* buff_y = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); scomplex* buff_z = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); scomplex* buff_v = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); scomplex* buff_w = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_u = 1; int inc_y = 1; int inc_z = 1; int inc_v = 1; int inc_w = 1; for ( i = 0; i < b_alg; ++i ) { scomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; scomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; scomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; scomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; scomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; scomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; scomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; scomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; scomplex* upsilon1 = buff_u + (i )*inc_u; scomplex* u2 = buff_u + (i+1)*inc_u; scomplex* y0 = buff_y + (0 )*inc_y; scomplex* psi1 = buff_y + (i )*inc_y; scomplex* y2 = buff_y + (i+1)*inc_y; scomplex* zeta1 = buff_z + (i )*inc_z; scomplex* z2 = buff_z + (i+1)*inc_z; scomplex* v2 = buff_v + (i+1)*inc_v; scomplex* w2 = buff_w + (i+1)*inc_w; scomplex* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; scomplex* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( upsilon1, minus_upsilon1 ); // FLA_Scal( FLA_MINUS_ONE, minus_upsilon1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, minus_conj_upsilon1 ); bli_cmult3( buff_m1, upsilon1, &minus_upsilon1 ); bli_ccopyconj( &minus_upsilon1, &minus_conj_upsilon1 ); // FLA_Copy( psi1, minus_psi1 ); // FLA_Scal( FLA_MINUS_ONE, minus_psi1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_psi1, minus_conj_psi1 ); bli_cmult3( buff_m1, psi1, &minus_psi1 ); bli_ccopyconj( &minus_psi1, &minus_conj_psi1 ); // FLA_Copy( zeta1, minus_zeta1 ); // FLA_Scal( FLA_MINUS_ONE, minus_zeta1 ); bli_cmult3( buff_m1, zeta1, &minus_zeta1 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, psi1, alpha11 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_zeta1, upsilon1, alpha11 ); bli_caxpyv( BLIS_CONJUGATE, 1, &minus_upsilon1, psi1, 1, alpha11, 1 ); bli_caxpyv( BLIS_CONJUGATE, 1, &minus_zeta1, upsilon1, 1, alpha11, 1 ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_upsilon1, y2, a12t ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_zeta1, u2, a12t ); bli_caxpyv( BLIS_CONJUGATE, m_ahead, &minus_upsilon1, y2, inc_y, a12t, cs_A ); bli_caxpyv( BLIS_CONJUGATE, m_ahead, &minus_zeta1, u2, inc_u, a12t, cs_A ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_psi1, u2, a21 ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_upsilon1, z2, a21 ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_psi1, u2, inc_u, a21, rs_A ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_upsilon1, z2, inc_z, a21, rs_A ); } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opc( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_cdiv3( buff_1, tau11, &inv_tau11 ); bli_cneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; } if ( m_behind > 0 && m_ahead > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, v2 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, w2 ); FLA_Fused_Gerc2_Ahx_Ax_opc_var1( m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, z2, inc_z, A22, rs_A, cs_A, a21, rs_A, v2, inc_v, w2, inc_w ); } else if ( m_ahead > 0 ) { // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, v2 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, w2 ); FLA_Fused_Ahx_Ax_opc_var1( m_ahead, n_ahead, A22, rs_A, cs_A, a21, rs_A, v2, inc_v, w2, inc_w ); } if ( m_ahead > 0 ) { // FLA_Copy( a21, u2 ); // FLA_Copy( v2, y2 ); // FLA_Copy( w2, z2 ); bli_ccopyv( BLIS_NO_CONJUGATE, m_ahead, a21, rs_A, u2, inc_u ); bli_ccopyv( BLIS_NO_CONJUGATE, m_ahead, v2, inc_v, y2, inc_y ); bli_ccopyv( BLIS_NO_CONJUGATE, m_ahead, w2, inc_w, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_cdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_cinvscals( buff_2, &beta ); bli_ccopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_cscals( &minus_inv_tau11, &conj_beta ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_cscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_cscals( &minus_inv_tau11, &beta ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_cscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_cdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_cscals( &minus_inv_tau11, &dot_product ); bli_caxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_cger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_cgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } if ( m_behind + 1 == b_alg && m_ahead > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); FLA_Fused_Gerc2_opc_var1( m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, z2, inc_z, u2, inc_u, A22, rs_A, cs_A ); } /*------------------------------------------------------------*/ } // FLA_Obj_free( &u ); // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); // FLA_Obj_free( &v ); // FLA_Obj_free( &w ); FLA_free( buff_u ); FLA_free( buff_y ); FLA_free( buff_z ); FLA_free( buff_v ); FLA_free( buff_w ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ofc_var4 | ( | int | m_A, |
int | m_T, | ||
scomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
scomplex * | buff_Y, | ||
int | rs_Y, | ||
int | cs_Y, | ||
scomplex * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
scomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_caxpyv(), bli_cdot(), bli_cgemv(), bli_cger(), bli_cscalv(), bli_csetm(), BLIS_CONJ_NO_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Fused_Ahx_Ax_opc_var1(), FLA_Fused_Uhu_Yhu_Zhu_opc_var1(), FLA_Househ2_UT_l_opc(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_ofu_var4().
{ scomplex* buff_2 = FLA_COMPLEX_PTR( FLA_TWO ); scomplex* buff_1 = FLA_COMPLEX_PTR( FLA_ONE ); scomplex* buff_0 = FLA_COMPLEX_PTR( FLA_ZERO ); scomplex* buff_m1 = FLA_COMPLEX_PTR( FLA_MINUS_ONE ); scomplex first_elem, last_elem; scomplex dot_product; scomplex beta, conj_beta; scomplex inv_tau11; scomplex minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &d ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &e ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &f ); scomplex* buff_e = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_e = 1; // FLA_Set( FLA_ZERO, Y ); // FLA_Set( FLA_ZERO, Z ); bli_csetm( m_A, b_alg, buff_0, buff_Y, rs_Y, cs_Y ); bli_csetm( m_A, b_alg, buff_0, buff_Z, rs_Z, cs_Z ); for ( i = 0; i < b_alg; ++i ) { scomplex* a10t = buff_A + (0 )*cs_A + (i )*rs_A; scomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; scomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; scomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; scomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; scomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; scomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; scomplex* y10t = buff_Y + (0 )*cs_Y + (i )*rs_Y; scomplex* Y20 = buff_Y + (0 )*cs_Y + (i+1)*rs_Y; scomplex* y21 = buff_Y + (i )*cs_Y + (i+1)*rs_Y; scomplex* z10t = buff_Z + (0 )*cs_Z + (i )*rs_Z; scomplex* Z20 = buff_Z + (0 )*cs_Z + (i+1)*rs_Z; scomplex* z21 = buff_Z + (i )*cs_Z + (i+1)*rs_Z; scomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; scomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; scomplex* e0 = buff_e + (0 )*inc_e; scomplex* a10t_r = a10t + (i-1)*cs_A + (0 )*rs_A; scomplex* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; scomplex* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; scomplex* ABL = a10t; scomplex* ZBL = z10t; scomplex* a2 = alpha11; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( a10t_r, last_elem ); // FLA_Set( FLA_ONE, a10t_r ); last_elem = *a10t_r; *a10t_r = *buff_1; } // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ABL, y10t, FLA_ONE, a2 ); // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ZBL, a10t, FLA_ONE, a2 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ABL, rs_A, cs_A, y10t, cs_Y, buff_1, a2, rs_A ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ZBL, rs_Z, cs_Z, a10t, cs_A, buff_1, a2, rs_A ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, a10t, FLA_ONE, a12t ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, A20, z10t, FLA_ONE, a12t ); bli_cgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, a10t, cs_A, buff_1, a12t, cs_A ); bli_cgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, z10t, cs_Z, buff_1, a12t, cs_A ); if ( m_behind > 0 ) { // FLA_Copy( last_elem, a10t_r ); *a10t_r = last_elem; } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opc( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_cdiv3( buff_1, tau11, &inv_tau11 ); bli_cneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z21 ); FLA_Fused_Ahx_Ax_opc_var1( m_ahead, n_ahead, A22, rs_A, cs_A, a21, rs_A, y21, rs_Y, z21, rs_Z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, d0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Y20, a21, FLA_ZERO, e0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Z20, a21, FLA_ZERO, f0 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, d0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, f0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, e0, FLA_ONE, z21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, d0, FLA_ONE, z21 ); // FLA_Copy( d0, t01 ); FLA_Fused_Uhu_Yhu_Zhu_opc_var1( m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, Y20, rs_Y, cs_Y, Z20, rs_Z, cs_Z, t01, rs_T, a21, rs_A, y21, rs_Y, z21, rs_Z ); // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_cdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z21, rs_Z, &beta ); bli_cinvscals( buff_2, &beta ); bli_ccopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y21 ); // FLA_Scal( inv_tau11, y21 ); bli_cscals( &minus_inv_tau11, &conj_beta ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y21, rs_Y ); bli_cscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y21, rs_Y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z21 ); // FLA_Scal( inv_tau11, z21 ); bli_cscals( &minus_inv_tau11, &beta ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z21, rs_Z ); bli_cscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z21, rs_Z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_cdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_cscals( &minus_inv_tau11, &dot_product ); bli_caxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, e0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, e0, a21, A02 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, e0, inc_e ); bli_cger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, e0, inc_e, a21, rs_A, A02, rs_A, cs_A ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &e ); FLA_free( buff_e ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ofd_var1 | ( | int | m_A, |
int | m_T, | ||
double * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
double * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
FLA_Error FLA_Hess_UT_step_ofd_var2 | ( | int | m_A, |
int | m_T, | ||
double * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
double * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_daxpyv(), bli_ddot(), bli_dgemv(), bli_dger(), bli_dscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Fused_Ahx_Ax_opd_var1(), FLA_Fused_Gerc2_opd_var1(), FLA_Househ2_UT_l_opd(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_ofu_var2().
{ double* buff_2 = FLA_DOUBLE_PTR( FLA_TWO ); double* buff_1 = FLA_DOUBLE_PTR( FLA_ONE ); double* buff_0 = FLA_DOUBLE_PTR( FLA_ZERO ); double* buff_m1 = FLA_DOUBLE_PTR( FLA_MINUS_ONE ); double first_elem; double dot_product; double beta, conj_beta; double inv_tau11; double minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); double* buff_y = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); double* buff_z = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_y = 1; int inc_z = 1; for ( i = 0; i < b_alg; ++i ) { double* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; double* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; double* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; double* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; double* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; double* t01 = buff_T + (i )*cs_T + (0 )*rs_T; double* tau11 = buff_T + (i )*cs_T + (i )*rs_T; double* y0 = buff_y + (0 )*inc_y; double* y2 = buff_y + (i+1)*inc_y; double* z2 = buff_z + (i+1)*inc_z; double* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; double* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opd( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_ddiv3( buff_1, tau11, &inv_tau11 ); bli_dneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y2 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z2 ); FLA_Fused_Ahx_Ax_opd_var1( m_ahead, n_ahead, A22, rs_A, cs_A, a21, rs_A, y2, inc_y, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_ddot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_dinvscals( buff_2, &beta ); bli_dcopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_dscals( &minus_inv_tau11, &conj_beta ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_dscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_dscals( &minus_inv_tau11, &beta ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_dscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_ddot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_dscals( &minus_inv_tau11, &dot_product ); bli_daxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_dger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, a21, A22 ); FLA_Fused_Gerc2_opd_var1( m_ahead, n_ahead, buff_m1, a21, rs_A, y2, inc_y, z2, inc_z, a21, rs_A, A22, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_dgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); FLA_free( buff_y ); FLA_free( buff_z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ofd_var3 | ( | int | m_A, |
int | m_T, | ||
double * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
double * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_daxpyv(), bli_dcopyv(), bli_ddot(), bli_dgemv(), bli_dger(), bli_dscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Fused_Ahx_Ax_opd_var1(), FLA_Fused_Gerc2_Ahx_Ax_opd_var1(), FLA_Fused_Gerc2_opd_var1(), FLA_Househ2_UT_l_opd(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_ofu_var3().
{ double* buff_2 = FLA_DOUBLE_PTR( FLA_TWO ); double* buff_1 = FLA_DOUBLE_PTR( FLA_ONE ); double* buff_0 = FLA_DOUBLE_PTR( FLA_ZERO ); double* buff_m1 = FLA_DOUBLE_PTR( FLA_MINUS_ONE ); double first_elem; double dot_product; double beta, conj_beta; double inv_tau11; double minus_inv_tau11; double minus_upsilon1, minus_conj_upsilon1; double minus_psi1, minus_conj_psi1; double minus_zeta1; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &u ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &v ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); double* buff_u = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); double* buff_y = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); double* buff_z = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); double* buff_v = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); double* buff_w = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_u = 1; int inc_y = 1; int inc_z = 1; int inc_v = 1; int inc_w = 1; for ( i = 0; i < b_alg; ++i ) { double* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; double* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; double* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; double* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; double* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; double* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; double* t01 = buff_T + (i )*cs_T + (0 )*rs_T; double* tau11 = buff_T + (i )*cs_T + (i )*rs_T; double* upsilon1 = buff_u + (i )*inc_u; double* u2 = buff_u + (i+1)*inc_u; double* y0 = buff_y + (0 )*inc_y; double* psi1 = buff_y + (i )*inc_y; double* y2 = buff_y + (i+1)*inc_y; double* zeta1 = buff_z + (i )*inc_z; double* z2 = buff_z + (i+1)*inc_z; double* v2 = buff_v + (i+1)*inc_v; double* w2 = buff_w + (i+1)*inc_w; double* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; double* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( upsilon1, minus_upsilon1 ); // FLA_Scal( FLA_MINUS_ONE, minus_upsilon1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, minus_conj_upsilon1 ); bli_dmult3( buff_m1, upsilon1, &minus_upsilon1 ); bli_dcopyconj( &minus_upsilon1, &minus_conj_upsilon1 ); // FLA_Copy( psi1, minus_psi1 ); // FLA_Scal( FLA_MINUS_ONE, minus_psi1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_psi1, minus_conj_psi1 ); bli_dmult3( buff_m1, psi1, &minus_psi1 ); bli_dcopyconj( &minus_psi1, &minus_conj_psi1 ); // FLA_Copy( zeta1, minus_zeta1 ); // FLA_Scal( FLA_MINUS_ONE, minus_zeta1 ); bli_dmult3( buff_m1, zeta1, &minus_zeta1 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, psi1, alpha11 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_zeta1, upsilon1, alpha11 ); bli_daxpyv( BLIS_CONJUGATE, 1, &minus_upsilon1, psi1, 1, alpha11, 1 ); bli_daxpyv( BLIS_CONJUGATE, 1, &minus_zeta1, upsilon1, 1, alpha11, 1 ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_upsilon1, y2, a12t ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_zeta1, u2, a12t ); bli_daxpyv( BLIS_CONJUGATE, m_ahead, &minus_upsilon1, y2, inc_y, a12t, cs_A ); bli_daxpyv( BLIS_CONJUGATE, m_ahead, &minus_zeta1, u2, inc_u, a12t, cs_A ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_psi1, u2, a21 ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_upsilon1, z2, a21 ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_psi1, u2, inc_u, a21, rs_A ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_upsilon1, z2, inc_z, a21, rs_A ); } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opd( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_ddiv3( buff_1, tau11, &inv_tau11 ); bli_dneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; } if ( m_behind > 0 && m_ahead > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, v2 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, w2 ); FLA_Fused_Gerc2_Ahx_Ax_opd_var1( m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, z2, inc_z, A22, rs_A, cs_A, a21, rs_A, v2, inc_v, w2, inc_w ); } else if ( m_ahead > 0 ) { // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, v2 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, w2 ); FLA_Fused_Ahx_Ax_opd_var1( m_ahead, n_ahead, A22, rs_A, cs_A, a21, rs_A, v2, inc_v, w2, inc_w ); } if ( m_ahead > 0 ) { // FLA_Copy( a21, u2 ); // FLA_Copy( v2, y2 ); // FLA_Copy( w2, z2 ); bli_dcopyv( BLIS_NO_CONJUGATE, m_ahead, a21, rs_A, u2, inc_u ); bli_dcopyv( BLIS_NO_CONJUGATE, m_ahead, v2, inc_v, y2, inc_y ); bli_dcopyv( BLIS_NO_CONJUGATE, m_ahead, w2, inc_w, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_ddot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_dinvscals( buff_2, &beta ); bli_dcopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_dscals( &minus_inv_tau11, &conj_beta ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_dscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_dscals( &minus_inv_tau11, &beta ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_dscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_ddot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_dscals( &minus_inv_tau11, &dot_product ); bli_daxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_dger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_dgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } if ( m_behind + 1 == b_alg && m_ahead > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); FLA_Fused_Gerc2_opd_var1( m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, z2, inc_z, u2, inc_u, A22, rs_A, cs_A ); } /*------------------------------------------------------------*/ } // FLA_Obj_free( &u ); // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); // FLA_Obj_free( &v ); // FLA_Obj_free( &w ); FLA_free( buff_u ); FLA_free( buff_y ); FLA_free( buff_z ); FLA_free( buff_v ); FLA_free( buff_w ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ofd_var4 | ( | int | m_A, |
int | m_T, | ||
double * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
double * | buff_Y, | ||
int | rs_Y, | ||
int | cs_Y, | ||
double * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
double * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_daxpyv(), bli_ddot(), bli_dgemv(), bli_dger(), bli_dscalv(), bli_dsetm(), BLIS_CONJ_NO_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Fused_Ahx_Ax_opd_var1(), FLA_Fused_Uhu_Yhu_Zhu_opd_var1(), FLA_Househ2_UT_l_opd(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_ofu_var4().
{ double* buff_2 = FLA_DOUBLE_PTR( FLA_TWO ); double* buff_1 = FLA_DOUBLE_PTR( FLA_ONE ); double* buff_0 = FLA_DOUBLE_PTR( FLA_ZERO ); double* buff_m1 = FLA_DOUBLE_PTR( FLA_MINUS_ONE ); double first_elem, last_elem; double dot_product; double beta, conj_beta; double inv_tau11; double minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &d ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &e ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &f ); double* buff_e = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_e = 1; // FLA_Set( FLA_ZERO, Y ); // FLA_Set( FLA_ZERO, Z ); bli_dsetm( m_A, b_alg, buff_0, buff_Y, rs_Y, cs_Y ); bli_dsetm( m_A, b_alg, buff_0, buff_Z, rs_Z, cs_Z ); for ( i = 0; i < b_alg; ++i ) { double* a10t = buff_A + (0 )*cs_A + (i )*rs_A; double* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; double* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; double* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; double* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; double* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; double* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; double* y10t = buff_Y + (0 )*cs_Y + (i )*rs_Y; double* Y20 = buff_Y + (0 )*cs_Y + (i+1)*rs_Y; double* y21 = buff_Y + (i )*cs_Y + (i+1)*rs_Y; double* z10t = buff_Z + (0 )*cs_Z + (i )*rs_Z; double* Z20 = buff_Z + (0 )*cs_Z + (i+1)*rs_Z; double* z21 = buff_Z + (i )*cs_Z + (i+1)*rs_Z; double* t01 = buff_T + (i )*cs_T + (0 )*rs_T; double* tau11 = buff_T + (i )*cs_T + (i )*rs_T; double* e0 = buff_e + (0 )*inc_e; double* a10t_r = a10t + (i-1)*cs_A + (0 )*rs_A; double* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; double* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; double* ABL = a10t; double* ZBL = z10t; double* a2 = alpha11; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( a10t_r, last_elem ); // FLA_Set( FLA_ONE, a10t_r ); last_elem = *a10t_r; *a10t_r = *buff_1; } // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ABL, y10t, FLA_ONE, a2 ); // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ZBL, a10t, FLA_ONE, a2 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ABL, rs_A, cs_A, y10t, cs_Y, buff_1, a2, rs_A ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ZBL, rs_Z, cs_Z, a10t, cs_A, buff_1, a2, rs_A ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, a10t, FLA_ONE, a12t ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, A20, z10t, FLA_ONE, a12t ); bli_dgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, a10t, cs_A, buff_1, a12t, cs_A ); bli_dgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, z10t, cs_Z, buff_1, a12t, cs_A ); if ( m_behind > 0 ) { // FLA_Copy( last_elem, a10t_r ); *a10t_r = last_elem; } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opd( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_ddiv3( buff_1, tau11, &inv_tau11 ); bli_dneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z21 ); FLA_Fused_Ahx_Ax_opd_var1( m_ahead, n_ahead, A22, rs_A, cs_A, a21, rs_A, y21, rs_Y, z21, rs_Z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, d0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Y20, a21, FLA_ZERO, e0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Z20, a21, FLA_ZERO, f0 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, d0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, f0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, e0, FLA_ONE, z21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, d0, FLA_ONE, z21 ); // FLA_Copy( d0, t01 ); FLA_Fused_Uhu_Yhu_Zhu_opd_var1( m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, Y20, rs_Y, cs_Y, Z20, rs_Z, cs_Z, t01, rs_T, a21, rs_A, y21, rs_Y, z21, rs_Z ); // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_ddot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z21, rs_Z, &beta ); bli_dinvscals( buff_2, &beta ); bli_dcopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y21 ); // FLA_Scal( inv_tau11, y21 ); bli_dscals( &minus_inv_tau11, &conj_beta ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y21, rs_Y ); bli_dscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y21, rs_Y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z21 ); // FLA_Scal( inv_tau11, z21 ); bli_dscals( &minus_inv_tau11, &beta ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z21, rs_Z ); bli_dscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z21, rs_Z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_ddot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_dscals( &minus_inv_tau11, &dot_product ); bli_daxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, e0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, e0, a21, A02 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, e0, inc_e ); bli_dger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, e0, inc_e, a21, rs_A, A02, rs_A, cs_A ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &e ); FLA_free( buff_e ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ofs_var1 | ( | int | m_A, |
int | m_T, | ||
float * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
float * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
FLA_Error FLA_Hess_UT_step_ofs_var2 | ( | int | m_A, |
int | m_T, | ||
float * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
float * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_saxpyv(), bli_sdot(), bli_sgemv(), bli_sger(), bli_sscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Fused_Ahx_Ax_ops_var1(), FLA_Fused_Gerc2_ops_var1(), FLA_Househ2_UT_l_ops(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_ofu_var2().
{ float* buff_2 = FLA_FLOAT_PTR( FLA_TWO ); float* buff_1 = FLA_FLOAT_PTR( FLA_ONE ); float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO ); float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE ); float first_elem; float dot_product; float beta, conj_beta; float inv_tau11; float minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); float* buff_y = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_z = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_y = 1; int inc_z = 1; for ( i = 0; i < b_alg; ++i ) { float* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; float* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; float* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; float* t01 = buff_T + (i )*cs_T + (0 )*rs_T; float* tau11 = buff_T + (i )*cs_T + (i )*rs_T; float* y0 = buff_y + (0 )*inc_y; float* y2 = buff_y + (i+1)*inc_y; float* z2 = buff_z + (i+1)*inc_z; float* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; float* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_ops( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_sdiv3( buff_1, tau11, &inv_tau11 ); bli_sneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y2 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z2 ); FLA_Fused_Ahx_Ax_ops_var1( m_ahead, n_ahead, A22, rs_A, cs_A, a21, rs_A, y2, inc_y, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_sdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_sinvscals( buff_2, &beta ); bli_scopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_sscals( &minus_inv_tau11, &conj_beta ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_sscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_sscals( &minus_inv_tau11, &beta ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_sscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_sdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_sscals( &minus_inv_tau11, &dot_product ); bli_saxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_sger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, a21, A22 ); FLA_Fused_Gerc2_ops_var1( m_ahead, n_ahead, buff_m1, a21, rs_A, y2, inc_y, z2, inc_z, a21, rs_A, A22, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_sgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); FLA_free( buff_y ); FLA_free( buff_z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ofs_var3 | ( | int | m_A, |
int | m_T, | ||
float * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
float * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_saxpyv(), bli_scopyv(), bli_sdot(), bli_sgemv(), bli_sger(), bli_sscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Fused_Ahx_Ax_ops_var1(), FLA_Fused_Gerc2_Ahx_Ax_ops_var1(), FLA_Fused_Gerc2_ops_var1(), FLA_Househ2_UT_l_ops(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_ofu_var3().
{ float* buff_2 = FLA_FLOAT_PTR( FLA_TWO ); float* buff_1 = FLA_FLOAT_PTR( FLA_ONE ); float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO ); float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE ); float first_elem; float dot_product; float beta, conj_beta; float inv_tau11; float minus_inv_tau11; float minus_upsilon1, minus_conj_upsilon1; float minus_psi1, minus_conj_psi1; float minus_zeta1; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &u ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &v ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); float* buff_u = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_y = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_z = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_v = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_w = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_u = 1; int inc_y = 1; int inc_z = 1; int inc_v = 1; int inc_w = 1; for ( i = 0; i < b_alg; ++i ) { float* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; float* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; float* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; float* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; float* t01 = buff_T + (i )*cs_T + (0 )*rs_T; float* tau11 = buff_T + (i )*cs_T + (i )*rs_T; float* upsilon1 = buff_u + (i )*inc_u; float* u2 = buff_u + (i+1)*inc_u; float* y0 = buff_y + (0 )*inc_y; float* psi1 = buff_y + (i )*inc_y; float* y2 = buff_y + (i+1)*inc_y; float* zeta1 = buff_z + (i )*inc_z; float* z2 = buff_z + (i+1)*inc_z; float* v2 = buff_v + (i+1)*inc_v; float* w2 = buff_w + (i+1)*inc_w; float* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; float* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( upsilon1, minus_upsilon1 ); // FLA_Scal( FLA_MINUS_ONE, minus_upsilon1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, minus_conj_upsilon1 ); bli_smult3( buff_m1, upsilon1, &minus_upsilon1 ); bli_scopyconj( &minus_upsilon1, &minus_conj_upsilon1 ); // FLA_Copy( psi1, minus_psi1 ); // FLA_Scal( FLA_MINUS_ONE, minus_psi1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_psi1, minus_conj_psi1 ); bli_smult3( buff_m1, psi1, &minus_psi1 ); bli_scopyconj( &minus_psi1, &minus_conj_psi1 ); // FLA_Copy( zeta1, minus_zeta1 ); // FLA_Scal( FLA_MINUS_ONE, minus_zeta1 ); bli_smult3( buff_m1, zeta1, &minus_zeta1 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, psi1, alpha11 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_zeta1, upsilon1, alpha11 ); bli_saxpyv( BLIS_CONJUGATE, 1, &minus_upsilon1, psi1, 1, alpha11, 1 ); bli_saxpyv( BLIS_CONJUGATE, 1, &minus_zeta1, upsilon1, 1, alpha11, 1 ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_upsilon1, y2, a12t ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_zeta1, u2, a12t ); bli_saxpyv( BLIS_CONJUGATE, m_ahead, &minus_upsilon1, y2, inc_y, a12t, cs_A ); bli_saxpyv( BLIS_CONJUGATE, m_ahead, &minus_zeta1, u2, inc_u, a12t, cs_A ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_psi1, u2, a21 ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_upsilon1, z2, a21 ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_psi1, u2, inc_u, a21, rs_A ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_upsilon1, z2, inc_z, a21, rs_A ); } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_ops( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_sdiv3( buff_1, tau11, &inv_tau11 ); bli_sneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; } if ( m_behind > 0 && m_ahead > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, v2 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, w2 ); FLA_Fused_Gerc2_Ahx_Ax_ops_var1( m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, z2, inc_z, A22, rs_A, cs_A, a21, rs_A, v2, inc_v, w2, inc_w ); } else if ( m_ahead > 0 ) { // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, v2 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, w2 ); FLA_Fused_Ahx_Ax_ops_var1( m_ahead, n_ahead, A22, rs_A, cs_A, a21, rs_A, v2, inc_v, w2, inc_w ); } if ( m_ahead > 0 ) { // FLA_Copy( a21, u2 ); // FLA_Copy( v2, y2 ); // FLA_Copy( w2, z2 ); bli_scopyv( BLIS_NO_CONJUGATE, m_ahead, a21, rs_A, u2, inc_u ); bli_scopyv( BLIS_NO_CONJUGATE, m_ahead, v2, inc_v, y2, inc_y ); bli_scopyv( BLIS_NO_CONJUGATE, m_ahead, w2, inc_w, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_sdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_sinvscals( buff_2, &beta ); bli_scopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_sscals( &minus_inv_tau11, &conj_beta ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_sscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_sscals( &minus_inv_tau11, &beta ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_sscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_sdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_sscals( &minus_inv_tau11, &dot_product ); bli_saxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_sger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_sgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } if ( m_behind + 1 == b_alg && m_ahead > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); FLA_Fused_Gerc2_ops_var1( m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, z2, inc_z, u2, inc_u, A22, rs_A, cs_A ); } /*------------------------------------------------------------*/ } // FLA_Obj_free( &u ); // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); // FLA_Obj_free( &v ); // FLA_Obj_free( &w ); FLA_free( buff_u ); FLA_free( buff_y ); FLA_free( buff_z ); FLA_free( buff_v ); FLA_free( buff_w ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ofs_var4 | ( | int | m_A, |
int | m_T, | ||
float * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
float * | buff_Y, | ||
int | rs_Y, | ||
int | cs_Y, | ||
float * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
float * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_saxpyv(), bli_sdot(), bli_sgemv(), bli_sger(), bli_sscalv(), bli_ssetm(), BLIS_CONJ_NO_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Fused_Ahx_Ax_ops_var1(), FLA_Fused_Uhu_Yhu_Zhu_ops_var1(), FLA_Househ2_UT_l_ops(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_ofu_var4().
{ float* buff_2 = FLA_FLOAT_PTR( FLA_TWO ); float* buff_1 = FLA_FLOAT_PTR( FLA_ONE ); float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO ); float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE ); float first_elem, last_elem; float dot_product; float beta, conj_beta; float inv_tau11; float minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &d ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &e ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &f ); float* buff_e = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_e = 1; // FLA_Set( FLA_ZERO, Y ); // FLA_Set( FLA_ZERO, Z ); bli_ssetm( m_A, b_alg, buff_0, buff_Y, rs_Y, cs_Y ); bli_ssetm( m_A, b_alg, buff_0, buff_Z, rs_Z, cs_Z ); for ( i = 0; i < b_alg; ++i ) { float* a10t = buff_A + (0 )*cs_A + (i )*rs_A; float* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; float* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; float* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; float* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; float* y10t = buff_Y + (0 )*cs_Y + (i )*rs_Y; float* Y20 = buff_Y + (0 )*cs_Y + (i+1)*rs_Y; float* y21 = buff_Y + (i )*cs_Y + (i+1)*rs_Y; float* z10t = buff_Z + (0 )*cs_Z + (i )*rs_Z; float* Z20 = buff_Z + (0 )*cs_Z + (i+1)*rs_Z; float* z21 = buff_Z + (i )*cs_Z + (i+1)*rs_Z; float* t01 = buff_T + (i )*cs_T + (0 )*rs_T; float* tau11 = buff_T + (i )*cs_T + (i )*rs_T; float* e0 = buff_e + (0 )*inc_e; float* a10t_r = a10t + (i-1)*cs_A + (0 )*rs_A; float* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; float* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; float* ABL = a10t; float* ZBL = z10t; float* a2 = alpha11; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( a10t_r, last_elem ); // FLA_Set( FLA_ONE, a10t_r ); last_elem = *a10t_r; *a10t_r = *buff_1; } // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ABL, y10t, FLA_ONE, a2 ); // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ZBL, a10t, FLA_ONE, a2 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ABL, rs_A, cs_A, y10t, cs_Y, buff_1, a2, rs_A ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ZBL, rs_Z, cs_Z, a10t, cs_A, buff_1, a2, rs_A ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, a10t, FLA_ONE, a12t ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, A20, z10t, FLA_ONE, a12t ); bli_sgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, a10t, cs_A, buff_1, a12t, cs_A ); bli_sgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, z10t, cs_Z, buff_1, a12t, cs_A ); if ( m_behind > 0 ) { // FLA_Copy( last_elem, a10t_r ); *a10t_r = last_elem; } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_ops( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_sdiv3( buff_1, tau11, &inv_tau11 ); bli_sneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z21 ); FLA_Fused_Ahx_Ax_ops_var1( m_ahead, n_ahead, A22, rs_A, cs_A, a21, rs_A, y21, rs_Y, z21, rs_Z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, d0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Y20, a21, FLA_ZERO, e0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Z20, a21, FLA_ZERO, f0 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, d0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, f0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, e0, FLA_ONE, z21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, d0, FLA_ONE, z21 ); // FLA_Copy( d0, t01 ); FLA_Fused_Uhu_Yhu_Zhu_ops_var1( m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, Y20, rs_Y, cs_Y, Z20, rs_Z, cs_Z, t01, rs_T, a21, rs_A, y21, rs_Y, z21, rs_Z ); // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_sdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z21, rs_Z, &beta ); bli_sinvscals( buff_2, &beta ); bli_scopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y21 ); // FLA_Scal( inv_tau11, y21 ); bli_sscals( &minus_inv_tau11, &conj_beta ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y21, rs_Y ); bli_sscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y21, rs_Y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z21 ); // FLA_Scal( inv_tau11, z21 ); bli_sscals( &minus_inv_tau11, &beta ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z21, rs_Z ); bli_sscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z21, rs_Z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_sdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_sscals( &minus_inv_tau11, &dot_product ); bli_saxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, e0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, e0, a21, A02 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, e0, inc_e ); bli_sger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, e0, inc_e, a21, rs_A, A02, rs_A, cs_A ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &e ); FLA_free( buff_e ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ofu_var1 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
FLA_Error FLA_Hess_UT_step_ofu_var2 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_ofc_var2(), FLA_Hess_UT_step_ofd_var2(), FLA_Hess_UT_step_ofs_var2(), FLA_Hess_UT_step_ofz_var2(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_length(), and FLA_Obj_row_stride().
Referenced by FLA_Hess_UT_blf_var2(), and FLA_Hess_UT_ofu_var2().
{ FLA_Datatype datatype; int m_A, m_T; int rs_A, cs_A; int rs_T, cs_T; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); m_T = FLA_Obj_length( T ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_T = FLA_Obj_row_stride( T ); cs_T = FLA_Obj_col_stride( T ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); float* buff_T = FLA_FLOAT_PTR( T ); FLA_Hess_UT_step_ofs_var2( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); double* buff_T = FLA_DOUBLE_PTR( T ); FLA_Hess_UT_step_ofd_var2( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); scomplex* buff_T = FLA_COMPLEX_PTR( T ); FLA_Hess_UT_step_ofc_var2( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buff_T = FLA_DOUBLE_COMPLEX_PTR( T ); FLA_Hess_UT_step_ofz_var2( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ofu_var3 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_ofc_var3(), FLA_Hess_UT_step_ofd_var3(), FLA_Hess_UT_step_ofs_var3(), FLA_Hess_UT_step_ofz_var3(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_length(), and FLA_Obj_row_stride().
Referenced by FLA_Hess_UT_blf_var3(), and FLA_Hess_UT_ofu_var3().
{ FLA_Datatype datatype; int m_A, m_T; int rs_A, cs_A; int rs_T, cs_T; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); m_T = FLA_Obj_length( T ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_T = FLA_Obj_row_stride( T ); cs_T = FLA_Obj_col_stride( T ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); float* buff_T = FLA_FLOAT_PTR( T ); FLA_Hess_UT_step_ofs_var3( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); double* buff_T = FLA_DOUBLE_PTR( T ); FLA_Hess_UT_step_ofd_var3( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); scomplex* buff_T = FLA_COMPLEX_PTR( T ); FLA_Hess_UT_step_ofc_var3( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buff_T = FLA_DOUBLE_COMPLEX_PTR( T ); FLA_Hess_UT_step_ofz_var3( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } } return FLA_SUCCESS; }
References FLA_Hess_UT_step_ofc_var4(), FLA_Hess_UT_step_ofd_var4(), FLA_Hess_UT_step_ofs_var4(), FLA_Hess_UT_step_ofz_var4(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_length(), and FLA_Obj_row_stride().
Referenced by FLA_Hess_UT_blf_var4(), and FLA_Hess_UT_ofu_var4().
{ FLA_Datatype datatype; int m_A, m_T; int rs_A, cs_A; int rs_Y, cs_Y; int rs_Z, cs_Z; int rs_T, cs_T; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); m_T = FLA_Obj_length( T ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_Y = FLA_Obj_row_stride( Y ); cs_Y = FLA_Obj_col_stride( Y ); rs_Z = FLA_Obj_row_stride( Z ); cs_Z = FLA_Obj_col_stride( Z ); rs_T = FLA_Obj_row_stride( T ); cs_T = FLA_Obj_col_stride( T ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); float* buff_Y = FLA_FLOAT_PTR( Y ); float* buff_Z = FLA_FLOAT_PTR( Z ); float* buff_T = FLA_FLOAT_PTR( T ); FLA_Hess_UT_step_ofs_var4( m_A, m_T, buff_A, rs_A, cs_A, buff_Y, rs_Y, cs_Y, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); double* buff_Y = FLA_DOUBLE_PTR( Y ); double* buff_Z = FLA_DOUBLE_PTR( Z ); double* buff_T = FLA_DOUBLE_PTR( T ); FLA_Hess_UT_step_ofd_var4( m_A, m_T, buff_A, rs_A, cs_A, buff_Y, rs_Y, cs_Y, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); scomplex* buff_Y = FLA_COMPLEX_PTR( Y ); scomplex* buff_Z = FLA_COMPLEX_PTR( Z ); scomplex* buff_T = FLA_COMPLEX_PTR( T ); FLA_Hess_UT_step_ofc_var4( m_A, m_T, buff_A, rs_A, cs_A, buff_Y, rs_Y, cs_Y, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buff_Y = FLA_DOUBLE_COMPLEX_PTR( Y ); dcomplex* buff_Z = FLA_DOUBLE_COMPLEX_PTR( Z ); dcomplex* buff_T = FLA_DOUBLE_COMPLEX_PTR( T ); FLA_Hess_UT_step_ofz_var4( m_A, m_T, buff_A, rs_A, cs_A, buff_Y, rs_Y, cs_Y, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ofz_var1 | ( | int | m_A, |
int | m_T, | ||
dcomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
dcomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
FLA_Error FLA_Hess_UT_step_ofz_var2 | ( | int | m_A, |
int | m_T, | ||
dcomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
dcomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_zaxpyv(), bli_zdot(), bli_zgemv(), bli_zger(), bli_zscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Fused_Ahx_Ax_opz_var1(), FLA_Fused_Gerc2_opz_var1(), FLA_Househ2_UT_l_opz(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_ofu_var2().
{ dcomplex* buff_2 = FLA_DOUBLE_COMPLEX_PTR( FLA_TWO ); dcomplex* buff_1 = FLA_DOUBLE_COMPLEX_PTR( FLA_ONE ); dcomplex* buff_0 = FLA_DOUBLE_COMPLEX_PTR( FLA_ZERO ); dcomplex* buff_m1 = FLA_DOUBLE_COMPLEX_PTR( FLA_MINUS_ONE ); dcomplex first_elem; dcomplex dot_product; dcomplex beta, conj_beta; dcomplex inv_tau11; dcomplex minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); dcomplex* buff_y = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); dcomplex* buff_z = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_y = 1; int inc_z = 1; for ( i = 0; i < b_alg; ++i ) { dcomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; dcomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; dcomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; dcomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; dcomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; dcomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; dcomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; dcomplex* y0 = buff_y + (0 )*inc_y; dcomplex* y2 = buff_y + (i+1)*inc_y; dcomplex* z2 = buff_z + (i+1)*inc_z; dcomplex* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; dcomplex* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opz( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_zdiv3( buff_1, tau11, &inv_tau11 ); bli_zneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y2 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z2 ); FLA_Fused_Ahx_Ax_opz_var1( m_ahead, n_ahead, A22, rs_A, cs_A, a21, rs_A, y2, inc_y, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_zdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_zinvscals( buff_2, &beta ); bli_zcopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_zscals( &minus_inv_tau11, &conj_beta ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_zscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_zscals( &minus_inv_tau11, &beta ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_zscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_zdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_zscals( &minus_inv_tau11, &dot_product ); bli_zaxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_zger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, a21, A22 ); FLA_Fused_Gerc2_opz_var1( m_ahead, n_ahead, buff_m1, a21, rs_A, y2, inc_y, z2, inc_z, a21, rs_A, A22, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_zgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); FLA_free( buff_y ); FLA_free( buff_z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ofz_var3 | ( | int | m_A, |
int | m_T, | ||
dcomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
dcomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_zaxpyv(), bli_zcopyv(), bli_zdot(), bli_zgemv(), bli_zger(), bli_zscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Fused_Ahx_Ax_opz_var1(), FLA_Fused_Gerc2_Ahx_Ax_opz_var1(), FLA_Fused_Gerc2_opz_var1(), FLA_Househ2_UT_l_opz(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_ofu_var3().
{ dcomplex* buff_2 = FLA_DOUBLE_COMPLEX_PTR( FLA_TWO ); dcomplex* buff_1 = FLA_DOUBLE_COMPLEX_PTR( FLA_ONE ); dcomplex* buff_0 = FLA_DOUBLE_COMPLEX_PTR( FLA_ZERO ); dcomplex* buff_m1 = FLA_DOUBLE_COMPLEX_PTR( FLA_MINUS_ONE ); dcomplex first_elem; dcomplex dot_product; dcomplex beta, conj_beta; dcomplex inv_tau11; dcomplex minus_inv_tau11; dcomplex minus_upsilon1, minus_conj_upsilon1; dcomplex minus_psi1, minus_conj_psi1; dcomplex minus_zeta1; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &u ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &v ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); dcomplex* buff_u = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); dcomplex* buff_y = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); dcomplex* buff_z = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); dcomplex* buff_v = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); dcomplex* buff_w = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_u = 1; int inc_y = 1; int inc_z = 1; int inc_v = 1; int inc_w = 1; for ( i = 0; i < b_alg; ++i ) { dcomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; dcomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; dcomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; dcomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; dcomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; dcomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; dcomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; dcomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; dcomplex* upsilon1 = buff_u + (i )*inc_u; dcomplex* u2 = buff_u + (i+1)*inc_u; dcomplex* y0 = buff_y + (0 )*inc_y; dcomplex* psi1 = buff_y + (i )*inc_y; dcomplex* y2 = buff_y + (i+1)*inc_y; dcomplex* zeta1 = buff_z + (i )*inc_z; dcomplex* z2 = buff_z + (i+1)*inc_z; dcomplex* v2 = buff_v + (i+1)*inc_v; dcomplex* w2 = buff_w + (i+1)*inc_w; dcomplex* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; dcomplex* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( upsilon1, minus_upsilon1 ); // FLA_Scal( FLA_MINUS_ONE, minus_upsilon1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, minus_conj_upsilon1 ); bli_zmult3( buff_m1, upsilon1, &minus_upsilon1 ); bli_zcopyconj( &minus_upsilon1, &minus_conj_upsilon1 ); // FLA_Copy( psi1, minus_psi1 ); // FLA_Scal( FLA_MINUS_ONE, minus_psi1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_psi1, minus_conj_psi1 ); bli_zmult3( buff_m1, psi1, &minus_psi1 ); bli_zcopyconj( &minus_psi1, &minus_conj_psi1 ); // FLA_Copy( zeta1, minus_zeta1 ); // FLA_Scal( FLA_MINUS_ONE, minus_zeta1 ); bli_zmult3( buff_m1, zeta1, &minus_zeta1 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, psi1, alpha11 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_zeta1, upsilon1, alpha11 ); bli_zaxpyv( BLIS_CONJUGATE, 1, &minus_upsilon1, psi1, 1, alpha11, 1 ); bli_zaxpyv( BLIS_CONJUGATE, 1, &minus_zeta1, upsilon1, 1, alpha11, 1 ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_upsilon1, y2, a12t ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_zeta1, u2, a12t ); bli_zaxpyv( BLIS_CONJUGATE, m_ahead, &minus_upsilon1, y2, inc_y, a12t, cs_A ); bli_zaxpyv( BLIS_CONJUGATE, m_ahead, &minus_zeta1, u2, inc_u, a12t, cs_A ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_psi1, u2, a21 ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_upsilon1, z2, a21 ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_psi1, u2, inc_u, a21, rs_A ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_upsilon1, z2, inc_z, a21, rs_A ); } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opz( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_zdiv3( buff_1, tau11, &inv_tau11 ); bli_zneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; } if ( m_behind > 0 && m_ahead > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, v2 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, w2 ); FLA_Fused_Gerc2_Ahx_Ax_opz_var1( m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, z2, inc_z, A22, rs_A, cs_A, a21, rs_A, v2, inc_v, w2, inc_w ); } else if ( m_ahead > 0 ) { // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, v2 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, w2 ); FLA_Fused_Ahx_Ax_opz_var1( m_ahead, n_ahead, A22, rs_A, cs_A, a21, rs_A, v2, inc_v, w2, inc_w ); } if ( m_ahead > 0 ) { // FLA_Copy( a21, u2 ); // FLA_Copy( v2, y2 ); // FLA_Copy( w2, z2 ); bli_zcopyv( BLIS_NO_CONJUGATE, m_ahead, a21, rs_A, u2, inc_u ); bli_zcopyv( BLIS_NO_CONJUGATE, m_ahead, v2, inc_v, y2, inc_y ); bli_zcopyv( BLIS_NO_CONJUGATE, m_ahead, w2, inc_w, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_zdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_zinvscals( buff_2, &beta ); bli_zcopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_zscals( &minus_inv_tau11, &conj_beta ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_zscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_zscals( &minus_inv_tau11, &beta ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_zscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_zdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_zscals( &minus_inv_tau11, &dot_product ); bli_zaxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_zger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_zgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } if ( m_behind + 1 == b_alg && m_ahead > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); FLA_Fused_Gerc2_opz_var1( m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, z2, inc_z, u2, inc_u, A22, rs_A, cs_A ); } /*------------------------------------------------------------*/ } // FLA_Obj_free( &u ); // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); // FLA_Obj_free( &v ); // FLA_Obj_free( &w ); FLA_free( buff_u ); FLA_free( buff_y ); FLA_free( buff_z ); FLA_free( buff_v ); FLA_free( buff_w ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ofz_var4 | ( | int | m_A, |
int | m_T, | ||
dcomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
dcomplex * | buff_Y, | ||
int | rs_Y, | ||
int | cs_Y, | ||
dcomplex * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
dcomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_zaxpyv(), bli_zdot(), bli_zgemv(), bli_zger(), bli_zscalv(), bli_zsetm(), BLIS_CONJ_NO_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Fused_Ahx_Ax_opz_var1(), FLA_Fused_Uhu_Yhu_Zhu_opz_var1(), FLA_Househ2_UT_l_opz(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_ofu_var4().
{ dcomplex* buff_2 = FLA_DOUBLE_COMPLEX_PTR( FLA_TWO ); dcomplex* buff_1 = FLA_DOUBLE_COMPLEX_PTR( FLA_ONE ); dcomplex* buff_0 = FLA_DOUBLE_COMPLEX_PTR( FLA_ZERO ); dcomplex* buff_m1 = FLA_DOUBLE_COMPLEX_PTR( FLA_MINUS_ONE ); dcomplex first_elem, last_elem; dcomplex dot_product; dcomplex beta, conj_beta; dcomplex inv_tau11; dcomplex minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &d ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &e ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &f ); dcomplex* buff_e = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_e = 1; // FLA_Set( FLA_ZERO, Y ); // FLA_Set( FLA_ZERO, Z ); bli_zsetm( m_A, b_alg, buff_0, buff_Y, rs_Y, cs_Y ); bli_zsetm( m_A, b_alg, buff_0, buff_Z, rs_Z, cs_Z ); for ( i = 0; i < b_alg; ++i ) { dcomplex* a10t = buff_A + (0 )*cs_A + (i )*rs_A; dcomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; dcomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; dcomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; dcomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; dcomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; dcomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; dcomplex* y10t = buff_Y + (0 )*cs_Y + (i )*rs_Y; dcomplex* Y20 = buff_Y + (0 )*cs_Y + (i+1)*rs_Y; dcomplex* y21 = buff_Y + (i )*cs_Y + (i+1)*rs_Y; dcomplex* z10t = buff_Z + (0 )*cs_Z + (i )*rs_Z; dcomplex* Z20 = buff_Z + (0 )*cs_Z + (i+1)*rs_Z; dcomplex* z21 = buff_Z + (i )*cs_Z + (i+1)*rs_Z; dcomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; dcomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; dcomplex* e0 = buff_e + (0 )*inc_e; dcomplex* a10t_r = a10t + (i-1)*cs_A + (0 )*rs_A; dcomplex* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; dcomplex* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; dcomplex* ABL = a10t; dcomplex* ZBL = z10t; dcomplex* a2 = alpha11; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( a10t_r, last_elem ); // FLA_Set( FLA_ONE, a10t_r ); last_elem = *a10t_r; *a10t_r = *buff_1; } // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ABL, y10t, FLA_ONE, a2 ); // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ZBL, a10t, FLA_ONE, a2 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ABL, rs_A, cs_A, y10t, cs_Y, buff_1, a2, rs_A ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ZBL, rs_Z, cs_Z, a10t, cs_A, buff_1, a2, rs_A ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, a10t, FLA_ONE, a12t ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, A20, z10t, FLA_ONE, a12t ); bli_zgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, a10t, cs_A, buff_1, a12t, cs_A ); bli_zgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, z10t, cs_Z, buff_1, a12t, cs_A ); if ( m_behind > 0 ) { // FLA_Copy( last_elem, a10t_r ); *a10t_r = last_elem; } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opz( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_zdiv3( buff_1, tau11, &inv_tau11 ); bli_zneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z21 ); FLA_Fused_Ahx_Ax_opz_var1( m_ahead, n_ahead, A22, rs_A, cs_A, a21, rs_A, y21, rs_Y, z21, rs_Z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, d0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Y20, a21, FLA_ZERO, e0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Z20, a21, FLA_ZERO, f0 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, d0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, f0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, e0, FLA_ONE, z21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, d0, FLA_ONE, z21 ); // FLA_Copy( d0, t01 ); FLA_Fused_Uhu_Yhu_Zhu_opz_var1( m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, Y20, rs_Y, cs_Y, Z20, rs_Z, cs_Z, t01, rs_T, a21, rs_A, y21, rs_Y, z21, rs_Z ); // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_zdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z21, rs_Z, &beta ); bli_zinvscals( buff_2, &beta ); bli_zcopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y21 ); // FLA_Scal( inv_tau11, y21 ); bli_zscals( &minus_inv_tau11, &conj_beta ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y21, rs_Y ); bli_zscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y21, rs_Y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z21 ); // FLA_Scal( inv_tau11, z21 ); bli_zscals( &minus_inv_tau11, &beta ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z21, rs_Z ); bli_zscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z21, rs_Z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_zdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_zscals( &minus_inv_tau11, &dot_product ); bli_zaxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, e0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, e0, a21, A02 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, e0, inc_e ); bli_zger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, e0, inc_e, a21, rs_A, A02, rs_A, cs_A ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &e ); FLA_free( buff_e ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opc_var1 | ( | int | m_A, |
int | m_T, | ||
scomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
scomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_cgemv(), BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, FLA_Apply_H2_UT_l_opc_var1(), FLA_Apply_H2_UT_r_opc_var1(), FLA_Househ2_UT_l_opc(), FLA_ONE, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var1().
{ scomplex* buff_1 = FLA_COMPLEX_PTR( FLA_ONE ); scomplex* buff_0 = FLA_COMPLEX_PTR( FLA_ZERO ); scomplex first_elem; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; for ( i = 0; i < b_alg; ++i ) { scomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; scomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; scomplex* a21_t = buff_A + (i )*cs_A + (i+1)*rs_A; scomplex* a21_b = buff_A + (i )*cs_A + (i+2)*rs_A; scomplex* A22_t = buff_A + (i+1)*cs_A + (i+1)*rs_A; scomplex* A22_b = buff_A + (i+1)*cs_A + (i+2)*rs_A; scomplex* A2_l = buff_A + (i+1)*cs_A + (0 )*rs_A; scomplex* A2_r = buff_A + (i+2)*cs_A + (0 )*rs_A; scomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; scomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int n_behind = i; /*------------------------------------------------------------*/ if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opc( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Apply_H2_UT( FLA_LEFT, tau11, a21_b, A22_t, // A22_b ); FLA_Apply_H2_UT_l_opc_var1( m_ahead - 1, n_ahead, tau11, a21_b, rs_A, A22_t, cs_A, A22_b, rs_A, cs_A ); // FLA_Apply_H2_UT( FLA_RIGHT, tau11, a21_b, A2_l, A2_r ); FLA_Apply_H2_UT_r_opc_var1( m_A, n_ahead - 1, tau11, a21_b, rs_A, A2_l, rs_A, A2_r, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_cgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opc_var2 | ( | int | m_A, |
int | m_T, | ||
scomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
scomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_caxpyv(), bli_cdot(), bli_cgemv(), bli_cger(), bli_cscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Househ2_UT_l_opc(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var2().
{ scomplex* buff_2 = FLA_COMPLEX_PTR( FLA_TWO ); scomplex* buff_1 = FLA_COMPLEX_PTR( FLA_ONE ); scomplex* buff_0 = FLA_COMPLEX_PTR( FLA_ZERO ); scomplex* buff_m1 = FLA_COMPLEX_PTR( FLA_MINUS_ONE ); scomplex first_elem; scomplex dot_product; scomplex beta, conj_beta; scomplex inv_tau11; scomplex minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); scomplex* buff_y = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); scomplex* buff_z = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_y = 1; int inc_z = 1; for ( i = 0; i < b_alg; ++i ) { scomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; scomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; scomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; scomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; scomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; scomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; scomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; scomplex* y0 = buff_y + (0 )*inc_y; scomplex* y2 = buff_y + (i+1)*inc_y; scomplex* z2 = buff_z + (i+1)*inc_z; scomplex* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; scomplex* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opc( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_cdiv3( buff_1, tau11, &inv_tau11 ); bli_cneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y2 ); bli_cgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, y2, inc_y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z2 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_cdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_cinvscals( buff_2, &beta ); bli_ccopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_cscals( &minus_inv_tau11, &conj_beta ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_cscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_cscals( &minus_inv_tau11, &beta ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_cscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_cdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_cscals( &minus_inv_tau11, &dot_product ); bli_caxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_cger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, a21, A22 ); bli_cger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, a21, rs_A, y2, inc_y, A22, rs_A, cs_A ); bli_cger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, z2, inc_z, a21, rs_A, A22, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_cgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); FLA_free( buff_y ); FLA_free( buff_z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opc_var3 | ( | int | m_A, |
int | m_T, | ||
scomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
scomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_caxpyv(), bli_ccopyv(), bli_cdot(), bli_cgemv(), bli_cger(), bli_cscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Househ2_UT_l_opc(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var3().
{ scomplex* buff_2 = FLA_COMPLEX_PTR( FLA_TWO ); scomplex* buff_1 = FLA_COMPLEX_PTR( FLA_ONE ); scomplex* buff_0 = FLA_COMPLEX_PTR( FLA_ZERO ); scomplex* buff_m1 = FLA_COMPLEX_PTR( FLA_MINUS_ONE ); scomplex first_elem; scomplex dot_product; scomplex beta, conj_beta; scomplex inv_tau11; scomplex minus_inv_tau11; scomplex minus_upsilon1, minus_conj_upsilon1; scomplex minus_psi1, minus_conj_psi1; scomplex minus_zeta1; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &u ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &v ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); scomplex* buff_u = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); scomplex* buff_y = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); scomplex* buff_z = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); scomplex* buff_v = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); scomplex* buff_w = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_u = 1; int inc_y = 1; int inc_z = 1; int inc_v = 1; int inc_w = 1; // Initialize some variables (only to prevent compiler warnings). first_elem = *buff_0; minus_inv_tau11 = *buff_0; for ( i = 0; i < b_alg; ++i ) { scomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; scomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; scomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; scomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; scomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; scomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; scomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; scomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; scomplex* upsilon1 = buff_u + (i )*inc_u; scomplex* u2 = buff_u + (i+1)*inc_u; scomplex* y0 = buff_y + (0 )*inc_y; scomplex* psi1 = buff_y + (i )*inc_y; scomplex* y2 = buff_y + (i+1)*inc_y; scomplex* zeta1 = buff_z + (i )*inc_z; scomplex* z2 = buff_z + (i+1)*inc_z; scomplex* v2 = buff_v + (i+1)*inc_v; scomplex* w2 = buff_w + (i+1)*inc_w; scomplex* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; scomplex* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( upsilon1, minus_upsilon1 ); // FLA_Scal( FLA_MINUS_ONE, minus_upsilon1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, minus_conj_upsilon1 ); bli_cmult3( buff_m1, upsilon1, &minus_upsilon1 ); bli_ccopyconj( &minus_upsilon1, &minus_conj_upsilon1 ); // FLA_Copy( psi1, minus_psi1 ); // FLA_Scal( FLA_MINUS_ONE, minus_psi1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_psi1, minus_conj_psi1 ); bli_cmult3( buff_m1, psi1, &minus_psi1 ); bli_ccopyconj( &minus_psi1, &minus_conj_psi1 ); // FLA_Copy( zeta1, minus_zeta1 ); // FLA_Scal( FLA_MINUS_ONE, minus_zeta1 ); bli_cmult3( buff_m1, zeta1, &minus_zeta1 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, psi1, alpha11 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_zeta1, upsilon1, alpha11 ); bli_caxpyv( BLIS_CONJUGATE, 1, &minus_upsilon1, psi1, 1, alpha11, 1 ); bli_caxpyv( BLIS_CONJUGATE, 1, &minus_zeta1, upsilon1, 1, alpha11, 1 ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_upsilon1, y2, a12t ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_zeta1, u2, a12t ); bli_caxpyv( BLIS_CONJUGATE, m_ahead, &minus_upsilon1, y2, inc_y, a12t, cs_A ); bli_caxpyv( BLIS_CONJUGATE, m_ahead, &minus_zeta1, u2, inc_u, a12t, cs_A ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_psi1, u2, a21 ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_upsilon1, z2, a21 ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_psi1, u2, inc_u, a21, rs_A ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_upsilon1, z2, inc_z, a21, rs_A ); } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opc( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_cdiv3( buff_1, tau11, &inv_tau11 ); bli_cneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; } if ( m_behind > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); bli_cger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, A22, rs_A, cs_A ); bli_cger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, z2, inc_z, u2, inc_u, A22, rs_A, cs_A ); } if ( m_ahead > 0 ) { // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, v2 ); bli_cgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, v2, inc_v ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, w2 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, w2, inc_w ); // FLA_Copy( a21, u2 ); // FLA_Copy( v2, y2 ); // FLA_Copy( w2, z2 ); bli_ccopyv( BLIS_NO_CONJUGATE, m_ahead, a21, rs_A, u2, inc_u ); bli_ccopyv( BLIS_NO_CONJUGATE, m_ahead, v2, inc_v, y2, inc_y ); bli_ccopyv( BLIS_NO_CONJUGATE, m_ahead, w2, inc_w, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_cdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_cinvscals( buff_2, &beta ); bli_ccopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_cscals( &minus_inv_tau11, &conj_beta ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_cscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_cscals( &minus_inv_tau11, &beta ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_cscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_cdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_cscals( &minus_inv_tau11, &dot_product ); bli_caxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_cger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_cgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } if ( m_behind + 1 == b_alg && m_ahead > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); bli_cger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, A22, rs_A, cs_A ); bli_cger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, z2, inc_z, u2, inc_u, A22, rs_A, cs_A ); } /*------------------------------------------------------------*/ } // FLA_Obj_free( &u ); // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); // FLA_Obj_free( &v ); // FLA_Obj_free( &w ); FLA_free( buff_u ); FLA_free( buff_y ); FLA_free( buff_z ); FLA_free( buff_v ); FLA_free( buff_w ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opc_var4 | ( | int | m_A, |
int | m_T, | ||
scomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
scomplex * | buff_Y, | ||
int | rs_Y, | ||
int | cs_Y, | ||
scomplex * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
scomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_caxpyv(), bli_ccopyv(), bli_cdot(), bli_cgemv(), bli_cger(), bli_cscalv(), bli_csetm(), BLIS_CONJ_NO_TRANSPOSE, BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Househ2_UT_l_opc(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var4().
{ scomplex* buff_2 = FLA_COMPLEX_PTR( FLA_TWO ); scomplex* buff_1 = FLA_COMPLEX_PTR( FLA_ONE ); scomplex* buff_0 = FLA_COMPLEX_PTR( FLA_ZERO ); scomplex* buff_m1 = FLA_COMPLEX_PTR( FLA_MINUS_ONE ); scomplex first_elem, last_elem; scomplex dot_product; scomplex beta, conj_beta; scomplex inv_tau11; scomplex minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &d ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &e ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &f ); scomplex* buff_d = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); scomplex* buff_e = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); scomplex* buff_f = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_d = 1; int inc_e = 1; int inc_f = 1; // FLA_Set( FLA_ZERO, Y ); // FLA_Set( FLA_ZERO, Z ); bli_csetm( m_A, b_alg, buff_0, buff_Y, rs_Y, cs_Y ); bli_csetm( m_A, b_alg, buff_0, buff_Z, rs_Z, cs_Z ); for ( i = 0; i < b_alg; ++i ) { scomplex* a10t = buff_A + (0 )*cs_A + (i )*rs_A; scomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; scomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; scomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; scomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; scomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; scomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; scomplex* y10t = buff_Y + (0 )*cs_Y + (i )*rs_Y; scomplex* Y20 = buff_Y + (0 )*cs_Y + (i+1)*rs_Y; scomplex* y21 = buff_Y + (i )*cs_Y + (i+1)*rs_Y; scomplex* z10t = buff_Z + (0 )*cs_Z + (i )*rs_Z; scomplex* Z20 = buff_Z + (0 )*cs_Z + (i+1)*rs_Z; scomplex* z21 = buff_Z + (i )*cs_Z + (i+1)*rs_Z; scomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; scomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; scomplex* d0 = buff_d + (0 )*inc_d; scomplex* e0 = buff_e + (0 )*inc_e; scomplex* f0 = buff_f + (0 )*inc_f; scomplex* a10t_r = a10t + (i-1)*cs_A + (0 )*rs_A; scomplex* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; scomplex* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; scomplex* ABL = a10t; scomplex* ZBL = z10t; scomplex* a2 = alpha11; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( a10t_r, last_elem ); // FLA_Set( FLA_ONE, a10t_r ); last_elem = *a10t_r; *a10t_r = *buff_1; } // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ABL, y10t, FLA_ONE, a2 ); // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ZBL, a10t, FLA_ONE, a2 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ABL, rs_A, cs_A, y10t, cs_Y, buff_1, a2, rs_A ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ZBL, rs_Z, cs_Z, a10t, cs_A, buff_1, a2, rs_A ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, a10t, FLA_ONE, a12t ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, A20, z10t, FLA_ONE, a12t ); bli_cgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, a10t, cs_A, buff_1, a12t, cs_A ); bli_cgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, z10t, cs_Z, buff_1, a12t, cs_A ); if ( m_behind > 0 ) { // FLA_Copy( last_elem, a10t_r ); *a10t_r = last_elem; } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opc( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_cdiv3( buff_1, tau11, &inv_tau11 ); bli_cneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y21 ); bli_cgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, y21, rs_Y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z21 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, z21, rs_Z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, d0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Y20, a21, FLA_ZERO, e0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Z20, a21, FLA_ZERO, f0 ); bli_cgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, d0, inc_d ); bli_cgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, Y20, rs_Y, cs_Y, a21, rs_A, buff_0, e0, inc_e ); bli_cgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, Z20, rs_Z, cs_Z, a21, rs_A, buff_0, f0, inc_f ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, d0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, f0, FLA_ONE, y21 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, d0, inc_d, buff_1, y21, rs_Y ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, f0, inc_f, buff_1, y21, rs_Y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, e0, FLA_ONE, z21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, d0, FLA_ONE, z21 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, e0, inc_e, buff_1, z21, rs_Z ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Z20, rs_Z, cs_Z, d0, inc_d, buff_1, z21, rs_Z ); // FLA_Copy( d0, t01 ); bli_ccopyv( BLIS_NO_CONJUGATE, n_behind, d0, inc_d, t01, rs_T ); // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_cdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z21, rs_Z, &beta ); bli_cinvscals( buff_2, &beta ); bli_ccopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y21 ); // FLA_Scal( inv_tau11, y21 ); bli_cscals( &minus_inv_tau11, &conj_beta ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y21, rs_Y ); bli_cscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y21, rs_Y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z21 ); // FLA_Scal( inv_tau11, z21 ); bli_cscals( &minus_inv_tau11, &beta ); bli_caxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z21, rs_Z ); bli_cscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z21, rs_Z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_cdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_cscals( &minus_inv_tau11, &dot_product ); bli_caxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, e0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, e0, a21, A02 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, e0, inc_e ); bli_cger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, e0, inc_e, a21, rs_A, A02, rs_A, cs_A ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &d ); // FLA_Obj_free( &e ); // FLA_Obj_free( &f ); FLA_free( buff_d ); FLA_free( buff_e ); FLA_free( buff_f ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opc_var5 | ( | int | m_A, |
int | m_T, | ||
scomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
scomplex * | buff_U, | ||
int | rs_U, | ||
int | cs_U, | ||
scomplex * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
scomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_caxpyv(), bli_ccopyv(), bli_cdot(), bli_cdots(), bli_cgemv(), bli_csetm(), bli_ctrmv(), bli_ctrmvsx(), bli_ctrsv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_LOWER_TRIANGULAR, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, BLIS_NONUNIT_DIAG, BLIS_UPPER_TRIANGULAR, FLA_free(), FLA_Househ2_UT_l_opc(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var5().
{ scomplex* buff_1 = FLA_COMPLEX_PTR( FLA_ONE ); scomplex* buff_0 = FLA_COMPLEX_PTR( FLA_ZERO ); scomplex* buff_m1 = FLA_COMPLEX_PTR( FLA_MINUS_ONE ); int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); scomplex* buff_w = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_w = 1; // FLA_Set( FLA_ZERO, U ); // FLA_Set( FLA_ZERO, Z ); bli_csetm( m_A, b_alg, buff_0, buff_U, rs_U, cs_U ); bli_csetm( m_A, b_alg, buff_0, buff_Z, rs_Z, cs_Z ); for ( i = 0; i < b_alg; ++i ) { scomplex* a01 = buff_A + (i )*cs_A + (0 )*rs_A; scomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; scomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; scomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; scomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; scomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; scomplex* U00 = buff_U + (0 )*cs_U + (0 )*rs_U; scomplex* u10t = buff_U + (0 )*cs_U + (i )*rs_U; scomplex* U20 = buff_U + (0 )*cs_U + (i+1)*rs_U; scomplex* u21 = buff_U + (i )*cs_U + (i+1)*rs_U; scomplex* Z00 = buff_Z + (0 )*cs_Z + (0 )*rs_Z; scomplex* z10t = buff_Z + (0 )*cs_Z + (i )*rs_Z; scomplex* Z20 = buff_Z + (0 )*cs_Z + (i+1)*rs_Z; scomplex* z01 = buff_Z + (i )*cs_Z + (0 )*rs_Z; scomplex* zeta11 = buff_Z + (i )*cs_Z + (i )*rs_Z; scomplex* z21 = buff_Z + (i )*cs_Z + (i+1)*rs_Z; scomplex* T00 = buff_T + (0 )*cs_T + (0 )*rs_T; scomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; scomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; scomplex* w0 = buff_w + (0 )*inc_w; scomplex* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; scomplex* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; scomplex* u21_t = u21 + (0 )*cs_U + (0 )*rs_U; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copyt( FLA_CONJ_TRANSPOSE, u10t, w0 ); // FLA_Trsv( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, // T00, w0 ); bli_ccopyv( BLIS_CONJUGATE, m_behind, u10t, cs_U, w0, inc_w ); bli_ctrsv( BLIS_UPPER_TRIANGULAR, BLIS_NO_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, T00, rs_T, cs_T, w0, inc_w ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z00, w0, FLA_ONE, a01 ); // FLA_Dots( FLA_MINUS_ONE, z10t, w0, FLA_ONE, alpha11 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, w0, FLA_ONE, a21 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_behind, buff_m1, Z00, rs_Z, cs_Z, w0, inc_w, buff_1, a01, rs_A ); bli_cdots( BLIS_NO_CONJUGATE, m_behind, buff_m1, z10t, cs_Z, w0, inc_w, buff_1, alpha11 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Z20, rs_Z, cs_Z, w0, inc_w, buff_1, a21, rs_A ); // FLA_Trmvsx( FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, // FLA_ONE, U00, a01, FLA_ZERO, w0 ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, alpha11, u10t, w0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, U20, a21, FLA_ONE, w0 ); bli_ccopyv( BLIS_NO_CONJUGATE, m_behind, a01, rs_A, w0, inc_w ); bli_ctrmv( BLIS_LOWER_TRIANGULAR, BLIS_CONJ_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, U00, rs_U, cs_U, w0, inc_w ); bli_caxpyv( BLIS_CONJUGATE, m_behind, alpha11, u10t, cs_U, w0, inc_w ); bli_cgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, U20, rs_U, cs_U, a21, rs_A, buff_1, w0, inc_w ); // FLA_Trsv( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, // T00, w0 ); bli_ctrsv( BLIS_UPPER_TRIANGULAR, BLIS_CONJ_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, T00, rs_T, cs_T, w0, inc_w ); // FLA_Trmvsx( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, // FLA_MINUS_ONE, U00, w0, FLA_ONE, a01 ); // FLA_Dots( FLA_MINUS_ONE, u10t, w0, FLA_ONE, alpha11 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, U20, w0, FLA_ONE, a21 ); bli_ctrmvsx( BLIS_LOWER_TRIANGULAR, BLIS_NO_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, buff_m1, U00, rs_U, cs_U, w0, inc_w, buff_1, a01, rs_A ); bli_cdots( BLIS_NO_CONJUGATE, m_behind, buff_m1, u10t, cs_U, w0, inc_w, buff_1, alpha11 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, U20, rs_U, cs_U, w0, inc_w, buff_1, a21, rs_A ); } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opc( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Copy( a21, u21 ); bli_ccopyv( BLIS_NO_CONJUGATE, m_ahead, a21, rs_A, u21, rs_U ); // FLA_Set( FLA_ONE, u21_t ); *u21_t = *buff_1; // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, u21, FLA_ZERO, z01 ); // FLA_Dot( a12t, u21, zeta11 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, u21, FLA_ZERO, z21 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, u21, rs_U, buff_0, z01, rs_Z ); bli_cdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, u21, rs_U, zeta11 ); bli_cgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, u21, rs_U, buff_0, z21, rs_Z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, U20, u21, FLA_ZERO, t01 ); bli_cgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, U20, rs_U, cs_U, u21, rs_U, buff_0, t01, rs_T ); } /*------------------------------------------------------------*/ } // FLA_Obj_free( &w ); FLA_free( buff_w ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opd_var1 | ( | int | m_A, |
int | m_T, | ||
double * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
double * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_dgemv(), BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, FLA_Apply_H2_UT_l_opd_var1(), FLA_Apply_H2_UT_r_opd_var1(), FLA_Househ2_UT_l_opd(), FLA_ONE, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var1().
{ double* buff_1 = FLA_DOUBLE_PTR( FLA_ONE ); double* buff_0 = FLA_DOUBLE_PTR( FLA_ZERO ); double first_elem; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; for ( i = 0; i < b_alg; ++i ) { double* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; double* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; double* a21_t = buff_A + (i )*cs_A + (i+1)*rs_A; double* a21_b = buff_A + (i )*cs_A + (i+2)*rs_A; double* A22_t = buff_A + (i+1)*cs_A + (i+1)*rs_A; double* A22_b = buff_A + (i+1)*cs_A + (i+2)*rs_A; double* A2_l = buff_A + (i+1)*cs_A + (0 )*rs_A; double* A2_r = buff_A + (i+2)*cs_A + (0 )*rs_A; double* t01 = buff_T + (i )*cs_T + (0 )*rs_T; double* tau11 = buff_T + (i )*cs_T + (i )*rs_T; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int n_behind = i; /*------------------------------------------------------------*/ if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opd( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Apply_H2_UT( FLA_LEFT, tau11, a21_b, A22_t, // A22_b ); FLA_Apply_H2_UT_l_opd_var1( m_ahead - 1, n_ahead, tau11, a21_b, rs_A, A22_t, cs_A, A22_b, rs_A, cs_A ); // FLA_Apply_H2_UT( FLA_RIGHT, tau11, a21_b, A2_l, A2_r ); FLA_Apply_H2_UT_r_opd_var1( m_A, n_ahead - 1, tau11, a21_b, rs_A, A2_l, rs_A, A2_r, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_dgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opd_var2 | ( | int | m_A, |
int | m_T, | ||
double * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
double * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_daxpyv(), bli_ddot(), bli_dgemv(), bli_dger(), bli_dscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Househ2_UT_l_opd(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var2().
{ double* buff_2 = FLA_DOUBLE_PTR( FLA_TWO ); double* buff_1 = FLA_DOUBLE_PTR( FLA_ONE ); double* buff_0 = FLA_DOUBLE_PTR( FLA_ZERO ); double* buff_m1 = FLA_DOUBLE_PTR( FLA_MINUS_ONE ); double first_elem; double dot_product; double beta, conj_beta; double inv_tau11; double minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); double* buff_y = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); double* buff_z = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_y = 1; int inc_z = 1; for ( i = 0; i < b_alg; ++i ) { double* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; double* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; double* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; double* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; double* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; double* t01 = buff_T + (i )*cs_T + (0 )*rs_T; double* tau11 = buff_T + (i )*cs_T + (i )*rs_T; double* y0 = buff_y + (0 )*inc_y; double* y2 = buff_y + (i+1)*inc_y; double* z2 = buff_z + (i+1)*inc_z; double* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; double* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opd( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_ddiv3( buff_1, tau11, &inv_tau11 ); bli_dneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y2 ); bli_dgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, y2, inc_y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z2 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_ddot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_dinvscals( buff_2, &beta ); bli_dcopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_dscals( &minus_inv_tau11, &conj_beta ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_dscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_dscals( &minus_inv_tau11, &beta ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_dscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_ddot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_dscals( &minus_inv_tau11, &dot_product ); bli_daxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_dger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, a21, A22 ); bli_dger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, a21, rs_A, y2, inc_y, A22, rs_A, cs_A ); bli_dger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, z2, inc_z, a21, rs_A, A22, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_dgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); FLA_free( buff_y ); FLA_free( buff_z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opd_var3 | ( | int | m_A, |
int | m_T, | ||
double * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
double * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_daxpyv(), bli_dcopyv(), bli_ddot(), bli_dgemv(), bli_dger(), bli_dscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Househ2_UT_l_opd(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var3().
{ double* buff_2 = FLA_DOUBLE_PTR( FLA_TWO ); double* buff_1 = FLA_DOUBLE_PTR( FLA_ONE ); double* buff_0 = FLA_DOUBLE_PTR( FLA_ZERO ); double* buff_m1 = FLA_DOUBLE_PTR( FLA_MINUS_ONE ); double first_elem; double dot_product; double beta, conj_beta; double inv_tau11; double minus_inv_tau11; double minus_upsilon1, minus_conj_upsilon1; double minus_psi1, minus_conj_psi1; double minus_zeta1; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &u ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &v ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); double* buff_u = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); double* buff_y = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); double* buff_z = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); double* buff_v = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); double* buff_w = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_u = 1; int inc_y = 1; int inc_z = 1; int inc_v = 1; int inc_w = 1; // Initialize some variables (only to prevent compiler warnings). first_elem = *buff_0; minus_inv_tau11 = *buff_0; for ( i = 0; i < b_alg; ++i ) { double* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; double* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; double* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; double* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; double* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; double* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; double* t01 = buff_T + (i )*cs_T + (0 )*rs_T; double* tau11 = buff_T + (i )*cs_T + (i )*rs_T; double* upsilon1 = buff_u + (i )*inc_u; double* u2 = buff_u + (i+1)*inc_u; double* y0 = buff_y + (0 )*inc_y; double* psi1 = buff_y + (i )*inc_y; double* y2 = buff_y + (i+1)*inc_y; double* zeta1 = buff_z + (i )*inc_z; double* z2 = buff_z + (i+1)*inc_z; double* v2 = buff_v + (i+1)*inc_v; double* w2 = buff_w + (i+1)*inc_w; double* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; double* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( upsilon1, minus_upsilon1 ); // FLA_Scal( FLA_MINUS_ONE, minus_upsilon1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, minus_conj_upsilon1 ); bli_dmult3( buff_m1, upsilon1, &minus_upsilon1 ); bli_dcopyconj( &minus_upsilon1, &minus_conj_upsilon1 ); // FLA_Copy( psi1, minus_psi1 ); // FLA_Scal( FLA_MINUS_ONE, minus_psi1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_psi1, minus_conj_psi1 ); bli_dmult3( buff_m1, psi1, &minus_psi1 ); bli_dcopyconj( &minus_psi1, &minus_conj_psi1 ); // FLA_Copy( zeta1, minus_zeta1 ); // FLA_Scal( FLA_MINUS_ONE, minus_zeta1 ); bli_dmult3( buff_m1, zeta1, &minus_zeta1 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, psi1, alpha11 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_zeta1, upsilon1, alpha11 ); bli_daxpyv( BLIS_CONJUGATE, 1, &minus_upsilon1, psi1, 1, alpha11, 1 ); bli_daxpyv( BLIS_CONJUGATE, 1, &minus_zeta1, upsilon1, 1, alpha11, 1 ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_upsilon1, y2, a12t ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_zeta1, u2, a12t ); bli_daxpyv( BLIS_CONJUGATE, m_ahead, &minus_upsilon1, y2, inc_y, a12t, cs_A ); bli_daxpyv( BLIS_CONJUGATE, m_ahead, &minus_zeta1, u2, inc_u, a12t, cs_A ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_psi1, u2, a21 ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_upsilon1, z2, a21 ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_psi1, u2, inc_u, a21, rs_A ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_upsilon1, z2, inc_z, a21, rs_A ); } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opd( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_ddiv3( buff_1, tau11, &inv_tau11 ); bli_dneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; } if ( m_behind > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); bli_dger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, A22, rs_A, cs_A ); bli_dger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, z2, inc_z, u2, inc_u, A22, rs_A, cs_A ); } if ( m_ahead > 0 ) { // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, v2 ); bli_dgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, v2, inc_v ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, w2 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, w2, inc_w ); // FLA_Copy( a21, u2 ); // FLA_Copy( v2, y2 ); // FLA_Copy( w2, z2 ); bli_dcopyv( BLIS_NO_CONJUGATE, m_ahead, a21, rs_A, u2, inc_u ); bli_dcopyv( BLIS_NO_CONJUGATE, m_ahead, v2, inc_v, y2, inc_y ); bli_dcopyv( BLIS_NO_CONJUGATE, m_ahead, w2, inc_w, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_ddot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_dinvscals( buff_2, &beta ); bli_dcopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_dscals( &minus_inv_tau11, &conj_beta ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_dscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_dscals( &minus_inv_tau11, &beta ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_dscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_ddot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_dscals( &minus_inv_tau11, &dot_product ); bli_daxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_dger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_dgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } if ( m_behind + 1 == b_alg && m_ahead > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); bli_dger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, A22, rs_A, cs_A ); bli_dger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, z2, inc_z, u2, inc_u, A22, rs_A, cs_A ); } /*------------------------------------------------------------*/ } // FLA_Obj_free( &u ); // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); // FLA_Obj_free( &v ); // FLA_Obj_free( &w ); FLA_free( buff_u ); FLA_free( buff_y ); FLA_free( buff_z ); FLA_free( buff_v ); FLA_free( buff_w ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opd_var4 | ( | int | m_A, |
int | m_T, | ||
double * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
double * | buff_Y, | ||
int | rs_Y, | ||
int | cs_Y, | ||
double * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
double * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_daxpyv(), bli_dcopyv(), bli_ddot(), bli_dgemv(), bli_dger(), bli_dscalv(), bli_dsetm(), BLIS_CONJ_NO_TRANSPOSE, BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Househ2_UT_l_opd(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var4().
{ double* buff_2 = FLA_DOUBLE_PTR( FLA_TWO ); double* buff_1 = FLA_DOUBLE_PTR( FLA_ONE ); double* buff_0 = FLA_DOUBLE_PTR( FLA_ZERO ); double* buff_m1 = FLA_DOUBLE_PTR( FLA_MINUS_ONE ); double first_elem, last_elem; double dot_product; double beta, conj_beta; double inv_tau11; double minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &d ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &e ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &f ); double* buff_d = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); double* buff_e = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); double* buff_f = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_d = 1; int inc_e = 1; int inc_f = 1; // FLA_Set( FLA_ZERO, Y ); // FLA_Set( FLA_ZERO, Z ); bli_dsetm( m_A, b_alg, buff_0, buff_Y, rs_Y, cs_Y ); bli_dsetm( m_A, b_alg, buff_0, buff_Z, rs_Z, cs_Z ); for ( i = 0; i < b_alg; ++i ) { double* a10t = buff_A + (0 )*cs_A + (i )*rs_A; double* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; double* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; double* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; double* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; double* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; double* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; double* y10t = buff_Y + (0 )*cs_Y + (i )*rs_Y; double* Y20 = buff_Y + (0 )*cs_Y + (i+1)*rs_Y; double* y21 = buff_Y + (i )*cs_Y + (i+1)*rs_Y; double* z10t = buff_Z + (0 )*cs_Z + (i )*rs_Z; double* Z20 = buff_Z + (0 )*cs_Z + (i+1)*rs_Z; double* z21 = buff_Z + (i )*cs_Z + (i+1)*rs_Z; double* t01 = buff_T + (i )*cs_T + (0 )*rs_T; double* tau11 = buff_T + (i )*cs_T + (i )*rs_T; double* d0 = buff_d + (0 )*inc_d; double* e0 = buff_e + (0 )*inc_e; double* f0 = buff_f + (0 )*inc_f; double* a10t_r = a10t + (i-1)*cs_A + (0 )*rs_A; double* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; double* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; double* ABL = a10t; double* ZBL = z10t; double* a2 = alpha11; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( a10t_r, last_elem ); // FLA_Set( FLA_ONE, a10t_r ); last_elem = *a10t_r; *a10t_r = *buff_1; } // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ABL, y10t, FLA_ONE, a2 ); // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ZBL, a10t, FLA_ONE, a2 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ABL, rs_A, cs_A, y10t, cs_Y, buff_1, a2, rs_A ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ZBL, rs_Z, cs_Z, a10t, cs_A, buff_1, a2, rs_A ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, a10t, FLA_ONE, a12t ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, A20, z10t, FLA_ONE, a12t ); bli_dgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, a10t, cs_A, buff_1, a12t, cs_A ); bli_dgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, z10t, cs_Z, buff_1, a12t, cs_A ); if ( m_behind > 0 ) { // FLA_Copy( last_elem, a10t_r ); *a10t_r = last_elem; } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opd( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_ddiv3( buff_1, tau11, &inv_tau11 ); bli_dneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y21 ); bli_dgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, y21, rs_Y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z21 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, z21, rs_Z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, d0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Y20, a21, FLA_ZERO, e0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Z20, a21, FLA_ZERO, f0 ); bli_dgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, d0, inc_d ); bli_dgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, Y20, rs_Y, cs_Y, a21, rs_A, buff_0, e0, inc_e ); bli_dgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, Z20, rs_Z, cs_Z, a21, rs_A, buff_0, f0, inc_f ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, d0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, f0, FLA_ONE, y21 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, d0, inc_d, buff_1, y21, rs_Y ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, f0, inc_f, buff_1, y21, rs_Y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, e0, FLA_ONE, z21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, d0, FLA_ONE, z21 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, e0, inc_e, buff_1, z21, rs_Z ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Z20, rs_Z, cs_Z, d0, inc_d, buff_1, z21, rs_Z ); // FLA_Copy( d0, t01 ); bli_dcopyv( BLIS_NO_CONJUGATE, n_behind, d0, inc_d, t01, rs_T ); // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_ddot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z21, rs_Z, &beta ); bli_dinvscals( buff_2, &beta ); bli_dcopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y21 ); // FLA_Scal( inv_tau11, y21 ); bli_dscals( &minus_inv_tau11, &conj_beta ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y21, rs_Y ); bli_dscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y21, rs_Y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z21 ); // FLA_Scal( inv_tau11, z21 ); bli_dscals( &minus_inv_tau11, &beta ); bli_daxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z21, rs_Z ); bli_dscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z21, rs_Z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_ddot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_dscals( &minus_inv_tau11, &dot_product ); bli_daxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, e0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, e0, a21, A02 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, e0, inc_e ); bli_dger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, e0, inc_e, a21, rs_A, A02, rs_A, cs_A ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &d ); // FLA_Obj_free( &e ); // FLA_Obj_free( &f ); FLA_free( buff_d ); FLA_free( buff_e ); FLA_free( buff_f ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opd_var5 | ( | int | m_A, |
int | m_T, | ||
double * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
double * | buff_U, | ||
int | rs_U, | ||
int | cs_U, | ||
double * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
double * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_daxpyv(), bli_dcopyv(), bli_ddot(), bli_ddots(), bli_dgemv(), bli_dsetm(), bli_dtrmv(), bli_dtrmvsx(), bli_dtrsv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_LOWER_TRIANGULAR, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, BLIS_NONUNIT_DIAG, BLIS_UPPER_TRIANGULAR, FLA_free(), FLA_Househ2_UT_l_opd(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var5().
{ double* buff_1 = FLA_DOUBLE_PTR( FLA_ONE ); double* buff_0 = FLA_DOUBLE_PTR( FLA_ZERO ); double* buff_m1 = FLA_DOUBLE_PTR( FLA_MINUS_ONE ); int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); double* buff_w = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_w = 1; // FLA_Set( FLA_ZERO, U ); // FLA_Set( FLA_ZERO, Z ); bli_dsetm( m_A, b_alg, buff_0, buff_U, rs_U, cs_U ); bli_dsetm( m_A, b_alg, buff_0, buff_Z, rs_Z, cs_Z ); for ( i = 0; i < b_alg; ++i ) { double* a01 = buff_A + (i )*cs_A + (0 )*rs_A; double* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; double* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; double* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; double* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; double* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; double* U00 = buff_U + (0 )*cs_U + (0 )*rs_U; double* u10t = buff_U + (0 )*cs_U + (i )*rs_U; double* U20 = buff_U + (0 )*cs_U + (i+1)*rs_U; double* u21 = buff_U + (i )*cs_U + (i+1)*rs_U; double* Z00 = buff_Z + (0 )*cs_Z + (0 )*rs_Z; double* z10t = buff_Z + (0 )*cs_Z + (i )*rs_Z; double* Z20 = buff_Z + (0 )*cs_Z + (i+1)*rs_Z; double* z01 = buff_Z + (i )*cs_Z + (0 )*rs_Z; double* zeta11 = buff_Z + (i )*cs_Z + (i )*rs_Z; double* z21 = buff_Z + (i )*cs_Z + (i+1)*rs_Z; double* T00 = buff_T + (0 )*cs_T + (0 )*rs_T; double* t01 = buff_T + (i )*cs_T + (0 )*rs_T; double* tau11 = buff_T + (i )*cs_T + (i )*rs_T; double* w0 = buff_w + (0 )*inc_w; double* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; double* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; double* u21_t = u21 + (0 )*cs_U + (0 )*rs_U; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copyt( FLA_CONJ_TRANSPOSE, u10t, w0 ); // FLA_Trsv( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, // T00, w0 ); bli_dcopyv( BLIS_CONJUGATE, m_behind, u10t, cs_U, w0, inc_w ); bli_dtrsv( BLIS_UPPER_TRIANGULAR, BLIS_NO_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, T00, rs_T, cs_T, w0, inc_w ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z00, w0, FLA_ONE, a01 ); // FLA_Dots( FLA_MINUS_ONE, z10t, w0, FLA_ONE, alpha11 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, w0, FLA_ONE, a21 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_behind, buff_m1, Z00, rs_Z, cs_Z, w0, inc_w, buff_1, a01, rs_A ); bli_ddots( BLIS_NO_CONJUGATE, m_behind, buff_m1, z10t, cs_Z, w0, inc_w, buff_1, alpha11 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Z20, rs_Z, cs_Z, w0, inc_w, buff_1, a21, rs_A ); // FLA_Trmvsx( FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, // FLA_ONE, U00, a01, FLA_ZERO, w0 ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, alpha11, u10t, w0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, U20, a21, FLA_ONE, w0 ); bli_dcopyv( BLIS_NO_CONJUGATE, m_behind, a01, rs_A, w0, inc_w ); bli_dtrmv( BLIS_LOWER_TRIANGULAR, BLIS_CONJ_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, U00, rs_U, cs_U, w0, inc_w ); bli_daxpyv( BLIS_CONJUGATE, m_behind, alpha11, u10t, cs_U, w0, inc_w ); bli_dgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, U20, rs_U, cs_U, a21, rs_A, buff_1, w0, inc_w ); // FLA_Trsv( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, // T00, w0 ); bli_dtrsv( BLIS_UPPER_TRIANGULAR, BLIS_CONJ_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, T00, rs_T, cs_T, w0, inc_w ); // FLA_Trmvsx( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, // FLA_MINUS_ONE, U00, w0, FLA_ONE, a01 ); // FLA_Dots( FLA_MINUS_ONE, u10t, w0, FLA_ONE, alpha11 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, U20, w0, FLA_ONE, a21 ); bli_dtrmvsx( BLIS_LOWER_TRIANGULAR, BLIS_NO_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, buff_m1, U00, rs_U, cs_U, w0, inc_w, buff_1, a01, rs_A ); bli_ddots( BLIS_NO_CONJUGATE, m_behind, buff_m1, u10t, cs_U, w0, inc_w, buff_1, alpha11 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, U20, rs_U, cs_U, w0, inc_w, buff_1, a21, rs_A ); } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opd( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Copy( a21, u21 ); bli_dcopyv( BLIS_NO_CONJUGATE, m_ahead, a21, rs_A, u21, rs_U ); // FLA_Set( FLA_ONE, u21_t ); *u21_t = *buff_1; // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, u21, FLA_ZERO, z01 ); // FLA_Dot( a12t, u21, zeta11 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, u21, FLA_ZERO, z21 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, u21, rs_U, buff_0, z01, rs_Z ); bli_ddot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, u21, rs_U, zeta11 ); bli_dgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, u21, rs_U, buff_0, z21, rs_Z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, U20, u21, FLA_ZERO, t01 ); bli_dgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, U20, rs_U, cs_U, u21, rs_U, buff_0, t01, rs_T ); } /*------------------------------------------------------------*/ } // FLA_Obj_free( &w ); FLA_free( buff_w ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ops_var1 | ( | int | m_A, |
int | m_T, | ||
float * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
float * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_sgemv(), BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, FLA_Apply_H2_UT_l_ops_var1(), FLA_Apply_H2_UT_r_ops_var1(), FLA_Househ2_UT_l_ops(), FLA_ONE, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var1().
{ float* buff_1 = FLA_FLOAT_PTR( FLA_ONE ); float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO ); float first_elem; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; for ( i = 0; i < b_alg; ++i ) { float* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; float* a21_t = buff_A + (i )*cs_A + (i+1)*rs_A; float* a21_b = buff_A + (i )*cs_A + (i+2)*rs_A; float* A22_t = buff_A + (i+1)*cs_A + (i+1)*rs_A; float* A22_b = buff_A + (i+1)*cs_A + (i+2)*rs_A; float* A2_l = buff_A + (i+1)*cs_A + (0 )*rs_A; float* A2_r = buff_A + (i+2)*cs_A + (0 )*rs_A; float* t01 = buff_T + (i )*cs_T + (0 )*rs_T; float* tau11 = buff_T + (i )*cs_T + (i )*rs_T; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int n_behind = i; /*------------------------------------------------------------*/ if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_ops( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Apply_H2_UT( FLA_LEFT, tau11, a21_b, A22_t, // A22_b ); FLA_Apply_H2_UT_l_ops_var1( m_ahead - 1, n_ahead, tau11, a21_b, rs_A, A22_t, cs_A, A22_b, rs_A, cs_A ); // FLA_Apply_H2_UT( FLA_RIGHT, tau11, a21_b, A2_l, A2_r ); FLA_Apply_H2_UT_r_ops_var1( m_A, n_ahead - 1, tau11, a21_b, rs_A, A2_l, rs_A, A2_r, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_sgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ops_var2 | ( | int | m_A, |
int | m_T, | ||
float * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
float * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_saxpyv(), bli_sdot(), bli_sgemv(), bli_sger(), bli_sscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Househ2_UT_l_ops(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var2().
{ float* buff_2 = FLA_FLOAT_PTR( FLA_TWO ); float* buff_1 = FLA_FLOAT_PTR( FLA_ONE ); float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO ); float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE ); float first_elem; float dot_product; float beta, conj_beta; float inv_tau11; float minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); float* buff_y = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_z = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_y = 1; int inc_z = 1; for ( i = 0; i < b_alg; ++i ) { float* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; float* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; float* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; float* t01 = buff_T + (i )*cs_T + (0 )*rs_T; float* tau11 = buff_T + (i )*cs_T + (i )*rs_T; float* y0 = buff_y + (0 )*inc_y; float* y2 = buff_y + (i+1)*inc_y; float* z2 = buff_z + (i+1)*inc_z; float* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; float* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_ops( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_sdiv3( buff_1, tau11, &inv_tau11 ); bli_sneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y2 ); bli_sgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, y2, inc_y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z2 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_sdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_sinvscals( buff_2, &beta ); bli_scopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_sscals( &minus_inv_tau11, &conj_beta ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_sscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_sscals( &minus_inv_tau11, &beta ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_sscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_sdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_sscals( &minus_inv_tau11, &dot_product ); bli_saxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_sger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, a21, A22 ); bli_sger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, a21, rs_A, y2, inc_y, A22, rs_A, cs_A ); bli_sger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, z2, inc_z, a21, rs_A, A22, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_sgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); FLA_free( buff_y ); FLA_free( buff_z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ops_var3 | ( | int | m_A, |
int | m_T, | ||
float * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
float * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_saxpyv(), bli_scopyv(), bli_sdot(), bli_sgemv(), bli_sger(), bli_sscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Househ2_UT_l_ops(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var3().
{ float* buff_2 = FLA_FLOAT_PTR( FLA_TWO ); float* buff_1 = FLA_FLOAT_PTR( FLA_ONE ); float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO ); float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE ); float first_elem; float dot_product; float beta, conj_beta; float inv_tau11; float minus_inv_tau11; float minus_upsilon1, minus_conj_upsilon1; float minus_psi1, minus_conj_psi1; float minus_zeta1; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &u ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &v ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); float* buff_u = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_y = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_z = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_v = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_w = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_u = 1; int inc_y = 1; int inc_z = 1; int inc_v = 1; int inc_w = 1; // Initialize some variables (only to prevent compiler warnings). first_elem = *buff_0; minus_inv_tau11 = *buff_0; for ( i = 0; i < b_alg; ++i ) { float* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; float* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; float* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; float* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; float* t01 = buff_T + (i )*cs_T + (0 )*rs_T; float* tau11 = buff_T + (i )*cs_T + (i )*rs_T; float* upsilon1 = buff_u + (i )*inc_u; float* u2 = buff_u + (i+1)*inc_u; float* y0 = buff_y + (0 )*inc_y; float* psi1 = buff_y + (i )*inc_y; float* y2 = buff_y + (i+1)*inc_y; float* zeta1 = buff_z + (i )*inc_z; float* z2 = buff_z + (i+1)*inc_z; float* v2 = buff_v + (i+1)*inc_v; float* w2 = buff_w + (i+1)*inc_w; float* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; float* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( upsilon1, minus_upsilon1 ); // FLA_Scal( FLA_MINUS_ONE, minus_upsilon1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, minus_conj_upsilon1 ); bli_smult3( buff_m1, upsilon1, &minus_upsilon1 ); bli_scopyconj( &minus_upsilon1, &minus_conj_upsilon1 ); // FLA_Copy( psi1, minus_psi1 ); // FLA_Scal( FLA_MINUS_ONE, minus_psi1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_psi1, minus_conj_psi1 ); bli_smult3( buff_m1, psi1, &minus_psi1 ); bli_scopyconj( &minus_psi1, &minus_conj_psi1 ); // FLA_Copy( zeta1, minus_zeta1 ); // FLA_Scal( FLA_MINUS_ONE, minus_zeta1 ); bli_smult3( buff_m1, zeta1, &minus_zeta1 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, psi1, alpha11 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_zeta1, upsilon1, alpha11 ); bli_saxpyv( BLIS_CONJUGATE, 1, &minus_upsilon1, psi1, 1, alpha11, 1 ); bli_saxpyv( BLIS_CONJUGATE, 1, &minus_zeta1, upsilon1, 1, alpha11, 1 ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_upsilon1, y2, a12t ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_zeta1, u2, a12t ); bli_saxpyv( BLIS_CONJUGATE, m_ahead, &minus_upsilon1, y2, inc_y, a12t, cs_A ); bli_saxpyv( BLIS_CONJUGATE, m_ahead, &minus_zeta1, u2, inc_u, a12t, cs_A ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_psi1, u2, a21 ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_upsilon1, z2, a21 ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_psi1, u2, inc_u, a21, rs_A ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_upsilon1, z2, inc_z, a21, rs_A ); } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_ops( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_sdiv3( buff_1, tau11, &inv_tau11 ); bli_sneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; } if ( m_behind > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); bli_sger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, A22, rs_A, cs_A ); bli_sger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, z2, inc_z, u2, inc_u, A22, rs_A, cs_A ); } if ( m_ahead > 0 ) { // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, v2 ); bli_sgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, v2, inc_v ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, w2 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, w2, inc_w ); // FLA_Copy( a21, u2 ); // FLA_Copy( v2, y2 ); // FLA_Copy( w2, z2 ); bli_scopyv( BLIS_NO_CONJUGATE, m_ahead, a21, rs_A, u2, inc_u ); bli_scopyv( BLIS_NO_CONJUGATE, m_ahead, v2, inc_v, y2, inc_y ); bli_scopyv( BLIS_NO_CONJUGATE, m_ahead, w2, inc_w, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_sdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_sinvscals( buff_2, &beta ); bli_scopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_sscals( &minus_inv_tau11, &conj_beta ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_sscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_sscals( &minus_inv_tau11, &beta ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_sscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_sdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_sscals( &minus_inv_tau11, &dot_product ); bli_saxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_sger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_sgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } if ( m_behind + 1 == b_alg && m_ahead > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); bli_sger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, A22, rs_A, cs_A ); bli_sger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, z2, inc_z, u2, inc_u, A22, rs_A, cs_A ); } /*------------------------------------------------------------*/ } // FLA_Obj_free( &u ); // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); // FLA_Obj_free( &v ); // FLA_Obj_free( &w ); FLA_free( buff_u ); FLA_free( buff_y ); FLA_free( buff_z ); FLA_free( buff_v ); FLA_free( buff_w ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ops_var4 | ( | int | m_A, |
int | m_T, | ||
float * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
float * | buff_Y, | ||
int | rs_Y, | ||
int | cs_Y, | ||
float * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
float * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_saxpyv(), bli_scopyv(), bli_sdot(), bli_sgemv(), bli_sger(), bli_sscalv(), bli_ssetm(), BLIS_CONJ_NO_TRANSPOSE, BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Househ2_UT_l_ops(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var4().
{ float* buff_2 = FLA_FLOAT_PTR( FLA_TWO ); float* buff_1 = FLA_FLOAT_PTR( FLA_ONE ); float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO ); float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE ); float first_elem, last_elem; float dot_product; float beta, conj_beta; float inv_tau11; float minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &d ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &e ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &f ); float* buff_d = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_e = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_f = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_d = 1; int inc_e = 1; int inc_f = 1; // FLA_Set( FLA_ZERO, Y ); // FLA_Set( FLA_ZERO, Z ); bli_ssetm( m_A, b_alg, buff_0, buff_Y, rs_Y, cs_Y ); bli_ssetm( m_A, b_alg, buff_0, buff_Z, rs_Z, cs_Z ); for ( i = 0; i < b_alg; ++i ) { float* a10t = buff_A + (0 )*cs_A + (i )*rs_A; float* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; float* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; float* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; float* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; float* y10t = buff_Y + (0 )*cs_Y + (i )*rs_Y; float* Y20 = buff_Y + (0 )*cs_Y + (i+1)*rs_Y; float* y21 = buff_Y + (i )*cs_Y + (i+1)*rs_Y; float* z10t = buff_Z + (0 )*cs_Z + (i )*rs_Z; float* Z20 = buff_Z + (0 )*cs_Z + (i+1)*rs_Z; float* z21 = buff_Z + (i )*cs_Z + (i+1)*rs_Z; float* t01 = buff_T + (i )*cs_T + (0 )*rs_T; float* tau11 = buff_T + (i )*cs_T + (i )*rs_T; float* d0 = buff_d + (0 )*inc_d; float* e0 = buff_e + (0 )*inc_e; float* f0 = buff_f + (0 )*inc_f; float* a10t_r = a10t + (i-1)*cs_A + (0 )*rs_A; float* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; float* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; float* ABL = a10t; float* ZBL = z10t; float* a2 = alpha11; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( a10t_r, last_elem ); // FLA_Set( FLA_ONE, a10t_r ); last_elem = *a10t_r; *a10t_r = *buff_1; } // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ABL, y10t, FLA_ONE, a2 ); // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ZBL, a10t, FLA_ONE, a2 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ABL, rs_A, cs_A, y10t, cs_Y, buff_1, a2, rs_A ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ZBL, rs_Z, cs_Z, a10t, cs_A, buff_1, a2, rs_A ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, a10t, FLA_ONE, a12t ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, A20, z10t, FLA_ONE, a12t ); bli_sgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, a10t, cs_A, buff_1, a12t, cs_A ); bli_sgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, z10t, cs_Z, buff_1, a12t, cs_A ); if ( m_behind > 0 ) { // FLA_Copy( last_elem, a10t_r ); *a10t_r = last_elem; } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_ops( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_sdiv3( buff_1, tau11, &inv_tau11 ); bli_sneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y21 ); bli_sgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, y21, rs_Y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z21 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, z21, rs_Z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, d0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Y20, a21, FLA_ZERO, e0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Z20, a21, FLA_ZERO, f0 ); bli_sgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, d0, inc_d ); bli_sgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, Y20, rs_Y, cs_Y, a21, rs_A, buff_0, e0, inc_e ); bli_sgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, Z20, rs_Z, cs_Z, a21, rs_A, buff_0, f0, inc_f ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, d0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, f0, FLA_ONE, y21 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, d0, inc_d, buff_1, y21, rs_Y ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, f0, inc_f, buff_1, y21, rs_Y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, e0, FLA_ONE, z21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, d0, FLA_ONE, z21 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, e0, inc_e, buff_1, z21, rs_Z ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Z20, rs_Z, cs_Z, d0, inc_d, buff_1, z21, rs_Z ); // FLA_Copy( d0, t01 ); bli_scopyv( BLIS_NO_CONJUGATE, n_behind, d0, inc_d, t01, rs_T ); // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_sdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z21, rs_Z, &beta ); bli_sinvscals( buff_2, &beta ); bli_scopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y21 ); // FLA_Scal( inv_tau11, y21 ); bli_sscals( &minus_inv_tau11, &conj_beta ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y21, rs_Y ); bli_sscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y21, rs_Y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z21 ); // FLA_Scal( inv_tau11, z21 ); bli_sscals( &minus_inv_tau11, &beta ); bli_saxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z21, rs_Z ); bli_sscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z21, rs_Z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_sdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_sscals( &minus_inv_tau11, &dot_product ); bli_saxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, e0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, e0, a21, A02 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, e0, inc_e ); bli_sger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, e0, inc_e, a21, rs_A, A02, rs_A, cs_A ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &d ); // FLA_Obj_free( &e ); // FLA_Obj_free( &f ); FLA_free( buff_d ); FLA_free( buff_e ); FLA_free( buff_f ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_ops_var5 | ( | int | m_A, |
int | m_T, | ||
float * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
float * | buff_U, | ||
int | rs_U, | ||
int | cs_U, | ||
float * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
float * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_saxpyv(), bli_scopyv(), bli_sdot(), bli_sdots(), bli_sgemv(), bli_ssetm(), bli_strmv(), bli_strmvsx(), bli_strsv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_LOWER_TRIANGULAR, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, BLIS_NONUNIT_DIAG, BLIS_UPPER_TRIANGULAR, FLA_free(), FLA_Househ2_UT_l_ops(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var5().
{ float* buff_1 = FLA_FLOAT_PTR( FLA_ONE ); float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO ); float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE ); int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); float* buff_w = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_w = 1; // FLA_Set( FLA_ZERO, U ); // FLA_Set( FLA_ZERO, Z ); bli_ssetm( m_A, b_alg, buff_0, buff_U, rs_U, cs_U ); bli_ssetm( m_A, b_alg, buff_0, buff_Z, rs_Z, cs_Z ); for ( i = 0; i < b_alg; ++i ) { float* a01 = buff_A + (i )*cs_A + (0 )*rs_A; float* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; float* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; float* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; float* U00 = buff_U + (0 )*cs_U + (0 )*rs_U; float* u10t = buff_U + (0 )*cs_U + (i )*rs_U; float* U20 = buff_U + (0 )*cs_U + (i+1)*rs_U; float* u21 = buff_U + (i )*cs_U + (i+1)*rs_U; float* Z00 = buff_Z + (0 )*cs_Z + (0 )*rs_Z; float* z10t = buff_Z + (0 )*cs_Z + (i )*rs_Z; float* Z20 = buff_Z + (0 )*cs_Z + (i+1)*rs_Z; float* z01 = buff_Z + (i )*cs_Z + (0 )*rs_Z; float* zeta11 = buff_Z + (i )*cs_Z + (i )*rs_Z; float* z21 = buff_Z + (i )*cs_Z + (i+1)*rs_Z; float* T00 = buff_T + (0 )*cs_T + (0 )*rs_T; float* t01 = buff_T + (i )*cs_T + (0 )*rs_T; float* tau11 = buff_T + (i )*cs_T + (i )*rs_T; float* w0 = buff_w + (0 )*inc_w; float* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; float* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; float* u21_t = u21 + (0 )*cs_U + (0 )*rs_U; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copyt( FLA_CONJ_TRANSPOSE, u10t, w0 ); // FLA_Trsv( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, // T00, w0 ); bli_scopyv( BLIS_CONJUGATE, m_behind, u10t, cs_U, w0, inc_w ); bli_strsv( BLIS_UPPER_TRIANGULAR, BLIS_NO_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, T00, rs_T, cs_T, w0, inc_w ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z00, w0, FLA_ONE, a01 ); // FLA_Dots( FLA_MINUS_ONE, z10t, w0, FLA_ONE, alpha11 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, w0, FLA_ONE, a21 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_behind, buff_m1, Z00, rs_Z, cs_Z, w0, inc_w, buff_1, a01, rs_A ); bli_sdots( BLIS_NO_CONJUGATE, m_behind, buff_m1, z10t, cs_Z, w0, inc_w, buff_1, alpha11 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Z20, rs_Z, cs_Z, w0, inc_w, buff_1, a21, rs_A ); // FLA_Trmvsx( FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, // FLA_ONE, U00, a01, FLA_ZERO, w0 ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, alpha11, u10t, w0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, U20, a21, FLA_ONE, w0 ); bli_scopyv( BLIS_NO_CONJUGATE, m_behind, a01, rs_A, w0, inc_w ); bli_strmv( BLIS_LOWER_TRIANGULAR, BLIS_CONJ_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, U00, rs_U, cs_U, w0, inc_w ); bli_saxpyv( BLIS_CONJUGATE, m_behind, alpha11, u10t, cs_U, w0, inc_w ); bli_sgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, U20, rs_U, cs_U, a21, rs_A, buff_1, w0, inc_w ); // FLA_Trsv( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, // T00, w0 ); bli_strsv( BLIS_UPPER_TRIANGULAR, BLIS_CONJ_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, T00, rs_T, cs_T, w0, inc_w ); // FLA_Trmvsx( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, // FLA_MINUS_ONE, U00, w0, FLA_ONE, a01 ); // FLA_Dots( FLA_MINUS_ONE, u10t, w0, FLA_ONE, alpha11 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, U20, w0, FLA_ONE, a21 ); bli_strmvsx( BLIS_LOWER_TRIANGULAR, BLIS_NO_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, buff_m1, U00, rs_U, cs_U, w0, inc_w, buff_1, a01, rs_A ); bli_sdots( BLIS_NO_CONJUGATE, m_behind, buff_m1, u10t, cs_U, w0, inc_w, buff_1, alpha11 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, U20, rs_U, cs_U, w0, inc_w, buff_1, a21, rs_A ); } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_ops( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Copy( a21, u21 ); bli_scopyv( BLIS_NO_CONJUGATE, m_ahead, a21, rs_A, u21, rs_U ); // FLA_Set( FLA_ONE, u21_t ); *u21_t = *buff_1; // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, u21, FLA_ZERO, z01 ); // FLA_Dot( a12t, u21, zeta11 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, u21, FLA_ZERO, z21 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, u21, rs_U, buff_0, z01, rs_Z ); bli_sdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, u21, rs_U, zeta11 ); bli_sgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, u21, rs_U, buff_0, z21, rs_Z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, U20, u21, FLA_ZERO, t01 ); bli_sgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, U20, rs_U, cs_U, u21, rs_U, buff_0, t01, rs_T ); } /*------------------------------------------------------------*/ } // FLA_Obj_free( &w ); FLA_free( buff_w ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opt_var1 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_opc_var1(), FLA_Hess_UT_step_opd_var1(), FLA_Hess_UT_step_ops_var1(), FLA_Hess_UT_step_opz_var1(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_length(), and FLA_Obj_row_stride().
Referenced by FLA_Hess_UT_blk_var1(), and FLA_Hess_UT_opt_var1().
{ FLA_Datatype datatype; int m_A, m_T; int rs_A, cs_A; int rs_T, cs_T; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); m_T = FLA_Obj_length( T ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_T = FLA_Obj_row_stride( T ); cs_T = FLA_Obj_col_stride( T ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); float* buff_T = FLA_FLOAT_PTR( T ); FLA_Hess_UT_step_ops_var1( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); double* buff_T = FLA_DOUBLE_PTR( T ); FLA_Hess_UT_step_opd_var1( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); scomplex* buff_T = FLA_COMPLEX_PTR( T ); FLA_Hess_UT_step_opc_var1( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buff_T = FLA_DOUBLE_COMPLEX_PTR( T ); FLA_Hess_UT_step_opz_var1( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opt_var2 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_opc_var2(), FLA_Hess_UT_step_opd_var2(), FLA_Hess_UT_step_ops_var2(), FLA_Hess_UT_step_opz_var2(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_length(), and FLA_Obj_row_stride().
Referenced by FLA_Hess_UT_blk_var2(), and FLA_Hess_UT_opt_var2().
{ FLA_Datatype datatype; int m_A, m_T; int rs_A, cs_A; int rs_T, cs_T; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); m_T = FLA_Obj_length( T ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_T = FLA_Obj_row_stride( T ); cs_T = FLA_Obj_col_stride( T ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); float* buff_T = FLA_FLOAT_PTR( T ); FLA_Hess_UT_step_ops_var2( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); double* buff_T = FLA_DOUBLE_PTR( T ); FLA_Hess_UT_step_opd_var2( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); scomplex* buff_T = FLA_COMPLEX_PTR( T ); FLA_Hess_UT_step_opc_var2( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buff_T = FLA_DOUBLE_COMPLEX_PTR( T ); FLA_Hess_UT_step_opz_var2( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opt_var3 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_opc_var3(), FLA_Hess_UT_step_opd_var3(), FLA_Hess_UT_step_ops_var3(), FLA_Hess_UT_step_opz_var3(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_length(), and FLA_Obj_row_stride().
Referenced by FLA_Hess_UT_blk_var3(), and FLA_Hess_UT_opt_var3().
{ FLA_Datatype datatype; int m_A, m_T; int rs_A, cs_A; int rs_T, cs_T; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); m_T = FLA_Obj_length( T ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_T = FLA_Obj_row_stride( T ); cs_T = FLA_Obj_col_stride( T ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); float* buff_T = FLA_FLOAT_PTR( T ); FLA_Hess_UT_step_ops_var3( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); double* buff_T = FLA_DOUBLE_PTR( T ); FLA_Hess_UT_step_opd_var3( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); scomplex* buff_T = FLA_COMPLEX_PTR( T ); FLA_Hess_UT_step_opc_var3( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buff_T = FLA_DOUBLE_COMPLEX_PTR( T ); FLA_Hess_UT_step_opz_var3( m_A, m_T, buff_A, rs_A, cs_A, buff_T, rs_T, cs_T ); break; } } return FLA_SUCCESS; }
References FLA_Hess_UT_step_opc_var4(), FLA_Hess_UT_step_opd_var4(), FLA_Hess_UT_step_ops_var4(), FLA_Hess_UT_step_opz_var4(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_length(), and FLA_Obj_row_stride().
Referenced by FLA_Hess_UT_blk_var4(), and FLA_Hess_UT_opt_var4().
{ FLA_Datatype datatype; int m_A, m_T; int rs_A, cs_A; int rs_Y, cs_Y; int rs_Z, cs_Z; int rs_T, cs_T; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); m_T = FLA_Obj_length( T ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_Y = FLA_Obj_row_stride( Y ); cs_Y = FLA_Obj_col_stride( Y ); rs_Z = FLA_Obj_row_stride( Z ); cs_Z = FLA_Obj_col_stride( Z ); rs_T = FLA_Obj_row_stride( T ); cs_T = FLA_Obj_col_stride( T ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); float* buff_Y = FLA_FLOAT_PTR( Y ); float* buff_Z = FLA_FLOAT_PTR( Z ); float* buff_T = FLA_FLOAT_PTR( T ); FLA_Hess_UT_step_ops_var4( m_A, m_T, buff_A, rs_A, cs_A, buff_Y, rs_Y, cs_Y, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); double* buff_Y = FLA_DOUBLE_PTR( Y ); double* buff_Z = FLA_DOUBLE_PTR( Z ); double* buff_T = FLA_DOUBLE_PTR( T ); FLA_Hess_UT_step_opd_var4( m_A, m_T, buff_A, rs_A, cs_A, buff_Y, rs_Y, cs_Y, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); scomplex* buff_Y = FLA_COMPLEX_PTR( Y ); scomplex* buff_Z = FLA_COMPLEX_PTR( Z ); scomplex* buff_T = FLA_COMPLEX_PTR( T ); FLA_Hess_UT_step_opc_var4( m_A, m_T, buff_A, rs_A, cs_A, buff_Y, rs_Y, cs_Y, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buff_Y = FLA_DOUBLE_COMPLEX_PTR( Y ); dcomplex* buff_Z = FLA_DOUBLE_COMPLEX_PTR( Z ); dcomplex* buff_T = FLA_DOUBLE_COMPLEX_PTR( T ); FLA_Hess_UT_step_opz_var4( m_A, m_T, buff_A, rs_A, cs_A, buff_Y, rs_Y, cs_Y, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } } return FLA_SUCCESS; }
References FLA_Hess_UT_step_opc_var5(), FLA_Hess_UT_step_opd_var5(), FLA_Hess_UT_step_ops_var5(), FLA_Hess_UT_step_opz_var5(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_length(), and FLA_Obj_row_stride().
Referenced by FLA_Hess_UT_blk_var5(), and FLA_Hess_UT_opt_var5().
{ FLA_Datatype datatype; int m_A, m_T; int rs_A, cs_A; int rs_U, cs_U; int rs_Z, cs_Z; int rs_T, cs_T; datatype = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); m_T = FLA_Obj_length( T ); rs_A = FLA_Obj_row_stride( A ); cs_A = FLA_Obj_col_stride( A ); rs_U = FLA_Obj_row_stride( U ); cs_U = FLA_Obj_col_stride( U ); rs_Z = FLA_Obj_row_stride( Z ); cs_Z = FLA_Obj_col_stride( Z ); rs_T = FLA_Obj_row_stride( T ); cs_T = FLA_Obj_col_stride( T ); switch ( datatype ) { case FLA_FLOAT: { float* buff_A = FLA_FLOAT_PTR( A ); float* buff_U = FLA_FLOAT_PTR( U ); float* buff_Z = FLA_FLOAT_PTR( Z ); float* buff_T = FLA_FLOAT_PTR( T ); FLA_Hess_UT_step_ops_var5( m_A, m_T, buff_A, rs_A, cs_A, buff_U, rs_U, cs_U, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE: { double* buff_A = FLA_DOUBLE_PTR( A ); double* buff_U = FLA_DOUBLE_PTR( U ); double* buff_Z = FLA_DOUBLE_PTR( Z ); double* buff_T = FLA_DOUBLE_PTR( T ); FLA_Hess_UT_step_opd_var5( m_A, m_T, buff_A, rs_A, cs_A, buff_U, rs_U, cs_U, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } case FLA_COMPLEX: { scomplex* buff_A = FLA_COMPLEX_PTR( A ); scomplex* buff_U = FLA_COMPLEX_PTR( U ); scomplex* buff_Z = FLA_COMPLEX_PTR( Z ); scomplex* buff_T = FLA_COMPLEX_PTR( T ); FLA_Hess_UT_step_opc_var5( m_A, m_T, buff_A, rs_A, cs_A, buff_U, rs_U, cs_U, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_A = FLA_DOUBLE_COMPLEX_PTR( A ); dcomplex* buff_U = FLA_DOUBLE_COMPLEX_PTR( U ); dcomplex* buff_Z = FLA_DOUBLE_COMPLEX_PTR( Z ); dcomplex* buff_T = FLA_DOUBLE_COMPLEX_PTR( T ); FLA_Hess_UT_step_opz_var5( m_A, m_T, buff_A, rs_A, cs_A, buff_U, rs_U, cs_U, buff_Z, rs_Z, cs_Z, buff_T, rs_T, cs_T ); break; } } return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opz_var1 | ( | int | m_A, |
int | m_T, | ||
dcomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
dcomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_zgemv(), BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, FLA_Apply_H2_UT_l_opz_var1(), FLA_Apply_H2_UT_r_opz_var1(), FLA_Househ2_UT_l_opz(), FLA_ONE, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var1().
{ dcomplex* buff_1 = FLA_DOUBLE_COMPLEX_PTR( FLA_ONE ); dcomplex* buff_0 = FLA_DOUBLE_COMPLEX_PTR( FLA_ZERO ); dcomplex first_elem; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; for ( i = 0; i < b_alg; ++i ) { dcomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; dcomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; dcomplex* a21_t = buff_A + (i )*cs_A + (i+1)*rs_A; dcomplex* a21_b = buff_A + (i )*cs_A + (i+2)*rs_A; dcomplex* A22_t = buff_A + (i+1)*cs_A + (i+1)*rs_A; dcomplex* A22_b = buff_A + (i+1)*cs_A + (i+2)*rs_A; dcomplex* A2_l = buff_A + (i+1)*cs_A + (0 )*rs_A; dcomplex* A2_r = buff_A + (i+2)*cs_A + (0 )*rs_A; dcomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; dcomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int n_behind = i; /*------------------------------------------------------------*/ if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opz( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Apply_H2_UT( FLA_LEFT, tau11, a21_b, A22_t, // A22_b ); FLA_Apply_H2_UT_l_opz_var1( m_ahead - 1, n_ahead, tau11, a21_b, rs_A, A22_t, cs_A, A22_b, rs_A, cs_A ); // FLA_Apply_H2_UT( FLA_RIGHT, tau11, a21_b, A2_l, A2_r ); FLA_Apply_H2_UT_r_opz_var1( m_A, n_ahead - 1, tau11, a21_b, rs_A, A2_l, rs_A, A2_r, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_zgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opz_var2 | ( | int | m_A, |
int | m_T, | ||
dcomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
dcomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_zaxpyv(), bli_zdot(), bli_zgemv(), bli_zger(), bli_zscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Househ2_UT_l_opz(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var2().
{ dcomplex* buff_2 = FLA_DOUBLE_COMPLEX_PTR( FLA_TWO ); dcomplex* buff_1 = FLA_DOUBLE_COMPLEX_PTR( FLA_ONE ); dcomplex* buff_0 = FLA_DOUBLE_COMPLEX_PTR( FLA_ZERO ); dcomplex* buff_m1 = FLA_DOUBLE_COMPLEX_PTR( FLA_MINUS_ONE ); dcomplex first_elem; dcomplex dot_product; dcomplex beta, conj_beta; dcomplex inv_tau11; dcomplex minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); dcomplex* buff_y = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); dcomplex* buff_z = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_y = 1; int inc_z = 1; for ( i = 0; i < b_alg; ++i ) { dcomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; dcomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; dcomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; dcomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; dcomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; dcomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; dcomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; dcomplex* y0 = buff_y + (0 )*inc_y; dcomplex* y2 = buff_y + (i+1)*inc_y; dcomplex* z2 = buff_z + (i+1)*inc_z; dcomplex* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; dcomplex* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opz( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_zdiv3( buff_1, tau11, &inv_tau11 ); bli_zneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y2 ); bli_zgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, y2, inc_y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z2 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_zdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_zinvscals( buff_2, &beta ); bli_zcopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_zscals( &minus_inv_tau11, &conj_beta ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_zscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_zscals( &minus_inv_tau11, &beta ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_zscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_zdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_zscals( &minus_inv_tau11, &dot_product ); bli_zaxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_zger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, a21, A22 ); bli_zger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, a21, rs_A, y2, inc_y, A22, rs_A, cs_A ); bli_zger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, z2, inc_z, a21, rs_A, A22, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_zgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); FLA_free( buff_y ); FLA_free( buff_z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opz_var3 | ( | int | m_A, |
int | m_T, | ||
dcomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
dcomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_zaxpyv(), bli_zcopyv(), bli_zdot(), bli_zgemv(), bli_zger(), bli_zscalv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Househ2_UT_l_opz(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var3().
{ dcomplex* buff_2 = FLA_DOUBLE_COMPLEX_PTR( FLA_TWO ); dcomplex* buff_1 = FLA_DOUBLE_COMPLEX_PTR( FLA_ONE ); dcomplex* buff_0 = FLA_DOUBLE_COMPLEX_PTR( FLA_ZERO ); dcomplex* buff_m1 = FLA_DOUBLE_COMPLEX_PTR( FLA_MINUS_ONE ); dcomplex first_elem; dcomplex dot_product; dcomplex beta, conj_beta; dcomplex inv_tau11; dcomplex minus_inv_tau11; dcomplex minus_upsilon1, minus_conj_upsilon1; dcomplex minus_psi1, minus_conj_psi1; dcomplex minus_zeta1; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &u ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &v ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); dcomplex* buff_u = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); dcomplex* buff_y = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); dcomplex* buff_z = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); dcomplex* buff_v = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); dcomplex* buff_w = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_u = 1; int inc_y = 1; int inc_z = 1; int inc_v = 1; int inc_w = 1; // Initialize some variables (only to prevent compiler warnings). first_elem = *buff_0; minus_inv_tau11 = *buff_0; for ( i = 0; i < b_alg; ++i ) { dcomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; dcomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; dcomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; dcomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; dcomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; dcomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; dcomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; dcomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; dcomplex* upsilon1 = buff_u + (i )*inc_u; dcomplex* u2 = buff_u + (i+1)*inc_u; dcomplex* y0 = buff_y + (0 )*inc_y; dcomplex* psi1 = buff_y + (i )*inc_y; dcomplex* y2 = buff_y + (i+1)*inc_y; dcomplex* zeta1 = buff_z + (i )*inc_z; dcomplex* z2 = buff_z + (i+1)*inc_z; dcomplex* v2 = buff_v + (i+1)*inc_v; dcomplex* w2 = buff_w + (i+1)*inc_w; dcomplex* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; dcomplex* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( upsilon1, minus_upsilon1 ); // FLA_Scal( FLA_MINUS_ONE, minus_upsilon1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, minus_conj_upsilon1 ); bli_zmult3( buff_m1, upsilon1, &minus_upsilon1 ); bli_zcopyconj( &minus_upsilon1, &minus_conj_upsilon1 ); // FLA_Copy( psi1, minus_psi1 ); // FLA_Scal( FLA_MINUS_ONE, minus_psi1 ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_psi1, minus_conj_psi1 ); bli_zmult3( buff_m1, psi1, &minus_psi1 ); bli_zcopyconj( &minus_psi1, &minus_conj_psi1 ); // FLA_Copy( zeta1, minus_zeta1 ); // FLA_Scal( FLA_MINUS_ONE, minus_zeta1 ); bli_zmult3( buff_m1, zeta1, &minus_zeta1 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, psi1, alpha11 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_zeta1, upsilon1, alpha11 ); bli_zaxpyv( BLIS_CONJUGATE, 1, &minus_upsilon1, psi1, 1, alpha11, 1 ); bli_zaxpyv( BLIS_CONJUGATE, 1, &minus_zeta1, upsilon1, 1, alpha11, 1 ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_upsilon1, y2, a12t ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_zeta1, u2, a12t ); bli_zaxpyv( BLIS_CONJUGATE, m_ahead, &minus_upsilon1, y2, inc_y, a12t, cs_A ); bli_zaxpyv( BLIS_CONJUGATE, m_ahead, &minus_zeta1, u2, inc_u, a12t, cs_A ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_psi1, u2, a21 ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_upsilon1, z2, a21 ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_psi1, u2, inc_u, a21, rs_A ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &minus_conj_upsilon1, z2, inc_z, a21, rs_A ); } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opz( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_zdiv3( buff_1, tau11, &inv_tau11 ); bli_zneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; } if ( m_behind > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); bli_zger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, A22, rs_A, cs_A ); bli_zger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, z2, inc_z, u2, inc_u, A22, rs_A, cs_A ); } if ( m_ahead > 0 ) { // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, v2 ); bli_zgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, v2, inc_v ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, w2 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, w2, inc_w ); // FLA_Copy( a21, u2 ); // FLA_Copy( v2, y2 ); // FLA_Copy( w2, z2 ); bli_zcopyv( BLIS_NO_CONJUGATE, m_ahead, a21, rs_A, u2, inc_u ); bli_zcopyv( BLIS_NO_CONJUGATE, m_ahead, v2, inc_v, y2, inc_y ); bli_zcopyv( BLIS_NO_CONJUGATE, m_ahead, w2, inc_w, z2, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_zdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z2, inc_z, &beta ); bli_zinvscals( buff_2, &beta ); bli_zcopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y2 ); // FLA_Scal( inv_tau11, y2 ); bli_zscals( &minus_inv_tau11, &conj_beta ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y2, inc_y ); bli_zscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y2, inc_y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z2 ); // FLA_Scal( inv_tau11, z2 ); bli_zscals( &minus_inv_tau11, &beta ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z2, inc_z ); bli_zscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z2, inc_z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_zdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_zscals( &minus_inv_tau11, &dot_product ); bli_zaxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, y0, inc_y ); bli_zger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, y0, inc_y, a21, rs_A, A02, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bli_zgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } if ( m_behind + 1 == b_alg && m_ahead > 0 ) { // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); bli_zger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, u2, inc_u, y2, inc_y, A22, rs_A, cs_A ); bli_zger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_ahead, n_ahead, buff_m1, z2, inc_z, u2, inc_u, A22, rs_A, cs_A ); } /*------------------------------------------------------------*/ } // FLA_Obj_free( &u ); // FLA_Obj_free( &y ); // FLA_Obj_free( &z ); // FLA_Obj_free( &v ); // FLA_Obj_free( &w ); FLA_free( buff_u ); FLA_free( buff_y ); FLA_free( buff_z ); FLA_free( buff_v ); FLA_free( buff_w ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opz_var4 | ( | int | m_A, |
int | m_T, | ||
dcomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
dcomplex * | buff_Y, | ||
int | rs_Y, | ||
int | cs_Y, | ||
dcomplex * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
dcomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_zaxpyv(), bli_zcopyv(), bli_zdot(), bli_zgemv(), bli_zger(), bli_zscalv(), bli_zsetm(), BLIS_CONJ_NO_TRANSPOSE, BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, FLA_free(), FLA_Househ2_UT_l_opz(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var4().
{ dcomplex* buff_2 = FLA_DOUBLE_COMPLEX_PTR( FLA_TWO ); dcomplex* buff_1 = FLA_DOUBLE_COMPLEX_PTR( FLA_ONE ); dcomplex* buff_0 = FLA_DOUBLE_COMPLEX_PTR( FLA_ZERO ); dcomplex* buff_m1 = FLA_DOUBLE_COMPLEX_PTR( FLA_MINUS_ONE ); dcomplex first_elem, last_elem; dcomplex dot_product; dcomplex beta, conj_beta; dcomplex inv_tau11; dcomplex minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &d ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &e ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &f ); dcomplex* buff_d = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); dcomplex* buff_e = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); dcomplex* buff_f = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_d = 1; int inc_e = 1; int inc_f = 1; // FLA_Set( FLA_ZERO, Y ); // FLA_Set( FLA_ZERO, Z ); bli_zsetm( m_A, b_alg, buff_0, buff_Y, rs_Y, cs_Y ); bli_zsetm( m_A, b_alg, buff_0, buff_Z, rs_Z, cs_Z ); for ( i = 0; i < b_alg; ++i ) { dcomplex* a10t = buff_A + (0 )*cs_A + (i )*rs_A; dcomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; dcomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; dcomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; dcomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; dcomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; dcomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; dcomplex* y10t = buff_Y + (0 )*cs_Y + (i )*rs_Y; dcomplex* Y20 = buff_Y + (0 )*cs_Y + (i+1)*rs_Y; dcomplex* y21 = buff_Y + (i )*cs_Y + (i+1)*rs_Y; dcomplex* z10t = buff_Z + (0 )*cs_Z + (i )*rs_Z; dcomplex* Z20 = buff_Z + (0 )*cs_Z + (i+1)*rs_Z; dcomplex* z21 = buff_Z + (i )*cs_Z + (i+1)*rs_Z; dcomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; dcomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; dcomplex* d0 = buff_d + (0 )*inc_d; dcomplex* e0 = buff_e + (0 )*inc_e; dcomplex* f0 = buff_f + (0 )*inc_f; dcomplex* a10t_r = a10t + (i-1)*cs_A + (0 )*rs_A; dcomplex* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; dcomplex* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; dcomplex* ABL = a10t; dcomplex* ZBL = z10t; dcomplex* a2 = alpha11; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( a10t_r, last_elem ); // FLA_Set( FLA_ONE, a10t_r ); last_elem = *a10t_r; *a10t_r = *buff_1; } // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ABL, y10t, FLA_ONE, a2 ); // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ZBL, a10t, FLA_ONE, a2 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ABL, rs_A, cs_A, y10t, cs_Y, buff_1, a2, rs_A ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ZBL, rs_Z, cs_Z, a10t, cs_A, buff_1, a2, rs_A ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, a10t, FLA_ONE, a12t ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, A20, z10t, FLA_ONE, a12t ); bli_zgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, a10t, cs_A, buff_1, a12t, cs_A ); bli_zgemv( BLIS_CONJ_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, z10t, cs_Z, buff_1, a12t, cs_A ); if ( m_behind > 0 ) { // FLA_Copy( last_elem, a10t_r ); *a10t_r = last_elem; } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opz( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bli_zdiv3( buff_1, tau11, &inv_tau11 ); bli_zneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y21 ); bli_zgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, y21, rs_Y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z21 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, z21, rs_Z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, d0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Y20, a21, FLA_ZERO, e0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Z20, a21, FLA_ZERO, f0 ); bli_zgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, d0, inc_d ); bli_zgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, Y20, rs_Y, cs_Y, a21, rs_A, buff_0, e0, inc_e ); bli_zgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, Z20, rs_Z, cs_Z, a21, rs_A, buff_0, f0, inc_f ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, d0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, f0, FLA_ONE, y21 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, d0, inc_d, buff_1, y21, rs_Y ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, f0, inc_f, buff_1, y21, rs_Y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, e0, FLA_ONE, z21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, d0, FLA_ONE, z21 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, e0, inc_e, buff_1, z21, rs_Z ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Z20, rs_Z, cs_Z, d0, inc_d, buff_1, z21, rs_Z ); // FLA_Copy( d0, t01 ); bli_zcopyv( BLIS_NO_CONJUGATE, n_behind, d0, inc_d, t01, rs_T ); // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bli_zdot( BLIS_CONJUGATE, m_ahead, a21, rs_A, z21, rs_Z, &beta ); bli_zinvscals( buff_2, &beta ); bli_zcopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y21 ); // FLA_Scal( inv_tau11, y21 ); bli_zscals( &minus_inv_tau11, &conj_beta ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y21, rs_Y ); bli_zscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, y21, rs_Y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z21 ); // FLA_Scal( inv_tau11, z21 ); bli_zscals( &minus_inv_tau11, &beta ); bli_zaxpyv( BLIS_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z21, rs_Z ); bli_zscalv( BLIS_NO_CONJUGATE, m_ahead, &inv_tau11, z21, rs_Z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bli_zdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bli_zscals( &minus_inv_tau11, &dot_product ); bli_zaxpyv( BLIS_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, e0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, e0, a21, A02 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, e0, inc_e ); bli_zger( BLIS_NO_CONJUGATE, BLIS_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, e0, inc_e, a21, rs_A, A02, rs_A, cs_A ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &d ); // FLA_Obj_free( &e ); // FLA_Obj_free( &f ); FLA_free( buff_d ); FLA_free( buff_e ); FLA_free( buff_f ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_opz_var5 | ( | int | m_A, |
int | m_T, | ||
dcomplex * | buff_A, | ||
int | rs_A, | ||
int | cs_A, | ||
dcomplex * | buff_U, | ||
int | rs_U, | ||
int | cs_U, | ||
dcomplex * | buff_Z, | ||
int | rs_Z, | ||
int | cs_Z, | ||
dcomplex * | buff_T, | ||
int | rs_T, | ||
int | cs_T | ||
) |
References bli_zaxpyv(), bli_zcopyv(), bli_zdot(), bli_zdots(), bli_zgemv(), bli_zsetm(), bli_ztrmv(), bli_ztrmvsx(), bli_ztrsv(), BLIS_CONJ_TRANSPOSE, BLIS_CONJUGATE, BLIS_LOWER_TRIANGULAR, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, BLIS_NONUNIT_DIAG, BLIS_UPPER_TRIANGULAR, FLA_free(), FLA_Househ2_UT_l_opz(), FLA_malloc(), FLA_MINUS_ONE, FLA_ONE, and FLA_ZERO.
Referenced by FLA_Hess_UT_step_opt_var5().
{ dcomplex* buff_1 = FLA_DOUBLE_COMPLEX_PTR( FLA_ONE ); dcomplex* buff_0 = FLA_DOUBLE_COMPLEX_PTR( FLA_ZERO ); dcomplex* buff_m1 = FLA_DOUBLE_COMPLEX_PTR( FLA_MINUS_ONE ); int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); dcomplex* buff_w = ( dcomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_w = 1; // FLA_Set( FLA_ZERO, U ); // FLA_Set( FLA_ZERO, Z ); bli_zsetm( m_A, b_alg, buff_0, buff_U, rs_U, cs_U ); bli_zsetm( m_A, b_alg, buff_0, buff_Z, rs_Z, cs_Z ); for ( i = 0; i < b_alg; ++i ) { dcomplex* a01 = buff_A + (i )*cs_A + (0 )*rs_A; dcomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; dcomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; dcomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; dcomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; dcomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; dcomplex* U00 = buff_U + (0 )*cs_U + (0 )*rs_U; dcomplex* u10t = buff_U + (0 )*cs_U + (i )*rs_U; dcomplex* U20 = buff_U + (0 )*cs_U + (i+1)*rs_U; dcomplex* u21 = buff_U + (i )*cs_U + (i+1)*rs_U; dcomplex* Z00 = buff_Z + (0 )*cs_Z + (0 )*rs_Z; dcomplex* z10t = buff_Z + (0 )*cs_Z + (i )*rs_Z; dcomplex* Z20 = buff_Z + (0 )*cs_Z + (i+1)*rs_Z; dcomplex* z01 = buff_Z + (i )*cs_Z + (0 )*rs_Z; dcomplex* zeta11 = buff_Z + (i )*cs_Z + (i )*rs_Z; dcomplex* z21 = buff_Z + (i )*cs_Z + (i+1)*rs_Z; dcomplex* T00 = buff_T + (0 )*cs_T + (0 )*rs_T; dcomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; dcomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; dcomplex* w0 = buff_w + (0 )*inc_w; dcomplex* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; dcomplex* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; dcomplex* u21_t = u21 + (0 )*cs_U + (0 )*rs_U; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copyt( FLA_CONJ_TRANSPOSE, u10t, w0 ); // FLA_Trsv( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, // T00, w0 ); bli_zcopyv( BLIS_CONJUGATE, m_behind, u10t, cs_U, w0, inc_w ); bli_ztrsv( BLIS_UPPER_TRIANGULAR, BLIS_NO_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, T00, rs_T, cs_T, w0, inc_w ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z00, w0, FLA_ONE, a01 ); // FLA_Dots( FLA_MINUS_ONE, z10t, w0, FLA_ONE, alpha11 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, w0, FLA_ONE, a21 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_behind, buff_m1, Z00, rs_Z, cs_Z, w0, inc_w, buff_1, a01, rs_A ); bli_zdots( BLIS_NO_CONJUGATE, m_behind, buff_m1, z10t, cs_Z, w0, inc_w, buff_1, alpha11 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Z20, rs_Z, cs_Z, w0, inc_w, buff_1, a21, rs_A ); // FLA_Trmvsx( FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, // FLA_ONE, U00, a01, FLA_ZERO, w0 ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, alpha11, u10t, w0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, U20, a21, FLA_ONE, w0 ); bli_zcopyv( BLIS_NO_CONJUGATE, m_behind, a01, rs_A, w0, inc_w ); bli_ztrmv( BLIS_LOWER_TRIANGULAR, BLIS_CONJ_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, U00, rs_U, cs_U, w0, inc_w ); bli_zaxpyv( BLIS_CONJUGATE, m_behind, alpha11, u10t, cs_U, w0, inc_w ); bli_zgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, U20, rs_U, cs_U, a21, rs_A, buff_1, w0, inc_w ); // FLA_Trsv( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, // T00, w0 ); bli_ztrsv( BLIS_UPPER_TRIANGULAR, BLIS_CONJ_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, T00, rs_T, cs_T, w0, inc_w ); // FLA_Trmvsx( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, // FLA_MINUS_ONE, U00, w0, FLA_ONE, a01 ); // FLA_Dots( FLA_MINUS_ONE, u10t, w0, FLA_ONE, alpha11 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, U20, w0, FLA_ONE, a21 ); bli_ztrmvsx( BLIS_LOWER_TRIANGULAR, BLIS_NO_TRANSPOSE, BLIS_NONUNIT_DIAG, m_behind, buff_m1, U00, rs_U, cs_U, w0, inc_w, buff_1, a01, rs_A ); bli_zdots( BLIS_NO_CONJUGATE, m_behind, buff_m1, u10t, cs_U, w0, inc_w, buff_1, alpha11 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_m1, U20, rs_U, cs_U, w0, inc_w, buff_1, a21, rs_A ); } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opz( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Copy( a21, u21 ); bli_zcopyv( BLIS_NO_CONJUGATE, m_ahead, a21, rs_A, u21, rs_U ); // FLA_Set( FLA_ONE, u21_t ); *u21_t = *buff_1; // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, u21, FLA_ZERO, z01 ); // FLA_Dot( a12t, u21, zeta11 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, u21, FLA_ZERO, z21 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, u21, rs_U, buff_0, z01, rs_Z ); bli_zdot( BLIS_NO_CONJUGATE, m_ahead, a12t, cs_A, u21, rs_U, zeta11 ); bli_zgemv( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, u21, rs_U, buff_0, z21, rs_Z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, U20, u21, FLA_ZERO, t01 ); bli_zgemv( BLIS_CONJ_TRANSPOSE, BLIS_NO_CONJUGATE, m_ahead, n_behind, buff_1, U20, rs_U, cs_U, u21, rs_U, buff_0, t01, rs_T ); } /*------------------------------------------------------------*/ } // FLA_Obj_free( &w ); FLA_free( buff_w ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_unb_var1 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Apply_H2_UT(), FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x3_to_2x2(), FLA_Copy(), FLA_Gemv(), FLA_Househ2_UT(), FLA_Obj_create(), FLA_Obj_datatype(), FLA_Obj_free(), FLA_Obj_length(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x2_to_3x3(), FLA_Set(), and FLA_ZERO.
Referenced by FLA_Hess_UT_unb_var1().
{ FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj AL, AR, A0, a1, A2; FLA_Obj TTL, TTR, T00, t01, T02, TBL, TBR, t10t, tau11, t12t, T20, t21, T22; FLA_Obj a21_t, a21_b; FLA_Obj A22_t, A22_b; FLA_Obj A2_l, A2_r; FLA_Obj first_elem; dim_t b_alg; FLA_Datatype datatype_A; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &first_elem ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_1x2( A, &AL, &AR, 0, FLA_LEFT ); FLA_Part_2x2( T, &TTL, &TTR, &TBL, &TBR, 0, 0, FLA_TL ); while ( FLA_Obj_length( ATL ) < b_alg ) { FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, /**/ &a1, &A2, 1, FLA_RIGHT ); FLA_Repart_2x2_to_3x3( TTL, /**/ TTR, &T00, /**/ &t01, &T02, /* ************* */ /* ************************** */ &t10t, /**/ &tau11, &t12t, TBL, /**/ TBR, &T20, /**/ &t21, &T22, 1, 1, FLA_BR ); /*------------------------------------------------------------*/ if ( FLA_Obj_length( A22 ) > 0 ) { FLA_Part_2x1( a21, &a21_t, &a21_b, 1, FLA_TOP ); FLA_Part_2x1( A22, &A22_t, &A22_b, 1, FLA_TOP ); FLA_Part_1x2( A2, &A2_l, &A2_r, 1, FLA_LEFT ); // [ u21, tau11, a21 ] = House( a21 ); FLA_Househ2_UT( FLA_LEFT, a21_t, a21_b, tau11 ); // Save first element of a21_t and set it to one so we can use a21 as // u21 in subsequent computations. We will restore a21_t later on. FLA_Copy( a21_t, first_elem ); FLA_Set( FLA_ONE, a21_t ); // A22 = ( I - inv( tau ) * u21 * u21' ) * A22; FLA_Apply_H2_UT( FLA_LEFT, tau11, a21_b, A22_t, A22_b ); // A02 = A02 * ( I - inv( tau ) * u21 * u21' ); // a12t = a12t * ( I - inv( tau ) * u21 * u21' ); // A22 = A22 * ( I - inv( tau ) * u21 * u21' ); FLA_Apply_H2_UT( FLA_RIGHT, tau11, a21_b, A2_l, A2_r ); // t01 = U20' * u21; FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); // Restore first element of a21. FLA_Copy( first_elem, a21_t ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, a1, /**/ A2, FLA_LEFT ); FLA_Cont_with_3x3_to_2x2( &TTL, /**/ &TTR, T00, t01, /**/ T02, t10t, tau11, /**/ t12t, /* ************** */ /* ************************ */ &TBL, /**/ &TBR, T20, t21, /**/ T22, FLA_TL ); } FLA_Obj_free( &first_elem ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_unb_var2 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Axpy(), FLA_Axpyt(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Copy(), FLA_Copyt(), FLA_Dot(), FLA_Dotc(), FLA_Gemv(), FLA_Gerc(), FLA_Househ2_UT(), FLA_Inv_scal(), FLA_Inv_scalc(), FLA_MINUS_ONE, FLA_Obj_create(), FLA_Obj_datatype(), FLA_Obj_free(), FLA_Obj_length(), FLA_ONE, FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_Scal(), FLA_Set(), FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_unb_var2().
{ FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj TTL, TTR, T00, t01, T02, TBL, TBR, t10t, tau11, t12t, T20, t21, T22; FLA_Obj yT, y0, yB, psi1, y2; FLA_Obj zT, z0, zB, zeta1, z2; FLA_Obj y, z; FLA_Obj inv_tau11; FLA_Obj minus_inv_tau11; FLA_Obj first_elem; FLA_Obj beta; FLA_Obj conj_beta; FLA_Obj dot_product; FLA_Obj a21_t, a21_b; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &inv_tau11 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_inv_tau11 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &first_elem ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &beta ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &conj_beta ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &dot_product ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x2( T, &TTL, &TTR, &TBL, &TBR, 0, 0, FLA_TL ); FLA_Part_2x1( y, &yT, &yB, 0, FLA_TOP ); FLA_Part_2x1( z, &zT, &zB, 0, FLA_TOP ); while ( FLA_Obj_length( ATL ) < b_alg ) { FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( TTL, /**/ TTR, &T00, /**/ &t01, &T02, /* ************* */ /* ************************** */ &t10t, /**/ &tau11, &t12t, TBL, /**/ TBR, &T20, /**/ &t21, &T22, 1, 1, FLA_BR ); FLA_Repart_2x1_to_3x1( yT, &y0, /* ** */ /* **** */ &psi1, yB, &y2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( zT, &z0, /* ** */ /* ***** */ &zeta1, zB, &z2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ if ( FLA_Obj_length( A22 ) > 0 ) { FLA_Part_2x1( a21, &a21_t, &a21_b, 1, FLA_TOP ); // [ u21, tau11, a21 ] = House( a21 ); FLA_Househ2_UT( FLA_LEFT, a21_t, a21_b, tau11 ); // inv_tau11 = 1 / tau11; // minus_inv_tau11 = -1 / tau11; FLA_Set( FLA_ONE, inv_tau11 ); FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); FLA_Copy( inv_tau11, minus_inv_tau11 ); FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); // Save first element of a21_t and set it to one so we can use a21 as // u21 in subsequent computations. We will restore a21_t later on. FLA_Copy( a21_t, first_elem ); FLA_Set( FLA_ONE, a21_t ); // y21 = A22' * u21; FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y2 ); // z21 = A22 * u21; FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z2 ); // beta = u21' * z21 / 2; // conj_beta = conj(beta); FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); FLA_Inv_scal( FLA_TWO, beta ); FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); // y21' = ( y21' - beta / tau * u21' ) / tau; // y21 = ( y21 - conj(beta) / tau * u21 ) / tau; FLA_Scal( minus_inv_tau11, conj_beta ); FLA_Axpy( conj_beta, a21, y2 ); FLA_Scal( inv_tau11, y2 ); // z21 = ( z21 - beta / tau * u21 ) / tau; FLA_Scal( minus_inv_tau11, beta ); FLA_Axpy( beta, a21, z2 ); FLA_Scal( inv_tau11, z2 ); // a12t = a12t * ( I - u21 * u21' / tau ); // = a12t - ( a12t * u21 ) * u21' / tau; FLA_Dot( a12t, a21, dot_product ); FLA_Scal( minus_inv_tau11, dot_product ); FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); // A02 = A02 * ( I - u21 * u21' / tau ); // = A02 - ( A02 * u21 ) * u21' / tau; FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); // A22 = A22 - u21 * y21' - z21 * u21'; FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, a21, y2, A22 ); FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, a21, A22 ); // t01 = U20' * u21; FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); // Restore first element of a21. FLA_Copy( first_elem, a21_t ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &TTL, /**/ &TTR, T00, t01, /**/ T02, t10t, tau11, /**/ t12t, /* ************** */ /* ************************ */ &TBL, /**/ &TBR, T20, t21, /**/ T22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &yT, y0, psi1, /* ** */ /* **** */ &yB, y2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &zT, z0, zeta1, /* ** */ /* ***** */ &zB, z2, FLA_TOP ); } FLA_Obj_free( &inv_tau11 ); FLA_Obj_free( &minus_inv_tau11 ); FLA_Obj_free( &first_elem ); FLA_Obj_free( &beta ); FLA_Obj_free( &conj_beta ); FLA_Obj_free( &dot_product ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_step_unb_var3 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Axpy(), FLA_Axpyt(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Copy(), FLA_Copyt(), FLA_Dot(), FLA_Dotc(), FLA_Gemv(), FLA_Gerc(), FLA_Househ2_UT(), FLA_Inv_scal(), FLA_Inv_scalc(), FLA_MINUS_ONE, FLA_Obj_create(), FLA_Obj_datatype(), FLA_Obj_free(), FLA_Obj_length(), FLA_ONE, FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_Scal(), FLA_Set(), FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_unb_var3().
{ FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj TTL, TTR, T00, t01, T02, TBL, TBR, t10t, tau11, t12t, T20, t21, T22; FLA_Obj uT, u0, uB, upsilon1, u2; FLA_Obj yT, y0, yB, psi1, y2; FLA_Obj zT, z0, zB, zeta1, z2; FLA_Obj vT, v0, vB, nu1, v2; FLA_Obj wT, w0, wB, omega1, w2; FLA_Obj u, y, z, v, w; FLA_Obj inv_tau11; FLA_Obj minus_inv_tau11; FLA_Obj first_elem; FLA_Obj beta; FLA_Obj conj_beta; FLA_Obj dot_product; FLA_Obj minus_upsilon1; FLA_Obj minus_conj_upsilon1; FLA_Obj minus_psi1; FLA_Obj minus_conj_psi1; FLA_Obj minus_zeta1; FLA_Obj a21_t, a21_b; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &inv_tau11 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_inv_tau11 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &first_elem ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &beta ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &conj_beta ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &dot_product ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_upsilon1 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_conj_upsilon1 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_psi1 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_conj_psi1 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_zeta1 ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &u ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &y ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &v ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x2( T, &TTL, &TTR, &TBL, &TBR, 0, 0, FLA_TL ); FLA_Part_2x1( u, &uT, &uB, 0, FLA_TOP ); FLA_Part_2x1( y, &yT, &yB, 0, FLA_TOP ); FLA_Part_2x1( z, &zT, &zB, 0, FLA_TOP ); FLA_Part_2x1( v, &vT, &vB, 0, FLA_TOP ); FLA_Part_2x1( w, &wT, &wB, 0, FLA_TOP ); while ( FLA_Obj_length( ATL ) < b_alg ) { FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( TTL, /**/ TTR, &T00, /**/ &t01, &T02, /* ************* */ /* ************************** */ &t10t, /**/ &tau11, &t12t, TBL, /**/ TBR, &T20, /**/ &t21, &T22, 1, 1, FLA_BR ); FLA_Repart_2x1_to_3x1( uT, &u0, /* ** */ /* ******** */ &upsilon1, uB, &u2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( yT, &y0, /* ** */ /* **** */ &psi1, yB, &y2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( zT, &z0, /* ** */ /* ***** */ &zeta1, zB, &z2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( vT, &v0, /* ** */ /* *** */ &nu1, vB, &v2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( wT, &w0, /* ** */ /* ****** */ &omega1, wB, &w2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ if ( FLA_Obj_length( ATL ) > 0 ) { FLA_Copy( upsilon1, minus_upsilon1 ); FLA_Scal( FLA_MINUS_ONE, minus_upsilon1 ); FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, minus_conj_upsilon1 ); FLA_Copy( psi1, minus_psi1 ); FLA_Scal( FLA_MINUS_ONE, minus_psi1 ); FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, minus_psi1, minus_conj_psi1 ); FLA_Copy( zeta1, minus_zeta1 ); FLA_Scal( FLA_MINUS_ONE, minus_zeta1 ); // alpha11 = alpha11 - upsilon11 * conj(psi11) - zeta11 * conj(upsilon11); FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon1, psi1, alpha11 ); FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_zeta1, upsilon1, alpha11 ); // a12t = a12t - upsilon11 * y21' - zeta11 * u21'; FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_upsilon1, y2, a12t ); FLA_Axpyt( FLA_CONJ_TRANSPOSE, minus_zeta1, u2, a12t ); // a21 = a21 - conj(psi11) * u21 - conj(upsilon11) * z21; FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_psi1, u2, a21 ); FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_upsilon1, z2, a21 ); } if ( FLA_Obj_length( A22 ) > 0 ) { FLA_Part_2x1( a21, &a21_t, &a21_b, 1, FLA_TOP ); // [ x21, tau11, a21 ] = House( a21 ); FLA_Househ2_UT( FLA_LEFT, a21_t, a21_b, tau11 ); // inv_tau11 = 1 / tau11; // minus_inv_tau11 = -1 / tau11; FLA_Set( FLA_ONE, inv_tau11 ); FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); FLA_Copy( inv_tau11, minus_inv_tau11 ); FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); // Save first element of a21_t and set it to one so we can use a21 as // u21 in subsequent computations. We will restore a21_t later on. FLA_Copy( a21_t, first_elem ); FLA_Set( FLA_ONE, a21_t ); } if ( FLA_Obj_length( ATL ) > 0 ) { // A22 = A22 - u21 * y21' - z21 * u21'; FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); } if ( FLA_Obj_length( A22 ) > 0 ) { // v2 = A22' * x21; FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, v2 ); // w2 = A22 * x21; FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, w2 ); // u21 = x21; // y21 = v2; // z21 = w2; FLA_Copy( a21, u2 ); FLA_Copy( v2, y2 ); FLA_Copy( w2, z2 ); // beta = u21' * z21 / 2; // conj_beta = conj(beta); FLA_Dotc( FLA_CONJUGATE, a21, z2, beta ); FLA_Inv_scal( FLA_TWO, beta ); FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); // y21' = ( y21' - beta / tau * u21' ) / tau; // y21 = ( y21 - conj(beta) / tau * u21 ) / tau; FLA_Scal( minus_inv_tau11, conj_beta ); FLA_Axpy( conj_beta, a21, y2 ); FLA_Scal( inv_tau11, y2 ); // z21 = ( z21 - beta / tau * u21 ) / tau; FLA_Scal( minus_inv_tau11, beta ); FLA_Axpy( beta, a21, z2 ); FLA_Scal( inv_tau11, z2 ); // a12t = a12t * ( I - u21 * u21' / tau ); // = a12t - ( a12t * u21 ) * u21' / tau; FLA_Dot( a12t, a21, dot_product ); FLA_Scal( minus_inv_tau11, dot_product ); FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); // A02 = A02 * ( I - u21 * u21' / tau ); // = A02 - ( A02 * u21 ) * u21' / tau; FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, y0 ); FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, y0, a21, A02 ); // t01 = U20' * u21; FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); // Restore first element of a21. FLA_Copy( first_elem, a21_t ); } // Update A22 if this is the last iteration; this is needed when we're // being called from the blocked routine so A22 is left in a valid state. if ( FLA_Obj_length( ATL ) + 1 == b_alg && FLA_Obj_length( A22 ) > 0 ) { // A22 = A22 - u21 * y21' - z21 * u21'; FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, u2, y2, A22 ); FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, FLA_MINUS_ONE, z2, u2, A22 ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &TTL, /**/ &TTR, T00, t01, /**/ T02, t10t, tau11, /**/ t12t, /* ************** */ /* ************************ */ &TBL, /**/ &TBR, T20, t21, /**/ T22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &uT, u0, upsilon1, /* ** */ /* ******** */ &uB, u2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &yT, y0, psi1, /* ** */ /* **** */ &yB, y2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &zT, z0, zeta1, /* ** */ /* ***** */ &zB, z2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &vT, v0, nu1, /* ** */ /* *** */ &vB, v2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &wT, w0, omega1, /* ** */ /* ****** */ &wB, w2, FLA_TOP ); } FLA_Obj_free( &inv_tau11 ); FLA_Obj_free( &minus_inv_tau11 ); FLA_Obj_free( &first_elem ); FLA_Obj_free( &beta ); FLA_Obj_free( &conj_beta ); FLA_Obj_free( &dot_product ); FLA_Obj_free( &minus_upsilon1 ); FLA_Obj_free( &minus_conj_upsilon1 ); FLA_Obj_free( &minus_psi1 ); FLA_Obj_free( &minus_conj_psi1 ); FLA_Obj_free( &minus_zeta1 ); FLA_Obj_free( &u ); FLA_Obj_free( &y ); FLA_Obj_free( &z ); FLA_Obj_free( &v ); FLA_Obj_free( &w ); return FLA_SUCCESS; }
References FLA_Axpy(), FLA_Axpyt(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Copy(), FLA_Copyt(), FLA_Dot(), FLA_Dotc(), FLA_Gemv(), FLA_Gemvc(), FLA_Gerc(), FLA_Househ2_UT(), FLA_Inv_scal(), FLA_Inv_scalc(), FLA_Merge_2x1(), FLA_MINUS_ONE, FLA_Obj_create(), FLA_Obj_datatype(), FLA_Obj_free(), FLA_Obj_length(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_Scal(), FLA_Set(), FLA_TWO, and FLA_ZERO.
Referenced by FLA_Hess_UT_unb_var4().
{ FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj YTL, YTR, Y00, y01, Y02, YBL, YBR, y10t, psi11, y12t, Y20, y21, Y22; FLA_Obj ZTL, ZTR, Z00, z01, Z02, ZBL, ZBR, z10t, zeta11, z12t, Z20, z21, Z22; FLA_Obj TTL, TTR, T00, t01, T02, TBL, TBR, t10t, tau11, t12t, T20, t21, T22; FLA_Obj dT, d0, dB, delta1, d2; FLA_Obj eT, e0, eB, epsilon1, e2; FLA_Obj fT, f0, fB, phi1, f2; FLA_Obj d, e, f; FLA_Obj inv_tau11; FLA_Obj minus_inv_tau11; FLA_Obj first_elem; FLA_Obj last_elem; FLA_Obj beta; FLA_Obj conj_beta; FLA_Obj dot_product; FLA_Obj a10t_l, a10t_r; FLA_Obj a21_t, a21_b; FLA_Obj a2; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &inv_tau11 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &minus_inv_tau11 ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &first_elem ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &last_elem ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &beta ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &conj_beta ); FLA_Obj_create( datatype_A, 1, 1, 0, 0, &dot_product ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &d ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &e ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &f ); FLA_Set( FLA_ZERO, Y ); FLA_Set( FLA_ZERO, Z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x2( Y, &YTL, &YTR, &YBL, &YBR, 0, 0, FLA_TL ); FLA_Part_2x2( Z, &ZTL, &ZTR, &ZBL, &ZBR, 0, 0, FLA_TL ); FLA_Part_2x2( T, &TTL, &TTR, &TBL, &TBR, 0, 0, FLA_TL ); FLA_Part_2x1( d, &dT, &dB, 0, FLA_TOP ); FLA_Part_2x1( e, &eT, &eB, 0, FLA_TOP ); FLA_Part_2x1( f, &fT, &fB, 0, FLA_TOP ); while ( FLA_Obj_length( ATL ) < b_alg ) { FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( YTL, /**/ YTR, &Y00, /**/ &y01, &Y02, /* ************* */ /* ************************ */ &y10t, /**/ &psi11, &y12t, YBL, /**/ YBR, &Y20, /**/ &y21, &Y22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( ZTL, /**/ ZTR, &Z00, /**/ &z01, &Z02, /* ************* */ /* ************************* */ &z10t, /**/ &zeta11, &z12t, ZBL, /**/ ZBR, &Z20, /**/ &z21, &Z22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( TTL, /**/ TTR, &T00, /**/ &t01, &T02, /* ************* */ /* ************************** */ &t10t, /**/ &tau11, &t12t, TBL, /**/ TBR, &T20, /**/ &t21, &T22, 1, 1, FLA_BR ); FLA_Repart_2x1_to_3x1( dT, &d0, /* ** */ /* ****** */ &delta1, dB, &d2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( eT, &e0, /* ** */ /* ******** */ &epsilon1, eB, &e2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( fT, &f0, /* ** */ /* **** */ &phi1, fB, &f2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ // Save first element of a10_r and set it to one so we can use a10t as // u10t in subsequent computations. We will restore a10_r later on. if ( FLA_Obj_length( ATL ) > 0 ) { FLA_Part_1x2( a10t, &a10t_l, &a10t_r, 1, FLA_RIGHT ); FLA_Copy( a10t_r, last_elem ); FLA_Set( FLA_ONE, a10t_r ); } FLA_Merge_2x1( alpha11, a21, &a2 ); // alpha11 = alpha11 - u10t * y10t' - z10t * u10t'; // a21 = a21 - U20 * y10t' - Z20 * u10t'; FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ABL, y10t, FLA_ONE, a2 ); FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ZBL, a10t, FLA_ONE, a2 ); // a12t = a12t - u10t * Y20' - z10t * U20'; FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, a10t, FLA_ONE, a12t ); FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, A20, z10t, FLA_ONE, a12t ); // Restore last element of a10t. if ( FLA_Obj_length( ATL ) > 0 ) { FLA_Copy( last_elem, a10t_r ); } if ( FLA_Obj_length( A22 ) > 0 ) { FLA_Part_2x1( a21, &a21_t, &a21_b, 1, FLA_TOP ); // [ u21, tau11, a21 ] = House( a21 ); FLA_Househ2_UT( FLA_LEFT, a21_t, a21_b, tau11 ); // inv_tau11 = 1 / tau11; // minus_inv_tau11 = -1 / tau11; FLA_Set( FLA_ONE, inv_tau11 ); FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); FLA_Copy( inv_tau11, minus_inv_tau11 ); FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); // Save first element of a21_t and set it to one. FLA_Copy( a21_t, first_elem ); FLA_Set( FLA_ONE, a21_t ); // y21 = A22' * u21; FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y21 ); // z21 = A22 * u21; FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z21 ); // y21 = y21 - Y20 * ( U20' * u21 ) - U20 * ( Z20' * u21 ); FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, d0 ); FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Y20, a21, FLA_ZERO, e0 ); FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Z20, a21, FLA_ZERO, f0 ); FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, d0, FLA_ONE, y21 ); FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, f0, FLA_ONE, y21 ); // t01 = U20' * u21; FLA_Copy( d0, t01 ); // z21 = z21 - U20 * ( Y20' * u21 ) - Z20 * ( U20' * u21 ); FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, e0, FLA_ONE, z21 ); FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, d0, FLA_ONE, z21 ); // beta = u21' * z21 / 2; // conj_beta = conj(beta); FLA_Dotc( FLA_CONJUGATE, a21, z21, beta ); FLA_Inv_scal( FLA_TWO, beta ); FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); // y21' = ( y21' - beta / tau * u21' ) / tau; // y21 = ( y21 - conj(beta) / tau * u21 ) / tau; FLA_Scal( minus_inv_tau11, conj_beta ); FLA_Axpy( conj_beta, a21, y21 ); FLA_Scal( inv_tau11, y21 ); // z21 = ( z21 - beta / tau * u21 ) / tau; FLA_Scal( minus_inv_tau11, beta ); FLA_Axpy( beta, a21, z21 ); FLA_Scal( inv_tau11, z21 ); // a12t = a12t * ( I - u21 * u21' / tau ); // = a12t - ( a12t * u21 ) * u21' / tau; FLA_Dot( a12t, a21, dot_product ); FLA_Scal( minus_inv_tau11, dot_product ); FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); // A02 = A02 * ( I - u21 * u21' / tau ); // = A02 - ( A02 * u21 ) * u21' / tau; FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, e0 ); FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, e0, a21, A02 ); // Restore first element of a21. FLA_Copy( first_elem, a21_t ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &YTL, /**/ &YTR, Y00, y01, /**/ Y02, y10t, psi11, /**/ y12t, /* ************** */ /* ********************** */ &YBL, /**/ &YBR, Y20, y21, /**/ Y22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &ZTL, /**/ &ZTR, Z00, z01, /**/ Z02, z10t, zeta11, /**/ z12t, /* ************** */ /* *********************** */ &ZBL, /**/ &ZBR, Z20, z21, /**/ Z22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &TTL, /**/ &TTR, T00, t01, /**/ T02, t10t, tau11, /**/ t12t, /* ************** */ /* ************************ */ &TBL, /**/ &TBR, T20, t21, /**/ T22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &dT, d0, delta1, /* ** */ /* ****** */ &dB, d2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &eT, e0, epsilon1, /* ** */ /* ******** */ &eB, e2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &fT, f0, phi1, /* ** */ /* **** */ &fB, f2, FLA_TOP ); } FLA_Obj_free( &inv_tau11 ); FLA_Obj_free( &minus_inv_tau11 ); FLA_Obj_free( &first_elem ); FLA_Obj_free( &last_elem ); FLA_Obj_free( &beta ); FLA_Obj_free( &conj_beta ); FLA_Obj_free( &dot_product ); FLA_Obj_free( &d ); FLA_Obj_free( &e ); FLA_Obj_free( &f ); return FLA_SUCCESS; }
References FLA_Axpyt(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Copy(), FLA_Copyt(), FLA_Dot(), FLA_Dots(), FLA_Gemv(), FLA_Househ2_UT(), FLA_MINUS_ONE, FLA_Obj_create(), FLA_Obj_datatype(), FLA_Obj_free(), FLA_Obj_length(), FLA_ONE, FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_Set(), FLA_Trmvsx(), FLA_Trsv(), and FLA_ZERO.
Referenced by FLA_Hess_UT_unb_var5().
{ FLA_Obj ATL, ATR, A00, a01, A02, ABL, ABR, a10t, alpha11, a12t, A20, a21, A22; FLA_Obj UTL, UTR, U00, u01, U02, UBL, UBR, u10t, upsilon11, u12t, U20, u21, U22; FLA_Obj ZTL, ZTR, Z00, z01, Z02, ZBL, ZBR, z10t, zeta11, z12t, Z20, z21, Z22; FLA_Obj TTL, TTR, T00, t01, T02, TBL, TBR, t10t, tau11, t12t, T20, t21, T22; FLA_Obj wT, w0, wB, omega1, w2; FLA_Obj w; FLA_Obj a21_t, a21_b; FLA_Obj u21_t, u21_b; FLA_Datatype datatype_A; dim_t m_A; dim_t b_alg; b_alg = FLA_Obj_length( T ); datatype_A = FLA_Obj_datatype( A ); m_A = FLA_Obj_length( A ); FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); FLA_Set( FLA_ZERO, U ); FLA_Set( FLA_ZERO, Z ); FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x2( U, &UTL, &UTR, &UBL, &UBR, 0, 0, FLA_TL ); FLA_Part_2x2( Z, &ZTL, &ZTR, &ZBL, &ZBR, 0, 0, FLA_TL ); FLA_Part_2x2( T, &TTL, &TTR, &TBL, &TBR, 0, 0, FLA_TL ); FLA_Part_2x1( w, &wT, &wB, 0, FLA_TOP ); while ( FLA_Obj_length( ATL ) < b_alg ) { FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02, /* ************* */ /* ************************** */ &a10t, /**/ &alpha11, &a12t, ABL, /**/ ABR, &A20, /**/ &a21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( UTL, /**/ UTR, &U00, /**/ &u01, &U02, /* ************* */ /* **************************** */ &u10t, /**/ &upsilon11, &u12t, UBL, /**/ UBR, &U20, /**/ &u21, &U22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( ZTL, /**/ ZTR, &Z00, /**/ &z01, &Z02, /* ************* */ /* ************************* */ &z10t, /**/ &zeta11, &z12t, ZBL, /**/ ZBR, &Z20, /**/ &z21, &Z22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( TTL, /**/ TTR, &T00, /**/ &t01, &T02, /* ************* */ /* ************************** */ &t10t, /**/ &tau11, &t12t, TBL, /**/ TBR, &T20, /**/ &t21, &T22, 1, 1, FLA_BR ); FLA_Repart_2x1_to_3x1( wT, &w0, /* ** */ /* ****** */ &omega1, wB, &w2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ if ( FLA_Obj_length( ATL ) > 0 ) { // w0 = inv( triu( T00 ) ) * u10t'; FLA_Copyt( FLA_CONJ_TRANSPOSE, u10t, w0 ); FLA_Trsv( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, T00, w0 ); // a01 = a01 - Z00 * w0; // alpha11 = alpha11 - z10t * w0; // a21 = a21 - Z20 * w0; FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z00, w0, FLA_ONE, a01 ); FLA_Dots( FLA_MINUS_ONE, z10t, w0, FLA_ONE, alpha11 ); FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, w0, FLA_ONE, a21 ); // w0 = inv( triu( T00 ) )' * ( U00' * a01 + u10t' * alpha11 + U20' * a21 ); FLA_Trmvsx( FLA_LOWER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, U00, a01, FLA_ZERO, w0 ); FLA_Axpyt( FLA_CONJ_TRANSPOSE, alpha11, u10t, w0 ); FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, U20, a21, FLA_ONE, w0 ); FLA_Trsv( FLA_UPPER_TRIANGULAR, FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, T00, w0 ); // a01 = a01 - U00 * w0; // alpha11 = alpha11 - u10t * w0; // a21 = a21 - U20 * w0; FLA_Trmvsx( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_MINUS_ONE, U00, w0, FLA_ONE, a01 ); FLA_Dots( FLA_MINUS_ONE, u10t, w0, FLA_ONE, alpha11 ); FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, U20, w0, FLA_ONE, a21 ); } if ( FLA_Obj_length( a21 ) > 0 ) { FLA_Part_2x1( a21, &a21_t, &a21_b, 1, FLA_TOP ); // [ u21, tau11, a21 ] = House( a21 ); FLA_Househ2_UT( FLA_LEFT, a21_t, a21_b, tau11 ); // u21 := a21; FLA_Copy( a21, u21 ); // Explicitly set the first element of the Householder vector so we // can use it in regular computations. FLA_Part_2x1( u21, &u21_t, &u21_b, 1, FLA_TOP ); FLA_Set( FLA_ONE, u21_t ); // z01 = A02 * u21; // zeta11 = a12t * u21; // z21 = A22 * u21; FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, u21, FLA_ZERO, z01 ); FLA_Dot( a12t, u21, zeta11 ); FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, u21, FLA_ZERO, z21 ); // t01 = U20' * u21; FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, U20, u21, FLA_ZERO, t01 ); } /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02, a10t, alpha11, /**/ a12t, /* ************** */ /* ************************ */ &ABL, /**/ &ABR, A20, a21, /**/ A22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &UTL, /**/ &UTR, U00, u01, /**/ U02, u10t, upsilon11, /**/ u12t, /* ************** */ /* ************************** */ &UBL, /**/ &UBR, U20, u21, /**/ U22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &ZTL, /**/ &ZTR, Z00, z01, /**/ Z02, z10t, zeta11, /**/ z12t, /* ************** */ /* *********************** */ &ZBL, /**/ &ZBR, Z20, z21, /**/ Z22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &TTL, /**/ &TTR, T00, t01, /**/ T02, t10t, tau11, /**/ t12t, /* ************** */ /* ************************ */ &TBL, /**/ &TBR, T20, t21, /**/ T22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &wT, w0, omega1, /* ** */ /* ****** */ &wB, w2, FLA_TOP ); } FLA_Obj_free( &w ); return FLA_SUCCESS; }
FLA_Error FLA_Hess_UT_unb_var1 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_unb_var1().
Referenced by FLA_Hess_UT_internal().
{ return FLA_Hess_UT_step_unb_var1( A, T ); }
FLA_Error FLA_Hess_UT_unb_var2 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_unb_var2().
Referenced by FLA_Hess_UT_internal().
{ return FLA_Hess_UT_step_unb_var2( A, T ); }
FLA_Error FLA_Hess_UT_unb_var3 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_unb_var3().
Referenced by FLA_Hess_UT_internal().
{ return FLA_Hess_UT_step_unb_var3( A, T ); }
FLA_Error FLA_Hess_UT_unb_var4 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_unb_var4(), FLA_Obj_create_conf_to(), and FLA_Obj_free().
Referenced by FLA_Hess_UT_internal().
{ FLA_Error r_val; FLA_Obj Y, Z; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Y ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z ); r_val = FLA_Hess_UT_step_unb_var4( A, Y, Z, T ); FLA_Obj_free( &Y ); FLA_Obj_free( &Z ); return r_val; }
FLA_Error FLA_Hess_UT_unb_var5 | ( | FLA_Obj | A, |
FLA_Obj | T | ||
) |
References FLA_Hess_UT_step_unb_var5(), FLA_Obj_create_conf_to(), and FLA_Obj_free().
Referenced by FLA_Hess_UT_internal().
{ FLA_Error r_val; FLA_Obj U, Z; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &U ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &Z ); r_val = FLA_Hess_UT_step_unb_var5( A, U, Z, T ); FLA_Obj_free( &U ); FLA_Obj_free( &Z ); return r_val; }