libflame revision_anchor
|
Go to the source code of this file.
References bli_cswap(), bli_dswap(), bli_sswap(), bli_zswap(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_has_zero_dim(), FLA_Obj_length(), and FLA_Obj_width().
Referenced by FLA_SA_FS_blk(), and FLA_SA_LU_blk().
{ FLA_Datatype datatype; int m_C, n_C, cs_C; int cs_E; // int rs_C; // int rs_E; int m_p; int i; int* buff_p; if ( FLA_Obj_has_zero_dim( C ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( C ); m_C = FLA_Obj_length( C ); n_C = FLA_Obj_width( C ); cs_C = FLA_Obj_col_stride( C ); // rs_C = FLA_Obj_row_stride( C ); cs_E = FLA_Obj_col_stride( E ); // rs_E = FLA_Obj_row_stride( E ); m_p = FLA_Obj_length( p ); buff_p = ( int * ) FLA_INT_PTR( p ); switch ( datatype ){ case FLA_FLOAT: { float* buff_C = ( float * ) FLA_FLOAT_PTR( C ); float* buff_E = ( float * ) FLA_FLOAT_PTR( E ); for ( i = 0; i < m_p; ++i ) { if ( buff_p[ i ] != 0 ) bli_sswap( n_C, buff_C + 0*cs_C + i, cs_C, buff_E + 0*cs_E + buff_p[ i ] - ( m_C - i ), cs_E ); } break; } case FLA_DOUBLE: { double* buff_C = ( double * ) FLA_DOUBLE_PTR( C ); double* buff_E = ( double * ) FLA_DOUBLE_PTR( E ); for ( i = 0; i < m_p; ++i ) { if ( buff_p[ i ] != 0 ) bli_dswap( n_C, buff_C + 0*cs_C + i, cs_C, buff_E + 0*cs_E + buff_p[ i ] - ( m_C - i ), cs_E ); } break; } case FLA_COMPLEX: { scomplex* buff_C = ( scomplex * ) FLA_COMPLEX_PTR( C ); scomplex* buff_E = ( scomplex * ) FLA_COMPLEX_PTR( E ); for ( i = 0; i < m_p; ++i ) { if ( buff_p[ i ] != 0 ) bli_cswap( n_C, buff_C + 0*cs_C + i, cs_C, buff_E + 0*cs_E + buff_p[ i ] - ( m_C - i ), cs_E ); } break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_C = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( C ); dcomplex* buff_E = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( E ); for ( i = 0; i < m_p; ++i ) { if ( buff_p[ i ] != 0 ) bli_zswap( n_C, buff_C + 0*cs_C + i, cs_C, buff_E + 0*cs_E + buff_p[ i ] - ( m_C - i ), cs_E ); } break; } } return FLA_SUCCESS; }
References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x1_to_2x1(), FLA_Gemm_external(), FLA_MINUS_ONE, FLA_Obj_length(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x1_to_3x1(), FLA_SA_Apply_pivots(), and FLA_Trsm_external().
Referenced by FLA_SA_FS_task(), and FLASH_FS_incpiv_aux2().
{ FLA_Obj LT, L0, LB, L1, L2; FLA_Obj DL, DR, D0, D1, D2; FLA_Obj pT, p0, pB, p1, p2; FLA_Obj CT, C0, CB, C1, C2; FLA_Obj L1_sqr, L1_rest; dim_t b; FLA_Part_2x1( L, <, &LB, 0, FLA_TOP ); FLA_Part_1x2( D, &DL, &DR, 0, FLA_LEFT ); FLA_Part_2x1( p, &pT, &pB, 0, FLA_TOP ); FLA_Part_2x1( C, &CT, &CB, 0, FLA_TOP ); while ( FLA_Obj_length( LT ) < FLA_Obj_length( L ) ) { b = min( FLA_Obj_length( LB ), nb_alg ); FLA_Repart_2x1_to_3x1( LT, &L0, /* ** */ /* ** */ &L1, LB, &L2, b, FLA_BOTTOM ); FLA_Repart_1x2_to_1x3( DL, /**/ DR, &D0, /**/ &D1, &D2, b, FLA_RIGHT ); FLA_Repart_2x1_to_3x1( pT, &p0, /* ** */ /* ** */ &p1, pB, &p2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( CT, &C0, /* ** */ /* ** */ &C1, CB, &C2, b, FLA_BOTTOM ); /*------------------------------------------------------------*/ FLA_Part_1x2( L1, &L1_sqr, &L1_rest, b, FLA_LEFT ); FLA_SA_Apply_pivots( C1, E, p1 ); FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, FLA_ONE, L1_sqr, C1 ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_MINUS_ONE, D1, C1, FLA_ONE, E ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( <, L0, L1, /* ** */ /* ** */ &LB, L2, FLA_TOP ); FLA_Cont_with_1x3_to_1x2( &DL, /**/ &DR, D0, D1, /**/ D2, FLA_LEFT ); FLA_Cont_with_3x1_to_2x1( &pT, p0, p1, /* ** */ /* ** */ &pB, p2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &CT, C0, C1, /* ** */ /* ** */ &CB, C2, FLA_TOP ); } return FLA_SUCCESS; }
References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Gemm_external(), FLA_MINUS_ONE, FLA_Obj_length(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_SA_Apply_pivots(), FLA_SA_LU_unb(), and FLA_Trsm_external().
Referenced by FLA_SA_LU_task().
{ FLA_Obj UTL, UTR, U00, U01, U02, UBL, UBR, U10, U11, U12, U20, U21, U22; FLA_Obj DL, DR, D0, D1, D2; FLA_Obj pT, p0, pB, p1, p2; FLA_Obj LT, L0, LB, L1, L2; FLA_Obj L1_sqr, L1_rest; dim_t b; FLA_Part_2x2( U, &UTL, &UTR, &UBL, &UBR, 0, 0, FLA_TL ); FLA_Part_1x2( D, &DL, &DR, 0, FLA_LEFT ); FLA_Part_2x1( p, &pT, &pB, 0, FLA_TOP ); FLA_Part_2x1( L, <, &LB, 0, FLA_TOP ); while ( FLA_Obj_length( UTL ) < FLA_Obj_length( U ) ) { b = min( FLA_Obj_length( UBR ), nb_alg ); FLA_Repart_2x2_to_3x3( UTL, /**/ UTR, &U00, /**/ &U01, &U02, /* ************* */ /* ******************** */ &U10, /**/ &U11, &U12, UBL, /**/ UBR, &U20, /**/ &U21, &U22, b, b, FLA_BR ); FLA_Repart_1x2_to_1x3( DL, /**/ DR, &D0, /**/ &D1, &D2, b, FLA_RIGHT ); FLA_Repart_2x1_to_3x1( pT, &p0, /* ** */ /* ** */ &p1, pB, &p2, b, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( LT, &L0, /* ** */ /* ** */ &L1, LB, &L2, b, FLA_BOTTOM ); /*------------------------------------------------------------*/ FLA_Part_1x2( L1, &L1_sqr, &L1_rest, b, FLA_LEFT ); FLA_SA_LU_unb( U11, D1, p1, L1_sqr ); FLA_SA_Apply_pivots( U12, D2, p1 ); FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, FLA_ONE, L1_sqr, U12 ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_MINUS_ONE, D1, U12, FLA_ONE, D2 ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &UTL, /**/ &UTR, U00, U01, /**/ U02, U10, U11, /**/ U12, /* ************** */ /* ****************** */ &UBL, /**/ &UBR, U20, U21, /**/ U22, FLA_TL ); FLA_Cont_with_1x3_to_1x2( &DL, /**/ &DR, D0, D1, /**/ D2, FLA_LEFT ); FLA_Cont_with_3x1_to_2x1( &pT, p0, p1, /* ** */ /* ** */ &pB, p2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( <, L0, L1, /* ** */ /* ** */ &LB, L2, FLA_TOP ); } return FLA_SUCCESS; }
References bli_camax(), bli_ccopy(), bli_cger(), bli_cscal(), bli_cswap(), bli_damax(), bli_dcopy(), bli_dger(), bli_dscal(), bli_dswap(), bli_samax(), bli_scopy(), bli_sger(), bli_sscal(), bli_sswap(), bli_zamax(), bli_zcopy(), bli_zger(), bli_zscal(), bli_zswap(), FLA_Copy_external(), FLA_MINUS_ONE, FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_has_zero_dim(), FLA_Obj_length(), FLA_Obj_row_stride(), FLA_Triangularize(), dcomplex::imag, scomplex::imag, dcomplex::real, and scomplex::real.
Referenced by FLA_SA_LU_blk().
{ FLA_Datatype datatype; int m_U, cs_U; int m_D, cs_D; int cs_L; // int rs_U; int rs_D; // int rs_L; int m_U_min_j, m_U_min_j_min_1; int j, ipiv; int* buff_p; if ( FLA_Obj_has_zero_dim( U ) ) return FLA_SUCCESS; datatype = FLA_Obj_datatype( U ); m_U = FLA_Obj_length( U ); // rs_U = FLA_Obj_row_stride( U ); cs_U = FLA_Obj_col_stride( U ); m_D = FLA_Obj_length( D ); rs_D = FLA_Obj_row_stride( D ); cs_D = FLA_Obj_col_stride( D ); // rs_L = FLA_Obj_row_stride( L ); cs_L = FLA_Obj_col_stride( L ); FLA_Copy_external( U, L ); FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, L ); buff_p = ( int * ) FLA_INT_PTR( p ); switch ( datatype ){ case FLA_FLOAT: { float* buff_U = ( float * ) FLA_FLOAT_PTR( U ); float* buff_D = ( float * ) FLA_FLOAT_PTR( D ); float* buff_L = ( float * ) FLA_FLOAT_PTR( L ); float* buff_minus1 = ( float * ) FLA_FLOAT_PTR( FLA_MINUS_ONE ); float L_tmp; float D_tmp; float d_inv_Ljj; for ( j = 0; j < m_U; ++j ) { bli_samax( m_D, buff_D + j*cs_D + 0*rs_D, rs_D, &ipiv ); L_tmp = buff_L[ j*cs_L + j ]; D_tmp = buff_D[ j*cs_D + ipiv ]; if ( dabs( L_tmp ) < dabs( D_tmp ) ) { bli_sswap( m_U, buff_L + 0*cs_L + j, cs_L, buff_D + 0*cs_D + ipiv, cs_D ); buff_p[ j ] = ipiv + m_U - j; } else { buff_p[ j ] = 0; } d_inv_Ljj = 1.0F / buff_L[ j*cs_L + j ]; bli_sscal( m_D, &d_inv_Ljj, buff_D + j*cs_D + 0, rs_D ); m_U_min_j_min_1 = m_U - j - 1; if ( m_U_min_j_min_1 > 0 ) { bli_sger( BLIS_NO_CONJUGATE, BLIS_NO_CONJUGATE, m_D, m_U_min_j_min_1, buff_minus1, buff_D + (j+0)*cs_D + 0, rs_D, buff_L + (j+1)*cs_L + j, cs_L, buff_D + (j+1)*cs_D + 0, rs_D, cs_D ); } m_U_min_j = m_U - j; if ( m_U_min_j > 0 ) { bli_scopy( m_U_min_j, buff_L + j*cs_L + j, cs_L, buff_U + j*cs_U + j, cs_U ); } } break; } case FLA_DOUBLE: { double* buff_U = ( double * ) FLA_DOUBLE_PTR( U ); double* buff_D = ( double * ) FLA_DOUBLE_PTR( D ); double* buff_L = ( double * ) FLA_DOUBLE_PTR( L ); double* buff_minus1 = ( double * ) FLA_DOUBLE_PTR( FLA_MINUS_ONE ); double L_tmp; double D_tmp; double d_inv_Ljj; for ( j = 0; j < m_U; ++j ) { bli_damax( m_D, buff_D + j*cs_D + 0*rs_D, rs_D, &ipiv ); L_tmp = buff_L[ j*cs_L + j ]; D_tmp = buff_D[ j*cs_D + ipiv ]; if ( dabs( L_tmp ) < dabs( D_tmp ) ) { bli_dswap( m_U, buff_L + 0*cs_L + j, cs_L, buff_D + 0*cs_D + ipiv, cs_D ); buff_p[ j ] = ipiv + m_U - j; } else { buff_p[ j ] = 0; } d_inv_Ljj = 1.0 / buff_L[ j*cs_L + j ]; bli_dscal( m_D, &d_inv_Ljj, buff_D + j*cs_D + 0, rs_D ); m_U_min_j_min_1 = m_U - j - 1; if ( m_U_min_j_min_1 > 0 ) { bli_dger( BLIS_NO_CONJUGATE, BLIS_NO_CONJUGATE, m_D, m_U_min_j_min_1, buff_minus1, buff_D + (j+0)*cs_D + 0, rs_D, buff_L + (j+1)*cs_L + j, cs_L, buff_D + (j+1)*cs_D + 0, rs_D, cs_D ); } m_U_min_j = m_U - j; if ( m_U_min_j > 0 ) { bli_dcopy( m_U_min_j, buff_L + j*cs_L + j, cs_L, buff_U + j*cs_U + j, cs_U ); } } break; } case FLA_COMPLEX: { scomplex* buff_U = ( scomplex * ) FLA_COMPLEX_PTR( U ); scomplex* buff_D = ( scomplex * ) FLA_COMPLEX_PTR( D ); scomplex* buff_L = ( scomplex * ) FLA_COMPLEX_PTR( L ); scomplex* buff_minus1 = ( scomplex * ) FLA_COMPLEX_PTR( FLA_MINUS_ONE ); scomplex L_tmp; scomplex D_tmp; scomplex d_inv_Ljj; scomplex Ljj; float temp; for ( j = 0; j < m_U; ++j ) { bli_camax( m_D, buff_D + j*cs_D + 0*rs_D, rs_D, &ipiv ); L_tmp = buff_L[ j*cs_L + j ]; D_tmp = buff_D[ j*cs_D + ipiv ]; if ( dabs( L_tmp.real + L_tmp.imag ) < dabs( D_tmp.real + D_tmp.imag ) ) { bli_cswap( m_U, buff_L + 0*cs_L + j, cs_L, buff_D + 0*cs_D + ipiv, cs_D ); buff_p[ j ] = ipiv + m_U - j; } else { buff_p[ j ] = 0; } Ljj = buff_L[ j*cs_L + j ]; // d_inv_Ljj = 1.0 / Ljj temp = 1.0F / ( Ljj.real * Ljj.real + Ljj.imag * Ljj.imag ); d_inv_Ljj.real = Ljj.real * temp; d_inv_Ljj.imag = Ljj.imag * -temp; bli_cscal( m_D, &d_inv_Ljj, buff_D + j*cs_D + 0, rs_D ); m_U_min_j_min_1 = m_U - j - 1; if ( m_U_min_j_min_1 > 0 ) { bli_cger( BLIS_NO_CONJUGATE, BLIS_NO_CONJUGATE, m_D, m_U_min_j_min_1, buff_minus1, buff_D + (j+0)*cs_D + 0, rs_D, buff_L + (j+1)*cs_L + j, cs_L, buff_D + (j+1)*cs_D + 0, rs_D, cs_D ); } m_U_min_j = m_U - j; if ( m_U_min_j > 0 ) { bli_ccopy( m_U_min_j, buff_L + j*cs_L + j, cs_L, buff_U + j*cs_U + j, cs_U ); } } break; } case FLA_DOUBLE_COMPLEX: { dcomplex* buff_U = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( U ); dcomplex* buff_D = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( D ); dcomplex* buff_L = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( L ); dcomplex* buff_minus1 = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( FLA_MINUS_ONE ); dcomplex L_tmp; dcomplex D_tmp; dcomplex d_inv_Ljj; dcomplex Ljj; double temp; for ( j = 0; j < m_U; ++j ) { bli_zamax( m_D, buff_D + j*cs_D + 0*rs_D, rs_D, &ipiv ); L_tmp = buff_L[ j*cs_L + j ]; D_tmp = buff_D[ j*cs_D + ipiv ]; if ( dabs( L_tmp.real + L_tmp.imag ) < dabs( D_tmp.real + D_tmp.imag ) ) { bli_zswap( m_U, buff_L + 0*cs_L + j, cs_L, buff_D + 0*cs_D + ipiv, cs_D ); buff_p[ j ] = ipiv + m_U - j; } else { buff_p[ j ] = 0; } Ljj = buff_L[ j*cs_L + j ]; // d_inv_Ljj = 1.0 / Ljj temp = 1.0 / ( Ljj.real * Ljj.real + Ljj.imag * Ljj.imag ); d_inv_Ljj.real = Ljj.real * temp; d_inv_Ljj.imag = Ljj.imag * -temp; bli_zscal( m_D, &d_inv_Ljj, buff_D + j*cs_D + 0, rs_D ); m_U_min_j_min_1 = m_U - j - 1; if ( m_U_min_j_min_1 > 0 ) { bli_zger( BLIS_NO_CONJUGATE, BLIS_NO_CONJUGATE, m_D, m_U_min_j_min_1, buff_minus1, buff_D + (j+0)*cs_D + 0, rs_D, buff_L + (j+1)*cs_L + j, cs_L, buff_D + (j+1)*cs_D + 0, rs_D, cs_D ); } m_U_min_j = m_U - j; if ( m_U_min_j > 0 ) { bli_zcopy( m_U_min_j, buff_L + j*cs_L + j, cs_L, buff_U + j*cs_U + j, cs_U ); } } break; } } return FLA_SUCCESS; }
References FLA_Apply_pivots(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Obj_length(), FLA_Obj_width(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_Trsv_external(), and FLASH_FS_incpiv_aux2().
Referenced by FLASH_FS_incpiv().
{ FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj pTL, pTR, p00, p01, p02, pBL, pBR, p10, p11, p12, p20, p21, p22; FLA_Obj LTL, LTR, L00, L01, L02, LBL, LBR, L10, L11, L12, L20, L21, L22; FLA_Obj bT, b0, bB, b1, b2; FLA_Obj p11_conf, p11_rest; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x2( p, &pTL, &pTR, &pBL, &pBR, 0, 0, FLA_TL ); FLA_Part_2x2( L, <L, <R, &LBL, &LBR, 0, 0, FLA_TL ); FLA_Part_2x1( b, &bT, &bB, 0, FLA_TOP ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) && FLA_Obj_width ( ATL ) < FLA_Obj_width ( A ) ) { FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( pTL, /**/ pTR, &p00, /**/ &p01, &p02, /* ************* */ /* ******************** */ &p10, /**/ &p11, &p12, pBL, /**/ pBR, &p20, /**/ &p21, &p22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( LTL, /**/ LTR, &L00, /**/ &L01, &L02, /* ************* */ /* ******************** */ &L10, /**/ &L11, &L12, LBL, /**/ LBR, &L20, /**/ &L21, &L22, 1, 1, FLA_BR ); FLA_Repart_2x1_to_3x1( bT, &b0, /* ** */ /* ** */ &b1, bB, &b2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ FLA_Part_2x1( *FLASH_OBJ_PTR_AT( p11 ), &p11_conf, &p11_rest, FLA_Obj_length( *FLASH_OBJ_PTR_AT( b1 ) ), FLA_TOP ); FLA_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, p11_conf, *FLASH_OBJ_PTR_AT( b1 ) ); FLA_Trsv_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, *FLASH_OBJ_PTR_AT( A11 ), *FLASH_OBJ_PTR_AT( b1 ) ); FLASH_FS_incpiv_aux2( L21, A21, p21, b1, b2, nb_alg ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &pTL, /**/ &pTR, p00, p01, /**/ p02, p10, p11, /**/ p12, /* ************** */ /* ****************** */ &pBL, /**/ &pBR, p20, p21, /**/ p22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( <L, /**/ <R, L00, L01, /**/ L02, L10, L11, /**/ L12, /* ************** */ /* ****************** */ &LBL, /**/ &LBR, L20, L21, /**/ L22, FLA_TL ); FLA_Cont_with_3x1_to_2x1( &bT, b0, b1, /* ** */ /* ** */ &bB, b2, FLA_TOP ); } return FLA_SUCCESS; }
FLA_Error FLASH_FS_incpiv_aux2 | ( | FLA_Obj | L, |
FLA_Obj | D, | ||
FLA_Obj | p, | ||
FLA_Obj | C, | ||
FLA_Obj | E, | ||
dim_t | nb_alg | ||
) |
References FLA_Cont_with_3x1_to_2x1(), FLA_Obj_length(), FLA_Part_2x1(), FLA_Repart_2x1_to_3x1(), and FLA_SA_FS_blk().
Referenced by FLASH_FS_incpiv_aux1().
{ FLA_Obj LT, L0, LB, L1, L2; FLA_Obj DT, D0, DB, D1, D2; FLA_Obj pT, p0, pB, p1, p2; FLA_Obj ET, E0, EB, E1, E2; FLA_Part_2x1( L, <, &LB, 0, FLA_TOP ); FLA_Part_2x1( D, &DT, &DB, 0, FLA_TOP ); FLA_Part_2x1( p, &pT, &pB, 0, FLA_TOP ); FLA_Part_2x1( E, &ET, &EB, 0, FLA_TOP ); while ( FLA_Obj_length( DT ) < FLA_Obj_length( D ) ) { FLA_Repart_2x1_to_3x1( LT, &L0, /* ** */ /* ** */ &L1, LB, &L2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( DT, &D0, /* ** */ /* ** */ &D1, DB, &D2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( pT, &p0, /* ** */ /* ** */ &p1, pB, &p2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( ET, &E0, /* ** */ /* ** */ &E1, EB, &E2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ FLA_SA_FS_blk( *FLASH_OBJ_PTR_AT( L1 ), *FLASH_OBJ_PTR_AT( D1 ), *FLASH_OBJ_PTR_AT( p1 ), *FLASH_OBJ_PTR_AT( C ), *FLASH_OBJ_PTR_AT( E1 ), nb_alg ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( <, L0, L1, /* ** */ /* ** */ &LB, L2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &DT, D0, D1, /* ** */ /* ** */ &DB, D2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &pT, p0, p1, /* ** */ /* ** */ &pB, p2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &ET, E0, E1, /* ** */ /* ** */ &EB, E2, FLA_TOP ); } return FLA_SUCCESS; }
References FLA_Cont_with_3x3_to_2x2(), FLA_LU_piv_task(), FLA_Obj_length(), FLA_Obj_width(), FLA_Part_2x2(), FLA_Repart_2x2_to_3x3(), FLASH_Queue_get_enabled(), FLASH_SA_LU(), and FLASH_Trsm_piv().
Referenced by FLASH_LU_incpiv_noopt().
{ FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj pTL, pTR, p00, p01, p02, pBL, pBR, p10, p11, p12, p20, p21, p22; FLA_Obj LTL, LTR, L00, L01, L02, LBL, LBR, L10, L11, L12, L20, L21, L22; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x2( p, &pTL, &pTR, &pBL, &pBR, 0, 0, FLA_TL ); FLA_Part_2x2( L, <L, <R, &LBL, &LBR, 0, 0, FLA_TL ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) && FLA_Obj_width ( ATL ) < FLA_Obj_width ( A ) ) { FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( pTL, /**/ pTR, &p00, /**/ &p01, &p02, /* ************* */ /* ******************** */ &p10, /**/ &p11, &p12, pBL, /**/ pBR, &p20, /**/ &p21, &p22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( LTL, /**/ LTR, &L00, /**/ &L01, &L02, /* ************* */ /* ******************** */ &L10, /**/ &L11, &L12, LBL, /**/ LBR, &L20, /**/ &L21, &L22, 1, 1, FLA_BR ); /*------------------------------------------------------------*/ if ( FLASH_Queue_get_enabled( ) ) { // Enqueue ENQUEUE_FLASH_LU_piv( *FLASH_OBJ_PTR_AT( A11 ), *FLASH_OBJ_PTR_AT( p11 ), FLA_Cntl_sub_lu( cntl ) ); } else { // Execute leaf FLA_LU_piv_task( *FLASH_OBJ_PTR_AT( A11 ), *FLASH_OBJ_PTR_AT( p11 ), FLA_Cntl_sub_lu( cntl ) ); } FLASH_Trsm_piv( A11, A12, p11, FLA_Cntl_sub_trsm1( cntl ) ); FLASH_SA_LU( A11, A12, A21, A22, p21, L21, nb_alg, cntl ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &pTL, /**/ &pTR, p00, p01, /**/ p02, p10, p11, /**/ p12, /* ************** */ /* ****************** */ &pBL, /**/ &pBR, p20, p21, /**/ p22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( <L, /**/ <R, L00, L01, /**/ L02, L10, L11, /**/ L12, /* ************** */ /* ****************** */ &LBL, /**/ &LBR, L20, L21, /**/ L22, FLA_TL ); } return FLA_SUCCESS; }
FLA_Error FLASH_LU_incpiv_var2 | ( | FLA_Obj | A, |
FLA_Obj | p, | ||
FLA_Obj | L, | ||
FLA_Obj | U, | ||
dim_t | nb_alg, | ||
fla_lu_t * | cntl | ||
) |
References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x3_to_2x2(), FLA_LU_piv_copy_task(), FLA_Obj_length(), FLA_Obj_width(), FLA_Part_1x2(), FLA_Part_2x2(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x2_to_3x3(), FLASH_Queue_get_enabled(), FLASH_SA_LU(), and FLASH_Trsm_piv().
Referenced by FLASH_LU_incpiv_opt1().
{ FLA_Obj ATL, ATR, A00, A01, A02, ABL, ABR, A10, A11, A12, A20, A21, A22; FLA_Obj pTL, pTR, p00, p01, p02, pBL, pBR, p10, p11, p12, p20, p21, p22; FLA_Obj LTL, LTR, L00, L01, L02, LBL, LBR, L10, L11, L12, L20, L21, L22; FLA_Obj UL, UR, U0, U1, U2; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 0, 0, FLA_TL ); FLA_Part_2x2( p, &pTL, &pTR, &pBL, &pBR, 0, 0, FLA_TL ); FLA_Part_2x2( L, <L, <R, &LBL, &LBR, 0, 0, FLA_TL ); FLA_Part_1x2( U, &UL, &UR, 0, FLA_LEFT ); while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) && FLA_Obj_width ( ATL ) < FLA_Obj_width ( A ) ) { FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, /* ************* */ /* ******************** */ &A10, /**/ &A11, &A12, ABL, /**/ ABR, &A20, /**/ &A21, &A22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( pTL, /**/ pTR, &p00, /**/ &p01, &p02, /* ************* */ /* ******************** */ &p10, /**/ &p11, &p12, pBL, /**/ pBR, &p20, /**/ &p21, &p22, 1, 1, FLA_BR ); FLA_Repart_2x2_to_3x3( LTL, /**/ LTR, &L00, /**/ &L01, &L02, /* ************* */ /* ******************** */ &L10, /**/ &L11, &L12, LBL, /**/ LBR, &L20, /**/ &L21, &L22, 1, 1, FLA_BR ); FLA_Repart_1x2_to_1x3( UL, /**/ UR, &U0, /**/ &U1, &U2, 1, FLA_RIGHT ); /*------------------------------------------------------------*/ if ( FLASH_Queue_get_enabled( ) ) { // Enqueue ENQUEUE_FLASH_LU_piv_copy( *FLASH_OBJ_PTR_AT( A11 ), *FLASH_OBJ_PTR_AT( p11 ), *FLASH_OBJ_PTR_AT( U1 ), FLA_Cntl_sub_lu( cntl ) ); } else { // Execute leaf FLA_LU_piv_copy_task( *FLASH_OBJ_PTR_AT( A11 ), *FLASH_OBJ_PTR_AT( p11 ), *FLASH_OBJ_PTR_AT( U1 ), FLA_Cntl_sub_lu( cntl ) ); } FLASH_Trsm_piv( U1, A12, p11, FLA_Cntl_sub_trsm1( cntl ) ); FLASH_SA_LU( A11, A12, A21, A22, p21, L21, nb_alg, cntl ); /*------------------------------------------------------------*/ FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, A10, A11, /**/ A12, /* ************** */ /* ****************** */ &ABL, /**/ &ABR, A20, A21, /**/ A22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( &pTL, /**/ &pTR, p00, p01, /**/ p02, p10, p11, /**/ p12, /* ************** */ /* ****************** */ &pBL, /**/ &pBR, p20, p21, /**/ p22, FLA_TL ); FLA_Cont_with_3x3_to_2x2( <L, /**/ <R, L00, L01, /**/ L02, L10, L11, /**/ L12, /* ************** */ /* ****************** */ &LBL, /**/ &LBR, L20, L21, /**/ L22, FLA_TL ); FLA_Cont_with_1x3_to_1x2( &UL, /**/ &UR, U0, U1, /**/ U2, FLA_LEFT ); } return FLA_SUCCESS; }
FLA_Error FLASH_SA_FS | ( | FLA_Obj | L, |
FLA_Obj | D, | ||
FLA_Obj | p, | ||
FLA_Obj | C, | ||
FLA_Obj | E, | ||
dim_t | nb_alg, | ||
fla_gemm_t * | cntl | ||
) |
References FLA_Cont_with_1x3_to_1x2(), FLA_Obj_width(), FLA_Part_1x2(), FLA_Repart_1x2_to_1x3(), FLA_SA_FS_task(), and FLASH_Queue_get_enabled().
Referenced by FLASH_SA_LU().
{ FLA_Obj CL, CR, C0, C1, C2; FLA_Obj EL, ER, E0, E1, E2; FLA_Part_1x2( C, &CL, &CR, 0, FLA_LEFT ); FLA_Part_1x2( E, &EL, &ER, 0, FLA_LEFT ); while ( FLA_Obj_width( CL ) < FLA_Obj_width( C ) ) { FLA_Repart_1x2_to_1x3( CL, /**/ CR, &C0, /**/ &C1, &C2, 1, FLA_RIGHT ); FLA_Repart_1x2_to_1x3( EL, /**/ ER, &E0, /**/ &E1, &E2, 1, FLA_RIGHT ); /*------------------------------------------------------------*/ if ( FLASH_Queue_get_enabled( ) ) { // Enqueue ENQUEUE_FLASH_SA_FS( *FLASH_OBJ_PTR_AT( L ), *FLASH_OBJ_PTR_AT( D ), *FLASH_OBJ_PTR_AT( p ), *FLASH_OBJ_PTR_AT( C1 ), *FLASH_OBJ_PTR_AT( E1 ), nb_alg, FLA_Cntl_sub_gemm( cntl ) ); } else { // Execute leaf FLA_SA_FS_task( *FLASH_OBJ_PTR_AT( L ), *FLASH_OBJ_PTR_AT( D ), *FLASH_OBJ_PTR_AT( p ), *FLASH_OBJ_PTR_AT( C1 ), *FLASH_OBJ_PTR_AT( E1 ), nb_alg, FLA_Cntl_sub_gemm( cntl ) ); } /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &CL, /**/ &CR, C0, C1, /**/ C2, FLA_LEFT ); FLA_Cont_with_1x3_to_1x2( &EL, /**/ &ER, E0, E1, /**/ E2, FLA_LEFT ); } return FLA_SUCCESS; }
FLA_Error FLASH_SA_LU | ( | FLA_Obj | B, |
FLA_Obj | C, | ||
FLA_Obj | D, | ||
FLA_Obj | E, | ||
FLA_Obj | p, | ||
FLA_Obj | L, | ||
dim_t | nb_alg, | ||
fla_lu_t * | cntl | ||
) |
References FLA_Cont_with_3x1_to_2x1(), FLA_Obj_length(), FLA_Part_2x1(), FLA_Repart_2x1_to_3x1(), FLA_SA_LU_task(), FLASH_Queue_get_enabled(), and FLASH_SA_FS().
Referenced by FLASH_LU_incpiv_var1(), and FLASH_LU_incpiv_var2().
{ FLA_Obj DT, D0, DB, D1, D2; FLA_Obj ET, E0, EB, E1, E2; FLA_Obj pT, p0, pB, p1, p2; FLA_Obj LT, L0, LB, L1, L2; FLA_Part_2x1( D, &DT, &DB, 0, FLA_TOP ); FLA_Part_2x1( E, &ET, &EB, 0, FLA_TOP ); FLA_Part_2x1( p, &pT, &pB, 0, FLA_TOP ); FLA_Part_2x1( L, <, &LB, 0, FLA_TOP ); while ( FLA_Obj_length( DT ) < FLA_Obj_length( D ) ) { FLA_Repart_2x1_to_3x1( DT, &D0, /* ** */ /* ** */ &D1, DB, &D2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( ET, &E0, /* ** */ /* ** */ &E1, EB, &E2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( pT, &p0, /* ** */ /* ** */ &p1, pB, &p2, 1, FLA_BOTTOM ); FLA_Repart_2x1_to_3x1( LT, &L0, /* ** */ /* ** */ &L1, LB, &L2, 1, FLA_BOTTOM ); /*------------------------------------------------------------*/ if ( FLASH_Queue_get_enabled( ) ) { // Enqueue ENQUEUE_FLASH_SA_LU( *FLASH_OBJ_PTR_AT( B ), *FLASH_OBJ_PTR_AT( D1 ), *FLASH_OBJ_PTR_AT( p1 ), *FLASH_OBJ_PTR_AT( L1 ), nb_alg, FLA_Cntl_sub_lu( cntl ) ); } else { // Execute leaf FLA_SA_LU_task( *FLASH_OBJ_PTR_AT( B ), *FLASH_OBJ_PTR_AT( D1 ), *FLASH_OBJ_PTR_AT( p1 ), *FLASH_OBJ_PTR_AT( L1 ), nb_alg, FLA_Cntl_sub_lu( cntl ) ); } FLASH_SA_FS( L1, D1, p1, C, E1, nb_alg, FLA_Cntl_sub_gemm1( cntl ) ); /*------------------------------------------------------------*/ FLA_Cont_with_3x1_to_2x1( &DT, D0, D1, /* ** */ /* ** */ &DB, D2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &ET, E0, E1, /* ** */ /* ** */ &EB, E2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( &pT, p0, p1, /* ** */ /* ** */ &pB, p2, FLA_TOP ); FLA_Cont_with_3x1_to_2x1( <, L0, L1, /* ** */ /* ** */ &LB, L2, FLA_TOP ); } return FLA_SUCCESS; }
FLA_Error FLASH_Trsm_piv | ( | FLA_Obj | A, |
FLA_Obj | B, | ||
FLA_Obj | p, | ||
fla_trsm_t * | cntl | ||
) |
References FLA_Cont_with_1x3_to_1x2(), FLA_Obj_width(), FLA_Part_1x2(), FLA_Repart_1x2_to_1x3(), FLA_Trsm_piv_task(), and FLASH_Queue_get_enabled().
Referenced by FLASH_LU_incpiv_var1(), and FLASH_LU_incpiv_var2().
{ FLA_Obj BL, BR, B0, B1, B2; FLA_Part_1x2( B, &BL, &BR, 0, FLA_LEFT ); while ( FLA_Obj_width( BL ) < FLA_Obj_width( B ) ) { FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, /**/ &B1, &B2, 1, FLA_RIGHT ); /*------------------------------------------------------------*/ if ( FLASH_Queue_get_enabled( ) ) { // Enqueue ENQUEUE_FLASH_Trsm_piv( *FLASH_OBJ_PTR_AT( A ), *FLASH_OBJ_PTR_AT( B1 ), *FLASH_OBJ_PTR_AT( p ), FLA_Cntl_sub_trsm( cntl ) ); } else { // Execute leaf FLA_Trsm_piv_task( *FLASH_OBJ_PTR_AT( A ), *FLASH_OBJ_PTR_AT( B1 ), *FLASH_OBJ_PTR_AT( p ), FLA_Cntl_sub_trsm( cntl ) ); } /*------------------------------------------------------------*/ FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, B1, /**/ B2, FLA_LEFT ); } return FLA_SUCCESS; }