libflame  revision_anchor
Functions
bli_symm.c File Reference

(r)

Functions

void bli_ssymm (side_t side, uplo_t uplo, int m, int n, float *alpha, float *a, int a_rs, int a_cs, float *b, int b_rs, int b_cs, float *beta, float *c, int c_rs, int c_cs)
void bli_dsymm (side_t side, uplo_t uplo, int m, int n, double *alpha, double *a, int a_rs, int a_cs, double *b, int b_rs, int b_cs, double *beta, double *c, int c_rs, int c_cs)
void bli_csymm (side_t side, uplo_t uplo, int m, int n, scomplex *alpha, scomplex *a, int a_rs, int a_cs, scomplex *b, int b_rs, int b_cs, scomplex *beta, scomplex *c, int c_rs, int c_cs)
void bli_zsymm (side_t side, uplo_t uplo, int m, int n, dcomplex *alpha, dcomplex *a, int a_rs, int a_cs, dcomplex *b, int b_rs, int b_cs, dcomplex *beta, dcomplex *c, int c_rs, int c_cs)
void bli_ssymm_blas (side_t side, uplo_t uplo, int m, int n, float *alpha, float *a, int lda, float *b, int ldb, float *beta, float *c, int ldc)
void bli_dsymm_blas (side_t side, uplo_t uplo, int m, int n, double *alpha, double *a, int lda, double *b, int ldb, double *beta, double *c, int ldc)
void bli_csymm_blas (side_t side, uplo_t uplo, int m, int n, scomplex *alpha, scomplex *a, int lda, scomplex *b, int ldb, scomplex *beta, scomplex *c, int ldc)
void bli_zsymm_blas (side_t side, uplo_t uplo, int m, int n, dcomplex *alpha, dcomplex *a, int lda, dcomplex *b, int ldb, dcomplex *beta, dcomplex *c, int ldc)

Function Documentation

void bli_csymm ( side_t  side,
uplo_t  uplo,
int  m,
int  n,
scomplex alpha,
scomplex a,
int  a_rs,
int  a_cs,
scomplex b,
int  b_rs,
int  b_cs,
scomplex beta,
scomplex c,
int  c_rs,
int  c_cs 
)

References bli_c0(), bli_c1(), bli_callocm(), bli_caxpymt(), bli_ccopymt(), bli_ccreate_contigm(), bli_ccreate_contigmr(), bli_cfree(), bli_cfree_contigm(), bli_cfree_saved_contigm(), bli_cscalm(), bli_csymm_blas(), bli_is_col_storage(), bli_set_dim_with_side(), bli_zero_dim2(), BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, and BLIS_TRANSPOSE.

Referenced by FLA_Symm_external().

{
    int       m_save    = m;
    int       n_save    = n;
    scomplex* a_save    = a;
    scomplex* b_save    = b;
    scomplex* c_save    = c;
    int       a_rs_save = a_rs;
    int       a_cs_save = a_cs;
    int       b_rs_save = b_rs;
    int       b_cs_save = b_cs;
    int       c_rs_save = c_rs;
    int       c_cs_save = c_cs;
    scomplex  zero = bli_c0();
    scomplex  one  = bli_c1();
    scomplex* b_copy;
    scomplex* c_trans;
    int       dim_a;
    int       lda, inca;
    int       ldb, incb;
    int       ldc, incc;
    int       ldb_copy, incb_copy;
    int       ldc_trans, incc_trans;
    int       symm_needs_copyb  = FALSE;
    int       symm_needs_transb = FALSE;
    int       symm_needs_axpyt  = FALSE;

    // Return early if possible.
    if ( bli_zero_dim2( m, n ) ) return;

    // If necessary, allocate, initialize, and use a temporary contiguous
    // copy of each matrix rather than the original matrices.
    bli_set_dim_with_side( side, m, n, &dim_a );
    bli_ccreate_contigmr( uplo,
                          dim_a,
                          dim_a,
                          a_save, a_rs_save, a_cs_save,
                          &a,     &a_rs,     &a_cs );

    bli_ccreate_contigm( m,
                         n,
                         b_save, b_rs_save, b_cs_save,
                         &b,     &b_rs,     &b_cs );

    bli_ccreate_contigm( m,
                         n,
                         c_save, c_rs_save, c_cs_save,
                         &c,     &c_rs,     &c_cs );

    // Initialize with values assuming column-major storage.
    lda  = a_cs;
    inca = a_rs;
    ldb  = b_cs;
    incb = b_rs;
    ldc  = c_cs;
    incc = c_rs;
    
    // Adjust the parameters based on the storage of each matrix.
    if ( bli_is_col_storage( c_rs, c_cs ) )
    {
        if ( bli_is_col_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c += uplo( A_c ) * B_c
                // effective operation: C_c += uplo( A_c ) * B_c
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c += uplo( A_c ) * B_r
                // effective operation: C_c += uplo( A_c ) * B_c
                symm_needs_copyb = TRUE;
            }
        }
        else // if ( bli_is_row_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c +=  uplo( A_r ) * B_c
                // effective operation: C_c += ~uplo( conj( A_c ) ) * B_c
                bli_swap_ints( lda, inca );

                bli_toggle_uplo( uplo );
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c += uplo( A_r ) * B_r
                // effective operation: C_c += ( B_c * ~uplo( conj( A_c ) ) )^T
                bli_swap_ints( lda, inca );
                bli_swap_ints( ldb, incb );

                bli_toggle_side( side );
                bli_toggle_uplo( uplo );

                symm_needs_axpyt = TRUE;
            }
        }
    }
    else // if ( bli_is_row_storage( c_rs, c_cs ) )
    {
        if ( bli_is_col_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_c ) * B_c
                // effective operation: C_c += ( uplo( A_c ) * B_c )^T
                bli_swap_ints( ldc, incc );

                bli_swap_ints( m, n );

                symm_needs_axpyt = TRUE;
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_c ) * B_r
                // effective operation: C_c += B_c * ~uplo( conj( A_c ) )
                bli_swap_ints( ldc, incc );
                bli_swap_ints( ldb, incb );

                bli_swap_ints( m, n );

                bli_toggle_side( side );
            }
        }
        else // if ( bli_is_row_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_r ) * B_c
                // effective operation: C_c += B_c^T * ~uplo( A_c )
                bli_swap_ints( ldc, incc );
                bli_swap_ints( lda, inca );

                bli_swap_ints( m, n );

                bli_toggle_side( side );
                bli_toggle_uplo( uplo );

                symm_needs_copyb  = TRUE;
                symm_needs_transb = TRUE;
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_r ) * B_r
                // effective operation: C_c += B_c * conj( ~uplo( A_c ) )
                bli_swap_ints( ldc, incc );
                bli_swap_ints( lda, inca );
                bli_swap_ints( ldb, incb );

                bli_swap_ints( m, n );

                bli_toggle_uplo( uplo );
                bli_toggle_side( side );
            }
        }
    }

    // We need a temporary matrix for the cases where B needs to be copied.
    b_copy    = b;
    ldb_copy  = ldb;
    incb_copy = incb;
    
    // There are two cases where we need to make a copy of B: one where the
    // copy's dimensions are transposed from the original B, and one where
    // the dimensions are not swapped.
    if ( symm_needs_copyb )
    {
        trans_t transb;

        // Set transb, which determines whether or not we need to copy from B
        // as if it needs a transposition. If a transposition is needed, then
        // m and n and have already been swapped. So in either case m
        // represents the leading dimension of the copy.
        if ( symm_needs_transb ) transb = BLIS_TRANSPOSE;
        else                     transb = BLIS_NO_TRANSPOSE;
        
        b_copy    = bli_callocm( m, n );
        ldb_copy  = m;
        incb_copy = 1;

        bli_ccopymt( transb,
                     m,
                     n,
                     b,      incb,      ldb,
                     b_copy, incb_copy, ldb_copy );
    }

    // There are two cases where we need to perform the symm and then axpy
    // the result into C with a transposition. We handle those cases here.
    if ( symm_needs_axpyt )
    {
        // We need a temporary matrix for holding C^T. Notice that m and n
        // represent the dimensions of C, and thus C_trans is n-by-m
        // (interpreting both as column-major matrices). So the leading
        // dimension of the temporary matrix holding C^T is n.
        c_trans    = bli_callocm( n, m );
        ldc_trans  = n;
        incc_trans = 1;

        // Compute A * B (or B * A) and store the result in C_trans.
        // Note that there is no overlap between the axpyt cases and
        // the conja/copyb cases, hence the use of a, b, lda, and ldb.
        bli_csymm_blas( side,
                        uplo,
                        n,
                        m,
                        alpha,
                        a,       lda,
                        b,       ldb,
                        &zero,
                        c_trans, ldc_trans );

        // Scale C by beta.
        bli_cscalm( BLIS_NO_CONJUGATE,
                    m,
                    n,
                    beta,
                    c, incc, ldc );
        
        // And finally, accumulate the matrix product in C_trans into C
        // with a transpose.
        bli_caxpymt( BLIS_TRANSPOSE,
                     m,
                     n,
                     &one,
                     c_trans, incc_trans, ldc_trans,
                     c,       incc,       ldc );

        // Free the temporary matrix for C.
        bli_cfree( c_trans );
    }
    else // no extra axpyt step needed
    {
        bli_csymm_blas( side,
                        uplo,
                        m,
                        n,
                        alpha,
                        a,      lda,
                        b_copy, ldb_copy,
                        beta,
                        c,      ldc );
    }

    if ( symm_needs_copyb )
        bli_cfree( b_copy );

    // Free any temporary contiguous matrices, copying the result back to
    // the original matrix.
    bli_cfree_contigm( a_save, a_rs_save, a_cs_save,
                       &a,     &a_rs,     &a_cs );

    bli_cfree_contigm( b_save, b_rs_save, b_cs_save,
                       &b,     &b_rs,     &b_cs );

    bli_cfree_saved_contigm( m_save,
                             n_save,
                             c_save, c_rs_save, c_cs_save,
                             &c,     &c_rs,     &c_cs );
}
void bli_csymm_blas ( side_t  side,
uplo_t  uplo,
int  m,
int  n,
scomplex alpha,
scomplex a,
int  lda,
scomplex b,
int  ldb,
scomplex beta,
scomplex c,
int  ldc 
)

References bli_param_map_to_netlib_side(), bli_param_map_to_netlib_uplo(), cblas_csymm(), CblasColMajor, and F77_csymm().

Referenced by bli_csymm().

{
#ifdef BLIS_ENABLE_CBLAS_INTERFACES
    enum CBLAS_ORDER cblas_order = CblasColMajor;
    enum CBLAS_SIDE  cblas_side;
    enum CBLAS_UPLO  cblas_uplo;

    bli_param_map_to_netlib_side( side, &cblas_side );
    bli_param_map_to_netlib_uplo( uplo, &cblas_uplo );

    cblas_csymm( cblas_order,
                 cblas_side,
                 cblas_uplo,
                 m,
                 n,
                 alpha,
                 a, lda,
                 b, ldb,
                 beta,
                 c, ldc );
#else
    char blas_side;
    char blas_uplo;

    bli_param_map_to_netlib_side( side, &blas_side );
    bli_param_map_to_netlib_uplo( uplo, &blas_uplo );

    F77_csymm( &blas_side,
               &blas_uplo,
               &m,
               &n,
               alpha,
               a, &lda,
               b, &ldb,
               beta,
               c, &ldc );
#endif
}
void bli_dsymm ( side_t  side,
uplo_t  uplo,
int  m,
int  n,
double *  alpha,
double *  a,
int  a_rs,
int  a_cs,
double *  b,
int  b_rs,
int  b_cs,
double *  beta,
double *  c,
int  c_rs,
int  c_cs 
)

References bli_d0(), bli_d1(), bli_dallocm(), bli_daxpymt(), bli_dcopymt(), bli_dcreate_contigm(), bli_dcreate_contigmr(), bli_dfree(), bli_dfree_contigm(), bli_dfree_saved_contigm(), bli_dscalm(), bli_dsymm_blas(), bli_is_col_storage(), bli_set_dim_with_side(), bli_zero_dim2(), BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, and BLIS_TRANSPOSE.

Referenced by bli_dhemm(), FLA_Hemm_external(), and FLA_Symm_external().

{
    int       m_save    = m;
    int       n_save    = n;
    double*   a_save    = a;
    double*   b_save    = b;
    double*   c_save    = c;
    int       a_rs_save = a_rs;
    int       a_cs_save = a_cs;
    int       b_rs_save = b_rs;
    int       b_cs_save = b_cs;
    int       c_rs_save = c_rs;
    int       c_cs_save = c_cs;
    double    zero = bli_d0();
    double    one  = bli_d1();
    double*   b_copy;
    double*   c_trans;
    int       dim_a;
    int       lda, inca;
    int       ldb, incb;
    int       ldc, incc;
    int       ldb_copy, incb_copy;
    int       ldc_trans, incc_trans;
    int       symm_needs_copyb  = FALSE;
    int       symm_needs_transb = FALSE;
    int       symm_needs_axpyt  = FALSE;

    // Return early if possible.
    if ( bli_zero_dim2( m, n ) ) return;

    // If necessary, allocate, initialize, and use a temporary contiguous
    // copy of each matrix rather than the original matrices.
    bli_set_dim_with_side( side, m, n, &dim_a );
    bli_dcreate_contigmr( uplo,
                          dim_a,
                          dim_a,
                          a_save, a_rs_save, a_cs_save,
                          &a,     &a_rs,     &a_cs );

    bli_dcreate_contigm( m,
                         n,
                         b_save, b_rs_save, b_cs_save,
                         &b,     &b_rs,     &b_cs );

    bli_dcreate_contigm( m,
                         n,
                         c_save, c_rs_save, c_cs_save,
                         &c,     &c_rs,     &c_cs );

    // Initialize with values assuming column-major storage.
    lda  = a_cs;
    inca = a_rs;
    ldb  = b_cs;
    incb = b_rs;
    ldc  = c_cs;
    incc = c_rs;
    
    // Adjust the parameters based on the storage of each matrix.
    if ( bli_is_col_storage( c_rs, c_cs ) )
    {
        if ( bli_is_col_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c += uplo( A_c ) * B_c
                // effective operation: C_c += uplo( A_c ) * B_c
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c += uplo( A_c ) * B_r
                // effective operation: C_c += uplo( A_c ) * B_c
                symm_needs_copyb = TRUE;
            }
        }
        else // if ( bli_is_row_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c +=  uplo( A_r ) * B_c
                // effective operation: C_c += ~uplo( conj( A_c ) ) * B_c
                bli_swap_ints( lda, inca );

                bli_toggle_uplo( uplo );
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c += uplo( A_r ) * B_r
                // effective operation: C_c += ( B_c * ~uplo( conj( A_c ) ) )^T
                bli_swap_ints( lda, inca );
                bli_swap_ints( ldb, incb );

                bli_toggle_side( side );
                bli_toggle_uplo( uplo );

                symm_needs_axpyt = TRUE;
            }
        }
    }
    else // if ( bli_is_row_storage( c_rs, c_cs ) )
    {
        if ( bli_is_col_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_c ) * B_c
                // effective operation: C_c += ( uplo( A_c ) * B_c )^T
                bli_swap_ints( ldc, incc );

                bli_swap_ints( m, n );

                symm_needs_axpyt = TRUE;
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_c ) * B_r
                // effective operation: C_c += B_c * ~uplo( conj( A_c ) )
                bli_swap_ints( ldc, incc );
                bli_swap_ints( ldb, incb );

                bli_swap_ints( m, n );

                bli_toggle_side( side );
            }
        }
        else // if ( bli_is_row_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_r ) * B_c
                // effective operation: C_c += B_c^T * ~uplo( A_c )
                bli_swap_ints( ldc, incc );
                bli_swap_ints( lda, inca );

                bli_swap_ints( m, n );

                bli_toggle_side( side );
                bli_toggle_uplo( uplo );

                symm_needs_copyb  = TRUE;
                symm_needs_transb = TRUE;
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_r ) * B_r
                // effective operation: C_c += B_c * conj( ~uplo( A_c ) )
                bli_swap_ints( ldc, incc );
                bli_swap_ints( lda, inca );
                bli_swap_ints( ldb, incb );

                bli_swap_ints( m, n );

                bli_toggle_uplo( uplo );
                bli_toggle_side( side );
            }
        }
    }

    // We need a temporary matrix for the cases where B needs to be copied.
    b_copy    = b;
    ldb_copy  = ldb;
    incb_copy = incb;
    
    // There are two cases where we need to make a copy of B: one where the
    // copy's dimensions are transposed from the original B, and one where
    // the dimensions are not swapped.
    if ( symm_needs_copyb )
    {
        trans_t transb;

        // Set transb, which determines whether or not we need to copy from B
        // as if it needs a transposition. If a transposition is needed, then
        // m and n and have already been swapped. So in either case m
        // represents the leading dimension of the copy.
        if ( symm_needs_transb ) transb = BLIS_TRANSPOSE;
        else                     transb = BLIS_NO_TRANSPOSE;
        
        b_copy    = bli_dallocm( m, n );
        ldb_copy  = m;
        incb_copy = 1;

        bli_dcopymt( transb,
                     m,
                     n,
                     b,      incb,      ldb,
                     b_copy, incb_copy, ldb_copy );
    }

    // There are two cases where we need to perform the symm and then axpy
    // the result into C with a transposition. We handle those cases here.
    if ( symm_needs_axpyt )
    {
        // We need a temporary matrix for holding C^T. Notice that m and n
        // represent the dimensions of C, and thus C_trans is n-by-m
        // (interpreting both as column-major matrices). So the leading
        // dimension of the temporary matrix holding C^T is n.
        c_trans    = bli_dallocm( n, m );
        ldc_trans  = n;
        incc_trans = 1;

        // Compute A * B (or B * A) and store the result in C_trans.
        // Note that there is no overlap between the axpyt cases and
        // the conja/copyb cases, hence the use of a, b, lda, and ldb.
        bli_dsymm_blas( side,
                        uplo,
                        n,
                        m,
                        alpha,
                        a,       lda,
                        b,       ldb,
                        &zero,
                        c_trans, ldc_trans );

        // Scale C by beta.
        bli_dscalm( BLIS_NO_CONJUGATE,
                    m,
                    n,
                    beta,
                    c, incc, ldc );
        
        // And finally, accumulate the matrix product in C_trans into C
        // with a transpose.
        bli_daxpymt( BLIS_TRANSPOSE,
                     m,
                     n,
                     &one,
                     c_trans, incc_trans, ldc_trans,
                     c,       incc,       ldc );

        // Free the temporary matrix for C.
        bli_dfree( c_trans );
    }
    else // no extra axpyt step needed
    {
        bli_dsymm_blas( side,
                        uplo,
                        m,
                        n,
                        alpha,
                        a,      lda,
                        b_copy, ldb_copy,
                        beta,
                        c,      ldc );
    }

    if ( symm_needs_copyb )
        bli_dfree( b_copy );

    // Free any temporary contiguous matrices, copying the result back to
    // the original matrix.
    bli_dfree_contigm( a_save, a_rs_save, a_cs_save,
                       &a,     &a_rs,     &a_cs );

    bli_dfree_contigm( b_save, b_rs_save, b_cs_save,
                       &b,     &b_rs,     &b_cs );

    bli_dfree_saved_contigm( m_save,
                             n_save,
                             c_save, c_rs_save, c_cs_save,
                             &c,     &c_rs,     &c_cs );
}
void bli_dsymm_blas ( side_t  side,
uplo_t  uplo,
int  m,
int  n,
double *  alpha,
double *  a,
int  lda,
double *  b,
int  ldb,
double *  beta,
double *  c,
int  ldc 
)

References bli_param_map_to_netlib_side(), bli_param_map_to_netlib_uplo(), cblas_dsymm(), CblasColMajor, and F77_dsymm().

Referenced by bli_dsymm().

{
#ifdef BLIS_ENABLE_CBLAS_INTERFACES
    enum CBLAS_ORDER cblas_order = CblasColMajor;
    enum CBLAS_SIDE  cblas_side;
    enum CBLAS_UPLO  cblas_uplo;

    bli_param_map_to_netlib_side( side, &cblas_side );
    bli_param_map_to_netlib_uplo( uplo, &cblas_uplo );

    cblas_dsymm( cblas_order,
                 cblas_side,
                 cblas_uplo,
                 m,
                 n,
                 *alpha,
                 a, lda,
                 b, ldb,
                 *beta,
                 c, ldc );
#else
    char blas_side;
    char blas_uplo;

    bli_param_map_to_netlib_side( side, &blas_side );
    bli_param_map_to_netlib_uplo( uplo, &blas_uplo );

    F77_dsymm( &blas_side,
               &blas_uplo,
               &m,
               &n,
               alpha,
               a, &lda,
               b, &ldb,
               beta,
               c, &ldc );
#endif
}
void bli_ssymm ( side_t  side,
uplo_t  uplo,
int  m,
int  n,
float *  alpha,
float *  a,
int  a_rs,
int  a_cs,
float *  b,
int  b_rs,
int  b_cs,
float *  beta,
float *  c,
int  c_rs,
int  c_cs 
)

References bli_is_col_storage(), bli_s0(), bli_s1(), bli_sallocm(), bli_saxpymt(), bli_scopymt(), bli_screate_contigm(), bli_screate_contigmr(), bli_set_dim_with_side(), bli_sfree(), bli_sfree_contigm(), bli_sfree_saved_contigm(), bli_sscalm(), bli_ssymm_blas(), bli_zero_dim2(), BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, and BLIS_TRANSPOSE.

Referenced by bli_shemm(), FLA_Hemm_external(), and FLA_Symm_external().

{
    int       m_save    = m;
    int       n_save    = n;
    float*    a_save    = a;
    float*    b_save    = b;
    float*    c_save    = c;
    int       a_rs_save = a_rs;
    int       a_cs_save = a_cs;
    int       b_rs_save = b_rs;
    int       b_cs_save = b_cs;
    int       c_rs_save = c_rs;
    int       c_cs_save = c_cs;
    float     zero = bli_s0();
    float     one  = bli_s1();
    float*    b_copy;
    float*    c_trans;
    int       dim_a;
    int       lda, inca;
    int       ldb, incb;
    int       ldc, incc;
    int       ldb_copy, incb_copy;
    int       ldc_trans, incc_trans;
    int       symm_needs_copyb  = FALSE;
    int       symm_needs_transb = FALSE;
    int       symm_needs_axpyt  = FALSE;

    // Return early if possible.
    if ( bli_zero_dim2( m, n ) ) return;

    // If necessary, allocate, initialize, and use a temporary contiguous
    // copy of each matrix rather than the original matrices.
    bli_set_dim_with_side( side, m, n, &dim_a );
    bli_screate_contigmr( uplo,
                          dim_a,
                          dim_a,
                          a_save, a_rs_save, a_cs_save,
                          &a,     &a_rs,     &a_cs );

    bli_screate_contigm( m,
                         n,
                         b_save, b_rs_save, b_cs_save,
                         &b,     &b_rs,     &b_cs );

    bli_screate_contigm( m,
                         n,
                         c_save, c_rs_save, c_cs_save,
                         &c,     &c_rs,     &c_cs );

    // Initialize with values assuming column-major storage.
    lda  = a_cs;
    inca = a_rs;
    ldb  = b_cs;
    incb = b_rs;
    ldc  = c_cs;
    incc = c_rs;
    
    // Adjust the parameters based on the storage of each matrix.
    if ( bli_is_col_storage( c_rs, c_cs ) )
    {
        if ( bli_is_col_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c += uplo( A_c ) * B_c
                // effective operation: C_c += uplo( A_c ) * B_c
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c += uplo( A_c ) * B_r
                // effective operation: C_c += uplo( A_c ) * B_c
                symm_needs_copyb = TRUE;
            }
        }
        else // if ( bli_is_row_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c +=  uplo( A_r ) * B_c
                // effective operation: C_c += ~uplo( conj( A_c ) ) * B_c
                bli_swap_ints( lda, inca );

                bli_toggle_uplo( uplo );
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c += uplo( A_r ) * B_r
                // effective operation: C_c += ( B_c * ~uplo( conj( A_c ) ) )^T
                bli_swap_ints( lda, inca );
                bli_swap_ints( ldb, incb );

                bli_toggle_side( side );
                bli_toggle_uplo( uplo );

                symm_needs_axpyt = TRUE;
            }
        }
    }
    else // if ( bli_is_row_storage( c_rs, c_cs ) )
    {
        if ( bli_is_col_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_c ) * B_c
                // effective operation: C_c += ( uplo( A_c ) * B_c )^T
                bli_swap_ints( ldc, incc );

                bli_swap_ints( m, n );

                symm_needs_axpyt = TRUE;
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_c ) * B_r
                // effective operation: C_c += B_c * ~uplo( conj( A_c ) )
                bli_swap_ints( ldc, incc );
                bli_swap_ints( ldb, incb );

                bli_swap_ints( m, n );

                bli_toggle_side( side );
            }
        }
        else // if ( bli_is_row_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_r ) * B_c
                // effective operation: C_c += B_c^T * ~uplo( A_c )
                bli_swap_ints( ldc, incc );
                bli_swap_ints( lda, inca );

                bli_swap_ints( m, n );

                bli_toggle_side( side );
                bli_toggle_uplo( uplo );

                symm_needs_copyb  = TRUE;
                symm_needs_transb = TRUE;
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_r ) * B_r
                // effective operation: C_c += B_c * conj( ~uplo( A_c ) )
                bli_swap_ints( ldc, incc );
                bli_swap_ints( lda, inca );
                bli_swap_ints( ldb, incb );

                bli_swap_ints( m, n );

                bli_toggle_uplo( uplo );
                bli_toggle_side( side );
            }
        }
    }

    // We need a temporary matrix for the cases where B needs to be copied.
    b_copy    = b;
    ldb_copy  = ldb;
    incb_copy = incb;
    
    // There are two cases where we need to make a copy of B: one where the
    // copy's dimensions are transposed from the original B, and one where
    // the dimensions are not swapped.
    if ( symm_needs_copyb )
    {
        trans_t transb;

        // Set transb, which determines whether or not we need to copy from B
        // as if it needs a transposition. If a transposition is needed, then
        // m and n and have already been swapped. So in either case m
        // represents the leading dimension of the copy.
        if ( symm_needs_transb ) transb = BLIS_TRANSPOSE;
        else                     transb = BLIS_NO_TRANSPOSE;
        
        b_copy    = bli_sallocm( m, n );
        ldb_copy  = m;
        incb_copy = 1;

        bli_scopymt( transb,
                     m,
                     n,
                     b,      incb,      ldb,
                     b_copy, incb_copy, ldb_copy );
    }

    // There are two cases where we need to perform the symm and then axpy
    // the result into C with a transposition. We handle those cases here.
    if ( symm_needs_axpyt )
    {
        // We need a temporary matrix for holding C^T. Notice that m and n
        // represent the dimensions of C, and thus C_trans is n-by-m
        // (interpreting both as column-major matrices). So the leading
        // dimension of the temporary matrix holding C^T is n.
        c_trans    = bli_sallocm( n, m );
        ldc_trans  = n;
        incc_trans = 1;

        // Compute A * B (or B * A) and store the result in C_trans.
        // Note that there is no overlap between the axpyt cases and
        // the conja/copyb cases, hence the use of a, b, lda, and ldb.
        bli_ssymm_blas( side,
                        uplo,
                        n,
                        m,
                        alpha,
                        a,       lda,
                        b,       ldb,
                        &zero,
                        c_trans, ldc_trans );

        // Scale C by beta.
        bli_sscalm( BLIS_NO_CONJUGATE,
                    m,
                    n,
                    beta,
                    c, incc, ldc );
        
        // And finally, accumulate the matrix product in C_trans into C
        // with a transpose.
        bli_saxpymt( BLIS_TRANSPOSE,
                     m,
                     n,
                     &one,
                     c_trans, incc_trans, ldc_trans,
                     c,       incc,       ldc );

        // Free the temporary matrix for C.
        bli_sfree( c_trans );
    }
    else // no extra axpyt step needed
    {
        bli_ssymm_blas( side,
                        uplo,
                        m,
                        n,
                        alpha,
                        a,      lda,
                        b_copy, ldb_copy,
                        beta,
                        c,      ldc );
    }

    if ( symm_needs_copyb )
        bli_sfree( b_copy );

    // Free any temporary contiguous matrices, copying the result back to
    // the original matrix.
    bli_sfree_contigm( a_save, a_rs_save, a_cs_save,
                       &a,     &a_rs,     &a_cs );

    bli_sfree_contigm( b_save, b_rs_save, b_cs_save,
                       &b,     &b_rs,     &b_cs );

    bli_sfree_saved_contigm( m_save,
                             n_save,
                             c_save, c_rs_save, c_cs_save,
                             &c,     &c_rs,     &c_cs );
}
void bli_ssymm_blas ( side_t  side,
uplo_t  uplo,
int  m,
int  n,
float *  alpha,
float *  a,
int  lda,
float *  b,
int  ldb,
float *  beta,
float *  c,
int  ldc 
)

References bli_param_map_to_netlib_side(), bli_param_map_to_netlib_uplo(), cblas_ssymm(), CblasColMajor, and F77_ssymm().

Referenced by bli_ssymm().

{
#ifdef BLIS_ENABLE_CBLAS_INTERFACES
    enum CBLAS_ORDER cblas_order = CblasColMajor;
    enum CBLAS_SIDE  cblas_side;
    enum CBLAS_UPLO  cblas_uplo;

    bli_param_map_to_netlib_side( side, &cblas_side );
    bli_param_map_to_netlib_uplo( uplo, &cblas_uplo );

    cblas_ssymm( cblas_order,
                 cblas_side,
                 cblas_uplo,
                 m,
                 n,
                 *alpha,
                 a, lda,
                 b, ldb,
                 *beta,
                 c, ldc );
#else
    char blas_side;
    char blas_uplo;

    bli_param_map_to_netlib_side( side, &blas_side );
    bli_param_map_to_netlib_uplo( uplo, &blas_uplo );

    F77_ssymm( &blas_side,
               &blas_uplo,
               &m,
               &n,
               alpha,
               a, &lda,
               b, &ldb,
               beta,
               c, &ldc );
#endif
}
void bli_zsymm ( side_t  side,
uplo_t  uplo,
int  m,
int  n,
dcomplex alpha,
dcomplex a,
int  a_rs,
int  a_cs,
dcomplex b,
int  b_rs,
int  b_cs,
dcomplex beta,
dcomplex c,
int  c_rs,
int  c_cs 
)

References bli_is_col_storage(), bli_set_dim_with_side(), bli_z0(), bli_z1(), bli_zallocm(), bli_zaxpymt(), bli_zcopymt(), bli_zcreate_contigm(), bli_zcreate_contigmr(), bli_zero_dim2(), bli_zfree(), bli_zfree_contigm(), bli_zfree_saved_contigm(), bli_zscalm(), bli_zsymm_blas(), BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, and BLIS_TRANSPOSE.

Referenced by FLA_Symm_external().

{
    int       m_save    = m;
    int       n_save    = n;
    dcomplex* a_save    = a;
    dcomplex* b_save    = b;
    dcomplex* c_save    = c;
    int       a_rs_save = a_rs;
    int       a_cs_save = a_cs;
    int       b_rs_save = b_rs;
    int       b_cs_save = b_cs;
    int       c_rs_save = c_rs;
    int       c_cs_save = c_cs;
    dcomplex  zero = bli_z0();
    dcomplex  one  = bli_z1();
    dcomplex* b_copy;
    dcomplex* c_trans;
    int       dim_a;
    int       lda, inca;
    int       ldb, incb;
    int       ldc, incc;
    int       ldb_copy, incb_copy;
    int       ldc_trans, incc_trans;
    int       symm_needs_copyb  = FALSE;
    int       symm_needs_transb = FALSE;
    int       symm_needs_axpyt  = FALSE;

    // Return early if possible.
    if ( bli_zero_dim2( m, n ) ) return;

    // If necessary, allocate, initialize, and use a temporary contiguous
    // copy of each matrix rather than the original matrices.
    bli_set_dim_with_side( side, m, n, &dim_a );
    bli_zcreate_contigmr( uplo,
                          dim_a,
                          dim_a,
                          a_save, a_rs_save, a_cs_save,
                          &a,     &a_rs,     &a_cs );

    bli_zcreate_contigm( m,
                         n,
                         b_save, b_rs_save, b_cs_save,
                         &b,     &b_rs,     &b_cs );

    bli_zcreate_contigm( m,
                         n,
                         c_save, c_rs_save, c_cs_save,
                         &c,     &c_rs,     &c_cs );

    // Initialize with values assuming column-major storage.
    lda  = a_cs;
    inca = a_rs;
    ldb  = b_cs;
    incb = b_rs;
    ldc  = c_cs;
    incc = c_rs;
    
    // Adjust the parameters based on the storage of each matrix.
    if ( bli_is_col_storage( c_rs, c_cs ) )
    {
        if ( bli_is_col_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c += uplo( A_c ) * B_c
                // effective operation: C_c += uplo( A_c ) * B_c
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c += uplo( A_c ) * B_r
                // effective operation: C_c += uplo( A_c ) * B_c
                symm_needs_copyb = TRUE;
            }
        }
        else // if ( bli_is_row_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c +=  uplo( A_r ) * B_c
                // effective operation: C_c += ~uplo( conj( A_c ) ) * B_c
                bli_swap_ints( lda, inca );

                bli_toggle_uplo( uplo );
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_c += uplo( A_r ) * B_r
                // effective operation: C_c += ( B_c * ~uplo( conj( A_c ) ) )^T
                bli_swap_ints( lda, inca );
                bli_swap_ints( ldb, incb );

                bli_toggle_side( side );
                bli_toggle_uplo( uplo );

                symm_needs_axpyt = TRUE;
            }
        }
    }
    else // if ( bli_is_row_storage( c_rs, c_cs ) )
    {
        if ( bli_is_col_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_c ) * B_c
                // effective operation: C_c += ( uplo( A_c ) * B_c )^T
                bli_swap_ints( ldc, incc );

                bli_swap_ints( m, n );

                symm_needs_axpyt = TRUE;
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_c ) * B_r
                // effective operation: C_c += B_c * ~uplo( conj( A_c ) )
                bli_swap_ints( ldc, incc );
                bli_swap_ints( ldb, incb );

                bli_swap_ints( m, n );

                bli_toggle_side( side );
            }
        }
        else // if ( bli_is_row_storage( a_rs, a_cs ) )
        {
            if ( bli_is_col_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_r ) * B_c
                // effective operation: C_c += B_c^T * ~uplo( A_c )
                bli_swap_ints( ldc, incc );
                bli_swap_ints( lda, inca );

                bli_swap_ints( m, n );

                bli_toggle_side( side );
                bli_toggle_uplo( uplo );

                symm_needs_copyb  = TRUE;
                symm_needs_transb = TRUE;
            }
            else // if ( bli_is_row_storage( b_rs, b_cs ) )
            {
                // requested operation: C_r += uplo( A_r ) * B_r
                // effective operation: C_c += B_c * conj( ~uplo( A_c ) )
                bli_swap_ints( ldc, incc );
                bli_swap_ints( lda, inca );
                bli_swap_ints( ldb, incb );

                bli_swap_ints( m, n );

                bli_toggle_uplo( uplo );
                bli_toggle_side( side );
            }
        }
    }

    // We need a temporary matrix for the cases where B needs to be copied.
    b_copy    = b;
    ldb_copy  = ldb;
    incb_copy = incb;
    
    // There are two cases where we need to make a copy of B: one where the
    // copy's dimensions are transposed from the original B, and one where
    // the dimensions are not swapped.
    if ( symm_needs_copyb )
    {
        trans_t transb;

        // Set transb, which determines whether or not we need to copy from B
        // as if it needs a transposition. If a transposition is needed, then
        // m and n and have already been swapped. So in either case m
        // represents the leading dimension of the copy.
        if ( symm_needs_transb ) transb = BLIS_TRANSPOSE;
        else                     transb = BLIS_NO_TRANSPOSE;
        
        b_copy    = bli_zallocm( m, n );
        ldb_copy  = m;
        incb_copy = 1;

        bli_zcopymt( transb,
                     m,
                     n,
                     b,      incb,      ldb,
                     b_copy, incb_copy, ldb_copy );
    }

    // There are two cases where we need to perform the symm and then axpy
    // the result into C with a transposition. We handle those cases here.
    if ( symm_needs_axpyt )
    {
        // We need a temporary matrix for holding C^T. Notice that m and n
        // represent the dimensions of C, and thus C_trans is n-by-m
        // (interpreting both as column-major matrices). So the leading
        // dimension of the temporary matrix holding C^T is n.
        c_trans    = bli_zallocm( n, m );
        ldc_trans  = n;
        incc_trans = 1;

        // Compute A * B (or B * A) and store the result in C_trans.
        // Note that there is no overlap between the axpyt cases and
        // the conja/copyb cases, hence the use of a, b, lda, and ldb.
        bli_zsymm_blas( side,
                        uplo,
                        n,
                        m,
                        alpha,
                        a,       lda,
                        b,       ldb,
                        &zero,
                        c_trans, ldc_trans );

        // Scale C by beta.
        bli_zscalm( BLIS_NO_CONJUGATE,
                    m,
                    n,
                    beta,
                    c, incc, ldc );
        
        // And finally, accumulate the matrix product in C_trans into C
        // with a transpose.
        bli_zaxpymt( BLIS_TRANSPOSE,
                     m,
                     n,
                     &one,
                     c_trans, incc_trans, ldc_trans,
                     c,       incc,       ldc );

        // Free the temporary matrix for C.
        bli_zfree( c_trans );
    }
    else // no extra axpyt step needed
    {
        bli_zsymm_blas( side,
                        uplo,
                        m,
                        n,
                        alpha,
                        a,      lda,
                        b_copy, ldb_copy,
                        beta,
                        c,      ldc );
    }

    if ( symm_needs_copyb )
        bli_zfree( b_copy );

    // Free any temporary contiguous matrices, copying the result back to
    // the original matrix.
    bli_zfree_contigm( a_save, a_rs_save, a_cs_save,
                       &a,     &a_rs,     &a_cs );

    bli_zfree_contigm( b_save, b_rs_save, b_cs_save,
                       &b,     &b_rs,     &b_cs );

    bli_zfree_saved_contigm( m_save,
                             n_save,
                             c_save, c_rs_save, c_cs_save,
                             &c,     &c_rs,     &c_cs );
}
void bli_zsymm_blas ( side_t  side,
uplo_t  uplo,
int  m,
int  n,
dcomplex alpha,
dcomplex a,
int  lda,
dcomplex b,
int  ldb,
dcomplex beta,
dcomplex c,
int  ldc 
)

References bli_param_map_to_netlib_side(), bli_param_map_to_netlib_uplo(), cblas_zsymm(), CblasColMajor, and F77_zsymm().

Referenced by bli_zsymm().

{
#ifdef BLIS_ENABLE_CBLAS_INTERFACES
    enum CBLAS_ORDER cblas_order = CblasColMajor;
    enum CBLAS_SIDE  cblas_side;
    enum CBLAS_UPLO  cblas_uplo;

    bli_param_map_to_netlib_side( side, &cblas_side );
    bli_param_map_to_netlib_uplo( uplo, &cblas_uplo );

    cblas_zsymm( cblas_order,
                 cblas_side,
                 cblas_uplo,
                 m,
                 n,
                 alpha,
                 a, lda,
                 b, ldb,
                 beta,
                 c, ldc );
#else
    char blas_side;
    char blas_uplo;

    bli_param_map_to_netlib_side( side, &blas_side );
    bli_param_map_to_netlib_uplo( uplo, &blas_uplo );

    F77_zsymm( &blas_side,
               &blas_uplo,
               &m,
               &n,
               alpha,
               a, &lda,
               b, &ldb,
               beta,
               c, &ldc );
#endif
}