libflame  revision_anchor
Functions
FLA_LU_incpiv_aux.h File Reference

(r)

Go to the source code of this file.

Functions

FLA_Error FLA_SA_Apply_pivots (FLA_Obj C, FLA_Obj E, FLA_Obj p)
 
FLA_Error FLA_SA_LU_blk (FLA_Obj U, FLA_Obj D, FLA_Obj p, FLA_Obj L, dim_t nb_alg)
 
FLA_Error FLA_SA_LU_unb (FLA_Obj U, FLA_Obj D, FLA_Obj p, FLA_Obj L)
 
FLA_Error FLA_SA_FS_blk (FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, dim_t nb_alg)
 
FLA_Error FLASH_LU_incpiv_var1 (FLA_Obj A, FLA_Obj p, FLA_Obj L, dim_t nb_alg, fla_lu_t *cntl)
 
FLA_Error FLASH_LU_incpiv_var2 (FLA_Obj A, FLA_Obj p, FLA_Obj L, FLA_Obj U, dim_t nb_alg, fla_lu_t *cntl)
 
FLA_Error FLASH_Trsm_piv (FLA_Obj A, FLA_Obj B, FLA_Obj p, fla_trsm_t *cntl)
 
FLA_Error FLASH_SA_LU (FLA_Obj B, FLA_Obj C, FLA_Obj D, FLA_Obj E, FLA_Obj p, FLA_Obj L, dim_t nb_alg, fla_lu_t *cntl)
 
FLA_Error FLASH_SA_FS (FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, dim_t nb_alg, fla_gemm_t *cntl)
 
FLA_Error FLASH_FS_incpiv_aux1 (FLA_Obj A, FLA_Obj p, FLA_Obj L, FLA_Obj b, dim_t nb_alg)
 
FLA_Error FLASH_FS_incpiv_aux2 (FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, dim_t nb_alg)
 

Function Documentation

◆ FLA_SA_Apply_pivots()

FLA_Error FLA_SA_Apply_pivots ( FLA_Obj  C,
FLA_Obj  E,
FLA_Obj  p 
)
14 {
15  FLA_Datatype datatype;
16  int m_C, n_C, cs_C;
17  int cs_E;
18  // int rs_C;
19  // int rs_E;
20  int m_p;
21  int i;
22  int* buff_p;
23 
24  if ( FLA_Obj_has_zero_dim( C ) ) return FLA_SUCCESS;
25 
26  datatype = FLA_Obj_datatype( C );
27 
28  m_C = FLA_Obj_length( C );
29  n_C = FLA_Obj_width( C );
30  cs_C = FLA_Obj_col_stride( C );
31  // rs_C = FLA_Obj_row_stride( C );
32 
33  cs_E = FLA_Obj_col_stride( E );
34  // rs_E = FLA_Obj_row_stride( E );
35 
36  m_p = FLA_Obj_length( p );
37 
38  buff_p = ( int * ) FLA_INT_PTR( p );
39 
40 
41  switch ( datatype ){
42 
43  case FLA_FLOAT:
44  {
45  float* buff_C = ( float * ) FLA_FLOAT_PTR( C );
46  float* buff_E = ( float * ) FLA_FLOAT_PTR( E );
47 
48  for ( i = 0; i < m_p; ++i )
49  {
50  if ( buff_p[ i ] != 0 )
51  bl1_sswap( n_C,
52  buff_C + 0*cs_C + i, cs_C,
53  buff_E + 0*cs_E + buff_p[ i ] - ( m_C - i ), cs_E );
54  }
55  break;
56  }
57 
58  case FLA_DOUBLE:
59  {
60  double* buff_C = ( double * ) FLA_DOUBLE_PTR( C );
61  double* buff_E = ( double * ) FLA_DOUBLE_PTR( E );
62 
63  for ( i = 0; i < m_p; ++i )
64  {
65  if ( buff_p[ i ] != 0 )
66  bl1_dswap( n_C,
67  buff_C + 0*cs_C + i, cs_C,
68  buff_E + 0*cs_E + buff_p[ i ] - ( m_C - i ), cs_E );
69  }
70  break;
71  }
72 
73  case FLA_COMPLEX:
74  {
75  scomplex* buff_C = ( scomplex * ) FLA_COMPLEX_PTR( C );
76  scomplex* buff_E = ( scomplex * ) FLA_COMPLEX_PTR( E );
77 
78  for ( i = 0; i < m_p; ++i )
79  {
80  if ( buff_p[ i ] != 0 )
81  bl1_cswap( n_C,
82  buff_C + 0*cs_C + i, cs_C,
83  buff_E + 0*cs_E + buff_p[ i ] - ( m_C - i ), cs_E );
84  }
85  break;
86  }
87 
88  case FLA_DOUBLE_COMPLEX:
89  {
90  dcomplex* buff_C = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( C );
91  dcomplex* buff_E = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( E );
92 
93  for ( i = 0; i < m_p; ++i )
94  {
95  if ( buff_p[ i ] != 0 )
96  bl1_zswap( n_C,
97  buff_C + 0*cs_C + i, cs_C,
98  buff_E + 0*cs_E + buff_p[ i ] - ( m_C - i ), cs_E );
99  }
100  break;
101  }
102 
103  }
104 
105  return FLA_SUCCESS;
106 }
dim_t FLA_Obj_width(FLA_Obj obj)
Definition: FLA_Query.c:123
FLA_Bool FLA_Obj_has_zero_dim(FLA_Obj A)
Definition: FLA_Query.c:400
dim_t FLA_Obj_length(FLA_Obj obj)
Definition: FLA_Query.c:116
dim_t FLA_Obj_col_stride(FLA_Obj obj)
Definition: FLA_Query.c:174
FLA_Datatype FLA_Obj_datatype(FLA_Obj obj)
Definition: FLA_Query.c:13
int FLA_Datatype
Definition: FLA_type_defs.h:49
int i
Definition: bl1_axmyv2.c:145
void bl1_zswap(int n, dcomplex *x, int incx, dcomplex *y, int incy)
Definition: bl1_swap.c:52
void bl1_dswap(int n, double *x, int incx, double *y, int incy)
Definition: bl1_swap.c:26
void bl1_cswap(int n, scomplex *x, int incx, scomplex *y, int incy)
Definition: bl1_swap.c:39
void bl1_sswap(int n, float *x, int incx, float *y, int incy)
Definition: bl1_swap.c:13
Definition: blis_type_defs.h:138
Definition: blis_type_defs.h:133

References bl1_cswap(), bl1_dswap(), bl1_sswap(), bl1_zswap(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_has_zero_dim(), FLA_Obj_length(), FLA_Obj_width(), and i.

Referenced by FLA_SA_FS_blk(), and FLA_SA_LU_blk().

◆ FLA_SA_FS_blk()

FLA_Error FLA_SA_FS_blk ( FLA_Obj  L,
FLA_Obj  D,
FLA_Obj  p,
FLA_Obj  C,
FLA_Obj  E,
dim_t  nb_alg 
)
16 {
17  FLA_Obj LT, L0,
18  LB, L1,
19  L2;
20 
21  FLA_Obj DL, DR, D0, D1, D2;
22 
23  FLA_Obj pT, p0,
24  pB, p1,
25  p2;
26 
27  FLA_Obj CT, C0,
28  CB, C1,
29  C2;
30 
31  FLA_Obj L1_sqr, L1_rest;
32 
33  dim_t b;
34 
35  FLA_Part_2x1( L, &LT,
36  &LB, 0, FLA_TOP );
37 
38  FLA_Part_1x2( D, &DL, &DR, 0, FLA_LEFT );
39 
40  FLA_Part_2x1( p, &pT,
41  &pB, 0, FLA_TOP );
42 
43  FLA_Part_2x1( C, &CT,
44  &CB, 0, FLA_TOP );
45 
46  while ( FLA_Obj_length( LT ) < FLA_Obj_length( L ) )
47  {
48  b = min( FLA_Obj_length( LB ), nb_alg );
49 
50  FLA_Repart_2x1_to_3x1( LT, &L0,
51  /* ** */ /* ** */
52  &L1,
53  LB, &L2, b, FLA_BOTTOM );
54 
55  FLA_Repart_1x2_to_1x3( DL, /**/ DR, &D0, /**/ &D1, &D2,
56  b, FLA_RIGHT );
57 
58  FLA_Repart_2x1_to_3x1( pT, &p0,
59  /* ** */ /* ** */
60  &p1,
61  pB, &p2, b, FLA_BOTTOM );
62 
63  FLA_Repart_2x1_to_3x1( CT, &C0,
64  /* ** */ /* ** */
65  &C1,
66  CB, &C2, b, FLA_BOTTOM );
67 
68  /*------------------------------------------------------------*/
69 
70  FLA_Part_1x2( L1, &L1_sqr, &L1_rest, b, FLA_LEFT );
71 
72 
74  E, p1 );
75 
76  FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR,
77  FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
78  FLA_ONE, L1_sqr, C1 );
79 
80  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
81  FLA_MINUS_ONE, D1, C1, FLA_ONE, E );
82 
83  /*------------------------------------------------------------*/
84 
85  FLA_Cont_with_3x1_to_2x1( &LT, L0,
86  L1,
87  /* ** */ /* ** */
88  &LB, L2, FLA_TOP );
89 
90  FLA_Cont_with_1x3_to_1x2( &DL, /**/ &DR, D0, D1, /**/ D2,
91  FLA_LEFT );
92 
93  FLA_Cont_with_3x1_to_2x1( &pT, p0,
94  p1,
95  /* ** */ /* ** */
96  &pB, p2, FLA_TOP );
97 
98  FLA_Cont_with_3x1_to_2x1( &CT, C0,
99  C1,
100  /* ** */ /* ** */
101  &CB, C2, FLA_TOP );
102  }
103 
104  return FLA_SUCCESS;
105 }
FLA_Error FLA_SA_Apply_pivots(FLA_Obj C, FLA_Obj E, FLA_Obj p)
Definition: FLA_SA_Apply_pivots.c:13
FLA_Error FLA_Gemm_external(FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C)
Definition: FLA_Gemm_external.c:13
FLA_Error FLA_Trsm_external(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B)
Definition: FLA_Trsm_external.c:13
FLA_Obj FLA_MINUS_ONE
Definition: FLA_Init.c:22
FLA_Obj FLA_ONE
Definition: FLA_Init.c:18
FLA_Error FLA_Cont_with_3x1_to_2x1(FLA_Obj *AT, FLA_Obj A0, FLA_Obj A1, FLA_Obj *AB, FLA_Obj A2, FLA_Side side)
Definition: FLA_View.c:428
FLA_Error FLA_Repart_2x1_to_3x1(FLA_Obj AT, FLA_Obj *A0, FLA_Obj *A1, FLA_Obj AB, FLA_Obj *A2, dim_t mb, FLA_Side side)
Definition: FLA_View.c:226
FLA_Error FLA_Cont_with_1x3_to_1x2(FLA_Obj *AL, FLA_Obj *AR, FLA_Obj A0, FLA_Obj A1, FLA_Obj A2, FLA_Side side)
Definition: FLA_View.c:475
FLA_Error FLA_Part_1x2(FLA_Obj A, FLA_Obj *A1, FLA_Obj *A2, dim_t nb, FLA_Side side)
Definition: FLA_View.c:110
FLA_Error FLA_Part_2x1(FLA_Obj A, FLA_Obj *A1, FLA_Obj *A2, dim_t mb, FLA_Side side)
Definition: FLA_View.c:76
FLA_Error FLA_Repart_1x2_to_1x3(FLA_Obj AL, FLA_Obj AR, FLA_Obj *A0, FLA_Obj *A1, FLA_Obj *A2, dim_t nb, FLA_Side side)
Definition: FLA_View.c:267
unsigned long dim_t
Definition: FLA_type_defs.h:71
Definition: FLA_type_defs.h:159

References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x1_to_2x1(), FLA_Gemm_external(), FLA_MINUS_ONE, FLA_Obj_length(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x1_to_3x1(), FLA_SA_Apply_pivots(), and FLA_Trsm_external().

Referenced by FLA_SA_FS_task(), and FLASH_FS_incpiv_aux2().

◆ FLA_SA_LU_blk()

FLA_Error FLA_SA_LU_blk ( FLA_Obj  U,
FLA_Obj  D,
FLA_Obj  p,
FLA_Obj  L,
dim_t  nb_alg 
)
15 {
16  FLA_Obj UTL, UTR, U00, U01, U02,
17  UBL, UBR, U10, U11, U12,
18  U20, U21, U22;
19 
20  FLA_Obj DL, DR, D0, D1, D2;
21 
22  FLA_Obj pT, p0,
23  pB, p1,
24  p2;
25 
26  FLA_Obj LT, L0,
27  LB, L1,
28  L2;
29 
30  FLA_Obj L1_sqr, L1_rest;
31 
32  dim_t b;
33 
34  FLA_Part_2x2( U, &UTL, &UTR,
35  &UBL, &UBR, 0, 0, FLA_TL );
36 
37  FLA_Part_1x2( D, &DL, &DR, 0, FLA_LEFT );
38 
39  FLA_Part_2x1( p, &pT,
40  &pB, 0, FLA_TOP );
41 
42  FLA_Part_2x1( L, &LT,
43  &LB, 0, FLA_TOP );
44 
45  while ( FLA_Obj_length( UTL ) < FLA_Obj_length( U ) )
46  {
47  b = min( FLA_Obj_length( UBR ), nb_alg );
48 
49  FLA_Repart_2x2_to_3x3( UTL, /**/ UTR, &U00, /**/ &U01, &U02,
50  /* ************* */ /* ******************** */
51  &U10, /**/ &U11, &U12,
52  UBL, /**/ UBR, &U20, /**/ &U21, &U22,
53  b, b, FLA_BR );
54 
55  FLA_Repart_1x2_to_1x3( DL, /**/ DR, &D0, /**/ &D1, &D2,
56  b, FLA_RIGHT );
57 
58  FLA_Repart_2x1_to_3x1( pT, &p0,
59  /* ** */ /* ** */
60  &p1,
61  pB, &p2, b, FLA_BOTTOM );
62 
63  FLA_Repart_2x1_to_3x1( LT, &L0,
64  /* ** */ /* ** */
65  &L1,
66  LB, &L2, b, FLA_BOTTOM );
67 
68  /*------------------------------------------------------------*/
69 
70  FLA_Part_1x2( L1, &L1_sqr, &L1_rest, b, FLA_LEFT );
71 
72 
73  FLA_SA_LU_unb( U11,
74  D1, p1, L1_sqr );
75 
77  D2, p1 );
78 
79  FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR,
80  FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
81  FLA_ONE, L1_sqr, U12 );
82 
83  FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
84  FLA_MINUS_ONE, D1, U12, FLA_ONE, D2 );
85 
86  /*------------------------------------------------------------*/
87 
88  FLA_Cont_with_3x3_to_2x2( &UTL, /**/ &UTR, U00, U01, /**/ U02,
89  U10, U11, /**/ U12,
90  /* ************** */ /* ****************** */
91  &UBL, /**/ &UBR, U20, U21, /**/ U22,
92  FLA_TL );
93 
94  FLA_Cont_with_1x3_to_1x2( &DL, /**/ &DR, D0, D1, /**/ D2,
95  FLA_LEFT );
96 
97  FLA_Cont_with_3x1_to_2x1( &pT, p0,
98  p1,
99  /* ** */ /* ** */
100  &pB, p2, FLA_TOP );
101 
102  FLA_Cont_with_3x1_to_2x1( &LT, L0,
103  L1,
104  /* ** */ /* ** */
105  &LB, L2, FLA_TOP );
106  }
107 
108  return FLA_SUCCESS;
109 }
FLA_Error FLA_SA_LU_unb(FLA_Obj U, FLA_Obj D, FLA_Obj p, FLA_Obj L)
Definition: FLA_SA_LU_unb.c:13
FLA_Error FLA_Cont_with_3x3_to_2x2(FLA_Obj *ATL, FLA_Obj *ATR, FLA_Obj A00, FLA_Obj A01, FLA_Obj A02, FLA_Obj A10, FLA_Obj A11, FLA_Obj A12, FLA_Obj *ABL, FLA_Obj *ABR, FLA_Obj A20, FLA_Obj A21, FLA_Obj A22, FLA_Quadrant quadrant)
Definition: FLA_View.c:304
FLA_Error FLA_Part_2x2(FLA_Obj A, FLA_Obj *A11, FLA_Obj *A12, FLA_Obj *A21, FLA_Obj *A22, dim_t mb, dim_t nb, FLA_Quadrant quadrant)
Definition: FLA_View.c:17
FLA_Error FLA_Repart_2x2_to_3x3(FLA_Obj ATL, FLA_Obj ATR, FLA_Obj *A00, FLA_Obj *A01, FLA_Obj *A02, FLA_Obj *A10, FLA_Obj *A11, FLA_Obj *A12, FLA_Obj ABL, FLA_Obj ABR, FLA_Obj *A20, FLA_Obj *A21, FLA_Obj *A22, dim_t mb, dim_t nb, FLA_Quadrant quadrant)
Definition: FLA_View.c:142

References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Gemm_external(), FLA_MINUS_ONE, FLA_Obj_length(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_SA_Apply_pivots(), FLA_SA_LU_unb(), and FLA_Trsm_external().

Referenced by FLA_SA_LU_task().

◆ FLA_SA_LU_unb()

FLA_Error FLA_SA_LU_unb ( FLA_Obj  U,
FLA_Obj  D,
FLA_Obj  p,
FLA_Obj  L 
)
14 {
15  FLA_Datatype datatype;
16  int m_U, cs_U;
17  int m_D, cs_D;
18  int cs_L;
19  // int rs_U;
20  int rs_D;
21  // int rs_L;
22  int m_U_min_j, m_U_min_j_min_1;
23  int j, ipiv;
24  int* buff_p;
25 
26  if ( FLA_Obj_has_zero_dim( U ) ) return FLA_SUCCESS;
27 
28  datatype = FLA_Obj_datatype( U );
29 
30  m_U = FLA_Obj_length( U );
31  // rs_U = FLA_Obj_row_stride( U );
32  cs_U = FLA_Obj_col_stride( U );
33 
34  m_D = FLA_Obj_length( D );
35  rs_D = FLA_Obj_row_stride( D );
36  cs_D = FLA_Obj_col_stride( D );
37 
38  // rs_L = FLA_Obj_row_stride( L );
39  cs_L = FLA_Obj_col_stride( L );
40 
41  FLA_Copy_external( U, L );
42  FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, L );
43 
44  buff_p = ( int * ) FLA_INT_PTR( p );
45 
46  switch ( datatype ){
47 
48  case FLA_FLOAT:
49  {
50  float* buff_U = ( float * ) FLA_FLOAT_PTR( U );
51  float* buff_D = ( float * ) FLA_FLOAT_PTR( D );
52  float* buff_L = ( float * ) FLA_FLOAT_PTR( L );
53  float* buff_minus1 = ( float * ) FLA_FLOAT_PTR( FLA_MINUS_ONE );
54  float L_tmp;
55  float D_tmp;
56  float d_inv_Ljj;
57 
58  for ( j = 0; j < m_U; ++j )
59  {
60  bl1_samax( m_D,
61  buff_D + j*cs_D + 0*rs_D,
62  rs_D,
63  &ipiv );
64 
65  L_tmp = buff_L[ j*cs_L + j ];
66  D_tmp = buff_D[ j*cs_D + ipiv ];
67 
68  if ( fabsf( L_tmp ) < fabsf( D_tmp ) )
69  {
70  bl1_sswap( m_U,
71  buff_L + 0*cs_L + j, cs_L,
72  buff_D + 0*cs_D + ipiv, cs_D );
73 
74  buff_p[ j ] = ipiv + m_U - j;
75  }
76  else
77  {
78  buff_p[ j ] = 0;
79  }
80 
81  d_inv_Ljj = 1.0F / buff_L[ j*cs_L + j ];
82 
83  bl1_sscal( m_D,
84  &d_inv_Ljj,
85  buff_D + j*cs_D + 0, rs_D );
86 
87  m_U_min_j_min_1 = m_U - j - 1;
88 
89  if ( m_U_min_j_min_1 > 0 )
90  {
93  m_D,
94  m_U_min_j_min_1,
95  buff_minus1,
96  buff_D + (j+0)*cs_D + 0, rs_D,
97  buff_L + (j+1)*cs_L + j, cs_L,
98  buff_D + (j+1)*cs_D + 0, rs_D, cs_D );
99  }
100 
101  m_U_min_j = m_U - j;
102 
103  if ( m_U_min_j > 0 )
104  {
105  bl1_scopy( m_U_min_j,
106  buff_L + j*cs_L + j, cs_L,
107  buff_U + j*cs_U + j, cs_U );
108  }
109  }
110  break;
111  }
112 
113  case FLA_DOUBLE:
114  {
115  double* buff_U = ( double * ) FLA_DOUBLE_PTR( U );
116  double* buff_D = ( double * ) FLA_DOUBLE_PTR( D );
117  double* buff_L = ( double * ) FLA_DOUBLE_PTR( L );
118  double* buff_minus1 = ( double * ) FLA_DOUBLE_PTR( FLA_MINUS_ONE );
119  double L_tmp;
120  double D_tmp;
121  double d_inv_Ljj;
122 
123  for ( j = 0; j < m_U; ++j )
124  {
125  bl1_damax( m_D,
126  buff_D + j*cs_D + 0*rs_D,
127  rs_D,
128  &ipiv );
129 
130  L_tmp = buff_L[ j*cs_L + j ];
131  D_tmp = buff_D[ j*cs_D + ipiv ];
132 
133  if ( fabs( L_tmp ) < fabs( D_tmp ) )
134  {
135  bl1_dswap( m_U,
136  buff_L + 0*cs_L + j, cs_L,
137  buff_D + 0*cs_D + ipiv, cs_D );
138 
139  buff_p[ j ] = ipiv + m_U - j;
140  }
141  else
142  {
143  buff_p[ j ] = 0;
144  }
145 
146  d_inv_Ljj = 1.0 / buff_L[ j*cs_L + j ];
147 
148  bl1_dscal( m_D,
149  &d_inv_Ljj,
150  buff_D + j*cs_D + 0, rs_D );
151 
152  m_U_min_j_min_1 = m_U - j - 1;
153 
154  if ( m_U_min_j_min_1 > 0 )
155  {
158  m_D,
159  m_U_min_j_min_1,
160  buff_minus1,
161  buff_D + (j+0)*cs_D + 0, rs_D,
162  buff_L + (j+1)*cs_L + j, cs_L,
163  buff_D + (j+1)*cs_D + 0, rs_D, cs_D );
164  }
165 
166  m_U_min_j = m_U - j;
167 
168  if ( m_U_min_j > 0 )
169  {
170  bl1_dcopy( m_U_min_j,
171  buff_L + j*cs_L + j, cs_L,
172  buff_U + j*cs_U + j, cs_U );
173  }
174  }
175  break;
176  }
177 
178  case FLA_COMPLEX:
179  {
180  scomplex* buff_U = ( scomplex * ) FLA_COMPLEX_PTR( U );
181  scomplex* buff_D = ( scomplex * ) FLA_COMPLEX_PTR( D );
182  scomplex* buff_L = ( scomplex * ) FLA_COMPLEX_PTR( L );
183  scomplex* buff_minus1 = ( scomplex * ) FLA_COMPLEX_PTR( FLA_MINUS_ONE );
184  scomplex L_tmp;
185  scomplex D_tmp;
186  scomplex d_inv_Ljj;
187  scomplex Ljj;
188  float temp;
189 
190  for ( j = 0; j < m_U; ++j )
191  {
192  bl1_camax( m_D,
193  buff_D + j*cs_D + 0*rs_D,
194  rs_D,
195  &ipiv );
196 
197  L_tmp = buff_L[ j*cs_L + j ];
198  D_tmp = buff_D[ j*cs_D + ipiv ];
199 
200  if ( fabsf( L_tmp.real + L_tmp.imag ) < fabsf( D_tmp.real + D_tmp.imag ) )
201  {
202  bl1_cswap( m_U,
203  buff_L + 0*cs_L + j, cs_L,
204  buff_D + 0*cs_D + ipiv, cs_D );
205 
206  buff_p[ j ] = ipiv + m_U - j;
207  }
208  else
209  {
210  buff_p[ j ] = 0;
211  }
212 
213  Ljj = buff_L[ j*cs_L + j ];
214 
215  // d_inv_Ljj = 1.0 / Ljj
216  temp = 1.0F / ( Ljj.real * Ljj.real +
217  Ljj.imag * Ljj.imag );
218  d_inv_Ljj.real = Ljj.real * temp;
219  d_inv_Ljj.imag = Ljj.imag * -temp;
220 
221  bl1_cscal( m_D,
222  &d_inv_Ljj,
223  buff_D + j*cs_D + 0, rs_D );
224 
225  m_U_min_j_min_1 = m_U - j - 1;
226 
227  if ( m_U_min_j_min_1 > 0 )
228  {
231  m_D,
232  m_U_min_j_min_1,
233  buff_minus1,
234  buff_D + (j+0)*cs_D + 0, rs_D,
235  buff_L + (j+1)*cs_L + j, cs_L,
236  buff_D + (j+1)*cs_D + 0, rs_D, cs_D );
237  }
238 
239  m_U_min_j = m_U - j;
240 
241  if ( m_U_min_j > 0 )
242  {
243  bl1_ccopy( m_U_min_j,
244  buff_L + j*cs_L + j, cs_L,
245  buff_U + j*cs_U + j, cs_U );
246  }
247  }
248  break;
249  }
250 
251  case FLA_DOUBLE_COMPLEX:
252  {
253  dcomplex* buff_U = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( U );
254  dcomplex* buff_D = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( D );
255  dcomplex* buff_L = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( L );
256  dcomplex* buff_minus1 = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( FLA_MINUS_ONE );
257  dcomplex L_tmp;
258  dcomplex D_tmp;
259  dcomplex d_inv_Ljj;
260  dcomplex Ljj;
261  double temp;
262 
263  for ( j = 0; j < m_U; ++j )
264  {
265  bl1_zamax( m_D,
266  buff_D + j*cs_D + 0*rs_D,
267  rs_D,
268  &ipiv );
269 
270  L_tmp = buff_L[ j*cs_L + j ];
271  D_tmp = buff_D[ j*cs_D + ipiv ];
272 
273  if ( fabs( L_tmp.real + L_tmp.imag ) < fabs( D_tmp.real + D_tmp.imag ) )
274  {
275  bl1_zswap( m_U,
276  buff_L + 0*cs_L + j, cs_L,
277  buff_D + 0*cs_D + ipiv, cs_D );
278 
279  buff_p[ j ] = ipiv + m_U - j;
280  }
281  else
282  {
283  buff_p[ j ] = 0;
284  }
285 
286  Ljj = buff_L[ j*cs_L + j ];
287 
288  // d_inv_Ljj = 1.0 / Ljj
289  temp = 1.0 / ( Ljj.real * Ljj.real +
290  Ljj.imag * Ljj.imag );
291  d_inv_Ljj.real = Ljj.real * temp;
292  d_inv_Ljj.imag = Ljj.imag * -temp;
293 
294  bl1_zscal( m_D,
295  &d_inv_Ljj,
296  buff_D + j*cs_D + 0, rs_D );
297 
298  m_U_min_j_min_1 = m_U - j - 1;
299 
300  if ( m_U_min_j_min_1 > 0 )
301  {
304  m_D,
305  m_U_min_j_min_1,
306  buff_minus1,
307  buff_D + (j+0)*cs_D + 0, rs_D,
308  buff_L + (j+1)*cs_L + j, cs_L,
309  buff_D + (j+1)*cs_D + 0, rs_D, cs_D );
310  }
311 
312  m_U_min_j = m_U - j;
313 
314  if ( m_U_min_j > 0 )
315  {
316  bl1_zcopy( m_U_min_j,
317  buff_L + j*cs_L + j, cs_L,
318  buff_U + j*cs_U + j, cs_U );
319  }
320  }
321  break;
322  }
323 
324  }
325 
326  return FLA_SUCCESS;
327 }
FLA_Error FLA_Copy_external(FLA_Obj A, FLA_Obj B)
Definition: FLA_Copy_external.c:13
dim_t FLA_Obj_row_stride(FLA_Obj obj)
Definition: FLA_Query.c:167
FLA_Error FLA_Triangularize(FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A)
Definition: FLA_Triangularize.c:13
void bl1_samax(int n, float *x, int incx, int *index)
Definition: bl1_amax.c:13
void bl1_zamax(int n, dcomplex *x, int incx, int *index)
Definition: bl1_amax.c:46
void bl1_damax(int n, double *x, int incx, int *index)
Definition: bl1_amax.c:24
void bl1_camax(int n, scomplex *x, int incx, int *index)
Definition: bl1_amax.c:35
dcomplex temp
Definition: bl1_axpyv2b.c:301
void bl1_zcopy(int m, dcomplex *x, int incx, dcomplex *y, int incy)
Definition: bl1_copy.c:52
void bl1_dcopy(int m, double *x, int incx, double *y, int incy)
Definition: bl1_copy.c:26
void bl1_ccopy(int m, scomplex *x, int incx, scomplex *y, int incy)
Definition: bl1_copy.c:39
void bl1_scopy(int m, float *x, int incx, float *y, int incy)
Definition: bl1_copy.c:13
void bl1_dger(conj1_t conjx, conj1_t conjy, int m, int n, double *alpha, double *x, int incx, double *y, int incy, double *a, int a_rs, int a_cs)
Definition: bl1_ger.c:62
void bl1_zger(conj1_t conjx, conj1_t conjy, int m, int n, dcomplex *alpha, dcomplex *x, int incx, dcomplex *y, int incy, dcomplex *a, int a_rs, int a_cs)
Definition: bl1_ger.c:194
void bl1_cger(conj1_t conjx, conj1_t conjy, int m, int n, scomplex *alpha, scomplex *x, int incx, scomplex *y, int incy, scomplex *a, int a_rs, int a_cs)
Definition: bl1_ger.c:111
void bl1_sger(conj1_t conjx, conj1_t conjy, int m, int n, float *alpha, float *x, int incx, float *y, int incy, float *a, int a_rs, int a_cs)
Definition: bl1_ger.c:13
void bl1_dscal(int n, double *alpha, double *x, int incx)
Definition: bl1_scal.c:26
void bl1_zscal(int n, dcomplex *alpha, dcomplex *x, int incx)
Definition: bl1_scal.c:78
void bl1_cscal(int n, scomplex *alpha, scomplex *x, int incx)
Definition: bl1_scal.c:52
void bl1_sscal(int n, float *alpha, float *x, int incx)
Definition: bl1_scal.c:13
@ BLIS1_NO_CONJUGATE
Definition: blis_type_defs.h:81
double real
Definition: blis_type_defs.h:139
double imag
Definition: blis_type_defs.h:139
float imag
Definition: blis_type_defs.h:134
float real
Definition: blis_type_defs.h:134

References bl1_camax(), bl1_ccopy(), bl1_cger(), bl1_cscal(), bl1_cswap(), bl1_damax(), bl1_dcopy(), bl1_dger(), bl1_dscal(), bl1_dswap(), bl1_samax(), bl1_scopy(), bl1_sger(), bl1_sscal(), bl1_sswap(), bl1_zamax(), bl1_zcopy(), bl1_zger(), bl1_zscal(), bl1_zswap(), BLIS1_NO_CONJUGATE, FLA_Copy_external(), FLA_MINUS_ONE, FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_has_zero_dim(), FLA_Obj_length(), FLA_Obj_row_stride(), FLA_Triangularize(), scomplex::imag, dcomplex::imag, scomplex::real, dcomplex::real, and temp.

Referenced by FLA_SA_LU_blk().

◆ FLASH_FS_incpiv_aux1()

FLA_Error FLASH_FS_incpiv_aux1 ( FLA_Obj  A,
FLA_Obj  p,
FLA_Obj  L,
FLA_Obj  b,
dim_t  nb_alg 
)
14 {
15  FLA_Obj ATL, ATR, A00, A01, A02,
16  ABL, ABR, A10, A11, A12,
17  A20, A21, A22;
18 
19  FLA_Obj pTL, pTR, p00, p01, p02,
20  pBL, pBR, p10, p11, p12,
21  p20, p21, p22;
22 
23  FLA_Obj LTL, LTR, L00, L01, L02,
24  LBL, LBR, L10, L11, L12,
25  L20, L21, L22;
26 
27  FLA_Obj bT, b0,
28  bB, b1,
29  b2;
30 
31  FLA_Obj p11_conf,
32  p11_rest;
33 
34  FLA_Part_2x2( A, &ATL, &ATR,
35  &ABL, &ABR, 0, 0, FLA_TL );
36 
37  FLA_Part_2x2( p, &pTL, &pTR,
38  &pBL, &pBR, 0, 0, FLA_TL );
39 
40  FLA_Part_2x2( L, &LTL, &LTR,
41  &LBL, &LBR, 0, 0, FLA_TL );
42 
43  FLA_Part_2x1( b, &bT,
44  &bB, 0, FLA_TOP );
45 
46  while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) &&
47  FLA_Obj_width ( ATL ) < FLA_Obj_width ( A ) )
48  {
49  FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02,
50  /* ************* */ /* ******************** */
51  &A10, /**/ &A11, &A12,
52  ABL, /**/ ABR, &A20, /**/ &A21, &A22,
53  1, 1, FLA_BR );
54 
55  FLA_Repart_2x2_to_3x3( pTL, /**/ pTR, &p00, /**/ &p01, &p02,
56  /* ************* */ /* ******************** */
57  &p10, /**/ &p11, &p12,
58  pBL, /**/ pBR, &p20, /**/ &p21, &p22,
59  1, 1, FLA_BR );
60 
61  FLA_Repart_2x2_to_3x3( LTL, /**/ LTR, &L00, /**/ &L01, &L02,
62  /* ************* */ /* ******************** */
63  &L10, /**/ &L11, &L12,
64  LBL, /**/ LBR, &L20, /**/ &L21, &L22,
65  1, 1, FLA_BR );
66 
67  FLA_Repart_2x1_to_3x1( bT, &b0,
68  /* ** */ /* ** */
69  &b1,
70  bB, &b2, 1, FLA_BOTTOM );
71 
72  /*------------------------------------------------------------*/
73 
74  FLA_Part_2x1( *FLASH_OBJ_PTR_AT( p11 ), &p11_conf,
75  &p11_rest,
76  FLA_Obj_length( *FLASH_OBJ_PTR_AT( b1 ) ), FLA_TOP );
77 
78 
79  FLA_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE,
80  p11_conf,
81  *FLASH_OBJ_PTR_AT( b1 ) );
82 
83  FLA_Trsv_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
84  *FLASH_OBJ_PTR_AT( A11 ),
85  *FLASH_OBJ_PTR_AT( b1 ) );
86 
88  A21, p21, b1,
89  b2, nb_alg );
90 
91  /*------------------------------------------------------------*/
92 
93  FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02,
94  A10, A11, /**/ A12,
95  /* ************** */ /* ****************** */
96  &ABL, /**/ &ABR, A20, A21, /**/ A22,
97  FLA_TL );
98 
99  FLA_Cont_with_3x3_to_2x2( &pTL, /**/ &pTR, p00, p01, /**/ p02,
100  p10, p11, /**/ p12,
101  /* ************** */ /* ****************** */
102  &pBL, /**/ &pBR, p20, p21, /**/ p22,
103  FLA_TL );
104 
105  FLA_Cont_with_3x3_to_2x2( &LTL, /**/ &LTR, L00, L01, /**/ L02,
106  L10, L11, /**/ L12,
107  /* ************** */ /* ****************** */
108  &LBL, /**/ &LBR, L20, L21, /**/ L22,
109  FLA_TL );
110 
111  FLA_Cont_with_3x1_to_2x1( &bT, b0,
112  b1,
113  /* ** */ /* ** */
114  &bB, b2, FLA_TOP );
115  }
116 
117  return FLA_SUCCESS;
118 }
FLA_Error FLASH_FS_incpiv_aux2(FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, dim_t nb_alg)
Definition: FLASH_FS_incpiv_aux2.c:13
FLA_Error FLA_Trsv_external(FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, FLA_Obj x)
Definition: FLA_Trsv_external.c:13
FLA_Error FLA_Apply_pivots(FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A)
Definition: FLA_Apply_pivots.c:15

References FLA_Apply_pivots(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Obj_length(), FLA_Obj_width(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_Trsv_external(), and FLASH_FS_incpiv_aux2().

Referenced by FLASH_FS_incpiv().

◆ FLASH_FS_incpiv_aux2()

FLA_Error FLASH_FS_incpiv_aux2 ( FLA_Obj  L,
FLA_Obj  D,
FLA_Obj  p,
FLA_Obj  C,
FLA_Obj  E,
dim_t  nb_alg 
)
16 {
17  FLA_Obj LT, L0,
18  LB, L1,
19  L2;
20 
21  FLA_Obj DT, D0,
22  DB, D1,
23  D2;
24 
25  FLA_Obj pT, p0,
26  pB, p1,
27  p2;
28 
29  FLA_Obj ET, E0,
30  EB, E1,
31  E2;
32 
33  FLA_Part_2x1( L, &LT,
34  &LB, 0, FLA_TOP );
35 
36  FLA_Part_2x1( D, &DT,
37  &DB, 0, FLA_TOP );
38 
39  FLA_Part_2x1( p, &pT,
40  &pB, 0, FLA_TOP );
41 
42  FLA_Part_2x1( E, &ET,
43  &EB, 0, FLA_TOP );
44 
45  while ( FLA_Obj_length( DT ) < FLA_Obj_length( D ) )
46  {
47  FLA_Repart_2x1_to_3x1( LT, &L0,
48  /* ** */ /* ** */
49  &L1,
50  LB, &L2, 1, FLA_BOTTOM );
51 
52  FLA_Repart_2x1_to_3x1( DT, &D0,
53  /* ** */ /* ** */
54  &D1,
55  DB, &D2, 1, FLA_BOTTOM );
56 
57  FLA_Repart_2x1_to_3x1( pT, &p0,
58  /* ** */ /* ** */
59  &p1,
60  pB, &p2, 1, FLA_BOTTOM );
61 
62  FLA_Repart_2x1_to_3x1( ET, &E0,
63  /* ** */ /* ** */
64  &E1,
65  EB, &E2, 1, FLA_BOTTOM );
66 
67  /*------------------------------------------------------------*/
68 
69  FLA_SA_FS_blk( *FLASH_OBJ_PTR_AT( L1 ),
70  *FLASH_OBJ_PTR_AT( D1 ),
71  *FLASH_OBJ_PTR_AT( p1 ),
72  *FLASH_OBJ_PTR_AT( C ),
73  *FLASH_OBJ_PTR_AT( E1 ),
74  nb_alg );
75 
76  /*------------------------------------------------------------*/
77 
78  FLA_Cont_with_3x1_to_2x1( &LT, L0,
79  L1,
80  /* ** */ /* ** */
81  &LB, L2, FLA_TOP );
82 
83  FLA_Cont_with_3x1_to_2x1( &DT, D0,
84  D1,
85  /* ** */ /* ** */
86  &DB, D2, FLA_TOP );
87 
88  FLA_Cont_with_3x1_to_2x1( &pT, p0,
89  p1,
90  /* ** */ /* ** */
91  &pB, p2, FLA_TOP );
92 
93  FLA_Cont_with_3x1_to_2x1( &ET, E0,
94  E1,
95  /* ** */ /* ** */
96  &EB, E2, FLA_TOP );
97  }
98 
99  return FLA_SUCCESS;
100 }
FLA_Error FLA_SA_FS_blk(FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, dim_t nb_alg)
Definition: FLA_SA_FS_blk.c:13

References FLA_Cont_with_3x1_to_2x1(), FLA_Obj_length(), FLA_Part_2x1(), FLA_Repart_2x1_to_3x1(), and FLA_SA_FS_blk().

Referenced by FLASH_FS_incpiv_aux1().

◆ FLASH_LU_incpiv_var1()

FLA_Error FLASH_LU_incpiv_var1 ( FLA_Obj  A,
FLA_Obj  p,
FLA_Obj  L,
dim_t  nb_alg,
fla_lu_t cntl 
)
14 {
15  FLA_Obj ATL, ATR, A00, A01, A02,
16  ABL, ABR, A10, A11, A12,
17  A20, A21, A22;
18 
19  FLA_Obj pTL, pTR, p00, p01, p02,
20  pBL, pBR, p10, p11, p12,
21  p20, p21, p22;
22 
23  FLA_Obj LTL, LTR, L00, L01, L02,
24  LBL, LBR, L10, L11, L12,
25  L20, L21, L22;
26 
27  FLA_Part_2x2( A, &ATL, &ATR,
28  &ABL, &ABR, 0, 0, FLA_TL );
29 
30  FLA_Part_2x2( p, &pTL, &pTR,
31  &pBL, &pBR, 0, 0, FLA_TL );
32 
33  FLA_Part_2x2( L, &LTL, &LTR,
34  &LBL, &LBR, 0, 0, FLA_TL );
35 
36  while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) &&
37  FLA_Obj_width ( ATL ) < FLA_Obj_width ( A ) )
38  {
39  FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02,
40  /* ************* */ /* ******************** */
41  &A10, /**/ &A11, &A12,
42  ABL, /**/ ABR, &A20, /**/ &A21, &A22,
43  1, 1, FLA_BR );
44 
45  FLA_Repart_2x2_to_3x3( pTL, /**/ pTR, &p00, /**/ &p01, &p02,
46  /* ************* */ /* ******************** */
47  &p10, /**/ &p11, &p12,
48  pBL, /**/ pBR, &p20, /**/ &p21, &p22,
49  1, 1, FLA_BR );
50 
51  FLA_Repart_2x2_to_3x3( LTL, /**/ LTR, &L00, /**/ &L01, &L02,
52  /* ************* */ /* ******************** */
53  &L10, /**/ &L11, &L12,
54  LBL, /**/ LBR, &L20, /**/ &L21, &L22,
55  1, 1, FLA_BR );
56 
57  /*------------------------------------------------------------*/
58 
59  if ( FLASH_Queue_get_enabled( ) )
60  {
61  // Enqueue
62  ENQUEUE_FLASH_LU_piv( *FLASH_OBJ_PTR_AT( A11 ),
63  *FLASH_OBJ_PTR_AT( p11 ),
64  FLA_Cntl_sub_lu( cntl ) );
65  }
66  else
67  {
68  // Execute leaf
69  FLA_LU_piv_task( *FLASH_OBJ_PTR_AT( A11 ),
70  *FLASH_OBJ_PTR_AT( p11 ),
71  FLA_Cntl_sub_lu( cntl ) );
72  }
73 
74  FLASH_Trsm_piv( A11, A12, p11,
75  FLA_Cntl_sub_trsm1( cntl ) );
76 
77  FLASH_SA_LU( A11, A12,
78  A21, A22, p21, L21, nb_alg, cntl );
79 
80  /*------------------------------------------------------------*/
81 
82  FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02,
83  A10, A11, /**/ A12,
84  /* ************** */ /* ****************** */
85  &ABL, /**/ &ABR, A20, A21, /**/ A22,
86  FLA_TL );
87 
88  FLA_Cont_with_3x3_to_2x2( &pTL, /**/ &pTR, p00, p01, /**/ p02,
89  p10, p11, /**/ p12,
90  /* ************** */ /* ****************** */
91  &pBL, /**/ &pBR, p20, p21, /**/ p22,
92  FLA_TL );
93 
94  FLA_Cont_with_3x3_to_2x2( &LTL, /**/ &LTR, L00, L01, /**/ L02,
95  L10, L11, /**/ L12,
96  /* ************** */ /* ****************** */
97  &LBL, /**/ &LBR, L20, L21, /**/ L22,
98  FLA_TL );
99  }
100 
101  return FLA_SUCCESS;
102 }
FLA_Bool FLASH_Queue_get_enabled(void)
Definition: FLASH_Queue.c:171
FLA_Error FLASH_SA_LU(FLA_Obj B, FLA_Obj C, FLA_Obj D, FLA_Obj E, FLA_Obj p, FLA_Obj L, dim_t nb_alg, fla_lu_t *cntl)
Definition: FLASH_SA_LU.c:13
FLA_Error FLASH_Trsm_piv(FLA_Obj A, FLA_Obj B, FLA_Obj p, fla_trsm_t *cntl)
Definition: FLASH_Trsm_piv.c:13
FLA_Error FLA_LU_piv_task(FLA_Obj A, FLA_Obj p, fla_lu_t *cntl)
Definition: FLA_LU_piv_task.c:15

References FLA_Cont_with_3x3_to_2x2(), FLA_LU_piv_task(), FLA_Obj_length(), FLA_Obj_width(), FLA_Part_2x2(), FLA_Repart_2x2_to_3x3(), FLASH_Queue_get_enabled(), FLASH_SA_LU(), and FLASH_Trsm_piv().

Referenced by FLASH_LU_incpiv_noopt().

◆ FLASH_LU_incpiv_var2()

FLA_Error FLASH_LU_incpiv_var2 ( FLA_Obj  A,
FLA_Obj  p,
FLA_Obj  L,
FLA_Obj  U,
dim_t  nb_alg,
fla_lu_t cntl 
)
14 {
15  FLA_Obj ATL, ATR, A00, A01, A02,
16  ABL, ABR, A10, A11, A12,
17  A20, A21, A22;
18 
19  FLA_Obj pTL, pTR, p00, p01, p02,
20  pBL, pBR, p10, p11, p12,
21  p20, p21, p22;
22 
23  FLA_Obj LTL, LTR, L00, L01, L02,
24  LBL, LBR, L10, L11, L12,
25  L20, L21, L22;
26 
27  FLA_Obj UL, UR, U0, U1, U2;
28 
29  FLA_Part_2x2( A, &ATL, &ATR,
30  &ABL, &ABR, 0, 0, FLA_TL );
31 
32  FLA_Part_2x2( p, &pTL, &pTR,
33  &pBL, &pBR, 0, 0, FLA_TL );
34 
35  FLA_Part_2x2( L, &LTL, &LTR,
36  &LBL, &LBR, 0, 0, FLA_TL );
37 
38  FLA_Part_1x2( U, &UL, &UR, 0, FLA_LEFT );
39 
40  while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) &&
41  FLA_Obj_width ( ATL ) < FLA_Obj_width ( A ) )
42  {
43  FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02,
44  /* ************* */ /* ******************** */
45  &A10, /**/ &A11, &A12,
46  ABL, /**/ ABR, &A20, /**/ &A21, &A22,
47  1, 1, FLA_BR );
48 
49  FLA_Repart_2x2_to_3x3( pTL, /**/ pTR, &p00, /**/ &p01, &p02,
50  /* ************* */ /* ******************** */
51  &p10, /**/ &p11, &p12,
52  pBL, /**/ pBR, &p20, /**/ &p21, &p22,
53  1, 1, FLA_BR );
54 
55  FLA_Repart_2x2_to_3x3( LTL, /**/ LTR, &L00, /**/ &L01, &L02,
56  /* ************* */ /* ******************** */
57  &L10, /**/ &L11, &L12,
58  LBL, /**/ LBR, &L20, /**/ &L21, &L22,
59  1, 1, FLA_BR );
60 
61  FLA_Repart_1x2_to_1x3( UL, /**/ UR, &U0, /**/ &U1, &U2,
62  1, FLA_RIGHT );
63 
64  /*------------------------------------------------------------*/
65 
66  if ( FLASH_Queue_get_enabled( ) )
67  {
68  // Enqueue
69  ENQUEUE_FLASH_LU_piv_copy( *FLASH_OBJ_PTR_AT( A11 ),
70  *FLASH_OBJ_PTR_AT( p11 ),
71  *FLASH_OBJ_PTR_AT( U1 ),
72  FLA_Cntl_sub_lu( cntl ) );
73  }
74  else
75  {
76  // Execute leaf
77  FLA_LU_piv_copy_task( *FLASH_OBJ_PTR_AT( A11 ),
78  *FLASH_OBJ_PTR_AT( p11 ),
79  *FLASH_OBJ_PTR_AT( U1 ),
80  FLA_Cntl_sub_lu( cntl ) );
81  }
82 
83  FLASH_Trsm_piv( U1, A12, p11,
84  FLA_Cntl_sub_trsm1( cntl ) );
85 
86  FLASH_SA_LU( A11, A12,
87  A21, A22, p21, L21, nb_alg, cntl );
88 
89  /*------------------------------------------------------------*/
90 
91  FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02,
92  A10, A11, /**/ A12,
93  /* ************** */ /* ****************** */
94  &ABL, /**/ &ABR, A20, A21, /**/ A22,
95  FLA_TL );
96 
97  FLA_Cont_with_3x3_to_2x2( &pTL, /**/ &pTR, p00, p01, /**/ p02,
98  p10, p11, /**/ p12,
99  /* ************** */ /* ****************** */
100  &pBL, /**/ &pBR, p20, p21, /**/ p22,
101  FLA_TL );
102 
103  FLA_Cont_with_3x3_to_2x2( &LTL, /**/ &LTR, L00, L01, /**/ L02,
104  L10, L11, /**/ L12,
105  /* ************** */ /* ****************** */
106  &LBL, /**/ &LBR, L20, L21, /**/ L22,
107  FLA_TL );
108 
109  FLA_Cont_with_1x3_to_1x2( &UL, /**/ &UR, U0, U1, /**/ U2,
110  FLA_LEFT );
111  }
112 
113  return FLA_SUCCESS;
114 }
FLA_Error FLA_LU_piv_copy_task(FLA_Obj A, FLA_Obj p, FLA_Obj U, fla_lu_t *cntl)
Definition: FLA_LU_piv_copy_task.c:13

References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x3_to_2x2(), FLA_LU_piv_copy_task(), FLA_Obj_length(), FLA_Obj_width(), FLA_Part_1x2(), FLA_Part_2x2(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x2_to_3x3(), FLASH_Queue_get_enabled(), FLASH_SA_LU(), and FLASH_Trsm_piv().

Referenced by FLASH_LU_incpiv_opt1().

◆ FLASH_SA_FS()

FLA_Error FLASH_SA_FS ( FLA_Obj  L,
FLA_Obj  D,
FLA_Obj  p,
FLA_Obj  C,
FLA_Obj  E,
dim_t  nb_alg,
fla_gemm_t cntl 
)
16 {
17  FLA_Obj CL, CR, C0, C1, C2;
18 
19  FLA_Obj EL, ER, E0, E1, E2;
20 
21  FLA_Part_1x2( C, &CL, &CR, 0, FLA_LEFT );
22 
23  FLA_Part_1x2( E, &EL, &ER, 0, FLA_LEFT );
24 
25  while ( FLA_Obj_width( CL ) < FLA_Obj_width( C ) )
26  {
27  FLA_Repart_1x2_to_1x3( CL, /**/ CR, &C0, /**/ &C1, &C2,
28  1, FLA_RIGHT );
29 
30  FLA_Repart_1x2_to_1x3( EL, /**/ ER, &E0, /**/ &E1, &E2,
31  1, FLA_RIGHT );
32 
33  /*------------------------------------------------------------*/
34 
35  if ( FLASH_Queue_get_enabled( ) )
36  {
37  // Enqueue
38  ENQUEUE_FLASH_SA_FS( *FLASH_OBJ_PTR_AT( L ),
39  *FLASH_OBJ_PTR_AT( D ),
40  *FLASH_OBJ_PTR_AT( p ),
41  *FLASH_OBJ_PTR_AT( C1 ),
42  *FLASH_OBJ_PTR_AT( E1 ),
43  nb_alg,
44  FLA_Cntl_sub_gemm( cntl ) );
45  }
46  else
47  {
48  // Execute leaf
49  FLA_SA_FS_task( *FLASH_OBJ_PTR_AT( L ),
50  *FLASH_OBJ_PTR_AT( D ),
51  *FLASH_OBJ_PTR_AT( p ),
52  *FLASH_OBJ_PTR_AT( C1 ),
53  *FLASH_OBJ_PTR_AT( E1 ),
54  nb_alg,
55  FLA_Cntl_sub_gemm( cntl ) );
56  }
57 
58  /*------------------------------------------------------------*/
59 
60  FLA_Cont_with_1x3_to_1x2( &CL, /**/ &CR, C0, C1, /**/ C2,
61  FLA_LEFT );
62 
63  FLA_Cont_with_1x3_to_1x2( &EL, /**/ &ER, E0, E1, /**/ E2,
64  FLA_LEFT );
65  }
66 
67  return FLA_SUCCESS;
68 }
FLA_Error FLA_SA_FS_task(FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, dim_t nb_alg, fla_gemm_t *cntl)
Definition: FLA_SA_FS_task.c:13

References FLA_Cont_with_1x3_to_1x2(), FLA_Obj_width(), FLA_Part_1x2(), FLA_Repart_1x2_to_1x3(), FLA_SA_FS_task(), and FLASH_Queue_get_enabled().

Referenced by FLASH_SA_LU().

◆ FLASH_SA_LU()

FLA_Error FLASH_SA_LU ( FLA_Obj  B,
FLA_Obj  C,
FLA_Obj  D,
FLA_Obj  E,
FLA_Obj  p,
FLA_Obj  L,
dim_t  nb_alg,
fla_lu_t cntl 
)
15 {
16  FLA_Obj DT, D0,
17  DB, D1,
18  D2;
19 
20  FLA_Obj ET, E0,
21  EB, E1,
22  E2;
23 
24  FLA_Obj pT, p0,
25  pB, p1,
26  p2;
27 
28  FLA_Obj LT, L0,
29  LB, L1,
30  L2;
31 
32  FLA_Part_2x1( D, &DT,
33  &DB, 0, FLA_TOP );
34 
35  FLA_Part_2x1( E, &ET,
36  &EB, 0, FLA_TOP );
37 
38  FLA_Part_2x1( p, &pT,
39  &pB, 0, FLA_TOP );
40 
41  FLA_Part_2x1( L, &LT,
42  &LB, 0, FLA_TOP );
43 
44  while ( FLA_Obj_length( DT ) < FLA_Obj_length( D ) )
45  {
46  FLA_Repart_2x1_to_3x1( DT, &D0,
47  /* ** */ /* ** */
48  &D1,
49  DB, &D2, 1, FLA_BOTTOM );
50 
51  FLA_Repart_2x1_to_3x1( ET, &E0,
52  /* ** */ /* ** */
53  &E1,
54  EB, &E2, 1, FLA_BOTTOM );
55 
56  FLA_Repart_2x1_to_3x1( pT, &p0,
57  /* ** */ /* ** */
58  &p1,
59  pB, &p2, 1, FLA_BOTTOM );
60 
61  FLA_Repart_2x1_to_3x1( LT, &L0,
62  /* ** */ /* ** */
63  &L1,
64  LB, &L2, 1, FLA_BOTTOM );
65 
66  /*------------------------------------------------------------*/
67 
68  if ( FLASH_Queue_get_enabled( ) )
69  {
70  // Enqueue
71  ENQUEUE_FLASH_SA_LU( *FLASH_OBJ_PTR_AT( B ),
72  *FLASH_OBJ_PTR_AT( D1 ),
73  *FLASH_OBJ_PTR_AT( p1 ),
74  *FLASH_OBJ_PTR_AT( L1 ),
75  nb_alg,
76  FLA_Cntl_sub_lu( cntl ) );
77  }
78  else
79  {
80  // Execute leaf
81  FLA_SA_LU_task( *FLASH_OBJ_PTR_AT( B ),
82  *FLASH_OBJ_PTR_AT( D1 ),
83  *FLASH_OBJ_PTR_AT( p1 ),
84  *FLASH_OBJ_PTR_AT( L1 ),
85  nb_alg,
86  FLA_Cntl_sub_lu( cntl ) );
87  }
88 
89  FLASH_SA_FS( L1,
90  D1, p1, C,
91  E1, nb_alg, FLA_Cntl_sub_gemm1( cntl ) );
92 
93  /*------------------------------------------------------------*/
94 
95  FLA_Cont_with_3x1_to_2x1( &DT, D0,
96  D1,
97  /* ** */ /* ** */
98  &DB, D2, FLA_TOP );
99 
100  FLA_Cont_with_3x1_to_2x1( &ET, E0,
101  E1,
102  /* ** */ /* ** */
103  &EB, E2, FLA_TOP );
104 
105  FLA_Cont_with_3x1_to_2x1( &pT, p0,
106  p1,
107  /* ** */ /* ** */
108  &pB, p2, FLA_TOP );
109 
110  FLA_Cont_with_3x1_to_2x1( &LT, L0,
111  L1,
112  /* ** */ /* ** */
113  &LB, L2, FLA_TOP );
114  }
115 
116  return FLA_SUCCESS;
117 }
FLA_Error FLASH_SA_FS(FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, dim_t nb_alg, fla_gemm_t *cntl)
Definition: FLASH_SA_FS.c:13
FLA_Error FLA_SA_LU_task(FLA_Obj U, FLA_Obj D, FLA_Obj p, FLA_Obj L, dim_t nb_alg, fla_lu_t *cntl)
Definition: FLA_SA_LU_task.c:13

References FLA_Cont_with_3x1_to_2x1(), FLA_Obj_length(), FLA_Part_2x1(), FLA_Repart_2x1_to_3x1(), FLA_SA_LU_task(), FLASH_Queue_get_enabled(), and FLASH_SA_FS().

Referenced by FLASH_LU_incpiv_var1(), and FLASH_LU_incpiv_var2().

◆ FLASH_Trsm_piv()

FLA_Error FLASH_Trsm_piv ( FLA_Obj  A,
FLA_Obj  B,
FLA_Obj  p,
fla_trsm_t cntl 
)
14 {
15  FLA_Obj BL, BR, B0, B1, B2;
16 
17  FLA_Part_1x2( B, &BL, &BR, 0, FLA_LEFT );
18 
19  while ( FLA_Obj_width( BL ) < FLA_Obj_width( B ) )
20  {
21  FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, /**/ &B1, &B2,
22  1, FLA_RIGHT );
23 
24  /*------------------------------------------------------------*/
25 
26  if ( FLASH_Queue_get_enabled( ) )
27  {
28  // Enqueue
29  ENQUEUE_FLASH_Trsm_piv( *FLASH_OBJ_PTR_AT( A ),
30  *FLASH_OBJ_PTR_AT( B1 ),
31  *FLASH_OBJ_PTR_AT( p ),
32  FLA_Cntl_sub_trsm( cntl ) );
33  }
34  else
35  {
36  // Execute leaf
37  FLA_Trsm_piv_task( *FLASH_OBJ_PTR_AT( A ),
38  *FLASH_OBJ_PTR_AT( B1 ),
39  *FLASH_OBJ_PTR_AT( p ),
40  FLA_Cntl_sub_trsm( cntl ) );
41  }
42 
43  /*------------------------------------------------------------*/
44 
45  FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, B1, /**/ B2,
46  FLA_LEFT );
47  }
48 
49  return FLA_SUCCESS;
50 }
FLA_Error FLA_Trsm_piv_task(FLA_Obj A, FLA_Obj B, FLA_Obj p, fla_trsm_t *cntl)
Definition: FLA_Trsm_piv_task.c:13

References FLA_Cont_with_1x3_to_1x2(), FLA_Obj_width(), FLA_Part_1x2(), FLA_Repart_1x2_to_1x3(), FLA_Trsm_piv_task(), and FLASH_Queue_get_enabled().

Referenced by FLASH_LU_incpiv_var1(), and FLASH_LU_incpiv_var2().