libflame  revision_anchor
Functions
bl1_hemm.c File Reference

(r)

Functions

void bl1_shemm (side1_t side, uplo1_t uplo, int m, int n, float *alpha, float *a, int a_rs, int a_cs, float *b, int b_rs, int b_cs, float *beta, float *c, int c_rs, int c_cs)
 
void bl1_dhemm (side1_t side, uplo1_t uplo, int m, int n, double *alpha, double *a, int a_rs, int a_cs, double *b, int b_rs, int b_cs, double *beta, double *c, int c_rs, int c_cs)
 
void bl1_chemm (side1_t side, uplo1_t uplo, int m, int n, scomplex *alpha, scomplex *a, int a_rs, int a_cs, scomplex *b, int b_rs, int b_cs, scomplex *beta, scomplex *c, int c_rs, int c_cs)
 
void bl1_zhemm (side1_t side, uplo1_t uplo, int m, int n, dcomplex *alpha, dcomplex *a, int a_rs, int a_cs, dcomplex *b, int b_rs, int b_cs, dcomplex *beta, dcomplex *c, int c_rs, int c_cs)
 
void bl1_chemm_blas (side1_t side, uplo1_t uplo, int m, int n, scomplex *alpha, scomplex *a, int lda, scomplex *b, int ldb, scomplex *beta, scomplex *c, int ldc)
 
void bl1_zhemm_blas (side1_t side, uplo1_t uplo, int m, int n, dcomplex *alpha, dcomplex *a, int lda, dcomplex *b, int ldb, dcomplex *beta, dcomplex *c, int ldc)
 

Function Documentation

◆ bl1_chemm()

void bl1_chemm ( side1_t  side,
uplo1_t  uplo,
int  m,
int  n,
scomplex alpha,
scomplex a,
int  a_rs,
int  a_cs,
scomplex b,
int  b_rs,
int  b_cs,
scomplex beta,
scomplex c,
int  c_rs,
int  c_cs 
)
39 {
40  int m_save = m;
41  int n_save = n;
42  scomplex* a_save = a;
43  scomplex* b_save = b;
44  scomplex* c_save = c;
45  int a_rs_save = a_rs;
46  int a_cs_save = a_cs;
47  int b_rs_save = b_rs;
48  int b_cs_save = b_cs;
49  int c_rs_save = c_rs;
50  int c_cs_save = c_cs;
51  scomplex zero = bl1_c0();
52  scomplex one = bl1_c1();
53  scomplex* a_conj;
54  scomplex* b_copy;
55  scomplex* c_trans;
56  int dim_a;
57  int lda, inca;
58  int ldb, incb;
59  int ldc, incc;
60  int lda_conj, inca_conj;
61  int ldb_copy, incb_copy;
62  int ldc_trans, incc_trans;
63  int hemm_needs_conja = FALSE;
64  int hemm_needs_copyb = FALSE;
65  int hemm_needs_transb = FALSE;
66  int hemm_needs_axpyt = FALSE;
67  int a_was_copied;
68 
69  // Return early if possible.
70  if ( bl1_zero_dim2( m, n ) ) return;
71 
72  // If necessary, allocate, initialize, and use a temporary contiguous
73  // copy of each matrix rather than the original matrices.
74  bl1_set_dim_with_side( side, m, n, &dim_a );
76  dim_a,
77  dim_a,
78  a_save, a_rs_save, a_cs_save,
79  &a, &a_rs, &a_cs );
80 
82  n,
83  b_save, b_rs_save, b_cs_save,
84  &b, &b_rs, &b_cs );
85 
87  n,
88  c_save, c_rs_save, c_cs_save,
89  &c, &c_rs, &c_cs );
90 
91  // Figure out whether A was copied to contiguous memory. This is used to
92  // prevent redundant copying.
93  a_was_copied = ( a != a_save );
94 
95  // Initialize with values assuming column-major storage.
96  lda = a_cs;
97  inca = a_rs;
98  ldb = b_cs;
99  incb = b_rs;
100  ldc = c_cs;
101  incc = c_rs;
102 
103  // Adjust the parameters based on the storage of each matrix.
104  if ( bl1_is_col_storage( c_rs, c_cs ) )
105  {
106  if ( bl1_is_col_storage( a_rs, a_cs ) )
107  {
108  if ( bl1_is_col_storage( b_rs, b_cs ) )
109  {
110  // requested operation: C_c += uplo( A_c ) * B_c
111  // effective operation: C_c += uplo( A_c ) * B_c
112  }
113  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
114  {
115  // requested operation: C_c += uplo( A_c ) * B_r
116  // effective operation: C_c += uplo( A_c ) * B_c
117  hemm_needs_copyb = TRUE;
118  }
119  }
120  else // if ( bl1_is_row_storage( a_rs, a_cs ) )
121  {
122  if ( bl1_is_col_storage( b_rs, b_cs ) )
123  {
124  // requested operation: C_c += uplo( A_r ) * B_c
125  // effective operation: C_c += ~uplo( conj( A_c ) ) * B_c
126  bl1_swap_ints( lda, inca );
127 
128  bl1_toggle_uplo( uplo );
129 
130  hemm_needs_conja = TRUE;
131  }
132  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
133  {
134  // requested operation: C_c += uplo( A_r ) * B_r
135  // effective operation: C_c += ( B_c * ~uplo( conj( A_c ) ) )^T
136  bl1_swap_ints( lda, inca );
137  bl1_swap_ints( ldb, incb );
138 
139  bl1_toggle_side( side );
140  bl1_toggle_uplo( uplo );
141 
142  hemm_needs_axpyt = TRUE;
143  }
144  }
145  }
146  else // if ( bl1_is_row_storage( c_rs, c_cs ) )
147  {
148  if ( bl1_is_col_storage( a_rs, a_cs ) )
149  {
150  if ( bl1_is_col_storage( b_rs, b_cs ) )
151  {
152  // requested operation: C_r += uplo( A_c ) * B_c
153  // effective operation: C_c += ( uplo( A_c ) * B_c )^T
154  bl1_swap_ints( ldc, incc );
155 
156  bl1_swap_ints( m, n );
157 
158  hemm_needs_axpyt = TRUE;
159  }
160  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
161  {
162  // requested operation: C_r += uplo( A_c ) * B_r
163  // effective operation: C_c += B_c * ~uplo( conj( A_c ) )
164  bl1_swap_ints( ldc, incc );
165  bl1_swap_ints( ldb, incb );
166 
167  bl1_swap_ints( m, n );
168 
169  bl1_toggle_side( side );
170 
171  hemm_needs_conja = TRUE;
172  }
173  }
174  else // if ( bl1_is_row_storage( a_rs, a_cs ) )
175  {
176  if ( bl1_is_col_storage( b_rs, b_cs ) )
177  {
178  // requested operation: C_r += uplo( A_r ) * B_c
179  // effective operation: C_c += B_c^T * ~uplo( A_c )
180  bl1_swap_ints( ldc, incc );
181  bl1_swap_ints( lda, inca );
182 
183  bl1_swap_ints( m, n );
184 
185  bl1_toggle_side( side );
186  bl1_toggle_uplo( uplo );
187 
188  hemm_needs_copyb = TRUE;
189  hemm_needs_transb = TRUE;
190  }
191  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
192  {
193  // requested operation: C_r += uplo( A_r ) * B_r
194  // effective operation: C_c += B_c * conj( ~uplo( A_c ) )
195  bl1_swap_ints( ldc, incc );
196  bl1_swap_ints( lda, inca );
197  bl1_swap_ints( ldb, incb );
198 
199  bl1_swap_ints( m, n );
200 
201  bl1_toggle_uplo( uplo );
202  bl1_toggle_side( side );
203  }
204  }
205  }
206 
207  // We need a temporary matrix for the cases where A is conjugated.
208  a_conj = a;
209  lda_conj = lda;
210  inca_conj = inca;
211 
212  if ( hemm_needs_conja && !a_was_copied )
213  {
214  int dim_a;
215 
216  bl1_set_dim_with_side( side, m, n, &dim_a );
217 
218  a_conj = bl1_callocm( dim_a, dim_a );
219  lda_conj = dim_a;
220  inca_conj = 1;
221 
222  bl1_ccopymrt( uplo,
224  dim_a,
225  dim_a,
226  a, inca, lda,
227  a_conj, inca_conj, lda_conj );
228  }
229  else if ( hemm_needs_conja && a_was_copied )
230  {
231  int dim_a;
232 
233  bl1_set_dim_with_side( side, m, n, &dim_a );
234 
235  bl1_cconjmr( uplo,
236  dim_a,
237  dim_a,
238  a_conj, inca_conj, lda_conj );
239  }
240 
241  // We need a temporary matrix for the cases where B needs to be copied.
242  b_copy = b;
243  ldb_copy = ldb;
244  incb_copy = incb;
245 
246  // There are two cases where we need to make a copy of B: one where the
247  // copy's dimensions are transposed from the original B, and one where
248  // the dimensions are not swapped.
249  if ( hemm_needs_copyb )
250  {
251  trans1_t transb;
252 
253  // Set transb, which determines whether or not we need to copy from B
254  // as if it needs a transposition. If a transposition is needed, then
255  // m and n and have already been swapped. So in either case m
256  // represents the leading dimension of the copy.
257  if ( hemm_needs_transb ) transb = BLIS1_TRANSPOSE;
258  else transb = BLIS1_NO_TRANSPOSE;
259 
260  b_copy = bl1_callocm( m, n );
261  ldb_copy = m;
262  incb_copy = 1;
263 
264  bl1_ccopymt( transb,
265  m,
266  n,
267  b, incb, ldb,
268  b_copy, incb_copy, ldb_copy );
269  }
270 
271  // There are two cases where we need to perform the hemm and then axpy
272  // the result into C with a transposition. We handle those cases here.
273  if ( hemm_needs_axpyt )
274  {
275  // We need a temporary matrix for holding C^T. Notice that m and n
276  // represent the dimensions of C, and thus C_trans is n-by-m
277  // (interpreting both as column-major matrices). So the leading
278  // dimension of the temporary matrix holding C^T is n.
279  c_trans = bl1_callocm( n, m );
280  ldc_trans = n;
281  incc_trans = 1;
282 
283  // Compute A * B (or B * A) and store the result in C_trans.
284  // Note that there is no overlap between the axpyt cases and
285  // the conja/copyb cases, hence the use of a, b, lda, and ldb.
286  bl1_chemm_blas( side,
287  uplo,
288  n,
289  m,
290  alpha,
291  a, lda,
292  b, ldb,
293  &zero,
294  c_trans, ldc_trans );
295 
296  // Scale C by beta.
298  m,
299  n,
300  beta,
301  c, incc, ldc );
302 
303  // And finally, accumulate the matrix product in C_trans into C
304  // with a transpose.
306  m,
307  n,
308  &one,
309  c_trans, incc_trans, ldc_trans,
310  c, incc, ldc );
311 
312  // Free the temporary matrix for C.
313  bl1_cfree( c_trans );
314  }
315  else // no extra axpyt step needed
316  {
317  bl1_chemm_blas( side,
318  uplo,
319  m,
320  n,
321  alpha,
322  a_conj, lda_conj,
323  b_copy, ldb_copy,
324  beta,
325  c, ldc );
326  }
327 
328  if ( hemm_needs_conja && !a_was_copied )
329  bl1_cfree( a_conj );
330 
331  if ( hemm_needs_copyb )
332  bl1_cfree( b_copy );
333 
334  // Free any temporary contiguous matrices, copying the result back to
335  // the original matrix.
336  bl1_cfree_contigm( a_save, a_rs_save, a_cs_save,
337  &a, &a_rs, &a_cs );
338 
339  bl1_cfree_contigm( b_save, b_rs_save, b_cs_save,
340  &b, &b_rs, &b_cs );
341 
342  bl1_cfree_saved_contigm( m_save,
343  n_save,
344  c_save, c_rs_save, c_cs_save,
345  &c, &c_rs, &c_cs );
346 }
void bl1_caxpymt(trans1_t trans, int m, int n, scomplex *alpha, scomplex *a, int a_rs, int a_cs, scomplex *b, int b_rs, int b_cs)
Definition: bl1_axpymt.c:149
void bl1_cconjmr(uplo1_t uplo, int m, int n, scomplex *a, int a_rs, int a_cs)
Definition: bl1_conjmr.c:23
void bl1_ccopymrt(uplo1_t uplo, trans1_t trans, int m, int n, scomplex *a, int a_rs, int a_cs, scomplex *b, int b_rs, int b_cs)
Definition: bl1_copymrt.c:223
void bl1_ccopymt(trans1_t trans, int m, int n, scomplex *a, int a_rs, int a_cs, scomplex *b, int b_rs, int b_cs)
Definition: bl1_copymt.c:215
void bl1_chemm_blas(side1_t side, uplo1_t uplo, int m, int n, scomplex *alpha, scomplex *a, int lda, scomplex *b, int ldb, scomplex *beta, scomplex *c, int ldc)
Definition: bl1_hemm.c:660
void bl1_cscalm(conj1_t conj, int m, int n, scomplex *alpha, scomplex *a, int a_rs, int a_cs)
Definition: bl1_scalm.c:169
int bl1_is_col_storage(int rs, int cs)
Definition: bl1_is.c:90
int bl1_zero_dim2(int m, int n)
Definition: bl1_is.c:118
scomplex * bl1_callocm(unsigned int m, unsigned int n)
Definition: bl1_allocm.c:40
void bl1_cfree_contigm(scomplex *a_save, int a_rs_save, int a_cs_save, scomplex **a, int *a_rs, int *a_cs)
Definition: bl1_free_contigm.c:45
scomplex bl1_c1(void)
Definition: bl1_constants.c:61
void bl1_cfree(scomplex *p)
Definition: bl1_free.c:40
void bl1_ccreate_contigm(int m, int n, scomplex *a_save, int a_rs_save, int a_cs_save, scomplex **a, int *a_rs, int *a_cs)
Definition: bl1_create_contigm.c:81
scomplex bl1_c0(void)
Definition: bl1_constants.c:125
void bl1_ccreate_contigmr(uplo1_t uplo, int m, int n, scomplex *a_save, int a_rs_save, int a_cs_save, scomplex **a, int *a_rs, int *a_cs)
Definition: bl1_create_contigmr.c:77
void bl1_cfree_saved_contigm(int m, int n, scomplex *a_save, int a_rs_save, int a_cs_save, scomplex **a, int *a_rs, int *a_cs)
Definition: bl1_free_saved_contigm.c:59
void bl1_set_dim_with_side(side1_t side, int m, int n, int *dim_new)
Definition: bl1_set_dims.c:27
trans1_t
Definition: blis_type_defs.h:53
@ BLIS1_NO_TRANSPOSE
Definition: blis_type_defs.h:54
@ BLIS1_TRANSPOSE
Definition: blis_type_defs.h:55
@ BLIS1_CONJ_NO_TRANSPOSE
Definition: blis_type_defs.h:56
@ BLIS1_NO_CONJUGATE
Definition: blis_type_defs.h:81
Definition: blis_type_defs.h:133

References bl1_c0(), bl1_c1(), bl1_callocm(), bl1_caxpymt(), bl1_cconjmr(), bl1_ccopymrt(), bl1_ccopymt(), bl1_ccreate_contigm(), bl1_ccreate_contigmr(), bl1_cfree(), bl1_cfree_contigm(), bl1_cfree_saved_contigm(), bl1_chemm_blas(), bl1_cscalm(), bl1_is_col_storage(), bl1_set_dim_with_side(), bl1_zero_dim2(), BLIS1_CONJ_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, BLIS1_NO_TRANSPOSE, and BLIS1_TRANSPOSE.

Referenced by FLA_Hemm_external().

◆ bl1_chemm_blas()

void bl1_chemm_blas ( side1_t  side,
uplo1_t  uplo,
int  m,
int  n,
scomplex alpha,
scomplex a,
int  lda,
scomplex b,
int  ldb,
scomplex beta,
scomplex c,
int  ldc 
)
661 {
662 #ifdef BLIS1_ENABLE_CBLAS_INTERFACES
663  enum CBLAS_ORDER cblas_order = CblasColMajor;
664  enum CBLAS_SIDE cblas_side;
665  enum CBLAS_UPLO cblas_uplo;
666 
667  bl1_param_map_to_netlib_side( side, &cblas_side );
668  bl1_param_map_to_netlib_uplo( uplo, &cblas_uplo );
669 
670  cblas_chemm( cblas_order,
671  cblas_side,
672  cblas_uplo,
673  m,
674  n,
675  alpha,
676  a, lda,
677  b, ldb,
678  beta,
679  c, ldc );
680 #else
681  char blas_side;
682  char blas_uplo;
683 
684  bl1_param_map_to_netlib_side( side, &blas_side );
685  bl1_param_map_to_netlib_uplo( uplo, &blas_uplo );
686 
687  F77_chemm( &blas_side,
688  &blas_uplo,
689  &m,
690  &n,
691  alpha,
692  a, &lda,
693  b, &ldb,
694  beta,
695  c, &ldc );
696 #endif
697 }
void F77_chemm(char *side, char *uplo, int *m, int *n, scomplex *alpha, scomplex *a, int *lda, scomplex *b, int *ldb, scomplex *beta, scomplex *c, int *ldc)
CBLAS_ORDER
Definition: blis_prototypes_cblas.h:17
@ CblasColMajor
Definition: blis_prototypes_cblas.h:17
CBLAS_UPLO
Definition: blis_prototypes_cblas.h:19
CBLAS_SIDE
Definition: blis_prototypes_cblas.h:21
void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc)
void bl1_param_map_to_netlib_side(side1_t blis_side, void *blas_side)
Definition: bl1_param_map.c:71
void bl1_param_map_to_netlib_uplo(uplo1_t blis_uplo, void *blas_uplo)
Definition: bl1_param_map.c:47

References bl1_param_map_to_netlib_side(), bl1_param_map_to_netlib_uplo(), cblas_chemm(), CblasColMajor, and F77_chemm().

Referenced by bl1_chemm().

◆ bl1_dhemm()

void bl1_dhemm ( side1_t  side,
uplo1_t  uplo,
int  m,
int  n,
double *  alpha,
double *  a,
int  a_rs,
int  a_cs,
double *  b,
int  b_rs,
int  b_cs,
double *  beta,
double *  c,
int  c_rs,
int  c_cs 
)
26 {
27  bl1_dsymm( side,
28  uplo,
29  m,
30  n,
31  alpha,
32  a, a_rs, a_cs,
33  b, b_rs, b_cs,
34  beta,
35  c, c_rs, c_cs );
36 }
void bl1_dsymm(side1_t side, uplo1_t uplo, int m, int n, double *alpha, double *a, int a_rs, int a_cs, double *b, int b_rs, int b_cs, double *beta, double *c, int c_rs, int c_cs)
Definition: bl1_symm.c:274

References bl1_dsymm().

◆ bl1_shemm()

void bl1_shemm ( side1_t  side,
uplo1_t  uplo,
int  m,
int  n,
float *  alpha,
float *  a,
int  a_rs,
int  a_cs,
float *  b,
int  b_rs,
int  b_cs,
float *  beta,
float *  c,
int  c_rs,
int  c_cs 
)
14 {
15  bl1_ssymm( side,
16  uplo,
17  m,
18  n,
19  alpha,
20  a, a_rs, a_cs,
21  b, b_rs, b_cs,
22  beta,
23  c, c_rs, c_cs );
24 }
void bl1_ssymm(side1_t side, uplo1_t uplo, int m, int n, float *alpha, float *a, int a_rs, int a_cs, float *b, int b_rs, int b_cs, float *beta, float *c, int c_rs, int c_cs)
Definition: bl1_symm.c:13

References bl1_ssymm().

◆ bl1_zhemm()

void bl1_zhemm ( side1_t  side,
uplo1_t  uplo,
int  m,
int  n,
dcomplex alpha,
dcomplex a,
int  a_rs,
int  a_cs,
dcomplex b,
int  b_rs,
int  b_cs,
dcomplex beta,
dcomplex c,
int  c_rs,
int  c_cs 
)
349 {
350  int m_save = m;
351  int n_save = n;
352  dcomplex* a_save = a;
353  dcomplex* b_save = b;
354  dcomplex* c_save = c;
355  int a_rs_save = a_rs;
356  int a_cs_save = a_cs;
357  int b_rs_save = b_rs;
358  int b_cs_save = b_cs;
359  int c_rs_save = c_rs;
360  int c_cs_save = c_cs;
361  dcomplex zero = bl1_z0();
362  dcomplex one = bl1_z1();
363  dcomplex* a_conj;
364  dcomplex* b_copy;
365  dcomplex* c_trans;
366  int dim_a;
367  int lda, inca;
368  int ldb, incb;
369  int ldc, incc;
370  int lda_conj, inca_conj;
371  int ldb_copy, incb_copy;
372  int ldc_trans, incc_trans;
373  int hemm_needs_conja = FALSE;
374  int hemm_needs_copyb = FALSE;
375  int hemm_needs_transb = FALSE;
376  int hemm_needs_axpyt = FALSE;
377  int a_was_copied;
378 
379  // Return early if possible.
380  if ( bl1_zero_dim2( m, n ) ) return;
381 
382  // If necessary, allocate, initialize, and use a temporary contiguous
383  // copy of each matrix rather than the original matrices.
384  bl1_set_dim_with_side( side, m, n, &dim_a );
385  bl1_zcreate_contigmr( uplo,
386  dim_a,
387  dim_a,
388  a_save, a_rs_save, a_cs_save,
389  &a, &a_rs, &a_cs );
390 
392  n,
393  b_save, b_rs_save, b_cs_save,
394  &b, &b_rs, &b_cs );
395 
397  n,
398  c_save, c_rs_save, c_cs_save,
399  &c, &c_rs, &c_cs );
400 
401  // Figure out whether A was copied to contiguous memory. This is used to
402  // prevent redundant copying.
403  a_was_copied = ( a != a_save );
404 
405  // Initialize with values assuming column-major storage.
406  lda = a_cs;
407  inca = a_rs;
408  ldb = b_cs;
409  incb = b_rs;
410  ldc = c_cs;
411  incc = c_rs;
412 
413  // Adjust the parameters based on the storage of each matrix.
414  if ( bl1_is_col_storage( c_rs, c_cs ) )
415  {
416  if ( bl1_is_col_storage( a_rs, a_cs ) )
417  {
418  if ( bl1_is_col_storage( b_rs, b_cs ) )
419  {
420  // requested operation: C_c += uplo( A_c ) * B_c
421  // effective operation: C_c += uplo( A_c ) * B_c
422  }
423  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
424  {
425  // requested operation: C_c += uplo( A_c ) * B_r
426  // effective operation: C_c += uplo( A_c ) * B_c
427  hemm_needs_copyb = TRUE;
428  }
429  }
430  else // if ( bl1_is_row_storage( a_rs, a_cs ) )
431  {
432  if ( bl1_is_col_storage( b_rs, b_cs ) )
433  {
434  // requested operation: C_c += uplo( A_r ) * B_c
435  // effective operation: C_c += ~uplo( conj( A_c ) ) * B_c
436  bl1_swap_ints( lda, inca );
437 
438  bl1_toggle_uplo( uplo );
439 
440  hemm_needs_conja = TRUE;
441  }
442  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
443  {
444  // requested operation: C_c += uplo( A_r ) * B_r
445  // effective operation: C_c += ( B_c * ~uplo( conj( A_c ) ) )^T
446  bl1_swap_ints( lda, inca );
447  bl1_swap_ints( ldb, incb );
448 
449  bl1_toggle_side( side );
450  bl1_toggle_uplo( uplo );
451 
452  hemm_needs_axpyt = TRUE;
453  }
454  }
455  }
456  else // if ( bl1_is_row_storage( c_rs, c_cs ) )
457  {
458  if ( bl1_is_col_storage( a_rs, a_cs ) )
459  {
460  if ( bl1_is_col_storage( b_rs, b_cs ) )
461  {
462  // requested operation: C_r += uplo( A_c ) * B_c
463  // effective operation: C_c += ( uplo( A_c ) * B_c )^T
464  bl1_swap_ints( ldc, incc );
465 
466  bl1_swap_ints( m, n );
467 
468  hemm_needs_axpyt = TRUE;
469  }
470  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
471  {
472  // requested operation: C_r += uplo( A_c ) * B_r
473  // effective operation: C_c += B_c * ~uplo( conj( A_c ) )
474  bl1_swap_ints( ldc, incc );
475  bl1_swap_ints( ldb, incb );
476 
477  bl1_swap_ints( m, n );
478 
479  bl1_toggle_side( side );
480 
481  hemm_needs_conja = TRUE;
482  }
483  }
484  else // if ( bl1_is_row_storage( a_rs, a_cs ) )
485  {
486  if ( bl1_is_col_storage( b_rs, b_cs ) )
487  {
488  // requested operation: C_r += uplo( A_r ) * B_c
489  // effective operation: C_c += B_c^T * ~uplo( A_c )
490  bl1_swap_ints( ldc, incc );
491  bl1_swap_ints( lda, inca );
492 
493  bl1_swap_ints( m, n );
494 
495  bl1_toggle_side( side );
496  bl1_toggle_uplo( uplo );
497 
498  hemm_needs_copyb = TRUE;
499  hemm_needs_transb = TRUE;
500  }
501  else // if ( bl1_is_row_storage( b_rs, b_cs ) )
502  {
503  // requested operation: C_r += uplo( A_r ) * B_r
504  // effective operation: C_c += B_c * conj( ~uplo( A_c ) )
505  bl1_swap_ints( ldc, incc );
506  bl1_swap_ints( lda, inca );
507  bl1_swap_ints( ldb, incb );
508 
509  bl1_swap_ints( m, n );
510 
511  bl1_toggle_uplo( uplo );
512  bl1_toggle_side( side );
513  }
514  }
515  }
516 
517  // We need a temporary matrix for the cases where A is conjugated.
518  a_conj = a;
519  lda_conj = lda;
520  inca_conj = inca;
521 
522  if ( hemm_needs_conja && !a_was_copied )
523  {
524  int dim_a;
525 
526  bl1_set_dim_with_side( side, m, n, &dim_a );
527 
528  a_conj = bl1_zallocm( dim_a, dim_a );
529  lda_conj = dim_a;
530  inca_conj = 1;
531 
532  bl1_zcopymrt( uplo,
534  dim_a,
535  dim_a,
536  a, inca, lda,
537  a_conj, inca_conj, lda_conj );
538  }
539  else if ( hemm_needs_conja && a_was_copied )
540  {
541  int dim_a;
542 
543  bl1_set_dim_with_side( side, m, n, &dim_a );
544 
545  bl1_zconjmr( uplo,
546  dim_a,
547  dim_a,
548  a_conj, inca_conj, lda_conj );
549  }
550 
551  // We need a temporary matrix for the cases where B needs to be copied.
552  b_copy = b;
553  ldb_copy = ldb;
554  incb_copy = incb;
555 
556  // There are two cases where we need to make a copy of B: one where the
557  // copy's dimensions are transposed from the original B, and one where
558  // the dimensions are not swapped.
559  if ( hemm_needs_copyb )
560  {
561  trans1_t transb;
562 
563  // Set transb, which determines whether or not we need to copy from B
564  // as if it needs a transposition. If a transposition is needed, then
565  // m and n and have already been swapped. So in either case m
566  // represents the leading dimension of the copy.
567  if ( hemm_needs_transb ) transb = BLIS1_TRANSPOSE;
568  else transb = BLIS1_NO_TRANSPOSE;
569 
570  b_copy = bl1_zallocm( m, n );
571  ldb_copy = m;
572  incb_copy = 1;
573 
574  bl1_zcopymt( transb,
575  m,
576  n,
577  b, incb, ldb,
578  b_copy, incb_copy, ldb_copy );
579  }
580 
581  // There are two cases where we need to perform the hemm and then axpy
582  // the result into C with a transposition. We handle those cases here.
583  if ( hemm_needs_axpyt )
584  {
585  // We need a temporary matrix for holding C^T. Notice that m and n
586  // represent the dimensions of C, and thus C_trans is n-by-m
587  // (interpreting both as column-major matrices). So the leading
588  // dimension of the temporary matrix holding C^T is n.
589  c_trans = bl1_zallocm( n, m );
590  ldc_trans = n;
591  incc_trans = 1;
592 
593  // Compute A * B (or B * A) and store the result in C_trans.
594  // Note that there is no overlap between the axpyt cases and
595  // the conja/copyb cases, hence the use of a, b, lda, and ldb.
596  bl1_zhemm_blas( side,
597  uplo,
598  n,
599  m,
600  alpha,
601  a, lda,
602  b, ldb,
603  &zero,
604  c_trans, ldc_trans );
605 
606  // Scale C by beta.
608  m,
609  n,
610  beta,
611  c, incc, ldc );
612 
613  // And finally, accumulate the matrix product in C_trans into C
614  // with a transpose.
616  m,
617  n,
618  &one,
619  c_trans, incc_trans, ldc_trans,
620  c, incc, ldc );
621 
622  // Free the temporary matrix for C.
623  bl1_zfree( c_trans );
624  }
625  else // no extra axpyt step needed
626  {
627  bl1_zhemm_blas( side,
628  uplo,
629  m,
630  n,
631  alpha,
632  a_conj, lda_conj,
633  b_copy, ldb_copy,
634  beta,
635  c, ldc );
636  }
637 
638  if ( hemm_needs_conja && !a_was_copied )
639  bl1_zfree( a_conj );
640 
641  if ( hemm_needs_copyb )
642  bl1_zfree( b_copy );
643 
644  // Free any temporary contiguous matrices, copying the result back to
645  // the original matrix.
646  bl1_zfree_contigm( a_save, a_rs_save, a_cs_save,
647  &a, &a_rs, &a_cs );
648 
649  bl1_zfree_contigm( b_save, b_rs_save, b_cs_save,
650  &b, &b_rs, &b_cs );
651 
652  bl1_zfree_saved_contigm( m_save,
653  n_save,
654  c_save, c_rs_save, c_cs_save,
655  &c, &c_rs, &c_cs );
656 }
void bl1_zaxpymt(trans1_t trans, int m, int n, dcomplex *alpha, dcomplex *a, int a_rs, int a_cs, dcomplex *b, int b_rs, int b_cs)
Definition: bl1_axpymt.c:248
void bl1_zconjmr(uplo1_t uplo, int m, int n, dcomplex *a, int a_rs, int a_cs)
Definition: bl1_conjmr.c:79
void bl1_zcopymrt(uplo1_t uplo, trans1_t trans, int m, int n, dcomplex *a, int a_rs, int a_cs, dcomplex *b, int b_rs, int b_cs)
Definition: bl1_copymrt.c:328
void bl1_zcopymt(trans1_t trans, int m, int n, dcomplex *a, int a_rs, int a_cs, dcomplex *b, int b_rs, int b_cs)
Definition: bl1_copymt.c:286
void bl1_zhemm_blas(side1_t side, uplo1_t uplo, int m, int n, dcomplex *alpha, dcomplex *a, int lda, dcomplex *b, int ldb, dcomplex *beta, dcomplex *c, int ldc)
Definition: bl1_hemm.c:699
void bl1_zscalm(conj1_t conj, int m, int n, dcomplex *alpha, dcomplex *a, int a_rs, int a_cs)
Definition: bl1_scalm.c:273
dcomplex bl1_z0(void)
Definition: bl1_constants.c:133
dcomplex * bl1_zallocm(unsigned int m, unsigned int n)
Definition: bl1_allocm.c:45
dcomplex bl1_z1(void)
Definition: bl1_constants.c:69
void bl1_zcreate_contigm(int m, int n, dcomplex *a_save, int a_rs_save, int a_cs_save, dcomplex **a, int *a_rs, int *a_cs)
Definition: bl1_create_contigm.c:115
void bl1_zcreate_contigmr(uplo1_t uplo, int m, int n, dcomplex *a_save, int a_rs_save, int a_cs_save, dcomplex **a, int *a_rs, int *a_cs)
Definition: bl1_create_contigmr.c:109
void bl1_zfree(dcomplex *p)
Definition: bl1_free.c:45
void bl1_zfree_contigm(dcomplex *a_save, int a_rs_save, int a_cs_save, dcomplex **a, int *a_rs, int *a_cs)
Definition: bl1_free_contigm.c:61
void bl1_zfree_saved_contigm(int m, int n, dcomplex *a_save, int a_rs_save, int a_cs_save, dcomplex **a, int *a_rs, int *a_cs)
Definition: bl1_free_saved_contigm.c:82
Definition: blis_type_defs.h:138

References bl1_is_col_storage(), bl1_set_dim_with_side(), bl1_z0(), bl1_z1(), bl1_zallocm(), bl1_zaxpymt(), bl1_zconjmr(), bl1_zcopymrt(), bl1_zcopymt(), bl1_zcreate_contigm(), bl1_zcreate_contigmr(), bl1_zero_dim2(), bl1_zfree(), bl1_zfree_contigm(), bl1_zfree_saved_contigm(), bl1_zhemm_blas(), bl1_zscalm(), BLIS1_CONJ_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, BLIS1_NO_TRANSPOSE, and BLIS1_TRANSPOSE.

Referenced by FLA_Hemm_external().

◆ bl1_zhemm_blas()

void bl1_zhemm_blas ( side1_t  side,
uplo1_t  uplo,
int  m,
int  n,
dcomplex alpha,
dcomplex a,
int  lda,
dcomplex b,
int  ldb,
dcomplex beta,
dcomplex c,
int  ldc 
)
700 {
701 #ifdef BLIS1_ENABLE_CBLAS_INTERFACES
702  enum CBLAS_ORDER cblas_order = CblasColMajor;
703  enum CBLAS_SIDE cblas_side;
704  enum CBLAS_UPLO cblas_uplo;
705 
706  bl1_param_map_to_netlib_side( side, &cblas_side );
707  bl1_param_map_to_netlib_uplo( uplo, &cblas_uplo );
708 
709  cblas_zhemm( cblas_order,
710  cblas_side,
711  cblas_uplo,
712  m,
713  n,
714  alpha,
715  a, lda,
716  b, ldb,
717  beta,
718  c, ldc );
719 #else
720  char blas_side;
721  char blas_uplo;
722 
723  bl1_param_map_to_netlib_side( side, &blas_side );
724  bl1_param_map_to_netlib_uplo( uplo, &blas_uplo );
725 
726  F77_zhemm( &blas_side,
727  &blas_uplo,
728  &m,
729  &n,
730  alpha,
731  a, &lda,
732  b, &ldb,
733  beta,
734  c, &ldc );
735 #endif
736 }
void F77_zhemm(char *side, char *uplo, int *m, int *n, dcomplex *alpha, dcomplex *a, int *lda, dcomplex *b, int *ldb, dcomplex *beta, dcomplex *c, int *ldc)
void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const int M, const int N, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc)

References bl1_param_map_to_netlib_side(), bl1_param_map_to_netlib_uplo(), cblas_zhemm(), CblasColMajor, and F77_zhemm().

Referenced by bl1_zhemm().