/* Note: if layout==101 (row major), then this code is known to only work when * nmat == VLEN. To check for accuracy otherwise, transpose everything */ LIBXSMM_INLINE void compact_strsm_ ( unsigned int *layout, char *side, char *uplo, char *transa, char *diag, unsigned int *m, unsigned int *n, float *alpha, float *A, unsigned int *lda, float *B, unsigned int *ldb, unsigned int *nmat, unsigned int *VLEN ) { int i, j, num, asize; float *Ap, *Bp, Atemp[BUFSIZE], Btemp[BUFSIZE]; if ( (*side == 'L') || (*side == 'l') ) asize = *m; else asize = *n; for ( i = 0, num = 0 ; i < (int)(*nmat) ; i+= *VLEN, num++ ) { for ( j = 0 ; j < (int)*VLEN ; j++ ) { /* Unpack the data, call a reference DTRSM, repack the data */ Ap = &A[j+num*(*lda)*asize*(*VLEN)]; Bp = &B[j+num*(*ldb)*(*n)*(*VLEN)]; scopy_to_temp ( *layout, Ap, *lda, asize, asize, Atemp, *VLEN ); scopy_to_temp ( *layout, Bp, *ldb, *m, *n, Btemp, *VLEN ); strsm_ ( side, uplo, transa, diag, m, n, alpha, Atemp, &asize, Btemp, m); scopy_from_temp ( *layout, Bp, *ldb, *m, *n, Btemp, *VLEN ); } } }
/* Note: if layout==101 (row major), then this code is known to only work when * nmat == VLEN. To check for accuracy otherwise, transpose everything */ LIBXSMM_INLINE void compact_sgemm_ ( char *transa, char *transb, unsigned int *layout, unsigned int *m, unsigned int *n, unsigned int *k, float *alpha, float *A, unsigned int *lda, float *B, unsigned int *ldb, float *beta, float *C, unsigned int *ldc, unsigned int *nmat, unsigned int *VLEN ) { unsigned int i, j, num, info; float *Ap, Atemp[BUFSIZE]; float *Bp, Btemp[BUFSIZE]; float *Cp, Ctemp[BUFSIZE]; static int ntimes = 0; char ntrans='N'; if ( ++ntimes < 3 ) printf("Inside reference compact_sgemm_()\n"); if ( ++ntimes < 3 ) printf("layout=%d m/n/k=%d %d %d lda/b/c=%d %d %d nmat=%d VLEN=%d\n",*layout,*m,*n,*k,*lda,*ldb,*ldc,*nmat,*VLEN); for ( i = 0, num = 0 ; i < (*nmat) ; i+= *VLEN, num++ ) { for ( j = 0 ; j < *VLEN ; j++ ) { /* Unpack the data, call a reference DGEMM, repack the data */ Ap = &A[j+num*(*lda)*(*k)*(*VLEN)]; Bp = &B[j+num*(*ldb)*(*n)*(*VLEN)]; Cp = &C[j+num*(*ldc)*(*n)*(*VLEN)]; if (++ntimes < 3 ) printf("Doing a sgemm at place i=%d j=%d num=%d Ap[%d]=%g\n",i,j,num,j+num*(*lda)*(*k)*(*VLEN),Ap[0]); scopy_to_temp ( *layout, Ap, *lda, *m, *k, Atemp, *VLEN ); scopy_to_temp ( *layout, Bp, *ldb, *k, *n, Btemp, *VLEN ); scopy_to_temp ( *layout, Cp, *ldc, *m, *n, Ctemp, *VLEN ); sgemm_ ( transa, transb, m, n, k, alpha, Atemp, m, Btemp, k, beta, Ctemp, m ); scopy_from_temp ( *layout, Cp, *ldc, *m, *n, Ctemp, *VLEN ); } } }