/** @return true if either real(x) or imag(x) is INF. */ inline bool magma_s_isinf( float x ) { #ifdef COMPLEX return isinf( MAGMA_S_REAL( x )) || isinf( MAGMA_S_IMAG( x )); #else return isinf( x ); #endif }
void magma_smake_spd( magma_int_t N, float* A, magma_int_t lda ) { magma_int_t i, j; for( i=0; i < N; ++i ) { A(i,i) = MAGMA_S_MAKE( MAGMA_S_REAL( A(i,i) ) + N, MAGMA_S_IMAG( A(i,i) ) ); for( j=0; j < i; ++j ) { A(j,i) = A(i,j); } } }
void magma_sprint( magma_int_t m, magma_int_t n, const float *A, magma_int_t lda ) { #define A(i,j) (A + (i) + (j)*lda) magma_int_t info = 0; if ( m < 0 ) info = -1; else if ( n < 0 ) info = -2; else if ( lda < max(1,m) ) info = -4; if (info != 0) { magma_xerbla( __func__, -(info) ); return; //info; } float c_zero = MAGMA_S_ZERO; if ( m == 1 ) { printf( "[ " ); } else { printf( "[\n" ); } for( int i = 0; i < m; ++i ) { for( int j = 0; j < n; ++j ) { if ( MAGMA_S_EQUAL( *A(i,j), c_zero )) { #ifdef COMPLEX printf( " 0. " ); #else printf( " 0. " ); #endif } else { #ifdef COMPLEX printf( " %8.4f+%8.4fi", MAGMA_S_REAL( *A(i,j) ), MAGMA_S_IMAG( *A(i,j) )); #else printf( " %8.4f", MAGMA_S_REAL( *A(i,j) )); #endif } } if ( m > 1 ) { printf( "\n" ); } else { printf( " " ); } } printf( "];\n" ); }
extern "C" magma_int_t magma_strsm_m (magma_int_t nrgpu, char side, char uplo, char transa, char diag, magma_int_t m, magma_int_t n, float alpha, float *a, magma_int_t lda, float *b, magma_int_t ldb) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= STRSM solves one of the matrix equations op( A )*X = alpha*B, or X*op( A ) = alpha*B, where alpha is a scalar, X and B are m by n matrices, A is a unit, or non-unit, upper or lower triangular matrix and op( A ) is one of op( A ) = A or op( A ) = A' or op( A ) = ( A' ). The matrix X is overwritten on B. Parameters ========== SIDE CHARACTER*1. On entry, SIDE specifies whether op( A ) appears on the left or right of X as follows: SIDE = 'L' or 'l' op( A )*X = alpha*B. SIDE = 'R' or 'r' X*op( A ) = alpha*B. Unchanged on exit. UPLO CHARACTER*1. On entry, UPLO specifies whether the matrix A is an upper or lower triangular matrix as follows: UPLO = 'U' or 'u' A is an upper triangular matrix. UPLO = 'L' or 'l' A is a lower triangular matrix. Unchanged on exit. TRANSA CHARACTER*1. On entry, TRANSA specifies the form of op( A ) to be used in the matrix multiplication as follows: TRANSA = 'N' or 'n' op( A ) = A. TRANSA = 'T' or 't' op( A ) = A'. TRANSA = 'C' or 'c' op( A ) = ( A' ). Unchanged on exit. DIAG CHARACTER*1. On entry, DIAG specifies whether or not A is unit triangular as follows: DIAG = 'U' or 'u' A is assumed to be unit triangular. DIAG = 'N' or 'n' A is not assumed to be unit triangular. Unchanged on exit. M INTEGER. On entry, M specifies the number of rows of B. M must be at least zero. Unchanged on exit. N INTEGER. On entry, N specifies the number of columns of B. N must be at least zero. Unchanged on exit. ALPHA REAL . On entry, ALPHA specifies the scalar alpha. When alpha is zero then A is not referenced and B need not be set before entry. Unchanged on exit. A REAL array of DIMENSION ( LDA, k ), where k is m when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. Before entry with UPLO = 'U' or 'u', the leading k by k upper triangular part of the array A must contain the upper triangular matrix and the strictly lower triangular part of A is not referenced. Before entry with UPLO = 'L' or 'l', the leading k by k lower triangular part of the array A must contain the lower triangular matrix and the strictly upper triangular part of A is not referenced. Note that when DIAG = 'U' or 'u', the diagonal elements of A are not referenced either, but are assumed to be unity. Unchanged on exit. LDA INTEGER. On entry, LDA specifies the first dimension of A as declared in the calling (sub) program. When SIDE = 'L' or 'l' then LDA >= max( 1, m ), when SIDE = 'R' or 'r' then LDA >= max( 1, n ). Unchanged on exit. B REAL array of DIMENSION ( LDB, n ). Before entry, the leading m by n part of the array B must contain the right-hand side matrix B, and on exit is overwritten by the solution matrix X. LDB INTEGER. On entry, LDB specifies the first dimension of B as declared in the calling (sub) program. LDB must be at least max( 1, m ). Unchanged on exit. ===================================================================== */ char side_[2] = {side, 0}; char uplo_[2] = {uplo, 0}; char transa_[2] = {transa, 0}; char diag_[2] = {diag, 0}; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float alpha_; float* dw[MagmaMaxGPUs]; magma_queue_t stream [MagmaMaxGPUs][3]; magma_int_t lside; magma_int_t upper; magma_int_t notransp; magma_int_t nrowa; magma_int_t nb = magma_get_strsm_m_nb(); magma_int_t igpu = 0; magma_int_t info; magma_int_t k, j, kb, jb; magma_int_t ldda, dima, lddb, dimb; int gpu_b; magma_getdevice(&gpu_b); lside = lapackf77_lsame(side_, "L"); if (lside) { nrowa = m; } else { nrowa = n; } upper = lapackf77_lsame(uplo_, "U"); notransp = lapackf77_lsame(transa_, "N"); info = 0; if (! lside && ! lapackf77_lsame(side_, "R")) { info = 1; } else if (! upper && ! lapackf77_lsame(uplo_, "L")) { info = 2; } else if (! notransp && ! lapackf77_lsame(transa_, "T") && ! lapackf77_lsame(transa_, "C")) { info = 3; } else if (! lapackf77_lsame(diag_, "U") && ! lapackf77_lsame(diag_, "N")) { info = 4; } else if (m < 0) { info = 5; } else if (n < 0) { info = 6; } else if (lda < max(1,nrowa)) { info = 9; } else if (ldb < max(1,m)) { info = 11; } if (info != 0) { magma_xerbla( __func__, -info ); return info; } //Quick return if possible. if (n == 0) { return info; } magma_int_t nbl = (n-1)/nb+1; // number of blocks in a row magma_int_t mbl = (m-1)/nb+1; // number of blocks in a column if (lside) { lddb = m; dimb = ((nbl-1)/nrgpu+1)*nb; if ( notransp ) { ldda = m; dima = 2 * nb; } else { ldda = 2 * nb; dima = m; } } else { lddb = ((mbl-1)/nrgpu+1)*nb; dimb = n; if ( !notransp ) { ldda = n; dima = 2 * nb; } else { ldda = 2 * nb; dima = n; } } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); if (MAGMA_SUCCESS != magma_smalloc( &dw[igpu], (dimb*lddb + dima*ldda) )) { info = MAGMA_ERR_DEVICE_ALLOC; return info; } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); magma_queue_create( &stream[igpu][2] ); } // alpha = 0 case; if (MAGMA_S_REAL(alpha) == 0. && MAGMA_S_IMAG(alpha) == 0.) { printf("strsm_m: alpha = 0 not implemented\n"); exit(-1); return info; } if (lside) { if (notransp) { //Form B := alpha*inv( A )*B if (upper) { //left upper notranspose magma_int_t nloc[MagmaMaxGPUs]; for(igpu = 0; igpu < nrgpu; ++igpu) nloc[igpu] = 0; //copy B to mgpus for (k = 0; k < nbl; ++k){ igpu = k%nrgpu; magma_setdevice(igpu); kb = min(nb, n-k*nb); nloc[igpu] += kb; magma_ssetmatrix_async( m, kb, B(0, k), ldb, dB(igpu, 0, k/nrgpu), lddb, stream[igpu][(mbl+1)%2] ); } jb = min(nb, m-(mbl-1)*nb); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( m, jb, A(0, mbl-1), lda, dA(igpu, 0, (mbl-1)%2), ldda, stream[igpu][(mbl+1)%2] ); } for (j = mbl-1; j >= 0; --j){ if (j > 0){ jb = nb; for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( j*nb, jb, A(0, j-1), lda, dA(igpu, 0, (j+1)%2), ldda, stream[igpu][(j+1)%2] ); } } if (j==mbl-1) alpha_=alpha; else alpha_= c_one; jb = min(nb, m-j*nb); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][j%2]); magma_strsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j, j%2), ldda, dB(igpu, j, 0), lddb ); } if (j>0){ for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][j%2]); magma_sgemm(transa, MagmaNoTrans, j*nb, nloc[igpu], jb, c_neg_one, dA(igpu, 0, j%2), ldda, dB(igpu, j, 0), lddb, alpha_, dB(igpu, 0, 0), lddb ); } } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_queue_sync( stream[igpu][j%2] ); } for (k = 0; k < nbl; ++k){ igpu = k%nrgpu; magma_setdevice(igpu); kb = min(nb, n-k*nb); magma_sgetmatrix_async( jb, kb, dB(igpu, j, k/nrgpu), lddb, B(j, k), ldb, stream[igpu][2] ); } } } else { //left lower notranspose magma_int_t nloc[MagmaMaxGPUs]; for(igpu = 0; igpu < nrgpu; ++igpu) nloc[igpu] = 0; //copy B to mgpus for (k = 0; k < nbl; ++k){ igpu = k%nrgpu; magma_setdevice(igpu); kb = min(nb, n-k*nb); nloc[igpu] += kb; magma_ssetmatrix_async( m, kb, B(0, k), ldb, dB(igpu, 0, k/nrgpu), lddb, stream[igpu][0] ); } jb = min(nb, m); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( m, jb, A(0, 0), lda, dA(igpu, 0, 0), ldda, stream[igpu][0] ); } for (j = 0; j < mbl; ++j){ if ((j+1)*nb < m){ jb = min(nb, m-(j+1)*nb); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( (m-(j+1)*nb), jb, A(j+1, j+1), lda, dA(igpu, j+1, (j+1)%2), ldda, stream[igpu][(j+1)%2] ); } } jb = min(nb, m-j*nb); if (j==0) alpha_=alpha; else alpha_= c_one; for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][j%2]); magma_strsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j, j%2), ldda, dB(igpu, j, 0), lddb ); } if ( j < mbl-1 ){ for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][j%2]); magma_sgemm(transa, MagmaNoTrans, m-(j+1)*nb, nloc[igpu], nb, c_neg_one, dA(igpu, j+1, j%2), ldda, dB(igpu, j, 0), lddb, alpha_, dB(igpu, j+1, 0), lddb ); } } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_queue_sync( stream[igpu][j%2] ); } for (k = 0; k < nbl; ++k){ igpu = k%nrgpu; magma_setdevice(igpu); kb = min(nb, n-k*nb); magma_sgetmatrix_async( jb, kb, dB(igpu, j, k/nrgpu), lddb, B(j, k), ldb, stream[igpu][2] ); } } } } else { //Form B := alpha*inv( A' )*B if (upper) { //left upper transpose or transpose magma_int_t nloc[MagmaMaxGPUs]; for(igpu = 0; igpu < nrgpu; ++igpu) nloc[igpu] = 0; //copy B to mgpus for (k = 0; k < nbl; ++k){ igpu = k%nrgpu; magma_setdevice(igpu); kb = min(nb, n-k*nb); nloc[igpu] += kb; magma_ssetmatrix_async( m, kb, B(0, k), ldb, dB(igpu, 0, k/nrgpu), lddb, stream[igpu][0] ); } jb = min(nb, m); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( jb, m, A(0, 0), lda, dA(igpu, 0, 0), ldda, stream[igpu][0] ); } for (j = 0; j < mbl; ++j){ if ((j+1)*nb < m){ jb = min(nb, m-(j+1)*nb); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( jb, m-(j+1)*nb, A(j+1, j+1), lda, dA(igpu, (j+1)%2, j+1), ldda, stream[igpu][(j+1)%2] ); } } jb = min(nb, m-j*nb); if (j==0) alpha_=alpha; else alpha_= c_one; for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][j%2]); magma_strsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j%2, j), ldda, dB(igpu, j, 0), lddb ); } if ( j < mbl-1 ){ for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][j%2]); magma_sgemm(transa, MagmaNoTrans, m-(j+1)*nb, nloc[igpu], nb, c_neg_one, dA(igpu, j%2, j+1), ldda, dB(igpu, j, 0), lddb, alpha_, dB(igpu, j+1, 0), lddb ); } } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_queue_sync( stream[igpu][j%2] ); } for (k = 0; k < nbl; ++k){ igpu = k%nrgpu; magma_setdevice(igpu); kb = min(nb, n-k*nb); magma_sgetmatrix_async( jb, kb, dB(igpu, j, k/nrgpu), lddb, B(j, k), ldb, stream[igpu][2] ); } } } else { //left lower transpose or transpose magma_int_t nloc[MagmaMaxGPUs]; for(igpu = 0; igpu < nrgpu; ++igpu) nloc[igpu] = 0; //copy B to mgpus for (k = 0; k < nbl; ++k){ igpu = k%nrgpu; magma_setdevice(igpu); kb = min(nb, n-k*nb); nloc[igpu] += kb; magma_ssetmatrix_async( m, kb, B(0, k), ldb, dB(igpu, 0, k/nrgpu), lddb, stream[igpu][(mbl+1)%2] ); } jb = min(nb, m-(mbl-1)*nb); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( jb, m, A(mbl-1, 0), lda, dA(igpu, (mbl-1)%2, 0), ldda, stream[igpu][(mbl+1)%2] ); } for (j = mbl-1; j >= 0; --j){ if (j > 0){ jb = nb; for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( jb, j*nb, A(j-1, 0), lda, dA(igpu, (j+1)%2, 0), ldda, stream[igpu][(j+1)%2] ); } } if (j==mbl-1) alpha_=alpha; else alpha_= c_one; jb = min(nb, m-j*nb); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][j%2]); magma_strsm(side, uplo, transa, diag, jb, nloc[igpu], alpha_, dA(igpu, j%2, j), ldda, dB(igpu, j, 0), lddb ); } if (j>0){ for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][j%2]); magma_sgemm(transa, MagmaNoTrans, j*nb, nloc[igpu], jb, c_neg_one, dA(igpu, j%2, 0), ldda, dB(igpu, j, 0), lddb, alpha_, dB(igpu, 0, 0), lddb ); } } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_queue_sync( stream[igpu][j%2] ); } for (k = 0; k < nbl; ++k){ igpu = k%nrgpu; magma_setdevice(igpu); kb = min(nb, n-k*nb); magma_sgetmatrix_async( jb, kb, dB(igpu, j, k/nrgpu), lddb, B(j, k), ldb, stream[igpu][2] ); } } } } } else { if (notransp) { //Form B := alpha*B*inv( A ). if (upper) { //right upper notranspose magma_int_t mloc[MagmaMaxGPUs]; for(igpu = 0; igpu < nrgpu; ++igpu) mloc[igpu] = 0; //copy B to mgpus for (j = 0; j < mbl; ++j){ igpu = j%nrgpu; magma_setdevice(igpu); jb = min(nb, m-j*nb); mloc[igpu] += jb; magma_ssetmatrix_async( jb, n, B(j, 0), ldb, dB(igpu, j/nrgpu, 0), lddb, stream[igpu][0] ); } kb = min(nb, n); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( kb, n, A(0, 0), lda, dA(igpu, 0, 0), ldda, stream[igpu][0] ); } for (k = 0; k < nbl; ++k){ if ((k+1)*nb < n){ kb = min(nb, n-(k+1)*nb); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( kb, n-(k+1)*nb, A(k+1, k+1), lda, dA(igpu, (k+1)%2, k+1), ldda, stream[igpu][(k+1)%2] ); } } kb = min(nb, n-k*nb); if (k==0) alpha_=alpha; else alpha_= c_one; for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][k%2]); magma_strsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k%2, k), ldda, dB(igpu, 0, k), lddb ); } if ( k < nbl-1 ){ for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][k%2]); magma_sgemm(MagmaNoTrans, transa, mloc[igpu], n-(k+1)*nb, nb, c_neg_one, dB(igpu, 0, k), lddb, dA(igpu, k%2, k+1), ldda, alpha_, dB(igpu, 0, k+1), lddb ); } } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_queue_sync( stream[igpu][k%2] ); } for (j = 0; j < mbl; ++j){ igpu = j%nrgpu; magma_setdevice(igpu); jb = min(nb, m-j*nb); magma_sgetmatrix_async( jb, kb, dB(igpu, j/nrgpu, k), lddb, B(j, k), ldb, stream[igpu][2] ); } } } else { //right lower notranspose magma_int_t mloc[MagmaMaxGPUs]; for(igpu = 0; igpu < nrgpu; ++igpu) mloc[igpu] = 0; //copy B to mgpus for (j = 0; j < mbl; ++j){ igpu = j%nrgpu; magma_setdevice(igpu); jb = min(nb, m-j*nb); mloc[igpu] += jb; magma_ssetmatrix_async( jb, n, B(j, 0), ldb, dB(igpu, j/nrgpu, 0), lddb, stream[igpu][(nbl+1)%2] ); } kb = min(nb, n-(nbl-1)*nb); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( kb, n, A(nbl-1, 0), lda, dA(igpu, (nbl-1)%2, 0), ldda, stream[igpu][(nbl+1)%2] ); } for (k = nbl-1; k >= 0; --k){ if (k > 0){ kb = nb; for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( kb, k*nb, A(k-1, 0), lda, dA(igpu, (k+1)%2, 0), ldda, stream[igpu][(k+1)%2] ); } } if (k==nbl-1) alpha_=alpha; else alpha_= c_one; kb = min(nb, n-k*nb); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][k%2]); magma_strsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k%2, k), ldda, dB(igpu, 0, k), lddb ); } if (k>0){ for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][k%2]); magma_sgemm(MagmaNoTrans, transa, mloc[igpu], k*nb, kb, c_neg_one, dB(igpu, 0, k), lddb, dA(igpu, k%2, 0), ldda, alpha_, dB(igpu, 0, 0), lddb ); } } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_queue_sync( stream[igpu][k%2] ); } for (j = 0; j < mbl; ++j){ igpu = j%nrgpu; magma_setdevice(igpu); jb = min(nb, m-j*nb); magma_sgetmatrix_async( jb, kb, dB(igpu, j/nrgpu, k), lddb, B(j, k), ldb, stream[igpu][2] ); } } } } else { //Form B := alpha*B*inv( A' ). if (upper) { //right upper transpose or transpose magma_int_t mloc[MagmaMaxGPUs]; for(igpu = 0; igpu < nrgpu; ++igpu) mloc[igpu] = 0; //copy B to mgpus for (j = 0; j < mbl; ++j){ igpu = j%nrgpu; magma_setdevice(igpu); jb = min(nb, m-j*nb); mloc[igpu] += jb; magma_ssetmatrix_async( jb, n, B(j, 0), ldb, dB(igpu, j/nrgpu, 0), lddb, stream[igpu][(nbl+1)%2] ); } kb = min(nb, n-(nbl-1)*nb); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( n, kb, A(0, nbl-1), lda, dA(igpu, 0, (nbl-1)%2), ldda, stream[igpu][(nbl+1)%2] ); } for (k = nbl-1; k >= 0; --k){ if (k > 0){ kb = nb; for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( k*nb, kb, A(0, k-1), lda, dA(igpu, 0, (k+1)%2), ldda, stream[igpu][(k+1)%2] ); } } if (k==nbl-1) alpha_=alpha; else alpha_= c_one; kb = min(nb, n-k*nb); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][k%2]); magma_strsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k, k%2), ldda, dB(igpu, 0, k), lddb ); } if (k>0){ for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][k%2]); magma_sgemm(MagmaNoTrans, transa, mloc[igpu], k*nb, kb, c_neg_one, dB(igpu, 0, k), lddb, dA(igpu, 0, k%2), ldda, alpha_, dB(igpu, 0, 0), lddb ); } } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_queue_sync( stream[igpu][k%2] ); } for (j = 0; j < mbl; ++j){ igpu = j%nrgpu; magma_setdevice(igpu); jb = min(nb, m-j*nb); magma_sgetmatrix_async( jb, kb, dB(igpu, j/nrgpu, k), lddb, B(j, k), ldb, stream[igpu][2] ); } } } else { //right lower transpose or transpose magma_int_t mloc[MagmaMaxGPUs]; for(igpu = 0; igpu < nrgpu; ++igpu) mloc[igpu] = 0; //copy B to mgpus for (j = 0; j < mbl; ++j){ igpu = j%nrgpu; magma_setdevice(igpu); jb = min(nb, m-j*nb); mloc[igpu] += jb; magma_ssetmatrix_async( jb, n, B(j, 0), ldb, dB(igpu, j/nrgpu, 0), lddb, stream[igpu][0] ); } kb = min(nb, n); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( n, kb, A(0, 0), lda, dA(igpu, 0, 0), ldda, stream[igpu][0] ); } for (k = 0; k < nbl; ++k){ if ((k+1)*nb < n){ kb = min(nb, n-(k+1)*nb); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_ssetmatrix_async( (n-(k+1)*nb), kb, A(k+1, k+1), lda, dA(igpu, k+1, (k+1)%2), ldda, stream[igpu][(k+1)%2] ); } } kb = min(nb, n-k*nb); if (k==0) alpha_=alpha; else alpha_= c_one; for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][k%2]); magma_strsm(side, uplo, transa, diag, mloc[igpu], kb, alpha_, dA(igpu, k, k%2), ldda, dB(igpu, 0, k), lddb ); } if ( k < nbl-1 ){ for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(stream[igpu][k%2]); magma_sgemm(MagmaNoTrans, transa, mloc[igpu], n-(k+1)*nb, nb, c_neg_one, dB(igpu, 0, k), lddb, dA(igpu, k+1, k%2), ldda, alpha_, dB(igpu, 0, k+1), lddb ); } } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_queue_sync( stream[igpu][k%2] ); } for (j = 0; j < mbl; ++j){ igpu = j%nrgpu; magma_setdevice(igpu); jb = min(nb, m-j*nb); magma_sgetmatrix_async( jb, kb, dB(igpu, j/nrgpu, k), lddb, B(j, k), ldb, stream[igpu][2] ); } } } } } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magmablasSetKernelStream(NULL); magma_queue_sync( stream[igpu][2] ); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_queue_destroy( stream[igpu][2] ); magma_free( dw[igpu] ); } magma_setdevice(gpu_b); return info; } /* magma_strsm_m */