/* //////////////////////////////////////////////////////////////////////////// -- Testing sswap, sswapblk, spermute, slaswp, slaswpx */ int main( int argc, char** argv) { TESTING_INIT(); float *h_A1, *h_A2; float *d_A1, *d_A2; float *h_R1, *h_R2; // row-major and column-major performance real_Double_t row_perf0, col_perf0; real_Double_t row_perf1, col_perf1; real_Double_t row_perf2, col_perf2; real_Double_t row_perf3; real_Double_t row_perf4; real_Double_t row_perf5, col_perf5; real_Double_t row_perf6, col_perf6; real_Double_t row_perf7; real_Double_t cpu_perf; real_Double_t time, gbytes; magma_int_t N, lda, ldda, nb, j; magma_int_t ione = 1; magma_int_t *ipiv, *ipiv2; magma_int_t *d_ipiv; magma_opts opts; parse_opts( argc, argv, &opts ); magma_queue_t queue = 0; printf(" cublasSswap sswap sswapblk slaswp spermute slaswp2 slaswpx scopymatrix CPU (all in )\n"); printf(" N nb row-maj/col-maj row-maj/col-maj row-maj/col-maj row-maj row-maj row-maj row-maj/col-maj row-blk/col-blk slaswp (GByte/s)\n"); printf("==================================================================================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { // each test is assigned one bit in the check bitmask, bit=1 is failure. // shift keeps track of which bit is for current test int shift = 1; int check = 0; N = opts.nsize[i]; lda = N; ldda = ((N+31)/32)*32; nb = (opts.nb > 0 ? opts.nb : magma_get_sgetrf_nb( N )); // for each swap, does 2N loads and 2N stores gbytes = sizeof(float) * 4.*N*nb / 1e9; TESTING_MALLOC_PIN( h_A1, float, lda*N ); TESTING_MALLOC_PIN( h_A2, float, lda*N ); TESTING_MALLOC_PIN( h_R1, float, lda*N ); TESTING_MALLOC_PIN( h_R2, float, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, nb ); TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb ); TESTING_MALLOC_DEV( d_A1, float, ldda*N ); TESTING_MALLOC_DEV( d_A2, float, ldda*N ); for( j=0; j < nb; j++ ) { ipiv[j] = (magma_int_t) ((rand()*1.*N) / (RAND_MAX * 1.)) + 1; } /* ===================================================================== * cublasSswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasSswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasSswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda); } } time = magma_sync_wtime( queue ) - time; col_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * sswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_sswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_sswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda ); } } time = magma_sync_wtime( queue ) - time; col_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * sswapblk, blocked version (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_sswapblk( 'R', N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; row_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_sswapblk( 'C', N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; col_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * spermute_long (1 matrix) */ /* Row Major */ memcpy( ipiv2, ipiv, nb*sizeof(magma_int_t) ); // spermute updates ipiv2 init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_spermute_long2( N, d_A1, ldda, ipiv2, nb, 0 ); time = magma_sync_wtime( queue ) - time; row_perf3 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style slaswp (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_slaswp( N, d_A1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf4 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style slaswp (1 matrix) - d_ipiv on GPU */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 ); magmablas_slaswp2( N, d_A1, ldda, 1, nb, d_ipiv ); time = magma_sync_wtime( queue ) - time; row_perf7 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style slaswpx (extended for row- and col-major) (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_slaswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf5 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* Col Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_slaswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; col_perf5 = gbytes / time; time = magma_wtime(); lapackf77_slaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione); time = magma_wtime() - time; cpu_perf = gbytes / time; magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * Copy matrix. */ time = magma_sync_wtime( queue ); magma_scopymatrix( N, nb, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap col_perf6 = 0.5 * gbytes / time; time = magma_sync_wtime( queue ); magma_scopymatrix( nb, N, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap row_perf6 = 0.5 * gbytes / time; printf("%5d %3d %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c %6.2f%c %6.2f%c %6.2f%c/ %6.2f%c %6.2f / %6.2f %6.2f %10s\n", (int) N, (int) nb, row_perf0, ((check & 0x001) != 0 ? '*' : ' '), col_perf0, ((check & 0x002) != 0 ? '*' : ' '), row_perf1, ((check & 0x004) != 0 ? '*' : ' '), col_perf1, ((check & 0x008) != 0 ? '*' : ' '), row_perf2, ((check & 0x010) != 0 ? '*' : ' '), col_perf2, ((check & 0x020) != 0 ? '*' : ' '), row_perf3, ((check & 0x040) != 0 ? '*' : ' '), row_perf4, ((check & 0x080) != 0 ? '*' : ' '), row_perf7, ((check & 0x100) != 0 ? '*' : ' '), row_perf5, ((check & 0x200) != 0 ? '*' : ' '), col_perf5, ((check & 0x400) != 0 ? '*' : ' '), row_perf6, col_perf6, cpu_perf, (check == 0 ? "ok" : "* failures") ); TESTING_FREE_PIN( h_A1 ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_R1 ); TESTING_FREE_PIN( h_R2 ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( ipiv2 ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_A1 ); TESTING_FREE_DEV( d_A2 ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/** Purpose ------- SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs to be used for the factorization. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrf2_mgpu(magma_int_t num_gpus, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, float *d_lAT[], magma_int_t lddat, magma_int_t *ipiv, float *d_lAP[], float *w, magma_int_t ldw, magma_queue_t streaml[][2], magma_int_t *info) { #define dAT(id,i,j) (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb) #define W(j) (w+((j)%num_gpus)*nb*ldw) float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t block_size = 32; magma_int_t iinfo, n_local[MagmaMaxGPUs]; magma_int_t maxm, mindim; magma_int_t i, d, dd, rows, cols, s, ldpan[MagmaMaxGPUs]; magma_int_t id, i_local, i_local2, nb0, nb1, h = 2+num_gpus; float *d_panel[MagmaMaxGPUs], *panel_local[MagmaMaxGPUs]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (num_gpus*lddat < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); if ( num_gpus > ceil((float)n/nb) ) { *info = -1; return *info; } /* Use hybrid blocked code. */ maxm = ((m + block_size-1)/block_size)*block_size; /* some initializations */ for (i=0; i < num_gpus; i++) { magma_setdevice(i); n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; /* workspaces */ d_panel[i] = &(d_lAP[i][h*nb*maxm]); /* temporary panel storage */ } trace_init( 1, num_gpus, 2, (CUstream_st**)streaml ); /* start sending the panel to cpu */ nb0 = min(mindim, nb); magma_setdevice(0); magmablasSetKernelStream(streaml[0][1]); trace_gpu_start( 0, 1, "comm", "get" ); magmablas_stranspose( nb0, m, dAT(0,0,0), lddat, d_lAP[0], maxm ); magma_sgetmatrix_async( m, nb0, d_lAP[0], maxm, W(0), ldw, streaml[0][1] ); trace_gpu_end( 0, 1 ); /* ------------------------------------------------------------------------------------- */ magma_timer_t time=0; timer_start( time ); s = mindim / nb; for( i=0; i < s; i++ ) { /* Set the GPU number that holds the current panel */ id = i%num_gpus; magma_setdevice(id); /* Set the local index where the current panel is */ i_local = i/num_gpus; cols = maxm - i*nb; rows = m - i*nb; /* synchrnoize i-th panel from id-th gpu into work */ magma_queue_sync( streaml[id][1] ); /* i-th panel factorization */ trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_sgetrf( &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) { *info = iinfo + i*nb; } trace_cpu_end( 0 ); /* start sending the panel to all the gpus */ d = (i+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { magma_setdevice(d); trace_gpu_start( 0, 1, "comm", "set" ); magma_ssetmatrix_async( rows, nb, W(i), ldw, &d_lAP[d][(i%h)*nb*maxm], cols, streaml[d][1] ); trace_gpu_end( 0, 1 ); d = (d+1)%num_gpus; } /* apply the pivoting */ d = (i+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { magma_setdevice(d); magmablasSetKernelStream(streaml[d][0]); trace_gpu_start( d, 1, "pivot", "pivot" ); if ( dd == 0 ) magmablas_spermute_long2( lddat, dAT(d,0,0), lddat, ipiv, nb, i*nb ); else magmablas_spermute_long3( dAT(d,0,0), lddat, ipiv, nb, i*nb ); trace_gpu_end( d, 1 ); d = (d+1)%num_gpus; } /* update the trailing-matrix/look-ahead */ d = (i+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { magma_setdevice(d); /* storage for panel */ if ( d == id ) { /* the panel belond to this gpu */ panel_local[d] = dAT(d,i,i_local); ldpan[d] = lddat; /* next column */ i_local2 = i_local+1; } else { /* the panel belong to another gpu */ panel_local[d] = d_panel[d]; ldpan[d] = nb; /* next column */ i_local2 = i_local; if ( d < id ) i_local2 ++; } /* the size of the next column */ if ( s > (i+1) ) { nb0 = nb; } else { nb0 = n_local[d]-nb*(s/num_gpus); if ( d < s%num_gpus ) nb0 -= nb; } if ( d == (i+1)%num_gpus) { /* owns the next column, look-ahead the column */ nb1 = nb0; magmablasSetKernelStream(streaml[d][1]); /* make sure all the pivoting has been applied */ magma_queue_sync(streaml[d][0]); trace_gpu_start( d, 1, "gemm", "gemm" ); /* transpose panel on GPU */ magmablas_stranspose( rows, nb, &d_lAP[d][(i%h)*nb*maxm], cols, panel_local[d], ldpan[d] ); /* synch for remaining update */ magma_queue_sync(streaml[d][1]); } else { /* update the entire trailing matrix */ nb1 = n_local[d] - i_local2*nb; magmablasSetKernelStream(streaml[d][0]); /* synchronization to make sure panel arrived on gpu */ magma_queue_sync(streaml[d][1]); trace_gpu_start( d, 0, "gemm", "gemm" ); /* transpose panel on GPU */ magmablas_stranspose( rows, nb, &d_lAP[d][(i%h)*nb*maxm], cols, panel_local[d], ldpan[d] ); } /* gpu updating the trailing matrix */ magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb, c_one, panel_local[d], ldpan[d], dAT(d, i, i_local2), lddat); magma_sgemm( MagmaNoTrans, MagmaNoTrans, nb1, m-(i+1)*nb, nb, c_neg_one, dAT(d, i, i_local2), lddat, &(panel_local[d][nb*ldpan[d]]), ldpan[d], c_one, dAT(d, i+1, i_local2), lddat ); if ( d == (i+1)%num_gpus ) { /* Set the local index where the current panel is */ int loff = i+1; int i_local = (i+1)/num_gpus; int ldda = maxm - (i+1)*nb; int cols = m - (i+1)*nb; nb0 = min(nb, mindim - (i+1)*nb); /* size of the diagonal block */ trace_gpu_end( d, 1 ); if ( nb0 > 0 ) { /* transpose the panel for sending it to cpu */ trace_gpu_start( d, 1, "comm", "get" ); magmablas_stranspose( nb0, m-(i+1)*nb, dAT(d,loff,i_local), lddat, &d_lAP[d][((i+1)%h)*nb*maxm], ldda ); /* send the panel to cpu */ magma_sgetmatrix_async( cols, nb0, &d_lAP[d][((i+1)%h)*nb*maxm], ldda, W(i+1), ldw, streaml[d][1] ); trace_gpu_end( d, 1 ); } } else { trace_gpu_end( d, 0 ); } d = (d+1)%num_gpus; } /* update the remaining matrix by gpu owning the next panel */ if ( (i+1) < s ) { int i_local = (i+1)/num_gpus; int rows = m - (i+1)*nb; d = (i+1)%num_gpus; magma_setdevice(d); magmablasSetKernelStream(streaml[d][0]); trace_gpu_start( d, 0, "gemm", "gemm" ); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d] - (i_local+1)*nb, nb, c_one, panel_local[d], ldpan[d], dAT(d,i,i_local+1), lddat ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n_local[d]-(i_local+1)*nb, rows, nb, c_neg_one, dAT(d,i,i_local+1), lddat, &(panel_local[d][nb*ldpan[d]]), ldpan[d], c_one, dAT(d,i+1, i_local+1), lddat ); trace_gpu_end( d, 0 ); } } /* end of for i=1..s */ /* ------------------------------------------------------------------------------ */ /* Set the GPU number that holds the last panel */ id = s%num_gpus; /* Set the local index where the last panel is */ i_local = s/num_gpus; /* size of the last diagonal-block */ nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; if ( nb0 > 0 ) { magma_setdevice(id); /* wait for the last panel on cpu */ magma_queue_sync( streaml[id][1] ); /* factor on cpu */ lapackf77_sgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; /* send the factor to gpus */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); i_local2 = i_local; if ( d < id ) i_local2 ++; if ( d == id || n_local[d] > i_local2*nb ) { magma_ssetmatrix_async( rows, nb0, W(s), ldw, &d_lAP[d][(s%h)*nb*maxm], cols, streaml[d][1] ); } } for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(streaml[d][0]); if ( d == 0 ) magmablas_spermute_long2( lddat, dAT(d,0,0), lddat, ipiv, nb0, s*nb ); else magmablas_spermute_long3( dAT(d,0,0), lddat, ipiv, nb0, s*nb ); } for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(streaml[d][1]); /* wait for the pivoting to be done */ magma_queue_sync( streaml[d][0] ); i_local2 = i_local; if ( d < id ) i_local2++; if ( d == id ) { /* the panel belond to this gpu */ panel_local[d] = dAT(d,s,i_local); /* next column */ nb1 = n_local[d] - i_local*nb-nb0; magmablas_stranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], lddat ); if ( nb1 > 0 ) { magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], lddat, dAT(d,s,i_local)+nb0, lddat); } } else if ( n_local[d] > i_local2*nb ) { /* the panel belong to another gpu */ panel_local[d] = d_panel[d]; /* next column */ nb1 = n_local[d] - i_local2*nb; magmablas_stranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], nb ); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], nb, dAT(d,s,i_local2), lddat); } } } /* if ( nb0 > 0 ) */ /* clean up */ trace_finalize( "sgetrf_mgpu.svg","trace.css" ); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( streaml[d][0] ); magma_queue_sync( streaml[d][1] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); timer_start( time ); timer_printf("\n Performance %f GFlop/s\n", FLOPS_SGETRF(m,n) / 1e9 / time ); return *info; } /* magma_sgetrf2_mgpu */