/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; float *h_A, *h_R; magmaFloat_ptr d_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) Copy time (ms) ||PA-LU||/(||A||*N)\n"); printf("=======================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_SGETRF( M, N ) / 1e9; if ( N > 512 ) { printf( "%5d %5d skipping because sgetf2 does not support N > 512\n", (int) M, (int) N ); continue; } TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); real_Double_t set_time = magma_wtime(); magma_ssetmatrix( M, N, h_R, lda, d_A, ldda ); set_time = magma_wtime() - set_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgetf2_gpu( M, N, d_A, ldda, ipiv, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgetf2_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); real_Double_t get_time = magma_wtime(); magma_sgetmatrix( M, N, d_A, ldda, h_A, lda ); get_time = magma_wtime() - get_time; /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f", (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., set_time*1000.+get_time*1000.); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %7.2f", (int) M, (int) N, gpu_perf, gpu_time*1000., set_time*1000.+get_time*1000. ); } if ( opts.check ) { magma_sgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_LU_error( M, N, h_R, lda, h_A, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- SGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs. num_gpus > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrf_m(magma_int_t num_gpus, magma_int_t m, magma_int_t n, float *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i,j) (A + (j)*lda + (i)) #define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0; timer_start( time_total ); real_Double_t flops; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, num_gpus0 = num_gpus; magma_int_t ii, jj, h, offset, ib, rows, s; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* initialize nb */ nb = magma_get_sgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(float); /* number of columns in the big panel */ h = 1+(2+num_gpus0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if ( num_gpus0 > ceil((float)NB/nb) ) { num_gpus = (int)ceil((float)NB/nb); h = 1+(2+num_gpus); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { num_gpus = num_gpus0; } if ( num_gpus*NB >= n ) { #ifdef CHECK_SGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_SGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = num_gpus*NB; NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_SGETRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (float)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_sgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ timer_start( time_alloc ); n_local[0] = (NB/nb)/num_gpus; if ( NB%(nb*num_gpus) != 0 ) n_local[0]++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_smalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); timer_stop( time_alloc ); for( I=0; I < n; I += NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if ( num_gpus0 > ceil((float)N/nb) ) { num_gpus = (int)ceil((float)N/nb); } else { num_gpus = num_gpus0; } for( d=0; d < num_gpus; d++ ) { n_local[d] = ((N/nb)/num_gpus)*nb; if (d < (N/nb)%num_gpus) n_local[d] += nb; else if (d == (N/nb)%num_gpus) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ timer_start( time ); magmablas_ssetmatrix_transpose_mgpu(num_gpus, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_set += timer_stop( time ); timer_start( time ); /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset < min(m,I); offset += NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_ssetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_stranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb ); } /* applying the pivot from the previous big-panel */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][1]); magmablas_spermute_long3( dAT(d,0,0), ldn_local, ipiv, NBk, offset ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if ( jj+nb < NBk ) { magma_ssetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_stranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb ); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local ); if ( M > ii+nb ) { magma_sgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local, dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if ( M > I ) { //magma_sgetrf1_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, // (magma_queue_t **)stream, &iinfo); magma_sgetrf2_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if ( iinfo < 0 ) { *info = iinfo; break; } else if ( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii < min(I+N,m); ii++ ) ipiv[ii] += I; } time_comp += timer_stop( time ); /* download the current big panel to CPU */ timer_start( time ); magmablas_sgetmatrix_transpose_mgpu(num_gpus, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_get += timer_stop( time ); } /* end of for */ timer_stop( time_total ); flops = FLOPS_SGETRF( m, n ) / 1e9; timer_printf(" memory-allocation time: %e\n", time_alloc ); timer_printf(" NB=%d nb=%d\n", (int) NB, (int) nb ); timer_printf(" memcopy and transpose %e seconds\n", time_set ); timer_printf(" total time %e seconds\n", time_total ); timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / (time_comp), time_comp ); timer_printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / (time_comp + time_set), time_comp + time_set ); timer_printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / (time_comp + time_get), time_comp + time_get ); timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc ); for( d=0; d < num_gpus0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); } if ( *info >= 0 ) magma_sgetrf_piv(m, n, NB, A, lda, ipiv, info); return *info; } /* magma_sgetrf_m */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; float *h_A; magmaFloat_ptr d_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% version %d\n", (int) opts.version ); if ( opts.check == 2 ) { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("%%========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default gflops = FLOPS_SGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( opts, M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_sgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( opts, M, N, h_A, lda ); if ( opts.version == 2 ) { // no pivoting versions, so set ipiv to identity for (magma_int_t i=0; i < min_mn; ++i ) { ipiv[i] = i+1; } } magma_ssetmatrix( M, N, h_A, lda, d_A, ldda ); gpu_time = magma_wtime(); if ( opts.version == 1 ) { magma_sgetrf_gpu( M, N, d_A, ldda, ipiv, &info); } else if ( opts.version == 2 ) { magma_sgetrf_nopiv_gpu( M, N, d_A, ldda, &info); } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_sgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { magma_sgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_residual( opts, M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { magma_sgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_LU_error( opts, M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; float *h_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t status = 0; /* Initialize */ magma_queue_t queue[2]; magma_device_t devices[MagmaMaxGPUs]; int num = 0; magma_err_t err; magma_init(); magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } // Create two queues on device opts.device err = magma_queue_create( devices[opts.device], &queue[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[opts.device], &queue[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_SGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_PIN( h_A, float, n2 ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_sgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); gpu_time = magma_wtime(); magma_sgetrf( M, N, h_A, lda, ipiv, &info, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( queue[0] ); magma_queue_destroy( queue[1] ); magma_finalize(); return status; }
/** Purpose ------- SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs to be used for the factorization. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrf2_mgpu(magma_int_t num_gpus, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, float *d_lAT[], magma_int_t lddat, magma_int_t *ipiv, float *d_lAP[], float *w, magma_int_t ldw, magma_queue_t streaml[][2], magma_int_t *info) { #define dAT(id,i,j) (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb) #define W(j) (w+((j)%num_gpus)*nb*ldw) float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t block_size = 32; magma_int_t iinfo, n_local[MagmaMaxGPUs]; magma_int_t maxm, mindim; magma_int_t i, d, dd, rows, cols, s, ldpan[MagmaMaxGPUs]; magma_int_t id, i_local, i_local2, nb0, nb1, h = 2+num_gpus; float *d_panel[MagmaMaxGPUs], *panel_local[MagmaMaxGPUs]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (num_gpus*lddat < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); if ( num_gpus > ceil((float)n/nb) ) { *info = -1; return *info; } /* Use hybrid blocked code. */ maxm = ((m + block_size-1)/block_size)*block_size; /* some initializations */ for (i=0; i < num_gpus; i++) { magma_setdevice(i); n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; /* workspaces */ d_panel[i] = &(d_lAP[i][h*nb*maxm]); /* temporary panel storage */ } trace_init( 1, num_gpus, 2, (CUstream_st**)streaml ); /* start sending the panel to cpu */ nb0 = min(mindim, nb); magma_setdevice(0); magmablasSetKernelStream(streaml[0][1]); trace_gpu_start( 0, 1, "comm", "get" ); magmablas_stranspose( nb0, m, dAT(0,0,0), lddat, d_lAP[0], maxm ); magma_sgetmatrix_async( m, nb0, d_lAP[0], maxm, W(0), ldw, streaml[0][1] ); trace_gpu_end( 0, 1 ); /* ------------------------------------------------------------------------------------- */ magma_timer_t time=0; timer_start( time ); s = mindim / nb; for( i=0; i < s; i++ ) { /* Set the GPU number that holds the current panel */ id = i%num_gpus; magma_setdevice(id); /* Set the local index where the current panel is */ i_local = i/num_gpus; cols = maxm - i*nb; rows = m - i*nb; /* synchrnoize i-th panel from id-th gpu into work */ magma_queue_sync( streaml[id][1] ); /* i-th panel factorization */ trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_sgetrf( &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) { *info = iinfo + i*nb; } trace_cpu_end( 0 ); /* start sending the panel to all the gpus */ d = (i+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { magma_setdevice(d); trace_gpu_start( 0, 1, "comm", "set" ); magma_ssetmatrix_async( rows, nb, W(i), ldw, &d_lAP[d][(i%h)*nb*maxm], cols, streaml[d][1] ); trace_gpu_end( 0, 1 ); d = (d+1)%num_gpus; } /* apply the pivoting */ d = (i+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { magma_setdevice(d); magmablasSetKernelStream(streaml[d][0]); trace_gpu_start( d, 1, "pivot", "pivot" ); if ( dd == 0 ) magmablas_spermute_long2( lddat, dAT(d,0,0), lddat, ipiv, nb, i*nb ); else magmablas_spermute_long3( dAT(d,0,0), lddat, ipiv, nb, i*nb ); trace_gpu_end( d, 1 ); d = (d+1)%num_gpus; } /* update the trailing-matrix/look-ahead */ d = (i+1)%num_gpus; for( dd=0; dd < num_gpus; dd++ ) { magma_setdevice(d); /* storage for panel */ if ( d == id ) { /* the panel belond to this gpu */ panel_local[d] = dAT(d,i,i_local); ldpan[d] = lddat; /* next column */ i_local2 = i_local+1; } else { /* the panel belong to another gpu */ panel_local[d] = d_panel[d]; ldpan[d] = nb; /* next column */ i_local2 = i_local; if ( d < id ) i_local2 ++; } /* the size of the next column */ if ( s > (i+1) ) { nb0 = nb; } else { nb0 = n_local[d]-nb*(s/num_gpus); if ( d < s%num_gpus ) nb0 -= nb; } if ( d == (i+1)%num_gpus) { /* owns the next column, look-ahead the column */ nb1 = nb0; magmablasSetKernelStream(streaml[d][1]); /* make sure all the pivoting has been applied */ magma_queue_sync(streaml[d][0]); trace_gpu_start( d, 1, "gemm", "gemm" ); /* transpose panel on GPU */ magmablas_stranspose( rows, nb, &d_lAP[d][(i%h)*nb*maxm], cols, panel_local[d], ldpan[d] ); /* synch for remaining update */ magma_queue_sync(streaml[d][1]); } else { /* update the entire trailing matrix */ nb1 = n_local[d] - i_local2*nb; magmablasSetKernelStream(streaml[d][0]); /* synchronization to make sure panel arrived on gpu */ magma_queue_sync(streaml[d][1]); trace_gpu_start( d, 0, "gemm", "gemm" ); /* transpose panel on GPU */ magmablas_stranspose( rows, nb, &d_lAP[d][(i%h)*nb*maxm], cols, panel_local[d], ldpan[d] ); } /* gpu updating the trailing matrix */ magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb, c_one, panel_local[d], ldpan[d], dAT(d, i, i_local2), lddat); magma_sgemm( MagmaNoTrans, MagmaNoTrans, nb1, m-(i+1)*nb, nb, c_neg_one, dAT(d, i, i_local2), lddat, &(panel_local[d][nb*ldpan[d]]), ldpan[d], c_one, dAT(d, i+1, i_local2), lddat ); if ( d == (i+1)%num_gpus ) { /* Set the local index where the current panel is */ int loff = i+1; int i_local = (i+1)/num_gpus; int ldda = maxm - (i+1)*nb; int cols = m - (i+1)*nb; nb0 = min(nb, mindim - (i+1)*nb); /* size of the diagonal block */ trace_gpu_end( d, 1 ); if ( nb0 > 0 ) { /* transpose the panel for sending it to cpu */ trace_gpu_start( d, 1, "comm", "get" ); magmablas_stranspose( nb0, m-(i+1)*nb, dAT(d,loff,i_local), lddat, &d_lAP[d][((i+1)%h)*nb*maxm], ldda ); /* send the panel to cpu */ magma_sgetmatrix_async( cols, nb0, &d_lAP[d][((i+1)%h)*nb*maxm], ldda, W(i+1), ldw, streaml[d][1] ); trace_gpu_end( d, 1 ); } } else { trace_gpu_end( d, 0 ); } d = (d+1)%num_gpus; } /* update the remaining matrix by gpu owning the next panel */ if ( (i+1) < s ) { int i_local = (i+1)/num_gpus; int rows = m - (i+1)*nb; d = (i+1)%num_gpus; magma_setdevice(d); magmablasSetKernelStream(streaml[d][0]); trace_gpu_start( d, 0, "gemm", "gemm" ); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d] - (i_local+1)*nb, nb, c_one, panel_local[d], ldpan[d], dAT(d,i,i_local+1), lddat ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n_local[d]-(i_local+1)*nb, rows, nb, c_neg_one, dAT(d,i,i_local+1), lddat, &(panel_local[d][nb*ldpan[d]]), ldpan[d], c_one, dAT(d,i+1, i_local+1), lddat ); trace_gpu_end( d, 0 ); } } /* end of for i=1..s */ /* ------------------------------------------------------------------------------ */ /* Set the GPU number that holds the last panel */ id = s%num_gpus; /* Set the local index where the last panel is */ i_local = s/num_gpus; /* size of the last diagonal-block */ nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; if ( nb0 > 0 ) { magma_setdevice(id); /* wait for the last panel on cpu */ magma_queue_sync( streaml[id][1] ); /* factor on cpu */ lapackf77_sgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; /* send the factor to gpus */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); i_local2 = i_local; if ( d < id ) i_local2 ++; if ( d == id || n_local[d] > i_local2*nb ) { magma_ssetmatrix_async( rows, nb0, W(s), ldw, &d_lAP[d][(s%h)*nb*maxm], cols, streaml[d][1] ); } } for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(streaml[d][0]); if ( d == 0 ) magmablas_spermute_long2( lddat, dAT(d,0,0), lddat, ipiv, nb0, s*nb ); else magmablas_spermute_long3( dAT(d,0,0), lddat, ipiv, nb0, s*nb ); } for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(streaml[d][1]); /* wait for the pivoting to be done */ magma_queue_sync( streaml[d][0] ); i_local2 = i_local; if ( d < id ) i_local2++; if ( d == id ) { /* the panel belond to this gpu */ panel_local[d] = dAT(d,s,i_local); /* next column */ nb1 = n_local[d] - i_local*nb-nb0; magmablas_stranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], lddat ); if ( nb1 > 0 ) { magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], lddat, dAT(d,s,i_local)+nb0, lddat); } } else if ( n_local[d] > i_local2*nb ) { /* the panel belong to another gpu */ panel_local[d] = d_panel[d]; /* next column */ nb1 = n_local[d] - i_local2*nb; magmablas_stranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], nb ); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], nb, dAT(d,s,i_local2), lddat); } } } /* if ( nb0 > 0 ) */ /* clean up */ trace_finalize( "sgetrf_mgpu.svg","trace.css" ); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( streaml[d][0] ); magma_queue_sync( streaml[d][1] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); timer_start( time ); timer_printf("\n Performance %f GFlop/s\n", FLOPS_SGETRF(m,n) / 1e9 / time ); return *info; } /* magma_sgetrf2_mgpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetrf_mgpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; float *h_A, *h_P; magmaFloat_ptr d_lA[ MagmaMaxSubs * MagmaMaxGPUs ]; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t dev, j, k, ngpu, nsub, n_local, nb, nk, ldn_local, maxm; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); /* Initialize queues */ magma_queue_t queues[MagmaMaxGPUs * 2]; magma_device_t devices[MagmaMaxGPUs]; magma_int_t num = 0; magma_int_t err; err = magma_getdevices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } for( dev=0; dev < opts.ngpu; dev++ ) { err = magma_queue_create( devices[dev], &queues[2*dev] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d (device %d)\n", (int) err, dev ); exit(-1); } err = magma_queue_create( devices[dev], &queues[2*dev+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d (device %d)\n", (int) err, dev ); exit(-1); } } printf("trans %s, ngpu %d, nsub %d\n", lapack_trans_const(opts.transA), (int) opts.ngpu, (int) opts.nsub ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); maxm = 32*((M+31)/32); lda = M; n2 = lda*N; nb = magma_get_sgetrf_nb(M); gflops = FLOPS_SGETRF( M, N ) / 1e9; // nsubs * ngpu must be at least the number of blocks ngpu = opts.ngpu; nsub = opts.nsub; if ( nsub*ngpu > N/nb ) { nsub = 1; ngpu = 1; printf( " * too many GPUs for the matrix size, using %d GPUs and %d submatrices\n", (int) ngpu, (int) nsub ); } /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_CPU( h_P, float, lda*nb ); /* Allocate device memory */ if ( opts.transA == MagmaNoTrans ) { ldda = N/nb; /* number of block columns */ ldda = ldda/(ngpu*nsub); /* number of block columns per GPU */ ldda = nb*ldda; /* number of columns per GPU */ if ( ldda * ngpu*nsub < N ) { /* left over */ if ( N-ldda*ngpu*nsub >= nb ) { ldda += nb; } else { ldda += (N-ldda*ngpu*nsub)%nb; } } ldda = ((ldda+31)/32)*32; /* make it a multiple of 32 */ for( j=0; j < nsub * ngpu; j++ ) { TESTING_MALLOC_DEV( d_lA[j], float, ldda*maxm ); } } else { ldda = ((M+31)/32)*32; for( j=0; j < nsub * ngpu; j++ ) { n_local = ((N/nb)/(nsub*ngpu))*nb; if ( j < (N/nb)%(nsub*ngpu) ) { n_local += nb; } else if ( j == (N/nb)%(nsub*ngpu) ) { n_local += N%nb; } TESTING_MALLOC_DEV( d_lA[j], float, ldda*n_local ); } } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_sgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if ( info != 0 ) printf("lapackf77_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); if ( opts.transA == MagmaNoTrans ) { for( j=0; j < N; j += nb ) { k = (j/nb)%(nsub*ngpu); nk = min(nb, N-j); /* transpose on CPU, then copy to GPU */ int ii,jj; for( ii=0; ii < M; ii++ ) { for( jj=0; jj < nk; jj++ ) { h_P[jj+ii*nk] = h_A[j*lda + ii+jj*lda]; } } magma_ssetmatrix( nk, M, h_P, nk, d_lA[k], j/(nb*nsub*ngpu)*nb, ldda, queues[2*(k%ngpu)] ); } } else { ldda = ((M+31)/32)*32; for( j=0; j < N; j += nb ) { k = (j/nb)%(nsub*ngpu); nk = min(nb, N-j); magma_ssetmatrix( M, nk, h_A + j*lda, lda, d_lA[k], j/(nb*nsub*ngpu)*nb*ldda, ldda, queues[2*(k%ngpu)] ); } } gpu_time = magma_wtime(); magma_sgetrf_msub( opts.transA, nsub, ngpu, M, N, d_lA, 0, ldda, ipiv, queues, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* get the matrix from GPUs */ if ( opts.transA == MagmaNoTrans ) { for (j=0; j < N; j+=nb) { k = (j/nb)%(nsub*ngpu); nk = min(nb, N-j); /* copy to CPU and then transpose */ magma_sgetmatrix( nk, M, d_lA[k], j/(nb*nsub*ngpu)*nb, ldda, h_P, nk, queues[2*(k%ngpu)] ); int ii, jj; for( ii=0; ii < M; ii++ ) { for( jj=0; jj < nk; jj++ ) { h_A[j*lda + ii+jj*lda] = h_P[jj+ii*nk]; } } } } else { for (j=0; j < N; j+=nb) { k = (j/nb)%(nsub*ngpu); nk = min(nb, N-j); magma_sgetmatrix( M, nk, d_lA[k], j/(nb*nsub*ngpu)*nb*ldda, ldda, h_A + j*lda, lda, queues[2*(k%ngpu)] ); } } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_P ); for( dev=0; dev < ngpu; dev++ ) { for( k=0; k < nsub; k++ ) { TESTING_FREE_DEV( d_lA[dev*nsub + k] ); } } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Free queues */ for( dev=0; dev < opts.ngpu; dev++ ) { magma_queue_destroy( queues[2*dev] ); magma_queue_destroy( queues[2*dev+1] ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgesv_gpu */ int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; float error, Rnorm, Anorm, Xnorm, *work; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_B, *h_X; magmaFloat_ptr d_A, d_B; magma_int_t *ipiv; magma_int_t N, nrhs, lda, ldb, ldda, lddb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); nrhs = opts.nrhs; printf(" N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldb = lda; ldda = ((N+31)/32)*32; lddb = ldda; gflops = ( FLOPS_SGETRF( N, N ) + FLOPS_SGETRS( N, nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, float, lda*N ); TESTING_MALLOC_CPU( h_B, float, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, float, ldb*nrhs ); TESTING_MALLOC_CPU( work, float, N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, lddb*nrhs ); /* Initialize the matrices */ sizeA = lda*N; sizeB = ldb*nrhs; lapackf77_slarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_slarnv( &ione, ISEED, &sizeB, h_B ); magma_ssetmatrix( N, N, h_A, lda, d_A, ldda ); magma_ssetmatrix( N, nrhs, h_B, ldb, d_B, lddb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgesv_gpu( N, nrhs, d_A, ldda, ipiv, d_B, lddb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgesv_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Residual //===================================================================== magma_sgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); Anorm = lapackf77_slange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_slange("I", &N, &nrhs, h_X, &ldb, work); blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_slange("I", &N, &nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status += ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgesv( &N, &nrhs, h_A, &lda, ipiv, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; float *h_A; float *d_lA[ MagmaMaxGPUs ]; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, ldn_local; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_sgetrf_nb( M ); gflops = FLOPS_SGETRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate host memory for the matrix TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); // Allocate device memory for( int dev=0; dev < ngpu; dev++){ n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; ldn_local = ((n_local+31)/32)*32; // TODO why? magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], float, ldda*ldn_local ); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_sgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); magma_ssetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_sgetrf_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_sgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( " ---\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgesv */ int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; float error, lerror, Rnorm, Anorm, Xnorm, *work; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_LU, *h_B, *h_B0, *h_X; magma_int_t *ipiv; magma_int_t N, nrhs, lda, ldb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); nrhs = opts.nrhs; printf("%% ngpu %d\n", (int) opts.ngpu ); if (opts.lapack) { printf("%% N NRHS CPU Gflop/s (sec) GPU Gflop/s (sec) ||B - AX|| / N*||A||*||X|| ||B - AX|| / N*||A||*||X||_CPU\n"); printf("%%================================================================================================================\n"); } else { printf("%% N NRHS CPU Gflop/s (sec) GPU Gflop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("%%===============================================================================\n"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldb = lda; gflops = ( FLOPS_SGETRF( N, N ) + FLOPS_SGETRS( N, nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, float, lda*N ); TESTING_MALLOC_CPU( h_LU, float, lda*N ); TESTING_MALLOC_CPU( h_B0, float, ldb*nrhs ); TESTING_MALLOC_CPU( h_B, float, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, float, ldb*nrhs ); TESTING_MALLOC_CPU( work, float, N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); /* Initialize the matrices */ sizeA = lda*N; sizeB = ldb*nrhs; lapackf77_slarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_slarnv( &ione, ISEED, &sizeB, h_B ); // copy A to LU and B to X; save A and B for residual lapackf77_slacpy( "F", &N, &N, h_A, &lda, h_LU, &lda ); lapackf77_slacpy( "F", &N, &nrhs, h_B, &ldb, h_X, &ldb ); lapackf77_slacpy( "F", &N, &nrhs, h_B, &ldb, h_B0, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgesv( N, nrhs, h_LU, lda, ipiv, h_X, ldb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_sgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } //===================================================================== // Residual //===================================================================== Anorm = lapackf77_slange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_slange("I", &N, &nrhs, h_X, &ldb, work); blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_slange("I", &N, &nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); bool okay = (error < tol); status += ! okay; /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { lapackf77_slacpy( "F", &N, &N, h_A, &lda, h_LU, &lda ); lapackf77_slacpy( "F", &N, &nrhs, h_B0, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_sgesv( &N, &nrhs, h_LU, &lda, ipiv, h_X, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_sgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } //Anorm = lapackf77_slange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_slange("I", &N, &nrhs, h_X, &ldb, work); blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B0, &ldb); Rnorm = lapackf77_slange("I", &N, &nrhs, h_B0, &ldb, work); lerror = Rnorm/(N*Anorm*Xnorm); bool lokay = (lerror < tol); printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %-6s %8.2e %s\n", (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (okay ? "ok" : "failed"), lerror, (lokay ? "ok" : "failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, gpu_perf, gpu_time, error, (okay ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_LU ); TESTING_FREE_CPU( h_B0 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( ipiv ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; float *h_A, *h_R; float *d_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||PA-LU||/(||A||*N)\n"); printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_SGETRF( M, N ) / 1e9; TESTING_MALLOC( ipiv, magma_int_t, min_mn ); TESTING_MALLOC( h_A, float, n2 ); TESTING_HOSTALLOC( h_R, float, n2 ); TESTING_DEVALLOC( d_A, float, ldda*N ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_ssetmatrix( M, N, h_R, lda, d_A, ldda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_sgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_sgetf2_gpu( M, N, d_A, ldda, ipiv, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_sgetf2_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000. ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time*1000. ); } if ( opts.check ) { magma_sgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_LU_error( M, N, h_R, lda, h_A, ipiv ); printf(" %8.2e\n", error ); } else { printf(" --- \n"); } TESTING_FREE( ipiv ); TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); TESTING_DEVFREE( d_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }