/* //////////////////////////////////////////////////////////////////////////// -- Testing sswap, sswapblk, spermute, slaswp, slaswpx */ int main( int argc, char** argv) { TESTING_INIT(); float *h_A1, *h_A2; float *d_A1, *d_A2; float *h_R1, *h_R2; // row-major and column-major performance real_Double_t row_perf0, col_perf0; real_Double_t row_perf1, col_perf1; real_Double_t row_perf2, col_perf2; real_Double_t row_perf3; real_Double_t row_perf4; real_Double_t row_perf5, col_perf5; real_Double_t row_perf6, col_perf6; real_Double_t row_perf7; real_Double_t cpu_perf; real_Double_t time, gbytes; magma_int_t N, lda, ldda, nb, j; magma_int_t ione = 1; magma_int_t *ipiv, *ipiv2; magma_int_t *d_ipiv; magma_opts opts; parse_opts( argc, argv, &opts ); magma_queue_t queue = 0; printf(" cublasSswap sswap sswapblk slaswp spermute slaswp2 slaswpx scopymatrix CPU (all in )\n"); printf(" N nb row-maj/col-maj row-maj/col-maj row-maj/col-maj row-maj row-maj row-maj row-maj/col-maj row-blk/col-blk slaswp (GByte/s)\n"); printf("==================================================================================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { // each test is assigned one bit in the check bitmask, bit=1 is failure. // shift keeps track of which bit is for current test int shift = 1; int check = 0; N = opts.nsize[i]; lda = N; ldda = ((N+31)/32)*32; nb = (opts.nb > 0 ? opts.nb : magma_get_sgetrf_nb( N )); // for each swap, does 2N loads and 2N stores gbytes = sizeof(float) * 4.*N*nb / 1e9; TESTING_MALLOC_PIN( h_A1, float, lda*N ); TESTING_MALLOC_PIN( h_A2, float, lda*N ); TESTING_MALLOC_PIN( h_R1, float, lda*N ); TESTING_MALLOC_PIN( h_R2, float, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, nb ); TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb ); TESTING_MALLOC_DEV( d_A1, float, ldda*N ); TESTING_MALLOC_DEV( d_A2, float, ldda*N ); for( j=0; j < nb; j++ ) { ipiv[j] = (magma_int_t) ((rand()*1.*N) / (RAND_MAX * 1.)) + 1; } /* ===================================================================== * cublasSswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasSswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasSswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda); } } time = magma_sync_wtime( queue ) - time; col_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * sswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_sswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_sswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda ); } } time = magma_sync_wtime( queue ) - time; col_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * sswapblk, blocked version (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_sswapblk( 'R', N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; row_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_sswapblk( 'C', N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; col_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * spermute_long (1 matrix) */ /* Row Major */ memcpy( ipiv2, ipiv, nb*sizeof(magma_int_t) ); // spermute updates ipiv2 init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_spermute_long2( N, d_A1, ldda, ipiv2, nb, 0 ); time = magma_sync_wtime( queue ) - time; row_perf3 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style slaswp (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_slaswp( N, d_A1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf4 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style slaswp (1 matrix) - d_ipiv on GPU */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 ); magmablas_slaswp2( N, d_A1, ldda, 1, nb, d_ipiv ); time = magma_sync_wtime( queue ) - time; row_perf7 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style slaswpx (extended for row- and col-major) (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_slaswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf5 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* Col Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_slaswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; col_perf5 = gbytes / time; time = magma_wtime(); lapackf77_slaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione); time = magma_wtime() - time; cpu_perf = gbytes / time; magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * Copy matrix. */ time = magma_sync_wtime( queue ); magma_scopymatrix( N, nb, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap col_perf6 = 0.5 * gbytes / time; time = magma_sync_wtime( queue ); magma_scopymatrix( nb, N, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap row_perf6 = 0.5 * gbytes / time; printf("%5d %3d %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c %6.2f%c %6.2f%c %6.2f%c/ %6.2f%c %6.2f / %6.2f %6.2f %10s\n", (int) N, (int) nb, row_perf0, ((check & 0x001) != 0 ? '*' : ' '), col_perf0, ((check & 0x002) != 0 ? '*' : ' '), row_perf1, ((check & 0x004) != 0 ? '*' : ' '), col_perf1, ((check & 0x008) != 0 ? '*' : ' '), row_perf2, ((check & 0x010) != 0 ? '*' : ' '), col_perf2, ((check & 0x020) != 0 ? '*' : ' '), row_perf3, ((check & 0x040) != 0 ? '*' : ' '), row_perf4, ((check & 0x080) != 0 ? '*' : ' '), row_perf7, ((check & 0x100) != 0 ? '*' : ' '), row_perf5, ((check & 0x200) != 0 ? '*' : ' '), col_perf5, ((check & 0x400) != 0 ? '*' : ' '), row_perf6, col_perf6, cpu_perf, (check == 0 ? "ok" : "* failures") ); TESTING_FREE_PIN( h_A1 ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_R1 ); TESTING_FREE_PIN( h_R2 ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( ipiv2 ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_A1 ); TESTING_FREE_DEV( d_A2 ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/** @deprecated Purpose ------- SLAQPS computes a step of QR factorization with column pivoting of a real M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] dA REAL array, dimension (LDDA,N), on the GPU. On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau REAL array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 REAL array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 REAL array, dimension (N) The vector with the exact column norms. @param[in,out] dauxv REAL array, dimension (NB), on the GPU Auxiliary vector. @param[in,out] dF REAL array, dimension (LDDF,NB), on the GPU Matrix F' = L*Y'*A. @param[in] lddf INTEGER The leading dimension of the array F. LDDF >= max(1,N). @ingroup magma_sgeqp3_aux ********************************************************************/ extern "C" magma_int_t magma_slaqps_gpu( magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *jpvt, float *tau, float *vn1, float *vn2, magmaFloat_ptr dauxv, magmaFloat_ptr dF, magma_int_t lddf) { #define dA(i, j) (dA + (i) + (j)*(ldda)) #define dF(i, j) (dF + (i) + (j)*(lddf)) float c_zero = MAGMA_S_MAKE( 0.,0.); float c_one = MAGMA_S_MAKE( 1.,0.); float c_neg_one = MAGMA_S_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; float z__1; magma_int_t k, rk; magmaFloat_ptr dAks; float tauk = MAGMA_S_ZERO; magma_int_t pvt; float tol3z; magma_int_t itemp; float lsticc; magmaFloat_ptr dlsticcs; magma_smalloc( &dlsticcs, 1+256*(n+255)/256 ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); lsticc = 0; k = 0; magma_smalloc( &dAks, nb ); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran/CUBLAS isamax; pvt, k are 0-based. pvt = k + magma_isamax( n-k, &vn1[k], ione, queue ) - 1; if (pvt != k) { /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; magmablas_sswap( m, dA(0, pvt), ione, dA(0, k), ione, queue ); magmablas_sswap( i__1, dF(pvt, 0), lddf, dF(k, 0), lddf, queue ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; magma_sswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset, queue ); } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_sgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione, queue ); #else i__1 = m - rk; i__2 = k; magma_sgemv( MagmaNoTrans, i__1, i__2, c_neg_one, dA(rk, 0), ldda, dF(k, 0), lddf, c_one, dA(rk, k), ione, queue ); #endif } /* Generate elementary reflector H(k). */ magma_slarfg_gpu( m-rk, dA(rk, k), dA(rk + 1, k), &tau[k], &vn1[k], &dAks[k], queue ); /* needed to avoid the race condition */ if (k == 0) magma_ssetvector( 1, &c_one, 1, dA(rk, k), 1, queue ); else magma_scopymatrix( 1, 1, dA(offset, 0), 1, dA(rk, k), 1, queue ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_sgetvector( 1, &tau[k], 1, &tauk, 1, queue ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Multiply on GPU */ magma_sgemv( MagmaConjTrans, m-rk, n-k-1, tauk, dA( rk, k+1 ), ldda, dA( rk, k ), 1, c_zero, dF( k+1, k ), 1, queue ); } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ if (k > 0) { z__1 = MAGMA_S_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_sgemv( MagmaConjTrans, i__1, i__2, z__1, dA(offset+nb, 0), lda, dA(offset+nb, k), ione, c_zero, dauxv, ione, queue ); i__1 = k; magma_sgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, dauxv, ione, c_one, F(k+1,k), ione, queue ); #else i__1 = m - rk; i__2 = k; magma_sgemv( MagmaConjTrans, i__1, i__2, z__1, dA(rk, 0), ldda, dA(rk, k), ione, c_zero, dauxv, ione, queue ); /* I think we only need stricly lower-triangular part :) */ magma_sgemv( MagmaNoTrans, n-k-1, i__2, c_one, dF(k+1,0), lddf, dauxv, ione, c_one, dF(k+1,k), ione, queue ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_sgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, dA(rk, k ), ldda, dF(k+1, k ), lddf, c_one, dA(rk, k+1), ldda, queue ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_sgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, dA(rk, 0 ), ldda, dF(k+1,0 ), lddf, c_one, dA(rk, k+1), ldda, queue ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ) { magmablas_snrm2_row_check_adjust( n-k-1, tol3z, &vn1[k+1], &vn2[k+1], dA(rk,k+1), ldda, dlsticcs, queue ); //magma_device_sync(); magma_sgetvector( 1, &dlsticcs[0], 1, &lsticc, 1, queue ); } ++k; } magma_scopymatrix( 1, k, dAks, 1, dA(offset, 0), ldda+1, queue ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; magma_sgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), lddf, c_one, dA(rk+1, *kb), ldda, queue ); } /* Recomputation of difficult columns. */ if ( lsticc > 0 ) { // printf( " -- recompute dnorms --\n" ); magmablas_snrm2_check( m-rk-1, n-*kb, dA(rk+1,*kb), ldda, &vn1[*kb], dlsticcs, queue ); magma_scopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb, queue ); } magma_free( dAks ); magma_free( dlsticcs ); magma_queue_destroy( queue ); return MAGMA_SUCCESS; } /* magma_slaqps */
/* //////////////////////////////////////////////////////////////////////////// -- Testing sswap, sswapblk, slaswp, slaswpx */ int main( int argc, char** argv) { TESTING_INIT(); float *h_A1, *h_A2; float *h_R1, *h_R2; magmaFloat_ptr d_A1, d_A2; // row-major and column-major performance real_Double_t row_perf0 = MAGMA_D_NAN, col_perf0 = MAGMA_D_NAN; real_Double_t row_perf1 = MAGMA_D_NAN, col_perf1 = MAGMA_D_NAN; real_Double_t row_perf2 = MAGMA_D_NAN, col_perf2 = MAGMA_D_NAN; real_Double_t row_perf4 = MAGMA_D_NAN; real_Double_t row_perf5 = MAGMA_D_NAN, col_perf5 = MAGMA_D_NAN; real_Double_t row_perf6 = MAGMA_D_NAN, col_perf6 = MAGMA_D_NAN; real_Double_t row_perf7 = MAGMA_D_NAN; real_Double_t cpu_perf = MAGMA_D_NAN; real_Double_t time, gbytes; magma_int_t N, lda, ldda, nb, j; magma_int_t ione = 1; magma_int_t *ipiv, *ipiv2; magmaInt_ptr d_ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_queue_t queue = 0; printf(" %8s sswap sswap sswapblk slaswp slaswp2 slaswpx scopymatrix CPU (all in )\n", g_platform_str ); printf(" N nb row-maj/col-maj row-maj/col-maj row-maj/col-maj row-maj row-maj row-maj/col-maj row-blk/col-blk slaswp (GByte/s)\n"); printf("=========================================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // For an N x N matrix, swap nb rows or nb columns using various methods. // Each test is assigned one bit in the 'check' bitmask; bit=1 indicates failure. // The variable 'shift' keeps track of which bit is for current test int shift = 1; int check = 0; N = opts.nsize[itest]; lda = N; ldda = ((N+31)/32)*32; nb = (opts.nb > 0 ? opts.nb : magma_get_sgetrf_nb( N )); nb = min( N, nb ); // each swap does 2N loads and 2N stores, for nb swaps gbytes = sizeof(float) * 4.*N*nb / 1e9; TESTING_MALLOC_PIN( h_A1, float, lda*N ); TESTING_MALLOC_PIN( h_A2, float, lda*N ); TESTING_MALLOC_PIN( h_R1, float, lda*N ); TESTING_MALLOC_PIN( h_R2, float, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, nb ); TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb ); TESTING_MALLOC_DEV( d_A1, float, ldda*N ); TESTING_MALLOC_DEV( d_A2, float, ldda*N ); // getrf always makes ipiv[j] >= j+1, where ipiv is one based and j is zero based // some implementations (e.g., MacOS dlaswp) assume this for( j=0; j < nb; j++ ) { ipiv[j] = (rand() % (N-j)) + j + 1; assert( ipiv[j] >= j+1 ); assert( ipiv[j] <= N ); } /* ===================================================================== * cublas / clBLAS / Xeon Phi sswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { #ifdef HAVE_CUBLAS cublasSswap( opts.handle, N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1 ); #else magma_sswap( N, d_A1, ldda*j, 1, d_A2, ldda*(ipiv[j]-1), 1, opts.queue ); #endif } } time = magma_sync_wtime( queue ) - time; row_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { #ifdef HAVE_CUBLAS cublasSswap( opts.handle, N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda ); #else magma_sswap( N, d_A1, j, ldda, d_A2, ipiv[j]-1, ldda, opts.queue ); #endif } } time = magma_sync_wtime( queue ) - time; col_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * sswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_sswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_sswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda ); } } time = magma_sync_wtime( queue ) - time; col_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * sswapblk, blocked version (2 matrices) */ #ifdef HAVE_CUBLAS /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_sswapblk( MagmaRowMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; row_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_sswapblk( MagmaColMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; col_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; #endif /* ===================================================================== * LAPACK-style slaswp (1 matrix) */ #ifdef HAVE_CUBLAS /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_slaswp( N, d_A1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf4 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; #endif /* ===================================================================== * LAPACK-style slaswp (1 matrix) - d_ipiv on GPU */ #ifdef HAVE_CUBLAS /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 ); magmablas_slaswp2( N, d_A1, ldda, 1, nb, d_ipiv, 1 ); time = magma_sync_wtime( queue ) - time; row_perf7 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; #endif /* ===================================================================== * LAPACK-style slaswpx (extended for row- and col-major) (1 matrix) */ #ifdef HAVE_CUBLAS /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_slaswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf5 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* Col Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_slaswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; col_perf5 = gbytes / time; #endif /* LAPACK swap on CPU for comparison */ time = magma_wtime(); lapackf77_slaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione); time = magma_wtime() - time; cpu_perf = gbytes / time; #ifdef HAVE_CUBLAS magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; #endif /* ===================================================================== * Copy matrix. */ time = magma_sync_wtime( queue ); magma_scopymatrix( N, nb, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap col_perf6 = 0.5 * gbytes / time; time = magma_sync_wtime( queue ); magma_scopymatrix( nb, N, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap row_perf6 = 0.5 * gbytes / time; printf("%5d %3d %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c %6.2f%c %6.2f%c/ %6.2f%c %6.2f / %6.2f %6.2f %10s\n", (int) N, (int) nb, row_perf0, ((check & 0x001) != 0 ? '*' : ' '), col_perf0, ((check & 0x002) != 0 ? '*' : ' '), row_perf1, ((check & 0x004) != 0 ? '*' : ' '), col_perf1, ((check & 0x008) != 0 ? '*' : ' '), row_perf2, ((check & 0x010) != 0 ? '*' : ' '), col_perf2, ((check & 0x020) != 0 ? '*' : ' '), row_perf4, ((check & 0x040) != 0 ? '*' : ' '), row_perf7, ((check & 0x080) != 0 ? '*' : ' '), row_perf5, ((check & 0x100) != 0 ? '*' : ' '), col_perf5, ((check & 0x200) != 0 ? '*' : ' '), row_perf6, col_perf6, cpu_perf, (check == 0 ? "ok" : "* failed") ); status += ! (check == 0); TESTING_FREE_PIN( h_A1 ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_R1 ); TESTING_FREE_PIN( h_R2 ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( ipiv2 ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_A1 ); TESTING_FREE_DEV( d_A2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }