/** Purpose ------- ZSSSSM applies the LU factorization update from a complex matrix formed by a lower triangular IB-by-K tile L1 on top of a M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1 tile A1 on top of a M2-by-N2 tile A2 (N1 == N2). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments --------- @param[in] m1 INTEGER The number of rows of the matrix A1. M1 >= 0. @param[in] n1 INTEGER The number of columns of the matrix A1. N1 >= 0. @param[in] m2 INTEGER The number of rows of the matrix A2. M2 >= 0. @param[in] n2 INTEGER The number of columns of the matrix A2. N2 >= 0. @param[in] k INTEGER The number of columns of the matrix L1 and L2. K >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in,out] dA1 COMPLEX_16 array, dimension(LDDA1, N), on gpu. On entry, the M1-by-N1 tile dA1. On exit, dA1 is updated by the application of dL (dL1 dL2). @param[in] ldda1 INTEGER The leading dimension of the array dA1. LDDA1 >= max(1,M1). @param[in,out] dA2 COMPLEX_16 array, dimension(LDDA2, N), on gpu. On entry, the M2-by-N2 tile dA2. On exit, dA2 is updated by the application of dL (dL1 dL2). @param[in] ldda2 INTEGER The leading dimension of the array dA2. LDDA2 >= max(1,M2). @param[in] dL1 COMPLEX_16 array, dimension(LDDL1, K), on gpu. The inverse of the IB-by-K lower triangular tile as returned by ZTSTRF. @param[in] lddl1 INTEGER The leading dimension of the array L1. LDDL1 >= max(1,2*IB). @param[in] dL2 COMPLEX_16 array, dimension(LDDL2, K) The M2-by-K tile as returned by ZTSTRF. @param[in] lddl2 INTEGER The leading dimension of the array L2. LDDL2 >= max(1,M2). @param[in] ipiv INTEGER array on the cpu. The pivot indices array of size K as returned by ZTSTRF @ingroup magma_zgesv_tile ********************************************************************/ extern "C" magma_int_t magma_zssssm_gpu( magma_order_t order, magma_int_t m1, magma_int_t n1, magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, magmaDoubleComplex_ptr dA1, magma_int_t ldda1, magmaDoubleComplex_ptr dA2, magma_int_t ldda2, magmaDoubleComplex_ptr dL1, magma_int_t lddl1, magmaDoubleComplex_ptr dL2, magma_int_t lddl2, magma_int_t *ipiv, magma_int_t *info) { #define A1T(i,j) (dA1T + (i)*ldda1 + (j)) #define A2T(i,j) (dA2T + (i)*ldda2 + (j)) #define L1(i) (dL1 + (i)*lddl1 ) #define L2(i,j) (dL2 + (i)*lddl2i + (j)*lddl2j) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; int ip, ii, sb; magmaDoubleComplex_ptr dA1T, dA2T; magma_trans_t transL; int lddl2i, lddl2j; MAGMA_UNUSED( ip ); // used only if NOSWAPBLK /* Check input arguments */ *info = 0; if (m1 < 0) { *info = -1; } else if (n1 < 0) { *info = -2; } else if (m2 < 0) { *info = -3; } else if (n2 < 0) { *info = -4; } else if (k < 0) { *info = -5; } else if (ib < 0) { *info = -6; } else if (ldda1 < max(1,m1)) { *info = -8; } else if (ldda2 < max(1,m2)) { *info = -10; } else if (lddl1 < max(1,ib)) { *info = -12; } else if (lddl2 < max(1,m2)) { *info = -14; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ((m1 == 0) || (n1 == 0) || (m2 == 0) || (n2 == 0) || (k == 0) || (ib == 0)) return *info; if ( order == MagmaColMajor ) { magmablas_zgetmo_in( dA1, dA1T, ldda1, m1, n1 ); magmablas_zgetmo_in( dA2, dA2T, ldda2, m2, n2 ); transL = MagmaTrans; lddl2i = 1; lddl2j = lddl2; } else { dA1T = dA1; dA2T = dA2; transL = MagmaNoTrans; lddl2i = lddl2; lddl2j = 1; } ip = 0; for( ii=0; ii < k; ii += ib ) { sb = min( k-ii, ib); #ifndef NOSWAPBLK magmablas_zswapblk( MagmaRowMajor, n1, A1T(0, 0), ldda1, A2T(0, 0), ldda2, ii+1, ii+ib, ipiv, 1, m1 ); #else { int im; for (i=0; i < ib; i++) { im = ipiv[ip]-1; if (im != (ii+i)) { im = im - m1; assert( (im >= 0) && (im < m1) && (im < m2) ); magmablas_zswap( n1, A1T(ii+i, 0), 1, A2T(im, 0), 1 ); } ip++; } } #endif #ifndef WITHOUTTRTRI /* Lower, Trans, because L1 is not transposed */ magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n1, sb, c_one, L1( ii), lddl1, A1T(ii, 0), ldda1); #else /* Lower, Trans, because L1 is not transposed */ magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n1, sb, c_one, L1( ii), lddl1, A1T(ii, 0), ldda1); #endif /* Second parameter is trans because L2 is not transposed */ magma_zgemm( MagmaNoTrans, transL, n2, m2, sb, c_neg_one, A1T(ii, 0), ldda1, L2( 0, ii), lddl2, c_one, A2T(0, 0 ), ldda2 ); } if ( order == MagmaColMajor ) { magmablas_zgetmo_out( dA1, dA1T, ldda1, m1, n1 ); magmablas_zgetmo_out( dA2, dA2T, ldda2, m2, n2 ); } return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zswap, zswapblk, zlaswp, zlaswpx */ int main( int argc, char** argv) { TESTING_INIT(); // OpenCL use: cl_mem , offset (two arguments); // else use: pointer + offset (one argument). #ifdef HAVE_clBLAS #define d_A1(i_, j_) d_A1 , (i_) + (j_)*ldda #define d_A2(i_, j_) d_A2 , (i_) + (j_)*ldda #define d_ipiv(i_) d_ipiv , (i_) #else #define d_A1(i_, j_) (d_A1 + (i_) + (j_)*ldda) #define d_A2(i_, j_) (d_A2 + (i_) + (j_)*ldda) #define d_ipiv(i_) (d_ipiv + (i_)) #endif #define h_A1(i_, j_) (h_A1 + (i_) + (j_)*lda) #define h_A2(i_, j_) (h_A2 + (i_) + (j_)*lda) magmaDoubleComplex *h_A1, *h_A2; magmaDoubleComplex *h_R1, *h_R2; magmaDoubleComplex_ptr d_A1, d_A2; // row-major and column-major performance real_Double_t row_perf0 = MAGMA_D_NAN, col_perf0 = MAGMA_D_NAN; real_Double_t row_perf1 = MAGMA_D_NAN, col_perf1 = MAGMA_D_NAN; real_Double_t row_perf2 = MAGMA_D_NAN, col_perf2 = MAGMA_D_NAN; real_Double_t row_perf4 = MAGMA_D_NAN; real_Double_t row_perf5 = MAGMA_D_NAN, col_perf5 = MAGMA_D_NAN; real_Double_t row_perf6 = MAGMA_D_NAN, col_perf6 = MAGMA_D_NAN; real_Double_t row_perf7 = MAGMA_D_NAN; real_Double_t cpu_perf = MAGMA_D_NAN; real_Double_t time, gbytes; magma_int_t N, lda, ldda, nb, j; magma_int_t ione = 1; magma_int_t *ipiv, *ipiv2; magmaInt_ptr d_ipiv; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); printf("%% %8s zswap zswap zswapblk zlaswp zlaswp2 zlaswpx zcopymatrix CPU (all in )\n", g_platform_str ); printf("%% N nb row-maj/col-maj row-maj/col-maj row-maj/col-maj row-maj row-maj row-maj/col-maj row-blk/col-blk zlaswp (GByte/s)\n"); printf("%%========================================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // For an N x N matrix, swap nb rows or nb columns using various methods. // Each test is assigned one bit in the 'check' bitmask; bit=1 indicates failure. // The variable 'shift' keeps track of which bit is for current test magma_int_t shift = 1; magma_int_t check = 0; N = opts.nsize[itest]; lda = N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default nb = (opts.nb > 0 ? opts.nb : magma_get_zgetrf_nb( N, N )); nb = min( N, nb ); // each swap does 2N loads and 2N stores, for nb swaps gbytes = sizeof(magmaDoubleComplex) * 4.*N*nb / 1e9; TESTING_MALLOC_PIN( h_A1, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_A2, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R1, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R2, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, nb ); TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb ); TESTING_MALLOC_DEV( d_A1, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_A2, magmaDoubleComplex, ldda*N ); // getrf always makes ipiv[j] >= j+1, where ipiv is one based and j is zero based // some implementations (e.g., MacOS dlaswp) assume this for( j=0; j < nb; j++ ) { ipiv[j] = (rand() % (N-j)) + j + 1; assert( ipiv[j] >= j+1 ); assert( ipiv[j] <= N ); } /* ===================================================================== * cublas / clBLAS / Xeon Phi zswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2(0,0), ldda ); magmablasSetKernelStream( opts.queue ); // opts.handle also uses opts.queue time = magma_sync_wtime( opts.queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { #ifdef HAVE_CUBLAS cublasZswap( opts.handle, N, d_A1(0,j), 1, d_A2(0,ipiv[j]-1), 1 ); #else magma_zswap( N, d_A1(0,j), 1, d_A2(0,ipiv[j]-1), 1, opts.queue ); #endif } } time = magma_sync_wtime( opts.queue ) - time; row_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1(0,j), &ione, h_A2(0,ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2(0,0), ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2(0,0), ldda ); time = magma_sync_wtime( opts.queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { #ifdef HAVE_CUBLAS cublasZswap( opts.handle, N, d_A1(j,0), ldda, d_A2(ipiv[j]-1,0), ldda ); #else magma_zswap( N, d_A1(j,0), ldda, d_A2(ipiv[j]-1,0), ldda, opts.queue ); #endif } } time = magma_sync_wtime( opts.queue ) - time; col_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2(0,0), ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * zswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2(0,0), ldda ); time = magma_sync_wtime( opts.queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_zswap( N, d_A1(0,j), 1, d_A2(0,ipiv[j]-1), 1); } } time = magma_sync_wtime( opts.queue ) - time; row_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1(0,j), &ione, h_A2(0,ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2(0,0), ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2(0,0), ldda ); time = magma_sync_wtime( opts.queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_zswap( N, d_A1(j,0), ldda, d_A2(ipiv[j]-1,0), ldda ); } } time = magma_sync_wtime( opts.queue ) - time; col_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2(0,0), ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * zswapblk, blocked version (2 matrices) */ #ifdef HAVE_CUBLAS /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2(0,0), ldda ); time = magma_sync_wtime( opts.queue ); magmablas_zswapblk( MagmaRowMajor, N, d_A1(0,0), ldda, d_A2(0,0), ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( opts.queue ) - time; row_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1(0,j), &ione, h_A2(0,ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2(0,0), ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2(0,0), ldda ); time = magma_sync_wtime( opts.queue ); magmablas_zswapblk( MagmaColMajor, N, d_A1(0,0), ldda, d_A2(0,0), ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( opts.queue ) - time; col_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1(j,0), &lda, h_A2(ipiv[j]-1,0), &lda); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2(0,0), ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; #endif /* ===================================================================== * LAPACK-style zlaswp (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); time = magma_sync_wtime( opts.queue ); magmablas_zlaswp( N, d_A1(0,0), ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( opts.queue ) - time; row_perf4 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1(0,j), &ione, h_A1(0,ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style zlaswp (1 matrix) - d_ipiv on GPU */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); time = magma_sync_wtime( opts.queue ); magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv(0), 1 ); magmablas_zlaswp2( N, d_A1(0,0), ldda, 1, nb, d_ipiv(0), 1 ); time = magma_sync_wtime( opts.queue ) - time; row_perf7 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1(0,j), &ione, h_A1(0,ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style zlaswpx (extended for row- and col-major) (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); time = magma_sync_wtime( opts.queue ); magmablas_zlaswpx( N, d_A1(0,0), ldda, 1, 1, nb, ipiv, 1); time = magma_sync_wtime( opts.queue ) - time; row_perf5 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1(0,j), &ione, h_A1(0,ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* Col Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1(0,0), ldda ); time = magma_sync_wtime( opts.queue ); magmablas_zlaswpx( N, d_A1(0,0), 1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( opts.queue ) - time; col_perf5 = gbytes / time; /* LAPACK swap on CPU for comparison */ time = magma_wtime(); lapackf77_zlaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione); time = magma_wtime() - time; cpu_perf = gbytes / time; magma_zgetmatrix( N, N, d_A1(0,0), ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * Copy matrix. */ time = magma_sync_wtime( opts.queue ); magma_zcopymatrix( N, nb, d_A1(0,0), ldda, d_A2(0,0), ldda ); time = magma_sync_wtime( opts.queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap col_perf6 = 0.5 * gbytes / time; time = magma_sync_wtime( opts.queue ); magma_zcopymatrix( nb, N, d_A1(0,0), ldda, d_A2(0,0), ldda ); time = magma_sync_wtime( opts.queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap row_perf6 = 0.5 * gbytes / time; printf("%5d %3d %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c %6.2f%c %6.2f%c/ %6.2f%c %6.2f / %6.2f %6.2f %10s\n", (int) N, (int) nb, row_perf0, ((check & 0x001) != 0 ? '*' : ' '), col_perf0, ((check & 0x002) != 0 ? '*' : ' '), row_perf1, ((check & 0x004) != 0 ? '*' : ' '), col_perf1, ((check & 0x008) != 0 ? '*' : ' '), row_perf2, ((check & 0x010) != 0 ? '*' : ' '), col_perf2, ((check & 0x020) != 0 ? '*' : ' '), row_perf4, ((check & 0x040) != 0 ? '*' : ' '), row_perf7, ((check & 0x080) != 0 ? '*' : ' '), row_perf5, ((check & 0x100) != 0 ? '*' : ' '), col_perf5, ((check & 0x200) != 0 ? '*' : ' '), row_perf6, col_perf6, cpu_perf, (check == 0 ? "ok" : "* failed") ); status += ! (check == 0); TESTING_FREE_PIN( h_A1 ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_R1 ); TESTING_FREE_PIN( h_R2 ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( ipiv2 ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_A1 ); TESTING_FREE_DEV( d_A2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- ZSSSSM applies the LU factorization update from a complex matrix formed by a lower triangular IB-by-K tile L1 on top of a M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1 tile A1 on top of a M2-by-N2 tile A2 (N1 == N2). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in] NB INTEGER The blocking size. NB >= 0. @param[in,out] hU COMPLEX_16 array, dimension(LDHU, N), on cpu. On entry, the NB-by-N upper triangular tile hU. On exit, the content is incomplete. Shouldn't be used. @param[in] ldhu INTEGER The leading dimension of the array hU. LDHU >= max(1,NB). @param[in,out] dU COMPLEX_16 array, dimension(LDDU, N), on gpu. On entry, the NB-by-N upper triangular tile dU identical to hU. On exit, the new factor U from the factorization. @param[in] lddu INTEGER The leading dimension of the array dU. LDDU >= max(1,NB). @param[in,out] hA COMPLEX_16 array, dimension(LDHA, N), on cpu. On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB). On exit, the content is incomplete. Shouldn't be used. @param[in] ldha INTEGER The leading dimension of the array hA. LDHA >= max(1,M). @param[in,out] dA COMPLEX_16 array, dimension(LDDA, N), on gpu. On entry, the M-by-N tile to be factored. On exit, the factor L from the factorization @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] hL COMPLEX_16 array, dimension(LDHL, K), on vpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-K the inverse of the top part. @param[in] ldhl INTEGER The leading dimension of the array hL. LDHL >= max(1,2*IB). @param[out] dL COMPLEX_16 array, dimension(LDDL, K), on gpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-K the inverse of the top part. @param[in] lddl INTEGER The leading dimension of the array dL. LDDL >= max(1,2*IB). @param[out] hWORK COMPLEX_16 array, dimension(LDHWORK, 2*IB), on cpu. Workspace. @param[in] ldhwork INTEGER The leading dimension of the array hWORK. LDHWORK >= max(NB, 1). @param[out] dWORK COMPLEX_16 array, dimension(LDDWORK, 2*IB), on gpu. Workspace. @param[in] lddwork INTEGER The leading dimension of the array dWORK. LDDWORK >= max(NB, 1). @param[out] ipiv INTEGER array on the cpu. The pivot indices array of size K as returned by ZTSTRF @param[out] info INTEGER - PLASMA_SUCCESS successful exit - < 0 if INFO = -k, the k-th argument had an illegal value - > 0 if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_tile ********************************************************************/ extern "C" magma_int_t magma_ztstrf_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, magmaDoubleComplex *hU, magma_int_t ldhu, magmaDoubleComplex_ptr dU, magma_int_t lddu, magmaDoubleComplex *hA, magma_int_t ldha, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex *hL, magma_int_t ldhl, magmaDoubleComplex_ptr dL, magma_int_t lddl, magma_int_t *ipiv, magmaDoubleComplex *hwork, magma_int_t ldhwork, magmaDoubleComplex_ptr dwork, magma_int_t lddwork, magma_int_t *info) { #define UT(i,j) (dUT + (i)*ib*lddu + (j)*ib ) #define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib ) #define L(i) (dL + (i)*ib*lddl ) #define L2(i) (dL2 + (i)*ib*lddl ) #define hU(i,j) (hU + (j)*ib*ldhu + (i)*ib ) #define hA(i,j) (hA + (j)*ib*ldha + (i)*ib ) #define hL(i) (hL + (i)*ib*ldhl ) #define hL2(i) (hL2 + (i)*ib*ldhl ) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; int iinfo = 0; int maxm, mindim; int i, j, im, s, ip, ii, sb, p = 1; magmaDoubleComplex_ptr dAT, dUT; magmaDoubleComplex_ptr dAp, dUp; #ifndef WITHOUTTRTRI magmaDoubleComplex_ptr dL2 = dL + ib; magmaDoubleComplex *hL2 = hL + ib; p = 2; #endif /* Check input arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ib < 0) { *info = -3; } else if ((lddu < max(1,m)) && (m > 0)) { *info = -6; } else if ((ldda < max(1,m)) && (m > 0)) { *info = -8; } else if ((lddl < max(1,ib)) && (ib > 0)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* quick return */ if ((m == 0) || (n == 0) || (ib == 0)) return *info; ip = 0; /* Function Body */ mindim = min(m, n); s = mindim / ib; if ( ib >= mindim ) { /* Use CPU code. */ CORE_ztstrf(m, n, ib, nb, (PLASMA_Complex64_t*)hU, ldhu, (PLASMA_Complex64_t*)hA, ldha, (PLASMA_Complex64_t*)hL, ldhl, ipiv, (PLASMA_Complex64_t*)hwork, ldhwork, info); #ifndef WITHOUTTRTRI CORE_zlacpy( PlasmaUpperLower, mindim, mindim, (PLASMA_Complex64_t*)hL, ldhl, (PLASMA_Complex64_t*)hL2, ldhl ); CORE_ztrtri( PlasmaLower, PlasmaUnit, mindim, (PLASMA_Complex64_t*)hL2, ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } #endif if ( order == MagmaRowMajor ) { magma_zsetmatrix( m, n, hU, ldhu, dwork, lddwork ); magmablas_ztranspose( m, n, dwork, lddwork, dU, lddu ); magma_zsetmatrix( m, n, hA, ldha, dwork, lddwork ); magmablas_ztranspose( m, n, dwork, lddwork, dA, ldda ); } else { magma_zsetmatrix( m, n, hU, ldhu, dU, lddu ); magma_zsetmatrix( m, n, hA, ldha, dA, ldda ); } magma_zsetmatrix( p*ib, n, hL, ldhl, dL, lddl ); } else { /* Use hybrid blocked code. */ maxm = magma_roundup( m, 32 ); if ( order == MagmaColMajor ) { magmablas_zgetmo_in( dU, dUT, lddu, m, n ); magmablas_zgetmo_in( dA, dAT, ldda, m, n ); } else { dUT = dU; dAT = dA; } dAp = dwork; dUp = dAp + ib*lddwork; ip = 0; for( i=0; i < s; i++ ) { ii = i * ib; sb = min(mindim-ii, ib); if ( i > 0 ) { // download i-th panel magmablas_ztranspose( sb, ii, UT(0,i), lddu, dUp, lddu ); magmablas_ztranspose( sb, m, AT(0,i), ldda, dAp, ldda ); magma_zgetmatrix( ii, sb, dUp, lddu, hU(0, i), ldhu ); magma_zgetmatrix( m, sb, dAp, ldda, hA(0, i), ldha ); // make sure that gpu queue is empty //magma_device_sync(); #ifndef WITHOUTTRTRI magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-(ii+sb), ib, c_one, L2(i-1), lddl, UT(i-1, i+1), lddu); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-(ii+sb), ib, c_one, L(i-1), lddl, UT(i-1, i+1), lddu); #endif magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(ii+sb), m, ib, c_neg_one, UT(i-1, i+1), lddu, AT(0, i-1), ldda, c_one, AT(0, i+1), ldda ); } // do the cpu part CORE_ztstrf(m, sb, ib, nb, (PLASMA_Complex64_t*)hU(i, i), ldhu, (PLASMA_Complex64_t*)hA(0, i), ldha, (PLASMA_Complex64_t*)hL(i), ldhl, ipiv+ii, (PLASMA_Complex64_t*)hwork, ldhwork, info); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + ii; // Need to swap betw U and A #ifndef NOSWAPBLK magmablas_zswapblk( MagmaRowMajor, n-(ii+sb), UT(i, i+1), lddu, AT(0, i+1), ldda, 1, sb, ipiv+ii, 1, nb ); for (j=0; j < ib; j++) { im = ipiv[ip]-1; if ( im == j ) { ipiv[ip] += ii; } ip++; } #else for (j=0; j < ib; j++) { im = ipiv[ip]-1; if ( im != (j) ) { im = im - nb; assert( (im >= 0) && (im < m) ); magmablas_zswap( n-(ii+sb), UT(i, i+1)+j*lddu, 1, AT(0, i+1)+im*ldda, 1 ); } else { ipiv[ip] += ii; } ip++; } #endif #ifndef WITHOUTTRTRI CORE_zlacpy( PlasmaUpperLower, sb, sb, (PLASMA_Complex64_t*)hL(i), ldhl, (PLASMA_Complex64_t*)hL2(i), ldhl ); CORE_ztrtri( PlasmaLower, PlasmaUnit, sb, (PLASMA_Complex64_t*)hL2(i), ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } #endif // upload i-th panel magma_zsetmatrix( sb, sb, hU(i, i), ldhu, dUp, lddu ); magma_zsetmatrix( m, sb, hA(0, i), ldha, dAp, ldda ); magma_zsetmatrix( p*ib, sb, hL(i), ldhl, L(i), lddl ); magmablas_ztranspose( sb, sb, dUp, lddu, UT(i,i), lddu ); magmablas_ztranspose( m, sb, dAp, ldda, AT(0,i), ldda ); // make sure that gpu queue is empty //magma_device_sync(); // do the small non-parallel computations if ( s > (i+1) ) { #ifndef WITHOUTTRTRI magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, L2(i), lddl, UT(i, i+1), lddu); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, L(i), lddl, UT(i, i+1), lddu); #endif magma_zgemm( MagmaNoTrans, MagmaNoTrans, sb, m, sb, c_neg_one, UT(i, i+1), lddu, AT(0, i ), ldda, c_one, AT(0, i+1), ldda ); } else { #ifndef WITHOUTTRTRI magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, L2(i), lddl, UT(i, i+1), lddu); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, L(i), lddl, UT(i, i+1), lddu); #endif magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-mindim, m, sb, c_neg_one, UT(i, i+1), lddu, AT(0, i ), ldda, c_one, AT(0, i+1), ldda ); } } if ( order == MagmaColMajor ) { magmablas_zgetmo_out( dU, dUT, lddu, m, n ); magmablas_zgetmo_out( dA, dAT, ldda, m, n ); } } return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zswap, zswapblk, zpermute, zlaswp, zlaswpx */ int main( int argc, char** argv) { TESTING_INIT(); magmaDoubleComplex *h_A1, *h_A2; magmaDoubleComplex *d_A1, *d_A2; magmaDoubleComplex *h_R1, *h_R2; // row-major and column-major performance real_Double_t row_perf0, col_perf0; real_Double_t row_perf1, col_perf1; real_Double_t row_perf2, col_perf2; real_Double_t row_perf3; real_Double_t row_perf4; real_Double_t row_perf5, col_perf5; real_Double_t row_perf6, col_perf6; real_Double_t row_perf7; real_Double_t cpu_perf; real_Double_t time, gbytes; magma_int_t N, lda, ldda, nb, j; magma_int_t ione = 1; magma_int_t *ipiv, *ipiv2; magma_int_t *d_ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_queue_t queue = 0; printf(" cublasZswap zswap zswapblk zlaswp zpermute zlaswp2 zlaswpx zcopymatrix CPU (all in )\n"); printf(" N nb row-maj/col-maj row-maj/col-maj row-maj/col-maj row-maj row-maj row-maj row-maj/col-maj row-blk/col-blk zlaswp (GByte/s)\n"); printf("==================================================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // For an N x N matrix, swap nb rows or nb columns using various methods. // Each test is assigned one bit in the 'check' bitmask; bit=1 indicates failure. // The variable 'shift' keeps track of which bit is for current test int shift = 1; int check = 0; N = opts.nsize[itest]; lda = N; ldda = ((N+31)/32)*32; nb = (opts.nb > 0 ? opts.nb : magma_get_zgetrf_nb( N )); nb = min( N, nb ); // each swap does 2N loads and 2N stores, for nb swaps gbytes = sizeof(magmaDoubleComplex) * 4.*N*nb / 1e9; TESTING_MALLOC_PIN( h_A1, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_A2, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R1, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R2, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, nb ); TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb ); TESTING_MALLOC_DEV( d_A1, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_A2, magmaDoubleComplex, ldda*N ); for( j=0; j < nb; j++ ) { ipiv[j] = (magma_int_t) ((rand()*1.*N) / (RAND_MAX * 1.)) + 1; } /* ===================================================================== * cublasZswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasZswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasZswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda); } } time = magma_sync_wtime( queue ) - time; col_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * zswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_zswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_zswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda ); } } time = magma_sync_wtime( queue ) - time; col_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * zswapblk, blocked version (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_zswapblk( MagmaRowMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; row_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_zsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_zswapblk( MagmaColMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; col_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_zgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * zpermute_long (1 matrix) */ /* Row Major */ memcpy( ipiv2, ipiv, nb*sizeof(magma_int_t) ); // zpermute updates ipiv2 init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_zpermute_long2( N, d_A1, ldda, ipiv2, nb, 0 ); time = magma_sync_wtime( queue ) - time; row_perf3 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style zlaswp (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_zlaswp( N, d_A1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf4 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style zlaswp (1 matrix) - d_ipiv on GPU */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 ); magmablas_zlaswp2( N, d_A1, ldda, 1, nb, d_ipiv ); time = magma_sync_wtime( queue ) - time; row_perf7 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style zlaswpx (extended for row- and col-major) (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_zlaswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf5 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_zswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* Col Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_zsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_zlaswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; col_perf5 = gbytes / time; time = magma_wtime(); lapackf77_zlaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione); time = magma_wtime() - time; cpu_perf = gbytes / time; magma_zgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * Copy matrix. */ time = magma_sync_wtime( queue ); magma_zcopymatrix( N, nb, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap col_perf6 = 0.5 * gbytes / time; time = magma_sync_wtime( queue ); magma_zcopymatrix( nb, N, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap row_perf6 = 0.5 * gbytes / time; printf("%5d %3d %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c %6.2f%c %6.2f%c %6.2f%c/ %6.2f%c %6.2f / %6.2f %6.2f %10s\n", (int) N, (int) nb, row_perf0, ((check & 0x001) != 0 ? '*' : ' '), col_perf0, ((check & 0x002) != 0 ? '*' : ' '), row_perf1, ((check & 0x004) != 0 ? '*' : ' '), col_perf1, ((check & 0x008) != 0 ? '*' : ' '), row_perf2, ((check & 0x010) != 0 ? '*' : ' '), col_perf2, ((check & 0x020) != 0 ? '*' : ' '), row_perf3, ((check & 0x040) != 0 ? '*' : ' '), row_perf4, ((check & 0x080) != 0 ? '*' : ' '), row_perf7, ((check & 0x100) != 0 ? '*' : ' '), row_perf5, ((check & 0x200) != 0 ? '*' : ' '), col_perf5, ((check & 0x400) != 0 ? '*' : ' '), row_perf6, col_perf6, cpu_perf, (check == 0 ? "ok" : "* failed") ); status += ! (check == 0); TESTING_FREE_PIN( h_A1 ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_R1 ); TESTING_FREE_PIN( h_R2 ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( ipiv2 ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_A1 ); TESTING_FREE_DEV( d_A2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }