/** @deprecated Purpose ------- CLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 REAL array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 REAL array, dimension (N) The vector with the exact column norms. @param[in,out] auxv COMPLEX array, dimension (NB) Auxiliar vector. @param[in,out] F COMPLEX array, dimension (LDF,NB) Matrix F' = L*Y'*A. @param[in] ldf INTEGER The leading dimension of the array F. LDF >= max(1,N). @ingroup magma_cgeqp3_aux ********************************************************************/ extern "C" magma_int_t magma_claqps_gpu(magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaFloatComplex *A, magma_int_t lda, magma_int_t *jpvt, magmaFloatComplex *tau, float *vn1, float *vn2, magmaFloatComplex *auxv, magmaFloatComplex *F, magma_int_t ldf) { #define A(i, j) (A + (i) + (j)*(lda )) #define F(i, j) (F + (i) + (j)*(ldf )) magmaFloatComplex c_zero = MAGMA_C_MAKE( 0.,0.); magmaFloatComplex c_one = MAGMA_C_MAKE( 1.,0.); magmaFloatComplex c_neg_one = MAGMA_C_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; //float d__1; magmaFloatComplex z__1; //magma_int_t j; magma_int_t k, rk; //magmaFloatComplex Akk; magmaFloatComplex *Aks; magmaFloatComplex tauk = MAGMA_C_ZERO; magma_int_t pvt; //float temp, temp2; float tol3z; magma_int_t itemp; float lsticc, *lsticcs; magma_int_t lastrk; magma_smalloc( &lsticcs, 1+256*(n+255)/256 ); lastrk = min( m, n + offset ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); lsticc = 0; k = 0; magma_cmalloc( &Aks, nb ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran/CUBLAS isamax; pvt, k are 0-based. pvt = k + magma_isamax( n-k, &vn1[k], ione ) - 1; if (pvt != k) { /*if (pvt >= nb) { // 1. Start copy from GPU magma_cgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, stream ); }*/ /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; /*if (pvt < nb) { // no need of transfer if pivot is within the panel blasf77_cswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { // 1. Finish copy from GPU magma_queue_sync( stream ); // 2. Swap as usual on CPU blasf77_cswap(&m, A(0, pvt), &ione, A(0, k), &ione); // 3. Restore the GPU magma_csetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, stream); }*/ magmablas_cswap( m, A(0, pvt), ione, A(0, k), ione ); //blasf77_cswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); magmablas_cswap( i__1, F(pvt, 0), ldf, F(k, 0), ldf); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; //vn1[pvt] = vn1[k]; //vn2[pvt] = vn2[k]; #if defined(PRECISION_d) || defined(PRECISION_z) //magma_dswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_dswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset ); #else //magma_sswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_sswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_sswap(2, &vn1[pvt], n+offset, &vn1[k], n+offset); #endif } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CNJG( *F(k,j) ); } #endif*/ //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_cgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione ); #else i__1 = m - rk; i__2 = k; /*blasf77_cgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione );*/ magma_cgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(rk, 0), lda, F(k, 0), ldf, c_one, A(rk, k), ione ); #endif /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CNJG( *F(k,j) ); } #endif*/ } /* Generate elementary reflector H(k). */ magma_clarfg_gpu(m-rk, A(rk, k), A(rk + 1, k), &tau[k], &vn1[k], &Aks[k]); //Akk = *A(rk, k); //*A(rk, k) = c_one; //magma_cgetvector( 1, &Aks[k], 1, &Akk, 1 ); /* needed to avoid the race condition */ if (k == 0) magma_csetvector( 1, &c_one, 1, A(rk, k), 1 ); else magma_ccopymatrix( 1, 1, A(offset, 0), 1, A(rk, k), 1 ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_cgetvector( 1, &tau[k], 1, &tauk, 1 ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ //magma_csetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda ); /* Multiply on GPU */ // was CALL CGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) //magma_cgetvector( 1, &tau[k], 1, &tauk, 1 ); magma_cgemv( MagmaConjTrans, m-rk, n-k-1, tauk, A( rk, k+1 ), lda, A( rk, k ), 1, c_zero, F( k+1, k ), 1 ); //magma_cscal( m-rk, tau[k], F( k+1, k), 1 ); //magma_int_t i__3 = nb-k-1; //magma_int_t i__4 = i__2 - i__3; //magma_int_t i__5 = nb-k; //magma_cgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, // tau[k], dA(rk +i__5, k+1+i__3), ldda, // dA(rk +i__5, k ), ione, // c_zero, dF(k+1+i__3, k ), ione ); //magma_cgetmatrix_async( i__2-i__3, 1, // dF(k + 1 +i__3, k), i__2, // F (k + 1 +i__3, k), i__2, stream ); //blasf77_cgemv( MagmaConjTransStr, &i__1, &i__3, // &tau[k], A(rk, k+1), &lda, // A(rk, k ), &ione, // &c_zero, F(k+1, k ), &ione ); //magma_queue_sync( stream ); //blasf77_cgemv( MagmaConjTransStr, &i__5, &i__4, // &tau[k], A(rk, k+1+i__3), &lda, // A(rk, k ), &ione, // &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. for (j = 0; j <= k; ++j) { magma_csetvector( 1, &c_zero, 1, F(j, k), 1 ); }*/ /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ //if (k > 0 && k < n-1) { if (k > 0) { //magma_cgetvector( 1, &tau[k], 1, &tauk, 1 ); z__1 = MAGMA_C_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_cgemv( MagmaConjTrans, i__1, i__2, z__1, A(offset+nb, 0), lda, A(offset+nb, k), ione, c_zero, auxv, ione ); i__1 = k; magma_cgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #else i__1 = m - rk; i__2 = k; //blasf77_cgemv( MagmaConjTransStr, &i__1, &i__2, // &z__1, A(rk, 0), &lda, // A(rk, k), &ione, // &c_zero, auxv, &ione ); magma_cgemv( MagmaConjTrans, i__1, i__2, z__1, A(rk, 0), lda, A(rk, k), ione, c_zero, auxv, ione ); //i__1 = k; //blasf77_cgemv( MagmaNoTransStr, &n, &i__1, // &c_one, F(0,0), &ldf, // auxv, &ione, // &c_one, F(0,k), &ione ); /*magma_cgemv( MagmaNoTrans, n, i__1, c_one, F(0,0), ldf, auxv, ione, c_one, F(0,k), ione );*/ /* I think we only need stricly lower-triangular part :) */ magma_cgemv( MagmaNoTrans, n-k-1, i__2, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; //blasf77_cgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, // &c_neg_one, A(rk, 0 ), &lda, // F(k+1,0 ), &ldf, // &c_one, A(rk, k+1), &lda ); #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_cgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, A(rk, k ), lda, F(k+1, k ), ldf, c_one, A(rk, k+1), lda ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_cgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, A(rk, 0 ), lda, F(k+1,0 ), ldf, c_one, A(rk, k+1), lda ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ) { magmablas_scnrm2_row_check_adjust(n-k-1, tol3z, &vn1[k+1], &vn2[k+1], A(rk,k+1), lda, lsticcs); magma_device_sync(); #if defined(PRECISION_d) || defined(PRECISION_z) magma_sgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #else magma_sgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #endif } /*if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { // NOTE: The following 4 lines follow from the analysis in // Lapack Working Note 176. temp = MAGMA_C_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (float) lsticc; lsticc = j; } else { vn1[j] *= magma_ssqrt(temp); } } } }*/ //*A(rk, k) = Akk; //magma_csetvector( 1, &Akk, 1, A(rk, k), 1 ); //magma_cswap( 1, &Aks[k], 1, A(rk, k), 1 ); ++k; } magma_ccopymatrix( 1, k, Aks, 1, A(offset, 0), lda+1 ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU magma_csetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2 );*/ magma_cgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, A(rk+1, 0 ), lda, F(*kb, 0 ), ldf, c_one, A(rk+1, *kb), lda ); } /* Recomputation of difficult columns. */ if ( lsticc > 0 ) { // printf( " -- recompute dnorms --\n" ); magmablas_scnrm2_check(m-rk-1, n-*kb, A(rk+1,*kb), lda, &vn1[*kb], lsticcs); magma_scopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb); /*while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) vn1[lsticc] = magma_cblas_scnrm2( i__1, A(rk+1,lsticc), ione ); else { // Where is the data, CPU or GPU ? float r1, r2; r1 = magma_cblas_scnrm2( nb-k, A(rk+1,lsticc), ione ); r2 = magma_scnrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione); vn1[lsticc] = magma_ssqrt(r1*r1+r2*r2); } // NOTE: The computation of VN1( LSTICC ) relies on the fact that // SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) vn2[lsticc] = vn1[lsticc]; lsticc = itemp;*/ } magma_free(Aks); magma_free(lsticcs); return MAGMA_SUCCESS; } /* magma_claqps */
/* //////////////////////////////////////////////////////////////////////////// -- Testing cswap, cswapblk, cpermute, claswp, claswpx */ int main( int argc, char** argv) { TESTING_INIT(); magmaFloatComplex *h_A1, *h_A2; magmaFloatComplex *d_A1, *d_A2; magmaFloatComplex *h_R1, *h_R2; // row-major and column-major performance real_Double_t row_perf0, col_perf0; real_Double_t row_perf1, col_perf1; real_Double_t row_perf2, col_perf2; real_Double_t row_perf3; real_Double_t row_perf4; real_Double_t row_perf5, col_perf5; real_Double_t row_perf6, col_perf6; real_Double_t row_perf7; real_Double_t cpu_perf; real_Double_t time, gbytes; magma_int_t N, lda, ldda, nb, j; magma_int_t ione = 1; magma_int_t *ipiv, *ipiv2; magma_int_t *d_ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_queue_t queue = 0; printf(" cublasCswap cswap cswapblk claswp cpermute claswp2 claswpx ccopymatrix CPU (all in )\n"); printf(" N nb row-maj/col-maj row-maj/col-maj row-maj/col-maj row-maj row-maj row-maj row-maj/col-maj row-blk/col-blk claswp (GByte/s)\n"); printf("==================================================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // For an N x N matrix, swap nb rows or nb columns using various methods. // Each test is assigned one bit in the 'check' bitmask; bit=1 indicates failure. // The variable 'shift' keeps track of which bit is for current test int shift = 1; int check = 0; N = opts.nsize[itest]; lda = N; ldda = ((N+31)/32)*32; nb = (opts.nb > 0 ? opts.nb : magma_get_cgetrf_nb( N )); nb = min( N, nb ); // each swap does 2N loads and 2N stores, for nb swaps gbytes = sizeof(magmaFloatComplex) * 4.*N*nb / 1e9; TESTING_MALLOC_PIN( h_A1, magmaFloatComplex, lda*N ); TESTING_MALLOC_PIN( h_A2, magmaFloatComplex, lda*N ); TESTING_MALLOC_PIN( h_R1, magmaFloatComplex, lda*N ); TESTING_MALLOC_PIN( h_R2, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, nb ); TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb ); TESTING_MALLOC_DEV( d_A1, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_A2, magmaFloatComplex, ldda*N ); for( j=0; j < nb; j++ ) { ipiv[j] = (magma_int_t) ((rand()*1.*N) / (RAND_MAX * 1.)) + 1; } /* ===================================================================== * cublasCswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_csetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasCswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_cgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_csetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasCswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda); } } time = magma_sync_wtime( queue ) - time; col_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_cgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * cswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_csetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_cswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_cgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_csetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_cswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda ); } } time = magma_sync_wtime( queue ) - time; col_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_cgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * cswapblk, blocked version (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_csetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_cswapblk( MagmaRowMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; row_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_cgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_csetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_cswapblk( MagmaColMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; col_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_cgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * cpermute_long (1 matrix) */ /* Row Major */ memcpy( ipiv2, ipiv, nb*sizeof(magma_int_t) ); // cpermute updates ipiv2 init_matrix( N, N, h_A1, lda, 0 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_cpermute_long2( N, d_A1, ldda, ipiv2, nb, 0 ); time = magma_sync_wtime( queue ) - time; row_perf3 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style claswp (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_claswp( N, d_A1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf4 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style claswp (1 matrix) - d_ipiv on GPU */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 ); magmablas_claswp2( N, d_A1, ldda, 1, nb, d_ipiv, 1 ); time = magma_sync_wtime( queue ) - time; row_perf7 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style claswpx (extended for row- and col-major) (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_claswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf5 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* Col Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_claswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; col_perf5 = gbytes / time; time = magma_wtime(); lapackf77_claswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione); time = magma_wtime() - time; cpu_perf = gbytes / time; magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * Copy matrix. */ time = magma_sync_wtime( queue ); magma_ccopymatrix( N, nb, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap col_perf6 = 0.5 * gbytes / time; time = magma_sync_wtime( queue ); magma_ccopymatrix( nb, N, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap row_perf6 = 0.5 * gbytes / time; printf("%5d %3d %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c %6.2f%c %6.2f%c %6.2f%c/ %6.2f%c %6.2f / %6.2f %6.2f %10s\n", (int) N, (int) nb, row_perf0, ((check & 0x001) != 0 ? '*' : ' '), col_perf0, ((check & 0x002) != 0 ? '*' : ' '), row_perf1, ((check & 0x004) != 0 ? '*' : ' '), col_perf1, ((check & 0x008) != 0 ? '*' : ' '), row_perf2, ((check & 0x010) != 0 ? '*' : ' '), col_perf2, ((check & 0x020) != 0 ? '*' : ' '), row_perf3, ((check & 0x040) != 0 ? '*' : ' '), row_perf4, ((check & 0x080) != 0 ? '*' : ' '), row_perf7, ((check & 0x100) != 0 ? '*' : ' '), row_perf5, ((check & 0x200) != 0 ? '*' : ' '), col_perf5, ((check & 0x400) != 0 ? '*' : ' '), row_perf6, col_perf6, cpu_perf, (check == 0 ? "ok" : "* failed") ); status += ! (check == 0); TESTING_FREE_PIN( h_A1 ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_R1 ); TESTING_FREE_PIN( h_R2 ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( ipiv2 ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_A1 ); TESTING_FREE_DEV( d_A2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cswap, cswapblk, claswp, claswpx */ int main( int argc, char** argv) { TESTING_INIT(); magmaFloatComplex *h_A1, *h_A2; magmaFloatComplex *h_R1, *h_R2; magmaFloatComplex_ptr d_A1, d_A2; // row-major and column-major performance real_Double_t row_perf0 = MAGMA_D_NAN, col_perf0 = MAGMA_D_NAN; real_Double_t row_perf1 = MAGMA_D_NAN, col_perf1 = MAGMA_D_NAN; real_Double_t row_perf2 = MAGMA_D_NAN, col_perf2 = MAGMA_D_NAN; real_Double_t row_perf4 = MAGMA_D_NAN; real_Double_t row_perf5 = MAGMA_D_NAN, col_perf5 = MAGMA_D_NAN; real_Double_t row_perf6 = MAGMA_D_NAN, col_perf6 = MAGMA_D_NAN; real_Double_t row_perf7 = MAGMA_D_NAN; real_Double_t cpu_perf = MAGMA_D_NAN; real_Double_t time, gbytes; magma_int_t N, lda, ldda, nb, j; magma_int_t ione = 1; magma_int_t *ipiv, *ipiv2; magmaInt_ptr d_ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_queue_t queue = 0; printf(" %8s cswap cswap cswapblk claswp claswp2 claswpx ccopymatrix CPU (all in )\n", g_platform_str ); printf(" N nb row-maj/col-maj row-maj/col-maj row-maj/col-maj row-maj row-maj row-maj/col-maj row-blk/col-blk claswp (GByte/s)\n"); printf("=========================================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // For an N x N matrix, swap nb rows or nb columns using various methods. // Each test is assigned one bit in the 'check' bitmask; bit=1 indicates failure. // The variable 'shift' keeps track of which bit is for current test int shift = 1; int check = 0; N = opts.nsize[itest]; lda = N; ldda = ((N+31)/32)*32; nb = (opts.nb > 0 ? opts.nb : magma_get_cgetrf_nb( N )); nb = min( N, nb ); // each swap does 2N loads and 2N stores, for nb swaps gbytes = sizeof(magmaFloatComplex) * 4.*N*nb / 1e9; TESTING_MALLOC_PIN( h_A1, magmaFloatComplex, lda*N ); TESTING_MALLOC_PIN( h_A2, magmaFloatComplex, lda*N ); TESTING_MALLOC_PIN( h_R1, magmaFloatComplex, lda*N ); TESTING_MALLOC_PIN( h_R2, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, nb ); TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb ); TESTING_MALLOC_DEV( d_A1, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_A2, magmaFloatComplex, ldda*N ); // getrf always makes ipiv[j] >= j+1, where ipiv is one based and j is zero based // some implementations (e.g., MacOS dlaswp) assume this for( j=0; j < nb; j++ ) { ipiv[j] = (rand() % (N-j)) + j + 1; assert( ipiv[j] >= j+1 ); assert( ipiv[j] <= N ); } /* ===================================================================== * cublas / clBLAS / Xeon Phi cswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_csetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { #ifdef HAVE_CUBLAS cublasCswap( opts.handle, N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1 ); #else magma_cswap( N, d_A1, ldda*j, 1, d_A2, ldda*(ipiv[j]-1), 1, opts.queue ); #endif } } time = magma_sync_wtime( queue ) - time; row_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_cgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_csetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { #ifdef HAVE_CUBLAS cublasCswap( opts.handle, N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda ); #else magma_cswap( N, d_A1, j, ldda, d_A2, ipiv[j]-1, ldda, opts.queue ); #endif } } time = magma_sync_wtime( queue ) - time; col_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_cgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * cswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_csetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_cswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_cgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_csetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_cswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda ); } } time = magma_sync_wtime( queue ) - time; col_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_cgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * cswapblk, blocked version (2 matrices) */ #ifdef HAVE_CUBLAS /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_csetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_cswapblk( MagmaRowMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; row_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_cgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_csetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_cswapblk( MagmaColMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; col_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_cgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; #endif /* ===================================================================== * LAPACK-style claswp (1 matrix) */ #ifdef HAVE_CUBLAS /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_claswp( N, d_A1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf4 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; #endif /* ===================================================================== * LAPACK-style claswp (1 matrix) - d_ipiv on GPU */ #ifdef HAVE_CUBLAS /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 ); magmablas_claswp2( N, d_A1, ldda, 1, nb, d_ipiv, 1 ); time = magma_sync_wtime( queue ) - time; row_perf7 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; #endif /* ===================================================================== * LAPACK-style claswpx (extended for row- and col-major) (1 matrix) */ #ifdef HAVE_CUBLAS /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_claswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf5 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* Col Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_csetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_claswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; col_perf5 = gbytes / time; #endif /* LAPACK swap on CPU for comparison */ time = magma_wtime(); lapackf77_claswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione); time = magma_wtime() - time; cpu_perf = gbytes / time; #ifdef HAVE_CUBLAS magma_cgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; #endif /* ===================================================================== * Copy matrix. */ time = magma_sync_wtime( queue ); magma_ccopymatrix( N, nb, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap col_perf6 = 0.5 * gbytes / time; time = magma_sync_wtime( queue ); magma_ccopymatrix( nb, N, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap row_perf6 = 0.5 * gbytes / time; printf("%5d %3d %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c %6.2f%c %6.2f%c/ %6.2f%c %6.2f / %6.2f %6.2f %10s\n", (int) N, (int) nb, row_perf0, ((check & 0x001) != 0 ? '*' : ' '), col_perf0, ((check & 0x002) != 0 ? '*' : ' '), row_perf1, ((check & 0x004) != 0 ? '*' : ' '), col_perf1, ((check & 0x008) != 0 ? '*' : ' '), row_perf2, ((check & 0x010) != 0 ? '*' : ' '), col_perf2, ((check & 0x020) != 0 ? '*' : ' '), row_perf4, ((check & 0x040) != 0 ? '*' : ' '), row_perf7, ((check & 0x080) != 0 ? '*' : ' '), row_perf5, ((check & 0x100) != 0 ? '*' : ' '), col_perf5, ((check & 0x200) != 0 ? '*' : ' '), row_perf6, col_perf6, cpu_perf, (check == 0 ? "ok" : "* failed") ); status += ! (check == 0); TESTING_FREE_PIN( h_A1 ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_R1 ); TESTING_FREE_PIN( h_R2 ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( ipiv2 ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_A1 ); TESTING_FREE_DEV( d_A2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** @deprecated Purpose ------- CLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] dA COMPLEX array, dimension (LDDA,N), on the GPU. On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 REAL array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 REAL array, dimension (N) The vector with the exact column norms. @param[in,out] dauxv COMPLEX array, dimension (NB), on the GPU Auxiliary vector. @param[in,out] dF COMPLEX array, dimension (LDDF,NB), on the GPU Matrix F' = L*Y'*A. @param[in] lddf INTEGER The leading dimension of the array F. LDDF >= max(1,N). @ingroup magma_cgeqp3_aux ********************************************************************/ extern "C" magma_int_t magma_claqps_gpu( magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaFloatComplex *tau, float *vn1, float *vn2, magmaFloatComplex_ptr dauxv, magmaFloatComplex_ptr dF, magma_int_t lddf) { #define dA(i, j) (dA + (i) + (j)*(ldda)) #define dF(i, j) (dF + (i) + (j)*(lddf)) magmaFloatComplex c_zero = MAGMA_C_MAKE( 0.,0.); magmaFloatComplex c_one = MAGMA_C_MAKE( 1.,0.); magmaFloatComplex c_neg_one = MAGMA_C_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; magmaFloatComplex z__1; magma_int_t k, rk; magmaFloatComplex_ptr dAks; magmaFloatComplex tauk = MAGMA_C_ZERO; magma_int_t pvt; float tol3z; magma_int_t itemp; float lsticc; magmaFloat_ptr dlsticcs; magma_smalloc( &dlsticcs, 1+256*(n+255)/256 ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); lsticc = 0; k = 0; magma_cmalloc( &dAks, nb ); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran/CUBLAS isamax; pvt, k are 0-based. pvt = k + magma_isamax( n-k, &vn1[k], ione, queue ) - 1; if (pvt != k) { /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; magmablas_cswap( m, dA(0, pvt), ione, dA(0, k), ione, queue ); magmablas_cswap( i__1, dF(pvt, 0), lddf, dF(k, 0), lddf, queue ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; magma_sswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset, queue ); } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_cgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione, queue ); #else i__1 = m - rk; i__2 = k; magma_cgemv( MagmaNoTrans, i__1, i__2, c_neg_one, dA(rk, 0), ldda, dF(k, 0), lddf, c_one, dA(rk, k), ione, queue ); #endif } /* Generate elementary reflector H(k). */ magma_clarfg_gpu( m-rk, dA(rk, k), dA(rk + 1, k), &tau[k], &vn1[k], &dAks[k], queue ); /* needed to avoid the race condition */ if (k == 0) magma_csetvector( 1, &c_one, 1, dA(rk, k), 1, queue ); else magma_ccopymatrix( 1, 1, dA(offset, 0), 1, dA(rk, k), 1, queue ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_cgetvector( 1, &tau[k], 1, &tauk, 1, queue ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Multiply on GPU */ magma_cgemv( MagmaConjTrans, m-rk, n-k-1, tauk, dA( rk, k+1 ), ldda, dA( rk, k ), 1, c_zero, dF( k+1, k ), 1, queue ); } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ if (k > 0) { z__1 = MAGMA_C_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_cgemv( MagmaConjTrans, i__1, i__2, z__1, dA(offset+nb, 0), lda, dA(offset+nb, k), ione, c_zero, dauxv, ione, queue ); i__1 = k; magma_cgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, dauxv, ione, c_one, F(k+1,k), ione, queue ); #else i__1 = m - rk; i__2 = k; magma_cgemv( MagmaConjTrans, i__1, i__2, z__1, dA(rk, 0), ldda, dA(rk, k), ione, c_zero, dauxv, ione, queue ); /* I think we only need stricly lower-triangular part :) */ magma_cgemv( MagmaNoTrans, n-k-1, i__2, c_one, dF(k+1,0), lddf, dauxv, ione, c_one, dF(k+1,k), ione, queue ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_cgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, dA(rk, k ), ldda, dF(k+1, k ), lddf, c_one, dA(rk, k+1), ldda, queue ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_cgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, dA(rk, 0 ), ldda, dF(k+1,0 ), lddf, c_one, dA(rk, k+1), ldda, queue ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ) { magmablas_scnrm2_row_check_adjust( n-k-1, tol3z, &vn1[k+1], &vn2[k+1], dA(rk,k+1), ldda, dlsticcs, queue ); //magma_device_sync(); magma_sgetvector( 1, &dlsticcs[0], 1, &lsticc, 1, queue ); } ++k; } magma_ccopymatrix( 1, k, dAks, 1, dA(offset, 0), ldda+1, queue ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; magma_cgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), lddf, c_one, dA(rk+1, *kb), ldda, queue ); } /* Recomputation of difficult columns. */ if ( lsticc > 0 ) { // printf( " -- recompute dnorms --\n" ); magmablas_scnrm2_check( m-rk-1, n-*kb, dA(rk+1,*kb), ldda, &vn1[*kb], dlsticcs, queue ); magma_scopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb, queue ); } magma_free( dAks ); magma_free( dlsticcs ); magma_queue_destroy( queue ); return MAGMA_SUCCESS; } /* magma_claqps */
/** Purpose ------- CGETRI computes the inverse of a matrix using the LU factorization computed by CGETRF. This method inverts U and then computes inv(A) by solving the system inv(A)*L = inv(U) for inv(A). Note that it is generally both faster and more accurate to use CGESV, or CGETRF and CGETRS, to solve the system AX = B, rather than inverting the matrix and multiplying to form X = inv(A)*B. Only in special instances should an explicit inverse be computed with this routine. Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA,N) On entry, the factors L and U from the factorization A = P*L*U as computed by CGETRF_GPU. On exit, if INFO = 0, the inverse of the original matrix A. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[in] ipiv INTEGER array, dimension (N) The pivot indices from CGETRF; for 1 <= i <= N, row i of the matrix was interchanged with row IPIV(i). @param[out] dwork (workspace) COMPLEX array on the GPU, dimension (MAX(1,LWORK)) @param[in] lwork INTEGER The dimension of the array DWORK. LWORK >= N*NB, where NB is the optimal blocksize returned by magma_get_cgetri_nb(n). \n Unlike LAPACK, this version does not currently support a workspace query, because the workspace is on the GPU. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, U(i,i) is exactly zero; the matrix is singular and its cannot be computed. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetri_gpu( magma_int_t n, magmaFloatComplex *dA, magma_int_t ldda, magma_int_t *ipiv, magmaFloatComplex *dwork, magma_int_t lwork, magma_int_t *info ) { #define dA(i, j) (dA + (i) + (j)*ldda) #define dL(i, j) (dL + (i) + (j)*lddl) /* Local variables */ magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *dL = dwork; magma_int_t lddl = n; magma_int_t nb = magma_get_cgetri_nb(n); magma_int_t j, jmax, jb, jp; *info = 0; if (n < 0) *info = -1; else if (ldda < max(1,n)) *info = -3; else if ( lwork < n*nb ) *info = -6; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( n == 0 ) return *info; /* Invert the triangular factor U */ magma_ctrtri_gpu( MagmaUpper, MagmaNonUnit, n, dA, ldda, info ); if ( *info != 0 ) return *info; jmax = ((n-1) / nb)*nb; for( j = jmax; j >= 0; j -= nb ) { jb = min( nb, n-j ); // copy current block column of A to work space dL // (only needs lower trapezoid, but we also copy upper triangle), // then zero the strictly lower trapezoid block column of A. magmablas_clacpy( MagmaFull, n-j, jb, dA(j,j), ldda, dL(j,0), lddl ); magmablas_claset( MagmaLower, n-j-1, jb, c_zero, c_zero, dA(j+1,j), ldda ); // compute current block column of Ainv // Ainv(:, j:j+jb-1) // = ( U(:, j:j+jb-1) - Ainv(:, j+jb:n) L(j+jb:n, j:j+jb-1) ) // * L(j:j+jb-1, j:j+jb-1)^{-1} // where L(:, j:j+jb-1) is stored in dL. if ( j+jb < n ) { magma_cgemm( MagmaNoTrans, MagmaNoTrans, n, jb, n-j-jb, c_neg_one, dA(0,j+jb), ldda, dL(j+jb,0), lddl, c_one, dA(0,j), ldda ); } // TODO use magmablas work interface magma_ctrsm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaUnit, n, jb, c_one, dL(j,0), lddl, dA(0,j), ldda ); } // Apply column interchanges for( j = n-2; j >= 0; --j ) { jp = ipiv[j] - 1; if ( jp != j ) { magmablas_cswap( n, dA(0,j), 1, dA(0,jp), 1 ); } } return *info; }
extern "C" magma_int_t magma_claqps_gpu(magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaFloatComplex *A, magma_int_t lda, magma_int_t *jpvt, magmaFloatComplex *tau, float *vn1, float *vn2, magmaFloatComplex *auxv, magmaFloatComplex *F, magma_int_t ldf) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0 OFFSET (input) INTEGER The number of rows of A that have been factorized in previous steps. NB (input) INTEGER The number of columns to factorize. KB (output) INTEGER The number of columns actually factorized. A (input/output) COMPLEX*16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). JPVT (input/output) INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. TAU (output) COMPLEX*16 array, dimension (KB) The scalar factors of the elementary reflectors. VN1 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. VN2 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. AUXV (input/output) COMPLEX*16 array, dimension (NB) Auxiliar vector. F (input/output) COMPLEX*16 array, dimension (LDF,NB) Matrix F' = L*Y'*A. LDF (input) INTEGER The leading dimension of the array F. LDF >= max(1,N). ===================================================================== */ #define A(i, j) (A + (i) + (j)*(lda )) #define F(i, j) (F + (i) + (j)*(ldf )) magmaFloatComplex c_zero = MAGMA_C_MAKE( 0.,0.); magmaFloatComplex c_one = MAGMA_C_MAKE( 1.,0.); magmaFloatComplex c_neg_one = MAGMA_C_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; //float d__1; magmaFloatComplex z__1; //magma_int_t j; magma_int_t k, rk; //magmaFloatComplex Akk; magmaFloatComplex *Aks; magmaFloatComplex tauk; magma_int_t pvt; //float temp, temp2; float tol3z; magma_int_t itemp; float lsticc, *lsticcs; magma_int_t lastrk; magma_smalloc( &lsticcs, 1+256*(n+255)/256 ); lastrk = min( m, n + offset ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); lsticc = 0; k = 0; magma_cmalloc( &Aks, nb ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // Fortran: pvt, k, isamax are all 1-based; subtract 1 from k. // C: pvt, k, isamax are all 0-based; don't subtract 1. pvt = k - 1 + magma_isamax( n-k, &vn1[k], ione ); if (pvt != k) { /*if (pvt >= nb) { // 1. Start copy from GPU magma_cgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, stream ); }*/ /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; /*if (pvt < nb){ // no need of transfer if pivot is within the panel blasf77_cswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { // 1. Finish copy from GPU magma_queue_sync( stream ); // 2. Swap as usual on CPU blasf77_cswap(&m, A(0, pvt), &ione, A(0, k), &ione); // 3. Restore the GPU magma_csetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, stream); }*/ magmablas_cswap( m, A(0, pvt), ione, A(0, k), ione ); //blasf77_cswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); magmablas_cswap( i__1, F(pvt, 0), ldf, F(k, 0), ldf); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; //vn1[pvt] = vn1[k]; //vn2[pvt] = vn2[k]; #if defined(PRECISION_d) || defined(PRECISION_z) //magma_dswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_dswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset ); #else //magma_sswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_sswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_sswap(2, &vn1[pvt], n+offset, &vn1[k], n+offset); #endif } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j){ *F(k,j) = MAGMA_C_CNJG( *F(k,j) ); } #endif*/ //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_cgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione ); #else i__1 = m - rk; i__2 = k; /*blasf77_cgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione );*/ magma_cgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(rk, 0), lda, F(k, 0), ldf, c_one, A(rk, k), ione ); #endif /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_C_CNJG( *F(k,j) ); } #endif*/ } /* Generate elementary reflector H(k). */ magma_clarfg_gpu(m-rk, A(rk, k), A(rk + 1, k), &tau[k], &vn1[k], &Aks[k]); //Akk = *A(rk, k); //*A(rk, k) = c_one; //magma_cgetvector( 1, &Aks[k], 1, &Akk, 1 ); /* needed to avoid the race condition */ if (k == 0) magma_csetvector( 1, &c_one, 1, A(rk, k), 1 ); else magma_ccopymatrix( 1, 1, A(offset, 0), 1, A(rk, k), 1 ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_cgetvector( 1, &tau[k], 1, &tauk, 1 ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ //magma_csetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda ); /* Multiply on GPU */ // was CALL CGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) //magma_cgetvector( 1, &tau[k], 1, &tauk, 1 ); magma_cgemv( MagmaConjTrans, m-rk, n-k-1, tauk, A( rk, k+1 ), lda, A( rk, k ), 1, c_zero, F( k+1, k ), 1 ); //magma_cscal( m-rk, tau[k], F( k+1, k), 1 ); //magma_int_t i__3 = nb-k-1; //magma_int_t i__4 = i__2 - i__3; //magma_int_t i__5 = nb-k; //magma_cgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, // tau[k], dA(rk +i__5, k+1+i__3), ldda, // dA(rk +i__5, k ), ione, // c_zero, dF(k+1+i__3, k ), ione ); //magma_cgetmatrix_async( i__2-i__3, 1, // dF(k + 1 +i__3, k), i__2, // F (k + 1 +i__3, k), i__2, stream ); //blasf77_cgemv( MagmaConjTransStr, &i__1, &i__3, // &tau[k], A(rk, k+1), &lda, // A(rk, k ), &ione, // &c_zero, F(k+1, k ), &ione ); //magma_queue_sync( stream ); //blasf77_cgemv( MagmaConjTransStr, &i__5, &i__4, // &tau[k], A(rk, k+1+i__3), &lda, // A(rk, k ), &ione, // &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. for (j = 0; j <= k; ++j) { magma_csetvector( 1, &c_zero, 1, F(j, k), 1 ); }*/ /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ //if (k > 0 && k<n-1) { if (k > 0) { //magma_cgetvector( 1, &tau[k], 1, &tauk, 1 ); z__1 = MAGMA_C_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_cgemv( MagmaConjTrans, i__1, i__2, z__1, A(offset+nb, 0), lda, A(offset+nb, k), ione, c_zero, auxv, ione ); i__1 = k; magma_cgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #else i__1 = m - rk; i__2 = k; //blasf77_cgemv( MagmaConjTransStr, &i__1, &i__2, // &z__1, A(rk, 0), &lda, // A(rk, k), &ione, // &c_zero, auxv, &ione ); magma_cgemv( MagmaConjTrans, i__1, i__2, z__1, A(rk, 0), lda, A(rk, k), ione, c_zero, auxv, ione ); //i__1 = k; //blasf77_cgemv( MagmaNoTransStr, &n, &i__1, // &c_one, F(0,0), &ldf, // auxv, &ione, // &c_one, F(0,k), &ione ); /*magma_cgemv( MagmaNoTrans, n, i__1, c_one, F(0,0), ldf, auxv, ione, c_one, F(0,k), ione );*/ /* I think we only need stricly lower-triangular part :) */ magma_cgemv( MagmaNoTrans, n-k-1, i__2, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; //blasf77_cgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, // &c_neg_one, A(rk, 0 ), &lda, // F(k+1,0 ), &ldf, // &c_one, A(rk, k+1), &lda ); #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_cgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, A(rk, k ), lda, F(k+1, k ), ldf, c_one, A(rk, k+1), lda ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_cgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, A(rk, 0 ), lda, F(k+1,0 ), ldf, c_one, A(rk, k+1), lda ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ){ magmablas_scnrm2_row_check_adjust(n-k-1, tol3z, &vn1[k+1], &vn2[k+1], A(rk,k+1), lda, lsticcs); magma_device_sync(); #if defined(PRECISION_d) || defined(PRECISION_z) magma_dgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #else magma_sgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #endif } /*if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { // NOTE: The following 4 lines follow from the analysis in // Lapack Working Note 176. temp = MAGMA_C_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (float) lsticc; lsticc = j; } else { vn1[j] *= magma_ssqrt(temp); } } } }*/ //*A(rk, k) = Akk; //magma_csetvector( 1, &Akk, 1, A(rk, k), 1 ); //magma_cswap( 1, &Aks[k], 1, A(rk, k), 1 ); ++k; } magma_ccopymatrix( 1, k, Aks, 1, A(offset, 0), lda+1 ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU magma_csetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2 );*/ magma_cgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, A(rk+1, 0 ), lda, F(*kb, 0 ), ldf, c_one, A(rk+1, *kb), lda ); } /* Recomputation of difficult columns. */ if( lsticc > 0 ) { printf( " -- recompute dnorms --\n" ); magmablas_scnrm2_check(m-rk-1, n-*kb, A(rk+1,*kb), lda, &vn1[*kb], lsticcs); #if defined(PRECISION_d) || defined(PRECISION_z) magma_dcopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb); #else magma_scopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb); #endif /*while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) vn1[lsticc] = cblas_scnrm2(i__1, A(rk + 1, lsticc), ione); else { // Where is the data, CPU or GPU ? float r1, r2; r1 = cblas_scnrm2(nb-k, A(rk + 1, lsticc), ione); r2 = magma_scnrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione); vn1[lsticc] = magma_ssqrt(r1*r1+r2*r2); } // NOTE: The computation of VN1( LSTICC ) relies on the fact that // SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) vn2[lsticc] = vn1[lsticc]; lsticc = itemp;*/ } magma_free(Aks); magma_free(lsticcs); return MAGMA_SUCCESS; } /* magma_claqps */
extern "C" magma_int_t magma_ctstrf_gpu( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, magmaFloatComplex *hU, magma_int_t ldhu, magmaFloatComplex *dU, magma_int_t lddu, magmaFloatComplex *hA, magma_int_t ldha, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *hL, magma_int_t ldhl, magmaFloatComplex *dL, magma_int_t lddl, magma_int_t *ipiv, magmaFloatComplex *hwork, magma_int_t ldhwork, magmaFloatComplex *dwork, magma_int_t lddwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CSSSSM applies the LU factorization update from a complex matrix formed by a lower triangular IB-by-K tile L1 on top of a M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1 tile A1 on top of a M2-by-N2 tile A2 (N1 == N2). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. IB (input) INTEGER The inner-blocking size. IB >= 0. NB (input) INTEGER The blocking size. NB >= 0. hU (input,output) COMPLEX array, dimension(LDHU, N), on cpu. On entry, the NB-by-N upper triangular tile hU. On exit, the content is incomplete. Shouldn't be used. LDHU (input) INTEGER The leading dimension of the array hU. LDHU >= max(1,NB). dU (input,output) COMPLEX array, dimension(LDDU, N), on gpu. On entry, the NB-by-N upper triangular tile dU identical to hU. On exit, the new factor U from the factorization. LDDU (input) INTEGER The leading dimension of the array dU. LDDU >= max(1,NB). hA (input,output) COMPLEX array, dimension(LDHA, N), on cpu. On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB). On exit, the content is incomplete. Shouldn't be used. LDHA (input) INTEGER The leading dimension of the array hA. LDHA >= max(1,M). dA (input,output) COMPLEX array, dimension(LDDA, N) , on gpu. On entry, the M-by-N tile to be factored. On exit, the factor L from the factorization LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). hL (output) COMPLEX array, dimension(LDHL, K), on vpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-K the inverse of the top part. LDHL (input) INTEGER The leading dimension of the array hL. LDHL >= max(1,2*IB). dL (output) COMPLEX array, dimension(LDDL, K), on gpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-K the inverse of the top part. LDDL (input) INTEGER The leading dimension of the array dL. LDDL >= max(1,2*IB). hWORK (output) COMPLEX array, dimension(LDHWORK, 2*IB), on cpu. Workspace. LDHWORK (input) INTEGER The leading dimension of the array hWORK. LDHWORK >= max(NB, 1). dWORK (output) COMPLEX array, dimension(LDDWORK, 2*IB), on gpu. Workspace. LDDWORK (input) INTEGER The leading dimension of the array dWORK. LDDWORK >= max(NB, 1). IPIV (output) INTEGER array on the cpu. The pivot indices array of size K as returned by CTSTRF INFO (output) INTEGER - PLASMA_SUCCESS successful exit - < 0 if INFO = -k, the k-th argument had an illegal value - > 0 if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define UT(i,j) (dUT + (i)*ib*lddu + (j)*ib ) #define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib ) #define L(i) (dL + (i)*ib*lddl ) #define L2(i) (dL2 + (i)*ib*lddl ) #define hU(i,j) (hU + (j)*ib*ldhu + (i)*ib ) #define hA(i,j) (hA + (j)*ib*ldha + (i)*ib ) #define hL(i) (hL + (i)*ib*ldhl ) #define hL2(i) (hL2 + (i)*ib*ldhl ) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; int iinfo = 0; int maxm, mindim; int i, j, im, s, ip, ii, sb, p = 1; magmaFloatComplex *dAT, *dUT; magmaFloatComplex *dAp, *dUp; #ifndef WITHOUTTRTRI magmaFloatComplex *dL2 = dL + ib; magmaFloatComplex *hL2 = hL + ib; p = 2; #endif /* Check input arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ib < 0) { *info = -3; } else if ((lddu < max(1,m)) && (m > 0)) { *info = -6; } else if ((ldda < max(1,m)) && (m > 0)) { *info = -8; } else if ((lddl < max(1,ib)) && (ib > 0)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* quick return */ if ((m == 0) || (n == 0) || (ib == 0)) return *info; ip = 0; /* Function Body */ mindim = min(m, n); s = mindim / ib; if ( ib >= mindim ) { /* Use CPU code. */ CORE_ctstrf(m, n, ib, nb, (PLASMA_Complex32_t*)hU, ldhu, (PLASMA_Complex32_t*)hA, ldha, (PLASMA_Complex32_t*)hL, ldhl, ipiv, (PLASMA_Complex32_t*)hwork, ldhwork, info); #ifndef WITHOUTTRTRI CORE_clacpy( PlasmaUpperLower, mindim, mindim, (PLASMA_Complex32_t*)hL, ldhl, (PLASMA_Complex32_t*)hL2, ldhl ); CORE_ctrtri( PlasmaLower, PlasmaUnit, mindim, (PLASMA_Complex32_t*)hL2, ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } #endif if ( (storev == 'R') || (storev == 'r') ) { magma_csetmatrix( m, n, hU, ldhu, dwork, lddwork ); magmablas_ctranspose( dU, lddu, dwork, lddwork, m, n ); magma_csetmatrix( m, n, hA, ldha, dwork, lddwork ); magmablas_ctranspose( dA, ldda, dwork, lddwork, m, n ); } else { magma_csetmatrix( m, n, hU, ldhu, dU, lddu ); magma_csetmatrix( m, n, hA, ldha, dA, ldda ); } magma_csetmatrix( p*ib, n, hL, ldhl, dL, lddl ); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if ( (storev == 'C') || (storev == 'c') ) { magmablas_cgetmo_in( dU, dUT, lddu, m, n ); magmablas_cgetmo_in( dA, dAT, ldda, m, n ); } else { dUT = dU; dAT = dA; } dAp = dwork; dUp = dAp + ib*lddwork; ip = 0; for( i=0; i<s; i++ ) { ii = i * ib; sb = min(mindim-ii, ib); if ( i>0 ){ // download i-th panel magmablas_ctranspose( dUp, lddu, UT(0, i), lddu, sb, ii ); magmablas_ctranspose( dAp, ldda, AT(0, i), ldda, sb, m ); magma_cgetmatrix( ii, sb, dUp, lddu, hU(0, i), ldhu ); magma_cgetmatrix( m, sb, dAp, ldda, hA(0, i), ldha ); // make sure that gpu queue is empty //magma_device_sync(); #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-(ii+sb), ib, c_one, L2(i-1), lddl, UT(i-1, i+1), lddu); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-(ii+sb), ib, c_one, L(i-1), lddl, UT(i-1, i+1), lddu); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(ii+sb), m, ib, c_neg_one, UT(i-1, i+1), lddu, AT(0, i-1), ldda, c_one, AT(0, i+1), ldda ); } // do the cpu part CORE_ctstrf(m, sb, ib, nb, (PLASMA_Complex32_t*)hU(i, i), ldhu, (PLASMA_Complex32_t*)hA(0, i), ldha, (PLASMA_Complex32_t*)hL(i), ldhl, ipiv+ii, (PLASMA_Complex32_t*)hwork, ldhwork, info); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + ii; // Need to swap betw U and A #ifndef NOSWAPBLK magmablas_cswapblk( 'R', n-(ii+sb), UT(i, i+1), lddu, AT(0, i+1), ldda, 1, sb, ipiv+ii, 1, nb ); for(j=0; j<ib; j++) { im = ipiv[ip]-1; if ( im == j ) { ipiv[ip] += ii; } ip++; } #else for(j=0; j<ib; j++) { im = ipiv[ip]-1; if ( im != (j) ) { im = im - nb; assert( (im>=0) && (im<m) ); magmablas_cswap( n-(ii+sb), UT(i, i+1)+j*lddu, 1, AT(0, i+1)+im*ldda, 1 ); } else { ipiv[ip] += ii; } ip++; } #endif #ifndef WITHOUTTRTRI CORE_clacpy( PlasmaUpperLower, sb, sb, (PLASMA_Complex32_t*)hL(i), ldhl, (PLASMA_Complex32_t*)hL2(i), ldhl ); CORE_ctrtri( PlasmaLower, PlasmaUnit, sb, (PLASMA_Complex32_t*)hL2(i), ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } #endif // upload i-th panel magma_csetmatrix( sb, sb, hU(i, i), ldhu, dUp, lddu ); magma_csetmatrix( m, sb, hA(0, i), ldha, dAp, ldda ); magma_csetmatrix( p*ib, sb, hL(i), ldhl, L(i), lddl ); magmablas_ctranspose( UT(i, i), lddu, dUp, lddu, sb, sb); magmablas_ctranspose( AT(0, i), ldda, dAp, ldda, m, sb); // make sure that gpu queue is empty //magma_device_sync(); // do the small non-parallel computations if ( s > (i+1) ) { #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, L2(i), lddl, UT(i, i+1), lddu); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, L(i), lddl, UT(i, i+1), lddu); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, sb, m, sb, c_neg_one, UT(i, i+1), lddu, AT(0, i ), ldda, c_one, AT(0, i+1), ldda ); } else { #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, L2(i), lddl, UT(i, i+1), lddu); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, L(i), lddl, UT(i, i+1), lddu); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-mindim, m, sb, c_neg_one, UT(i, i+1), lddu, AT(0, i ), ldda, c_one, AT(0, i+1), ldda ); } } if ( (storev == 'C') || (storev == 'c') ) { magmablas_cgetmo_out( dU, dUT, lddu, m, n ); magmablas_cgetmo_out( dA, dAT, ldda, m, n ); } } return *info; }
/** Purpose ======= CLAHEF computes a partial factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. The partial factorization has the form: A = ( I U12 ) ( A11 0 ) ( I 0 ) if UPLO = 'U', or: ( 0 U22 ) ( 0 D ) ( U12' U22' ) A = ( L11 0 ) ( D 0 ) ( L11' L21' ) if UPLO = 'L' ( L21 I ) ( 0 A22 ) ( 0 I ) where the order of D is at most NB. The actual order is returned in the argument KB, and is either NB or NB-1, or N if N <= NB. Note that U' denotes the conjugate transpose of U. CLAHEF is an auxiliary routine called by CHETRF. It uses blocked code (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or A22 (if UPLO = 'L'). Arguments --------- @param[in] UPLO CHARACTER Specifies whether the upper or lower triangular part of the Hermitian matrix A is stored: - = 'U': Upper triangular - = 'L': Lower triangular @param[in] N INTEGER The order of the matrix A. N >= 0. @param[in] NB INTEGER The maximum number of columns of the matrix A that should be factored. NB should be at least 2 to allow for 2-by-2 pivot blocks. @param[out] KB INTEGER The number of columns of A that were actually factored. KB is either NB-1 or NB, or N if N <= NB. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, A contains details of the partial factorization. @param[in] LDA INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] ipiv INTEGER array, dimension (N) Details of the interchanges and the block structure of D. If UPLO = 'U', only the last KB elements of ipiv are set; if UPLO = 'L', only the first KB elements are set. \n If ipiv(k) > 0, then rows and columns k and ipiv(k) were interchanged and D(k,k) is a 1-by-1 diagonal block. If UPLO = 'U' and ipiv(k) = ipiv(k-1) < 0, then rows and columns k-1 and -ipiv(k) were interchanged and D(k-1:k,k-1:k) is a 2-by-2 diagonal block. If UPLO = 'L' and ipiv(k) = ipiv(k+1) < 0, then rows and columns k+1 and -ipiv(k) were interchanged and D(k:k+1,k:k+1) is a 2-by-2 diagonal block. @param[out] W (workspace) COMPLEX array, dimension (LDW,NB) @param[in] LDW INTEGER The leading dimension of the array W. LDW >= max(1,N). @param[out] INFO INTEGER - = 0: successful exit - > 0: if INFO = k, D(k,k) is exactly zero. The factorization has been completed, but the block diagonal matrix D is exactly singular. @ingroup magma_chetrf_comp ********************************************************************/ extern "C" magma_int_t magma_clahef_gpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t *kb, magmaFloatComplex *hA, magma_int_t lda, magmaFloatComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *ipiv, magmaFloatComplex_ptr dW, size_t dW_offset, magma_int_t lddw, magma_queue_t queue, magma_int_t *info) { /* .. Parameters .. */ float d_one = 1.0; float d_zero = 0.0; float d_eight = 8.0; float d_seven = 7.0; #if defined(PRECISION_c) float f_zero = 0.0; #endif magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_mone = -MAGMA_C_ONE; magma_int_t upper = (uplo == MagmaUpper); magma_int_t ione = 1; /* .. Local Scalars .. */ magma_int_t imax = 0, jmax = 0, kk, kkW, kp, kstep, iinfo; float abs_akk, alpha, colmax, R1, rowmax; magmaFloatComplex Zimax, Z; #define dA(i, j) dA, dA_offset + (j)*ldda + (i) #define dW(i, j) dW, dW_offset + (j)*lddw + (i) #define A(i, j) (hA + (j)*lda + (i)) /* .. Executable Statements .. */ *info = 0; /* Initialize alpha for use in choosing pivot block size. */ alpha = ( d_one+sqrt( d_seven ) ) / d_eight; magma_event_t event = NULL; if( upper ) { /* Factorize the trailing columns of A using the upper triangle of A and working backwards, and compute the matrix W = U12*D for use in updating A11 (note that conjg(W) is actually stored) K is the main loop index, decreasing from N in steps of 1 or 2 KW is the column of W which corresponds to column K of A */ int k, kw = 0; for (k = n-1; k+1 > max(n-nb+1, nb); k -= kstep) { kw = nb - (n-k); /* Copy column K of A to column KW of W and update it */ magma_ccopy( k+1, dA( 0, k ), 1, dW( 0, kw ), 1, queue ); // set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif if (k+1 < n) { magma_cgemv( MagmaNoTrans, k+1, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( k, kw+1 ), lddw, c_one, dW( 0, kw ), ione, queue ); // set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k+ kw*lddw+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif } kstep = 1; /* Determine rows and columns to be interchanged and whether a 1-by-1 or 2-by-2 pivot block will be used */ magma_cgetvector_async( 1, dW( k, kw ), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); abs_akk = fabs( MAGMA_C_REAL( Z ) ); /* imax is the row-index of the largest off-diagonal element in column K, and colmax is its absolute value */ if( k > 0 ) { // magma is one-base imax = magma_icamax( k, dW( 0, kw ), 1, queue ) - 1; magma_cgetvector( 1, dW( imax, kw ), 1, &Z, 1, queue ); colmax = MAGMA_C_ABS1( Z ); } else { colmax = d_zero; } if( max( abs_akk, colmax ) == 0.0 ) { /* Column K is zero: set INFO and continue */ if ( *info == 0 ) *info = k; kp = k; #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(k+ k*ldda+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(k+ k*ldda+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif } else { if( abs_akk >= alpha*colmax ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else { /* Copy column imax to column KW-1 of W and update it */ magma_ccopy( imax+1, dA( 0, imax ), 1, dW( 0, kw-1 ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #endif #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( k-imax, dA(imax,imax+1), ldda, dW(imax+1,kw-1), 1, queue ); #else magma_ccopy( k-imax, dA(imax,imax+1), ldda, dW(imax+1,kw-1), 1, queue ); #endif if( k+1 < n ) { magma_cgemv( MagmaNoTrans, k+1, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( imax, kw+1 ), lddw, c_one, dW( 0, kw-1 ), ione, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(imax+ (kw-1)*lddw+dW_offset)+1, 1, queue, &event ); #endif } magma_cgetvector_async( 1, dW( imax, kw-1 ), 1, &Zimax, 1, queue, &event ); magma_queue_sync( queue ); /* jmax is the column-index of the largest off-diagonal element in row imax, and rowmax is its absolute value */ jmax = imax + magma_icamax( k-imax, dW( imax+1, kw-1 ), 1, queue ); magma_cgetvector( 1, dW( jmax, kw-1 ), 1, &Z, 1, queue ); rowmax = MAGMA_C_ABS1( Z ); if ( imax > 0 ) { // magma is one-base jmax = magma_icamax( imax, dW( 0, kw-1 ), 1, queue ) - 1; magma_cgetvector( 1, dW( jmax, kw-1 ), 1, &Z, 1, queue ); rowmax = max( rowmax, MAGMA_C_ABS1( Z ) ); } if( abs_akk >= alpha*colmax*( colmax / rowmax ) ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else if ( fabs( MAGMA_C_REAL( Zimax ) ) >= alpha*rowmax ) { /* interchange rows and columns K and imax, use 1-by-1 pivot block */ kp = imax; /* copy column KW-1 of W to column KW */ magma_ccopy( k+1, dW( 0, kw-1 ), 1, dW( 0, kw ), 1, queue ); } else { /* interchange rows and columns K-1 and imax, use 2-by-2 pivot block */ kp = imax; kstep = 2; } } kk = k - kstep + 1; kkW = nb - (n - kk); /* Updated column kp is already stored in column kkW of W */ if( kp != kk ) { /* Interchange rows kk and kp in last kk columns of A and W */ // note: row-swap A(:,kk) magmablas_cswap( n-kk, dA( kk, kk ), ldda, dA( kp, kk ), ldda, queue ); magmablas_cswap( n-kk, dW( kk, kkW), lddw, dW( kp, kkW), lddw, queue ); /* Copy non-updated column kk to column kp */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( kk-kp-1, dA( kp+1, kk ), 1, dA( kp, kp+1 ), ldda, queue ); #else magma_ccopy( kk-kp-1, dA( kp+1, kk ), 1, dA( kp, kp+1 ), ldda, queue ); #endif // now A(kp,kk) should be A(kk,kk), and copy to A(kp,kp) magma_ccopy( kp+1, dA( 0, kk ), 1, dA( 0, kp ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(kp+ kp*ldda+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(kp+ kp*ldda+dA_offset)+1, 1, queue, &event ); #endif } if( kstep == 1 ) { /* 1-by-1 pivot block D(k): column KW of W now holds W(k) = U(k)*D(k) where U(k) is the k-th column of U Store U(k) in column k of A */ magma_ccopy( k+1, dW( 0, kw ), 1, dA( 0, k ), 1, queue ); if ( k > 0 ) { magma_cgetvector_async( 1, dA( k, k ), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); R1 = d_one / MAGMA_C_REAL( Z ); magma_csscal( k, R1, dA( 0, k ), 1, queue ); /* Conjugate W(k) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( k, dW( 0, kw ), 1, dW( 0, kw ), 1, queue ); #endif } } else { /* 2-by-2 pivot block D(k): columns KW and KW-1 of W now hold ( W(k-1) W(k) ) = ( U(k-1) U(k) )*D(k) where U(k) and U(k-1) are the k-th and (k-1)-th columns of U */ if( k > 1 ) { /* Store U(k) and U(k-1) in columns k and k-1 of A */ magmablas_clascl_2x2( MagmaUpper, k-1, dW(0, kw-1), lddw, dA(0,k-1), ldda, &iinfo, queue ); } /* Copy D(k) to A */ magma_ccopymatrix( 2, 2, dW( k-1, kw-1 ), lddw, dA( k-1, k-1 ), ldda, queue ); /* Conjugate W(k) and W(k-1) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( k, dW( 0, kw ), 1, dW( 0, kw ), 1, queue ); magmablas_clacpy_cnjg( k-1, dW( 0, kw-1 ), 1, dW( 0, kw-1 ), 1, queue ); #endif } } /* Store details of the interchanges in ipiv */ if( kstep == 1 ) { ipiv[ k ] = 1+kp; } else { ipiv[ k ] = -(1+kp); ipiv[ k-1 ] = -(1+kp); } } /* Update the upper triangle of A11 (= A(1:k,1:k)) as A11 := A11 - U12*D*U12' = A11 - U12*W' computing blocks of NB columns at a time (note that conjg(W) is actually stored) */ kw = nb - (n-k); for (int j = ( k / nb )*nb; j >= 0; j -= nb ) { int jb = min( nb, k-j+1 ); #ifdef SYMMETRIC_UPDATE /* Update the upper triangle of the diagonal block */ for (int jj = j; jj < j + jb; jj++) { #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #endif magma_cgemv( MagmaNoTrans, jj-j+1, n-(k+1), c_mone, dA( j, k+1 ), ldda, dW( jj, kw+1 ), lddw, c_one, dA( j, jj ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(jj+ jj*ldda+dA_offset)+1, 1, queue, &event ); #endif } /* Update the rectangular superdiagonal block */ magma_cgemm( MagmaNoTrans, MagmaTrans, j, jb, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( j, kw+1 ), lddw, c_one, dA( 0, j ), ldda, queue ); #else #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #endif magma_cgemm( MagmaNoTrans, MagmaTrans, j+jb, jb, n-(k+1), c_mone, dA( 0, k+1 ), ldda, dW( j, kw+1 ), lddw, c_one, dA( 0, j ), ldda, queue ); #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j+ j*ldda+dA_offset)+1, 2*(1+ldda), queue ); #endif #endif } /* Put U12 in standard form by partially undoing the interchanges in columns k+1:n */ for (int j = k+1; j < n;) { int jj = j; int jp = ipiv[ j ]; if( jp < 0 ) { jp = -jp; j = j + 1; } j = j + 1; jp = jp - 1; if( jp != jj && j < n ) magmablas_cswap( n-j, dA( jp, j ), ldda, dA( jj, j ), ldda, queue ); } // copying the panel back to CPU magma_cgetmatrix_async( n, n-(k+1), dA(0,k+1), ldda, A(0,k+1), lda, queue, &event ); magma_queue_sync( queue ); /* Set KB to the number of columns factorized */ *kb = n - (k+1); } else { /* Factorize the leading columns of A using the lower triangle of A and working forwards, and compute the matrix W = L21*D for use in updating A22 (note that conjg(W) is actually stored) K is the main loop index, increasing from 1 in steps of 1 or 2 */ int k; for (k = 0; k < min(nb-1,n); k += kstep) { /* Copy column K of A to column K of W and update it */ /* -------------------------------------------------------------- */ magma_ccopy( n-k, dA( k, k ), 1, dW( k, k ), 1, queue ); // set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif /* -------------------------------------------------------------- */ magma_cgemv( MagmaNoTrans, n-k, k, c_mone, dA( k, 0 ), ldda, dW( k, 0 ), lddw, c_one, dW( k, k ), ione, queue ); // re-set imaginary part of diagonal to be zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*(k*lddw+k+dW_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif kstep = 1; /* Determine rows and columns to be interchanged and whether a 1-by-1 or 2-by-2 pivot block will be used */ magma_cgetvector_async( 1, dW( k, k ), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); abs_akk = fabs( MAGMA_C_REAL( Z ) ); /* imax is the row-index of the largest off-diagonal element in column K, and colmax is its absolute value */ if( k < n-1 ) { // magmablas is one-base imax = k + magma_icamax( n-k-1, dW(k+1,k), 1, queue ); magma_cgetvector( 1, dW( imax,k ), 1, &Z, 1, queue ); colmax = MAGMA_C_ABS1( Z ); } else { colmax = d_zero; } if ( max( abs_akk, colmax ) == 0.0 ) { /* Column K is zero: set INFO and continue */ if( *info == 0 ) *info = k; kp = k; // make sure the imaginary part of diagonal is zero #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dA, 2*(k*ldda+k+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dA, 2*(k*ldda+k+dA_offset)+1, 1, queue, &event ); magma_queue_sync( queue ); #endif } else { if ( abs_akk >= alpha*colmax ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else { /* Copy column imax to column K+1 of W and update it */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( imax-k, dA(imax,k), ldda, dW(k,k+1), 1, queue ); #else magma_ccopy( imax-k, dA( imax, k ), ldda, dW( k, k+1 ), 1, queue ); #endif magma_ccopy( n-imax, dA( imax, imax ), 1, dW( imax, k+1 ), 1, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif magma_cgemv( MagmaNoTrans, n-k, k, c_mone, dA( k, 0 ), ldda, dW( imax, 0 ), lddw, c_one, dW( k, k+1 ), ione, queue ); #if defined(PRECISION_z) magma_dsetvector_async( 1, &d_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #elif defined(PRECISION_c) magma_ssetvector_async( 1, &f_zero, 1, dW, 2*((k+1)*lddw+imax+dW_offset)+1, 1, queue, &event); magma_queue_sync( queue ); #endif magma_cgetvector_async( 1, dW(imax,k+1), 1, &Zimax, 1, queue, &event); magma_queue_sync( queue ); /* jmax is the column-index of the largest off-diagonal element in row imax, and rowmax is its absolute value */ // magmablas is one-base jmax = k-1 + magma_icamax( imax-k, dW(k, k+1), 1, queue ); magma_cgetvector( 1, dW(jmax,k+1), 1, &Z, 1, queue ); rowmax = MAGMA_C_ABS1( Z ); if( imax < n-1 ) { // magmablas is one-base jmax = imax + magma_icamax( (n-1)-imax, dW(imax+1,k+1), 1, queue); magma_cgetvector( 1, dW(jmax,k+1), 1, &Z, 1, queue ); rowmax = max( rowmax, MAGMA_C_ABS1( Z ) ); } if( abs_akk >= alpha*colmax*( colmax / rowmax ) ) { /* no interchange, use 1-by-1 pivot block */ kp = k; } else if( fabs( MAGMA_C_REAL( Zimax ) ) >= alpha*rowmax ) { /* interchange rows and columns K and imax, use 1-by-1 pivot block */ kp = imax; /* copy column K+1 of W to column K */ magma_ccopy( n-k, dW( k, k+1 ), 1, dW( k, k ), 1, queue ); } else { /* interchange rows and columns K+1 and imax, use 2-by-2 pivot block */ kp = imax; kstep = 2; } } kk = k + kstep - 1; /* Updated column kp is already stored in column kk of W */ if( kp != kk ) { /* Copy non-updated column kk to column kp */ /* ------------------------------------------------------------------ */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( kp-kk, dA( kk, kk ), 1, dA( kp, kk ), ldda, queue ); #else magma_ccopy( kp-kk, dA( kk, kk ), 1, dA( kp, kk ), ldda, queue ); #endif if ( kp < n ) { magma_ccopy( n-kp, dA( kp, kk), 1, dA( kp, kp ), 1, queue ); } /* ------------------------------------------------------------------ */ /* Interchange rows kk and kp in first kk columns of A and W */ magmablas_cswap( kk+1, dA( kk, 0 ), ldda, dA( kp, 0 ), ldda, queue ); magmablas_cswap( kk+1, dW( kk, 0 ), lddw, dW( kp, 0 ), lddw, queue ); } if ( kstep == 1 ) { /* 1-by-1 pivot block D(k): column k of W now holds W(k) = L(k)*D(k) where L(k) is the k-th column of L Store L(k) in column k of A */ magma_ccopy( n-k, dW( k, k ), 1, dA( k, k ), 1, queue ); if ( k < n-1 ) { magma_cgetvector_async( 1, dA(k,k), 1, &Z, 1, queue, &event ); magma_queue_sync( queue ); R1 = d_one / MAGMA_C_REAL( Z ); magma_csscal((n-1)-k, R1, dA( k+1,k ), 1, queue); /* Conjugate W(k) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( (n-1)-k, dW( k+1,k ), 1, dW( k+1,k ), 1, queue ); #endif } } else { /* 2-by-2 pivot block D(k): columns k and k+1 of W now hold ( W(k) W(k+1) ) = ( L(k) L(k+1) )*D(k) where L(k) and L(k+1) are the k-th and (k+1)-th columns of L */ magmablas_clascl_2x2( MagmaLower, n-(k+2), dW(k,k), lddw, dA(k+2,k), ldda, &iinfo, queue ); /* Copy D(k) to A */ magma_ccopymatrix( 2, 2, dW( k, k ), lddw, dA( k, k ), ldda, queue ); /* Conjugate W(k) and W(k+1) */ #if defined(PRECISION_z) || defined(PRECISION_c) magmablas_clacpy_cnjg( (n-1)-k, dW( k+1,k ), 1, dW( k+1,k ), 1, queue ); magmablas_clacpy_cnjg( (n-1)-k-1, dW( k+2,k+1), 1, dW( k+2,k+1 ), 1, queue ); #endif } } /* Store details of the interchanges in ipiv */ if ( kstep == 1 ) { ipiv[k] = kp+1; } else { ipiv[k] = -kp-1; ipiv[k+1] = -kp-1; } } /* Update the lower triangle of A22 (= A(k:n,k:n)) as A22 := A22 - L21*D*L21' = A22 - L21*W' computing blocks of NB columns at a time (note that conjg(W) is actually stored) */ for( int j = k; j < n; j += nb ) { int jb = min( nb, n-j ); /* Update the lower triangle of the diagonal block */ #ifdef SYMMETRIC_UPDATE for (int jj = j; jj < j + jb; jj++) { int jnb = j + jb - jj; /* -------------------------------------------------------- */ magma_cgemv( MagmaNoTrans, jnb, k, c_mone, dA( jj, 0 ), ldda, dW( jj, 0 ), lddw, c_one, dA( jj, jj ), ione, queue ); /* -------------------------------------------------------- */ } /* Update the rectangular subdiagonal block */ if( j+jb < n ) { int nk = n - (j+jb); /* -------------------------------------------- */ magma_cgemm( MagmaNoTrans, MagmaTrans, nk, jb, k, c_mone, dA( j+jb, 0 ), ldda, dW( j, 0 ), lddw, c_one, dA( j+jb, j ), ldda, queue ); /* ------------------------------------------- */ } #else #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #endif magma_cgemm( MagmaNoTrans, MagmaTrans, n-j, jb, k, c_mone, dA( j, 0 ), ldda, dW( j, 0 ), lddw, c_one, dA( j, j ), ldda, queue ); #if defined(PRECISION_z) magmablas_dlaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #elif defined(PRECISION_c) magmablas_slaset(MagmaUpperLower, 1, jb, 0, 0, dA, 2*(j*ldda+j+dA_offset)+1, 2*(1+ldda), queue ); #endif #endif } /* Put L21 in standard form by partially undoing the interchanges in columns 1:k-1 */ for (int j = k; j > 0;) { int jj = j; int jp = ipiv[j-1]; if( jp < 0 ) { jp = -jp; j--; } j--; if ( jp != jj && j >= 1 ) { magmablas_cswap( j, dA( jp-1,0 ), ldda, dA( jj-1,0 ), ldda, queue ); } } // copying the panel back to CPU magma_cgetmatrix_async( n, k, dA(0,0), ldda, A(0,0), lda, queue, &event ); magma_queue_sync( queue ); /* Set KB to the number of columns factorized */ *kb = k; } return *info; /* End of CLAHEF */ }