// // m, n - dimensions in the source (input) matrix. // This routine copies the hA matrix from the CPU // to dAT on the GPU. In addition, the output matrix // is transposed. The routine uses a buffer of size // 2*lddwork*nb pointed to by dwork (lddwork > m) on the GPU. // Note that lda >= m and lddat >= n. // extern "C" void magmablas_zsetmatrix_transpose_q( magma_int_t m, magma_int_t n, const magmaDoubleComplex *hA, magma_int_t lda, magmaDoubleComplex_ptr dAT, magma_int_t ldda, magmaDoubleComplex_ptr dwork, magma_int_t lddwork, magma_int_t nb, magma_queue_t queues[2] ) { #define hA(i_, j_) (hA + (i_) + (j_)*lda) #define dAT(i_, j_) (dAT + (i_) + (j_)*ldda) #define dwork(i_, j_) (dwork + (i_) + (j_)*lddwork) magma_int_t i = 0, j = 0, ib; /* Quick return */ if ( (m == 0) || (n == 0) ) return; // TODO standard check arguments if (lda < m || ldda < n || lddwork < m) { fprintf( stderr, "%s: wrong arguments.\n", __func__ ); return; } /* Move data from CPU to GPU in the first panel in the dwork buffer */ ib = min(n-i, nb); magma_zsetmatrix_async( m, ib, hA(0,i), lda, dwork(0,(j%2)*nb), lddwork, queues[j%2] ); j++; for (i=nb; i < n; i += nb) { /* Move data from CPU to GPU in the second panel in the dwork buffer */ ib = min(n-i, nb); magma_zsetmatrix_async( m, ib, hA(0,i), lda, dwork(0,(j%2)*nb), lddwork, queues[j%2] ); j++; /* Note that the previous panel (i.e., j%2) comes through the queue for the kernel so there is no need to synchronize. */ // TODO should this be ib not nb? magmablas_ztranspose_q( m, nb, dwork(0,(j%2)*nb), lddwork, dAT(i-nb,0), ldda, queues[j%2] ); } /* Transpose the last part of the matrix. */ j++; magmablas_ztranspose_q( m, ib, dwork(0,(j%2)*nb), lddwork, dAT(i-nb,0), ldda, queues[j%2] ); }
// static void GCrypto::test() { size_t testLen = DIGEST_BYTES * 5 / 2; char* pA = new char[3 * testLen]; ArrayHolder<char> hA(pA); char* pB = pA + testLen; char* pC = pB + testLen; GRand r(1234); for(size_t i = 0; i < testLen; i++) pA[i] = (unsigned char)r.next(); memcpy(pB, pA, testLen); { GCrypto c("password", 8); c.doChunk(pB, testLen); if(memcmp(pA, pB, testLen) == 0) ThrowError("Didn't do anything"); } memcpy(pC, pB, testLen); { GCrypto c("passworx", 8); c.doChunk(pB, testLen); if(memcmp(pA, pB, testLen) == 0) ThrowError("This should have failed"); } { GCrypto c("password", 8); c.doChunk(pC, testLen); if(memcmp(pA, pC, testLen) != 0) ThrowError("This should have worked"); } }
bool TpsOrderParameter::isTransitionPath(TpsTrajectory& trajectory) { if(hA(trajectory) && hB(trajectory)) { return true; } else { return false; } }
void testDeviceVector() { const int aSize = 64; std::vector<int> hA(aSize), hB(aSize); bolt::cl::device_vector<int> dA(aSize), dB(aSize); for(int i=0; i<aSize; i++) { hA[i] = hB[i] = dB[i] = dA[i] = i; }; int hSum = std::inner_product(hA.begin(), hA.end(), hB.begin(), 1); int sum = bolt::cl::inner_product( dA.begin(), dA.end(), dB.begin(), 1, bolt::cl::plus<int>(), bolt::cl::multiplies<int>() ); };
void testDeviceVector() { const int aSize = 1000; std::vector<int> hA(aSize); bolt::cl::device_vector<int> dA(aSize); for(int i=0; i<aSize; i++) { hA[i] = i; dA[i] = i; }; std::vector<int>::iterator smaxdex = std::max_element(hA.begin(), hA.end()); bolt::cl::device_vector<int>::iterator bmaxdex = bolt::cl::max_element(dA.begin(), dA.end(),bolt::cl::greater<int>()); };
// // m, n - dimensions in the source (input) matrix. // This routine copies the hA matrix from the CPU // to dAT on the GPU. In addition, the output matrix // is transposed. The routine uses a buffer of size // 2*lddw*nb pointed to by dwork (lddw > m) on the GPU. // Note that lda >= m and lddat >= n. // extern "C" void magmablas_ssetmatrix_transpose_mgpu( magma_int_t ngpu, magma_queue_t queues[][2], const float *hA, magma_int_t lda, magmaFloat_ptr dAT[], magma_int_t ldda, magmaFloat_ptr dwork[], magma_int_t lddw, magma_int_t m, magma_int_t n, magma_int_t nb) { #define hA(j) (hA + (j)*lda) #define dwork(d, j) (dwork[(d)] + (j)*nb*lddw) #define dAT(d, j) (dAT[(d)] + (j)*nb) magma_int_t nqueues = 2, d, j, j_local, id, ib; /* Quick return */ if ( (m == 0) || (n == 0) ) return; if (lda < m || ngpu*ldda < n || lddw < m) { fprintf( stderr, "%s: wrong arguments (%d<%d), (%d*%d<%d), or (%d<%d).\n", __func__, (int) lda, (int) m, (int) ngpu, (int) ldda, (int) n, (int) lddw, (int) m ); return; } /* Move data from CPU to GPU by block columns and transpose it */ for (j=0; j < n; j += nb) { d = (j/nb)%ngpu; j_local = (j/nb)/ngpu; id = j_local%nqueues; magma_setdevice(d); ib = min(n-j, nb); magma_ssetmatrix_async( m, ib, hA(j), lda, dwork(d, id), lddw, queues[d][id] ); magmablas_stranspose_q( m, ib, dwork(d,id), lddw, dAT(d,j_local), ldda, queues[d][id] ); } }
int main( int argc, char** argv) { #define hA(i,j) (hA + (i) + (j)*lda) TESTING_CUDA_INIT(); cuDoubleComplex c_zero = MAGMA_Z_ZERO; cuDoubleComplex c_one = MAGMA_Z_ONE; cuDoubleComplex *hA, *hR, *dA; //real_Double_t gpu_time, gpu_perf; //int ione = 1; //int ISEED[4] = {0, 0, 0, 1}; int nsize[] = { 32, 64, 96, 256, 100, 200, 512 }; int ntest = sizeof(nsize) / sizeof(int); int n = nsize[ntest-1]; int lda = ((n + 31)/32)*32; int ntile, nb; TESTING_MALLOC ( hA, cuDoubleComplex, lda*n ); TESTING_MALLOC ( hR, cuDoubleComplex, lda*n ); TESTING_DEVALLOC ( dA, cuDoubleComplex, lda*n ); for( int t = 0; t < ntest; ++t ) { n = nsize[t]; lda = ((n + 31)/32)*32; // initialize matrices; entries are (i.j) for A double nf = 100.; for( int j = 0; j < n; ++j ) { // upper for( int i = 0; i < j; ++i ) { *hA(i,j) = MAGMA_Z_MAKE( (i + j/nf)/nf, 0. ); } // lower for( int i = j; i < n; ++i ) { *hA(i,j) = MAGMA_Z_MAKE( i + j/nf, 0. ); } } printf( "A%d = ", n ); magma_zprint( n, n, hA, lda ); magma_zsetmatrix( n, n, hA, lda, dA, lda ); magmablas_zsymmetrize( MagmaLower, n, dA, lda ); magma_zgetmatrix( n, n, dA, lda, hR, lda ); printf( "L%d = ", n ); magma_zprint( n, n, hR, lda ); magma_zsetmatrix( n, n, hA, lda, dA, lda ); magmablas_zsymmetrize( MagmaUpper, n, dA, lda ); magma_zgetmatrix( n, n, dA, lda, hR, lda ); printf( "U%d = ", n ); magma_zprint( n, n, hR, lda ); // ----- //lapackf77_zlaset( "u", &n, &n, &c_zero, &c_one, hA, &lda ); nb = 64; ntile = n / nb; magma_zsetmatrix( n, n, hA, lda, dA, lda ); magmablas_zsymmetrize_tiles( MagmaLower, nb, dA, lda, ntile, nb, nb ); magma_zgetmatrix( n, n, dA, lda, hR, lda ); printf( "L%d_%d = ", n, nb ); magma_zprint( n, n, hR, lda ); nb = 32; ntile = n / nb; magma_zsetmatrix( n, n, hA, lda, dA, lda ); magmablas_zsymmetrize_tiles( MagmaLower, nb, dA, lda, ntile, nb, nb ); magma_zgetmatrix( n, n, dA, lda, hR, lda ); printf( "L%d_%d = ", n, nb ); magma_zprint( n, n, hR, lda ); ntile = (n - nb < 0 ? 0 : (n - nb) / (2*nb) + 1); magma_zsetmatrix( n, n, hA, lda, dA, lda ); magmablas_zsymmetrize_tiles( MagmaLower, nb, dA, lda, ntile, 2*nb, nb ); magma_zgetmatrix( n, n, dA, lda, hR, lda ); printf( "L%d_%d_2m = ", n, nb ); magma_zprint( n, n, hR, lda ); nb = 25; ntile = n / nb; magma_zsetmatrix( n, n, hA, lda, dA, lda ); magmablas_zsymmetrize_tiles( MagmaLower, nb, dA, lda, ntile, nb, nb ); magma_zgetmatrix( n, n, dA, lda, hR, lda ); printf( "L%d_%d = ", n, nb ); magma_zprint( n, n, hR, lda ); nb = 25; ntile = (n - nb < 0 ? 0 : (n - nb) / (3*nb) + 1); magma_zsetmatrix( n, n, hA, lda, dA, lda ); magmablas_zsymmetrize_tiles( MagmaLower, nb, dA, lda, ntile, nb, 3*nb ); magma_zgetmatrix( n, n, dA, lda, hR, lda ); printf( "L%d_%d_3n = ", n, nb ); magma_zprint( n, n, hR, lda ); nb = 100; ntile = n / nb; magma_zsetmatrix( n, n, hA, lda, dA, lda ); magmablas_zsymmetrize_tiles( MagmaLower, nb, dA, lda, ntile, nb, nb ); magmablas_zsymmetrize( MagmaLower, n%nb, &dA[ ntile*nb*(1+lda) ], lda ); // last partial block magma_zgetmatrix( n, n, dA, lda, hR, lda ); printf( "L%d_%d = ", n, nb ); magma_zprint( n, n, hR, lda ); // ----- nb = 64; ntile = n / nb; magma_zsetmatrix( n, n, hA, lda, dA, lda ); magmablas_zsymmetrize_tiles( MagmaUpper, nb, dA, lda, ntile, nb, nb ); magma_zgetmatrix( n, n, dA, lda, hR, lda ); printf( "U%d_%d = ", n, nb ); magma_zprint( n, n, hR, lda ); } TESTING_FREE( hA ); TESTING_FREE( hR ); TESTING_DEVFREE( dA ); /* Shutdown */ TESTING_CUDA_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing znan_inf */ int main( int argc, char** argv) { TESTING_INIT(); #define hA(i,j) (hA + (i) + (j)*lda) magmaFloatComplex *hA, *dA; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, lda, ldda, size; magma_int_t *ii, *jj; magma_int_t i, j, cnt, tmp; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("uplo M N CPU nan + inf GPU nan + inf actual nan + inf \n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iuplo = 0; iuplo < 3; ++iuplo ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = ((M + 31)/32)*32; size = lda*N; /* Allocate memory for the matrix */ TESTING_MALLOC_CPU( hA, magmaFloatComplex, lda *N ); TESTING_MALLOC_DEV( dA, magmaFloatComplex, ldda*N ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &size, hA ); // up to half of matrix is NAN, and // up to half of matrix is INF. magma_int_t cnt_nan = (magma_int_t)( (rand() / ((float)RAND_MAX)) * 0.5 * M*N ); magma_int_t cnt_inf = (magma_int_t)( (rand() / ((float)RAND_MAX)) * 0.5 * M*N ); magma_int_t total = cnt_nan + cnt_inf; assert( cnt_nan >= 0 ); assert( cnt_inf >= 0 ); assert( total <= M*N ); // fill in indices TESTING_MALLOC_CPU( ii, magma_int_t, size ); TESTING_MALLOC_CPU( jj, magma_int_t, size ); for( cnt=0; cnt < size; ++cnt ) { ii[cnt] = cnt % M; jj[cnt] = cnt / M; } // shuffle indices for( cnt=0; cnt < total; ++cnt ) { i = int( rand() / ((float)RAND_MAX) * size ); tmp=ii[cnt]; ii[cnt]=ii[i]; ii[i]=tmp; tmp=jj[cnt]; jj[cnt]=jj[i]; jj[i]=tmp; } // fill in NAN and INF // for uplo, count NAN and INF in triangular portion of A int c_nan=0; int c_inf=0; for( cnt=0; cnt < cnt_nan; ++cnt ) { i = ii[cnt]; j = jj[cnt]; *hA(i,j) = MAGMA_C_NAN; if ( uplo[iuplo] == MagmaLower && i >= j ) { c_nan++; } if ( uplo[iuplo] == MagmaUpper && i <= j ) { c_nan++; } } for( cnt=cnt_nan; cnt < cnt_nan + cnt_inf; ++cnt ) { i = ii[cnt]; j = jj[cnt]; *hA(i,j) = MAGMA_C_INF; if ( uplo[iuplo] == MagmaLower && i >= j ) { c_inf++; } if ( uplo[iuplo] == MagmaUpper && i <= j ) { c_inf++; } } if ( uplo[iuplo] == MagmaLower || uplo[iuplo] == MagmaUpper ) { cnt_nan = c_nan; cnt_inf = c_inf; total = cnt_nan + cnt_inf; } //printf( "nan %g + %gi\n", MAGMA_C_REAL( MAGMA_C_NAN ), MAGMA_C_REAL( MAGMA_C_NAN ) ); //printf( "inf %g + %gi\n", MAGMA_C_REAL( MAGMA_C_INF ), MAGMA_C_REAL( MAGMA_C_INF ) ); //magma_cprint( M, N, hA, lda ); magma_csetmatrix( M, N, hA, lda, dA, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_int_t c_cpu_nan=-1, c_cpu_inf=-1; magma_int_t c_gpu_nan=-1, c_gpu_inf=-1; magma_int_t c_cpu = magma_cnan_inf ( uplo[iuplo], M, N, hA, lda, &c_cpu_nan, &c_cpu_inf ); magma_int_t c_gpu = magma_cnan_inf_gpu( uplo[iuplo], M, N, dA, ldda, &c_gpu_nan, &c_gpu_inf ); magma_int_t c_cpu2 = magma_cnan_inf ( uplo[iuplo], M, N, hA, lda, NULL, NULL ); magma_int_t c_gpu2 = magma_cnan_inf_gpu( uplo[iuplo], M, N, dA, ldda, NULL, NULL ); /* ===================================================================== Check the result =================================================================== */ bool ok = ( c_cpu == c_gpu ) && ( c_cpu == c_cpu2 ) && ( c_gpu == c_gpu2 ) && ( c_cpu == c_cpu_nan + c_cpu_inf ) && ( c_gpu == c_gpu_nan + c_gpu_inf ) && ( c_cpu_nan == cnt_nan ) && ( c_cpu_inf == cnt_inf ) && ( c_gpu_nan == cnt_nan ) && ( c_gpu_inf == cnt_inf ); printf( "%4c %5d %5d %10d + %-10d %10d + %-10d %10d + %-10d %s\n", lapacke_uplo_const( uplo[iuplo] ), (int) M, (int) N, (int) c_cpu_nan, (int) c_cpu_inf, (int) c_gpu_nan, (int) c_gpu_inf, (int) cnt_nan, (int) cnt_inf, (ok ? "ok" : "failed")); status += ! ok; TESTING_FREE_CPU( hA ); TESTING_FREE_DEV( dA ); TESTING_FREE_CPU( ii ); TESTING_FREE_CPU( jj ); } } printf( "\n" ); } TESTING_FINALIZE(); return status; }
/** Purpose ------- CGETRF_INCPIV computes an LU factorization of a general M-by-N tile A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in,out] hA COMPLEX array, dimension(LDHA, N), on cpu. On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB). On exit, the content is incomplete. Shouldn't be used. @param[in] ldha INTEGER The leading dimension of the array hA. LDHA >= max(1,M). @param[in,out] dA COMPLEX array, dimension(LDDA, N), on gpu. On entry, the M-by-N tile to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] hL COMPLEX array, dimension(LDHL, min(M,N)), on vpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. @param[in] ldhl INTEGER The leading dimension of the array hL. LDHL >= max(1,2*IB). @param[out] dL COMPLEX array, dimension(LDDL, K), on gpu. On exit, contains in the upper part the IB-by-min(M,N) lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. @param[in] lddl INTEGER The leading dimension of the array dL. LDDL >= max(1,2*IB). @param[out] ipiv INTEGER array, dimension min(M,N), on the cpu. The pivot indices array. @param[out] dWORK COMPLEX array, dimension(LDDWORK, 2*IB), on gpu. Workspace. @param[in] lddwork INTEGER The leading dimension of the array dWORK. LDDWORK >= max(NB, 1). @param[out] info INTEGER - PLASMA_SUCCESS successful exit - < 0 if INFO = -k, the k-th argument had an illegal value - > 0 if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf_incpiv_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t ib, magmaFloatComplex *hA, magma_int_t ldha, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *hL, magma_int_t ldhl, magmaFloatComplex *dL, magma_int_t lddl, magma_int_t *ipiv, magmaFloatComplex *dwork, magma_int_t lddwork, magma_int_t *info) { #define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib) #define hA(i,j) (hA + (i)*ib + (j)*ib*ldha) #define hL(j) (hL + (j)*ib*ldhl ) #define hL2(j) (hL2 + (j)*ib*ldhl ) #define dL(j) (dL + (j)*ib*lddl ) #define dL2(j) (dL2 + (j)*ib*lddl ) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t iinfo; magma_int_t maxm, mindim; magma_int_t i, rows, cols, s, ii, sb; magmaFloatComplex *dAT; #ifndef WITHOUTTRTRI magmaFloatComplex *dL2 = dL + ib; magmaFloatComplex *hL2 = hL + ib; #endif /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); s = mindim / ib; if ( ib >= mindim ) { /* Use CPU code. */ lapackf77_cgetrf(&m, &n, hA, &ldha, ipiv, info); #ifndef WITHOUTTRTRI CORE_clacpy(PlasmaUpperLower, mindim, mindim, (PLASMA_Complex32_t*)hA, ldha, (PLASMA_Complex32_t*)hL2, ldhl ); CORE_ctrtri( PlasmaLower, PlasmaUnit, mindim, (PLASMA_Complex32_t*)hL2, ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_csetmatrix( mindim, mindim, hL2, ldhl, dL2, lddl ); #endif if ( order == MagmaRowMajor ) { magma_csetmatrix( m, n, hA, ldha, dwork, lddwork ); magmablas_ctranspose( m, n, dwork, lddwork, dA, ldda ); } else { magma_csetmatrix( m, n, hA, ldha, dA, ldda ); } } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if ( order == MagmaColMajor ) { magmablas_cgetmo_in( dA, dAT, ldda, m, n ); } else { dAT = dA; } for( i=0; i < s; i++ ) { ii = i * ib; sb = min(ib, mindim-ii); cols = maxm - ii; if ( i > 0 ) { // download i-th panel magmablas_ctranspose( sb, m, AT(0,i), ldda, dwork, maxm ); magma_cgetmatrix( m, sb, dwork, maxm, hA(0, i), ldha ); // make sure that gpu queue is empty //magma_device_sync(); #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n - (ii+sb), ib, c_one, dL2(i-1), lddl, AT(i-1,i+1), ldda ); #else magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (ii+sb), ib, c_one, AT(i-1,i-1), ldda, AT(i-1,i+1), ldda ); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(ii+sb), m-ii, ib, c_neg_one, AT(i-1,i+1), ldda, AT(i, i-1), ldda, c_one, AT(i, i+1), ldda ); } // do the cpu part rows = m - ii; lapackf77_cgetrf( &rows, &sb, hA(i, i), &ldha, ipiv+ii, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + ii; { int j; int fin = ii + sb; for (j=ii; j < fin; j++) { ipiv[j] = ii + ipiv[j]; } } magmablas_claswp( n-ii, AT(0, i), ldda, ii+1, ii+sb, ipiv, 1 ); #ifndef WITHOUTTRTRI CORE_clacpy(PlasmaLower, sb, sb, (PLASMA_Complex32_t*)hA(i, i), ldha, (PLASMA_Complex32_t*)hL2(i), ldhl ); CORE_ctrtri( PlasmaLower, PlasmaUnit, sb, (PLASMA_Complex32_t*)hL2(i), ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_csetmatrix( sb, sb, hL2(i), ldhl, dL2(i), lddl ); #endif // upload i-th panel magma_csetmatrix( rows, sb, hA(i, i), ldha, dwork, cols ); magmablas_ctranspose( rows, sb, dwork, cols, AT(i,i), ldda ); // do the small non-parallel computations if ( s > (i+1) ) { #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, sb, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, sb, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } else { /* Update of the last panel */ #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-mindim, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif /* m-(ii+sb) should be always 0 */ magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-mindim, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } } if ( order == MagmaColMajor ) { magmablas_cgetmo_out( dA, dAT, ldda, m, n ); } } return *info; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; magmaDoubleComplex *hA, *hR; magmaDoubleComplex_ptr dA; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1024, 2048, 3072, 4032, 5184, 6048, 7200, 8064, 8928, 10560 }; magma_int_t i, info; magmaDoubleComplex mz_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], matnorm, diffnorm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_zpotrf_gpu -N %d\n\n", 1024); } /* Initialize */ magma_queue_t queue; magma_device_t device; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( &device, 1, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the largest matrix */ N = size[9]; n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC( hA, magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( hR, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); printf("\n\n"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (double)N ) * 1e-9; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, hA ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { MAGMA_Z_SET2REAL( hA(i,i), MAGMA_Z_REAL(hA(i,i)) + N ); for( int j = 0; j < i; ++j ) { hA(i, j) = MAGMA_Z_CNJG( hA(j,i) ); } } lapackf77_zlacpy( MagmaFullStr, &N, &N, hA, &lda, hR, &lda ); /* Warm up to measure the performance */ magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); magma_zpotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); gpu_time = get_time(); magma_zpotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); gpu_time = get_time() - gpu_time; if (info != 0) printf( "magma_zpotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = get_time(); lapackf77_zpotrf( MagmaUpperStr, &N, hA, &lda, &info ); cpu_time = get_time() - cpu_time; if (info != 0) printf( "lapackf77_zpotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ magma_zgetmatrix( N, N, dA, 0, ldda, hR, 0, lda, queue ); matnorm = lapackf77_zlange("f", &N, &N, hA, &lda, work); blasf77_zaxpy(&n2, &mz_one, hA, &ione, hR, &ione); diffnorm = lapackf77_zlange("f", &N, &N, hR, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (argc != 1) break; } /* clean up */ TESTING_FREE( hA ); TESTING_FREE_HOST( hR ); TESTING_FREE_DEV( dA ); magma_queue_destroy( queue ); magma_finalize(); }
/** Purpose ------- ZSSSSM applies the LU factorization update from a complex matrix formed by a lower triangular IB-by-K tile L1 on top of a M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1 tile A1 on top of a M2-by-N2 tile A2 (N1 == N2). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in] NB INTEGER The blocking size. NB >= 0. @param[in,out] hU COMPLEX_16 array, dimension(LDHU, N), on cpu. On entry, the NB-by-N upper triangular tile hU. On exit, the content is incomplete. Shouldn't be used. @param[in] ldhu INTEGER The leading dimension of the array hU. LDHU >= max(1,NB). @param[in,out] dU COMPLEX_16 array, dimension(LDDU, N), on gpu. On entry, the NB-by-N upper triangular tile dU identical to hU. On exit, the new factor U from the factorization. @param[in] lddu INTEGER The leading dimension of the array dU. LDDU >= max(1,NB). @param[in,out] hA COMPLEX_16 array, dimension(LDHA, N), on cpu. On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB). On exit, the content is incomplete. Shouldn't be used. @param[in] ldha INTEGER The leading dimension of the array hA. LDHA >= max(1,M). @param[in,out] dA COMPLEX_16 array, dimension(LDDA, N), on gpu. On entry, the M-by-N tile to be factored. On exit, the factor L from the factorization @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] hL COMPLEX_16 array, dimension(LDHL, K), on vpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-K the inverse of the top part. @param[in] ldhl INTEGER The leading dimension of the array hL. LDHL >= max(1,2*IB). @param[out] dL COMPLEX_16 array, dimension(LDDL, K), on gpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-K the inverse of the top part. @param[in] lddl INTEGER The leading dimension of the array dL. LDDL >= max(1,2*IB). @param[out] hWORK COMPLEX_16 array, dimension(LDHWORK, 2*IB), on cpu. Workspace. @param[in] ldhwork INTEGER The leading dimension of the array hWORK. LDHWORK >= max(NB, 1). @param[out] dWORK COMPLEX_16 array, dimension(LDDWORK, 2*IB), on gpu. Workspace. @param[in] lddwork INTEGER The leading dimension of the array dWORK. LDDWORK >= max(NB, 1). @param[out] ipiv INTEGER array on the cpu. The pivot indices array of size K as returned by ZTSTRF @param[out] info INTEGER - PLASMA_SUCCESS successful exit - < 0 if INFO = -k, the k-th argument had an illegal value - > 0 if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_tile ********************************************************************/ extern "C" magma_int_t magma_ztstrf_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, magmaDoubleComplex *hU, magma_int_t ldhu, magmaDoubleComplex_ptr dU, magma_int_t lddu, magmaDoubleComplex *hA, magma_int_t ldha, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex *hL, magma_int_t ldhl, magmaDoubleComplex_ptr dL, magma_int_t lddl, magma_int_t *ipiv, magmaDoubleComplex *hwork, magma_int_t ldhwork, magmaDoubleComplex_ptr dwork, magma_int_t lddwork, magma_int_t *info) { #define UT(i,j) (dUT + (i)*ib*lddu + (j)*ib ) #define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib ) #define L(i) (dL + (i)*ib*lddl ) #define L2(i) (dL2 + (i)*ib*lddl ) #define hU(i,j) (hU + (j)*ib*ldhu + (i)*ib ) #define hA(i,j) (hA + (j)*ib*ldha + (i)*ib ) #define hL(i) (hL + (i)*ib*ldhl ) #define hL2(i) (hL2 + (i)*ib*ldhl ) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; int iinfo = 0; int maxm, mindim; int i, j, im, s, ip, ii, sb, p = 1; magmaDoubleComplex_ptr dAT, dUT; magmaDoubleComplex_ptr dAp, dUp; #ifndef WITHOUTTRTRI magmaDoubleComplex_ptr dL2 = dL + ib; magmaDoubleComplex *hL2 = hL + ib; p = 2; #endif /* Check input arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ib < 0) { *info = -3; } else if ((lddu < max(1,m)) && (m > 0)) { *info = -6; } else if ((ldda < max(1,m)) && (m > 0)) { *info = -8; } else if ((lddl < max(1,ib)) && (ib > 0)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* quick return */ if ((m == 0) || (n == 0) || (ib == 0)) return *info; ip = 0; /* Function Body */ mindim = min(m, n); s = mindim / ib; if ( ib >= mindim ) { /* Use CPU code. */ CORE_ztstrf(m, n, ib, nb, (PLASMA_Complex64_t*)hU, ldhu, (PLASMA_Complex64_t*)hA, ldha, (PLASMA_Complex64_t*)hL, ldhl, ipiv, (PLASMA_Complex64_t*)hwork, ldhwork, info); #ifndef WITHOUTTRTRI CORE_zlacpy( PlasmaUpperLower, mindim, mindim, (PLASMA_Complex64_t*)hL, ldhl, (PLASMA_Complex64_t*)hL2, ldhl ); CORE_ztrtri( PlasmaLower, PlasmaUnit, mindim, (PLASMA_Complex64_t*)hL2, ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } #endif if ( order == MagmaRowMajor ) { magma_zsetmatrix( m, n, hU, ldhu, dwork, lddwork ); magmablas_ztranspose( m, n, dwork, lddwork, dU, lddu ); magma_zsetmatrix( m, n, hA, ldha, dwork, lddwork ); magmablas_ztranspose( m, n, dwork, lddwork, dA, ldda ); } else { magma_zsetmatrix( m, n, hU, ldhu, dU, lddu ); magma_zsetmatrix( m, n, hA, ldha, dA, ldda ); } magma_zsetmatrix( p*ib, n, hL, ldhl, dL, lddl ); } else { /* Use hybrid blocked code. */ maxm = magma_roundup( m, 32 ); if ( order == MagmaColMajor ) { magmablas_zgetmo_in( dU, dUT, lddu, m, n ); magmablas_zgetmo_in( dA, dAT, ldda, m, n ); } else { dUT = dU; dAT = dA; } dAp = dwork; dUp = dAp + ib*lddwork; ip = 0; for( i=0; i < s; i++ ) { ii = i * ib; sb = min(mindim-ii, ib); if ( i > 0 ) { // download i-th panel magmablas_ztranspose( sb, ii, UT(0,i), lddu, dUp, lddu ); magmablas_ztranspose( sb, m, AT(0,i), ldda, dAp, ldda ); magma_zgetmatrix( ii, sb, dUp, lddu, hU(0, i), ldhu ); magma_zgetmatrix( m, sb, dAp, ldda, hA(0, i), ldha ); // make sure that gpu queue is empty //magma_device_sync(); #ifndef WITHOUTTRTRI magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-(ii+sb), ib, c_one, L2(i-1), lddl, UT(i-1, i+1), lddu); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-(ii+sb), ib, c_one, L(i-1), lddl, UT(i-1, i+1), lddu); #endif magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(ii+sb), m, ib, c_neg_one, UT(i-1, i+1), lddu, AT(0, i-1), ldda, c_one, AT(0, i+1), ldda ); } // do the cpu part CORE_ztstrf(m, sb, ib, nb, (PLASMA_Complex64_t*)hU(i, i), ldhu, (PLASMA_Complex64_t*)hA(0, i), ldha, (PLASMA_Complex64_t*)hL(i), ldhl, ipiv+ii, (PLASMA_Complex64_t*)hwork, ldhwork, info); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + ii; // Need to swap betw U and A #ifndef NOSWAPBLK magmablas_zswapblk( MagmaRowMajor, n-(ii+sb), UT(i, i+1), lddu, AT(0, i+1), ldda, 1, sb, ipiv+ii, 1, nb ); for (j=0; j < ib; j++) { im = ipiv[ip]-1; if ( im == j ) { ipiv[ip] += ii; } ip++; } #else for (j=0; j < ib; j++) { im = ipiv[ip]-1; if ( im != (j) ) { im = im - nb; assert( (im >= 0) && (im < m) ); magmablas_zswap( n-(ii+sb), UT(i, i+1)+j*lddu, 1, AT(0, i+1)+im*ldda, 1 ); } else { ipiv[ip] += ii; } ip++; } #endif #ifndef WITHOUTTRTRI CORE_zlacpy( PlasmaUpperLower, sb, sb, (PLASMA_Complex64_t*)hL(i), ldhl, (PLASMA_Complex64_t*)hL2(i), ldhl ); CORE_ztrtri( PlasmaLower, PlasmaUnit, sb, (PLASMA_Complex64_t*)hL2(i), ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } #endif // upload i-th panel magma_zsetmatrix( sb, sb, hU(i, i), ldhu, dUp, lddu ); magma_zsetmatrix( m, sb, hA(0, i), ldha, dAp, ldda ); magma_zsetmatrix( p*ib, sb, hL(i), ldhl, L(i), lddl ); magmablas_ztranspose( sb, sb, dUp, lddu, UT(i,i), lddu ); magmablas_ztranspose( m, sb, dAp, ldda, AT(0,i), ldda ); // make sure that gpu queue is empty //magma_device_sync(); // do the small non-parallel computations if ( s > (i+1) ) { #ifndef WITHOUTTRTRI magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, L2(i), lddl, UT(i, i+1), lddu); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, L(i), lddl, UT(i, i+1), lddu); #endif magma_zgemm( MagmaNoTrans, MagmaNoTrans, sb, m, sb, c_neg_one, UT(i, i+1), lddu, AT(0, i ), ldda, c_one, AT(0, i+1), ldda ); } else { #ifndef WITHOUTTRTRI magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, L2(i), lddl, UT(i, i+1), lddu); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, L(i), lddl, UT(i, i+1), lddu); #endif magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-mindim, m, sb, c_neg_one, UT(i, i+1), lddu, AT(0, i ), ldda, c_one, AT(0, i+1), ldda ); } } if ( order == MagmaColMajor ) { magmablas_zgetmo_out( dU, dUT, lddu, m, n ); magmablas_zgetmo_out( dA, dAT, ldda, m, n ); } } return *info; }
extern "C" magma_int_t magma_ctstrf_gpu( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, magmaFloatComplex *hU, magma_int_t ldhu, magmaFloatComplex *dU, magma_int_t lddu, magmaFloatComplex *hA, magma_int_t ldha, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *hL, magma_int_t ldhl, magmaFloatComplex *dL, magma_int_t lddl, magma_int_t *ipiv, magmaFloatComplex *hwork, magma_int_t ldhwork, magmaFloatComplex *dwork, magma_int_t lddwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CSSSSM applies the LU factorization update from a complex matrix formed by a lower triangular IB-by-K tile L1 on top of a M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1 tile A1 on top of a M2-by-N2 tile A2 (N1 == N2). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. IB (input) INTEGER The inner-blocking size. IB >= 0. NB (input) INTEGER The blocking size. NB >= 0. hU (input,output) COMPLEX array, dimension(LDHU, N), on cpu. On entry, the NB-by-N upper triangular tile hU. On exit, the content is incomplete. Shouldn't be used. LDHU (input) INTEGER The leading dimension of the array hU. LDHU >= max(1,NB). dU (input,output) COMPLEX array, dimension(LDDU, N), on gpu. On entry, the NB-by-N upper triangular tile dU identical to hU. On exit, the new factor U from the factorization. LDDU (input) INTEGER The leading dimension of the array dU. LDDU >= max(1,NB). hA (input,output) COMPLEX array, dimension(LDHA, N), on cpu. On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB). On exit, the content is incomplete. Shouldn't be used. LDHA (input) INTEGER The leading dimension of the array hA. LDHA >= max(1,M). dA (input,output) COMPLEX array, dimension(LDDA, N) , on gpu. On entry, the M-by-N tile to be factored. On exit, the factor L from the factorization LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). hL (output) COMPLEX array, dimension(LDHL, K), on vpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-K the inverse of the top part. LDHL (input) INTEGER The leading dimension of the array hL. LDHL >= max(1,2*IB). dL (output) COMPLEX array, dimension(LDDL, K), on gpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-K the inverse of the top part. LDDL (input) INTEGER The leading dimension of the array dL. LDDL >= max(1,2*IB). hWORK (output) COMPLEX array, dimension(LDHWORK, 2*IB), on cpu. Workspace. LDHWORK (input) INTEGER The leading dimension of the array hWORK. LDHWORK >= max(NB, 1). dWORK (output) COMPLEX array, dimension(LDDWORK, 2*IB), on gpu. Workspace. LDDWORK (input) INTEGER The leading dimension of the array dWORK. LDDWORK >= max(NB, 1). IPIV (output) INTEGER array on the cpu. The pivot indices array of size K as returned by CTSTRF INFO (output) INTEGER - PLASMA_SUCCESS successful exit - < 0 if INFO = -k, the k-th argument had an illegal value - > 0 if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define UT(i,j) (dUT + (i)*ib*lddu + (j)*ib ) #define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib ) #define L(i) (dL + (i)*ib*lddl ) #define L2(i) (dL2 + (i)*ib*lddl ) #define hU(i,j) (hU + (j)*ib*ldhu + (i)*ib ) #define hA(i,j) (hA + (j)*ib*ldha + (i)*ib ) #define hL(i) (hL + (i)*ib*ldhl ) #define hL2(i) (hL2 + (i)*ib*ldhl ) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; int iinfo = 0; int maxm, mindim; int i, j, im, s, ip, ii, sb, p = 1; magmaFloatComplex *dAT, *dUT; magmaFloatComplex *dAp, *dUp; #ifndef WITHOUTTRTRI magmaFloatComplex *dL2 = dL + ib; magmaFloatComplex *hL2 = hL + ib; p = 2; #endif /* Check input arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ib < 0) { *info = -3; } else if ((lddu < max(1,m)) && (m > 0)) { *info = -6; } else if ((ldda < max(1,m)) && (m > 0)) { *info = -8; } else if ((lddl < max(1,ib)) && (ib > 0)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* quick return */ if ((m == 0) || (n == 0) || (ib == 0)) return *info; ip = 0; /* Function Body */ mindim = min(m, n); s = mindim / ib; if ( ib >= mindim ) { /* Use CPU code. */ CORE_ctstrf(m, n, ib, nb, (PLASMA_Complex32_t*)hU, ldhu, (PLASMA_Complex32_t*)hA, ldha, (PLASMA_Complex32_t*)hL, ldhl, ipiv, (PLASMA_Complex32_t*)hwork, ldhwork, info); #ifndef WITHOUTTRTRI CORE_clacpy( PlasmaUpperLower, mindim, mindim, (PLASMA_Complex32_t*)hL, ldhl, (PLASMA_Complex32_t*)hL2, ldhl ); CORE_ctrtri( PlasmaLower, PlasmaUnit, mindim, (PLASMA_Complex32_t*)hL2, ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } #endif if ( (storev == 'R') || (storev == 'r') ) { magma_csetmatrix( m, n, hU, ldhu, dwork, lddwork ); magmablas_ctranspose( dU, lddu, dwork, lddwork, m, n ); magma_csetmatrix( m, n, hA, ldha, dwork, lddwork ); magmablas_ctranspose( dA, ldda, dwork, lddwork, m, n ); } else { magma_csetmatrix( m, n, hU, ldhu, dU, lddu ); magma_csetmatrix( m, n, hA, ldha, dA, ldda ); } magma_csetmatrix( p*ib, n, hL, ldhl, dL, lddl ); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if ( (storev == 'C') || (storev == 'c') ) { magmablas_cgetmo_in( dU, dUT, lddu, m, n ); magmablas_cgetmo_in( dA, dAT, ldda, m, n ); } else { dUT = dU; dAT = dA; } dAp = dwork; dUp = dAp + ib*lddwork; ip = 0; for( i=0; i<s; i++ ) { ii = i * ib; sb = min(mindim-ii, ib); if ( i>0 ){ // download i-th panel magmablas_ctranspose( dUp, lddu, UT(0, i), lddu, sb, ii ); magmablas_ctranspose( dAp, ldda, AT(0, i), ldda, sb, m ); magma_cgetmatrix( ii, sb, dUp, lddu, hU(0, i), ldhu ); magma_cgetmatrix( m, sb, dAp, ldda, hA(0, i), ldha ); // make sure that gpu queue is empty //magma_device_sync(); #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-(ii+sb), ib, c_one, L2(i-1), lddl, UT(i-1, i+1), lddu); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-(ii+sb), ib, c_one, L(i-1), lddl, UT(i-1, i+1), lddu); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(ii+sb), m, ib, c_neg_one, UT(i-1, i+1), lddu, AT(0, i-1), ldda, c_one, AT(0, i+1), ldda ); } // do the cpu part CORE_ctstrf(m, sb, ib, nb, (PLASMA_Complex32_t*)hU(i, i), ldhu, (PLASMA_Complex32_t*)hA(0, i), ldha, (PLASMA_Complex32_t*)hL(i), ldhl, ipiv+ii, (PLASMA_Complex32_t*)hwork, ldhwork, info); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + ii; // Need to swap betw U and A #ifndef NOSWAPBLK magmablas_cswapblk( 'R', n-(ii+sb), UT(i, i+1), lddu, AT(0, i+1), ldda, 1, sb, ipiv+ii, 1, nb ); for(j=0; j<ib; j++) { im = ipiv[ip]-1; if ( im == j ) { ipiv[ip] += ii; } ip++; } #else for(j=0; j<ib; j++) { im = ipiv[ip]-1; if ( im != (j) ) { im = im - nb; assert( (im>=0) && (im<m) ); magmablas_cswap( n-(ii+sb), UT(i, i+1)+j*lddu, 1, AT(0, i+1)+im*ldda, 1 ); } else { ipiv[ip] += ii; } ip++; } #endif #ifndef WITHOUTTRTRI CORE_clacpy( PlasmaUpperLower, sb, sb, (PLASMA_Complex32_t*)hL(i), ldhl, (PLASMA_Complex32_t*)hL2(i), ldhl ); CORE_ctrtri( PlasmaLower, PlasmaUnit, sb, (PLASMA_Complex32_t*)hL2(i), ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } #endif // upload i-th panel magma_csetmatrix( sb, sb, hU(i, i), ldhu, dUp, lddu ); magma_csetmatrix( m, sb, hA(0, i), ldha, dAp, ldda ); magma_csetmatrix( p*ib, sb, hL(i), ldhl, L(i), lddl ); magmablas_ctranspose( UT(i, i), lddu, dUp, lddu, sb, sb); magmablas_ctranspose( AT(0, i), ldda, dAp, ldda, m, sb); // make sure that gpu queue is empty //magma_device_sync(); // do the small non-parallel computations if ( s > (i+1) ) { #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, L2(i), lddl, UT(i, i+1), lddu); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, L(i), lddl, UT(i, i+1), lddu); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, sb, m, sb, c_neg_one, UT(i, i+1), lddu, AT(0, i ), ldda, c_one, AT(0, i+1), ldda ); } else { #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, L2(i), lddl, UT(i, i+1), lddu); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, L(i), lddl, UT(i, i+1), lddu); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-mindim, m, sb, c_neg_one, UT(i, i+1), lddu, AT(0, i ), ldda, c_one, AT(0, i+1), ldda ); } } if ( (storev == 'C') || (storev == 'c') ) { magmablas_cgetmo_out( dU, dUT, lddu, m, n ); magmablas_cgetmo_out( dA, dAT, ldda, m, n ); } } return *info; }
extern "C" magma_int_t magma_sgetrf_incpiv_gpu( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, float *hA, magma_int_t ldha, float *dA, magma_int_t ldda, float *hL, magma_int_t ldhl, float *dL, magma_int_t lddl, magma_int_t *ipiv, float *dwork, magma_int_t lddwork, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= SGETRF_INCPIV computes an LU factorization of a general M-by-N tile A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. IB (input) INTEGER The inner-blocking size. IB >= 0. hA (input,output) DOUBLE COMPLEX array, dimension(LDHA, N), on cpu. On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB). On exit, the content is incomplete. Shouldn't be used. LDHA (input) INTEGER The leading dimension of the array hA. LDHA >= max(1,M). dA (input,output) DOUBLE COMPLEX array, dimension(LDDA, N) , on gpu. On entry, the M-by-N tile to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). hL (output) DOUBLE COMPLEX array, dimension(LDHL, min(M,N)), on vpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. LDHL (input) INTEGER The leading dimension of the array hL. LDHL >= max(1,2*IB). dL (output) DOUBLE COMPLEX array, dimension(LDDL, K), on gpu. On exit, contains in the upper part the IB-by-min(M,N) lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. LDDL (input) INTEGER The leading dimension of the array dL. LDDL >= max(1,2*IB). IPIV (output) INTEGER array, dimension min(M,N), on the cpu. The pivot indices array. dWORK (output) DOUBLE COMPLEX array, dimension(LDDWORK, 2*IB), on gpu. Workspace. LDDWORK (input) INTEGER The leading dimension of the array dWORK. LDDWORK >= max(NB, 1). INFO (output) INTEGER - PLASMA_SUCCESS successful exit - < 0 if INFO = -k, the k-th argument had an illegal value - > 0 if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib) #define hA(i,j) (hA + (i)*ib + (j)*ib*ldha) #define hL(j) (hL + (j)*ib*ldhl ) #define hL2(j) (hL2 + (j)*ib*ldhl ) #define dL(j) (dL + (j)*ib*lddl ) #define dL2(j) (dL2 + (j)*ib*lddl ) float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t iinfo; magma_int_t maxm, mindim; magma_int_t i, rows, cols, s, ii, sb; float *dAT; #ifndef WITHOUTTRTRI float *dL2 = dL + ib; float *hL2 = hL + ib; #endif /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); s = mindim / ib; if ( ib >= mindim ) { /* Use CPU code. */ lapackf77_sgetrf(&m, &n, hA, &ldha, ipiv, info); #ifndef WITHOUTTRTRI CORE_slacpy(PlasmaUpperLower, mindim, mindim, (float*)hA, ldha, (float*)hL2, ldhl ); CORE_strtri( PlasmaLower, PlasmaUnit, mindim, (float*)hL2, ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_ssetmatrix( mindim, mindim, hL2, ldhl, dL2, lddl ); #endif if ( (storev == 'R') || (storev == 'r') ) { magma_ssetmatrix( m, n, hA, ldha, dwork, lddwork ); magmablas_stranspose( dA, ldda, dwork, lddwork, m, n ); } else { magma_ssetmatrix( m, n, hA, ldha, dA, ldda ); } } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if ( (storev == 'C') || (storev == 'c') ) { magmablas_sgetmo_in( dA, dAT, ldda, m, n ); } else { dAT = dA; } for( i=0; i<s; i++ ) { ii = i * ib; sb = min(ib, mindim-ii); cols = maxm - ii; if ( i>0 ){ // download i-th panel magmablas_stranspose( dwork, maxm, AT(0, i), ldda, sb, m ); magma_sgetmatrix( m, sb, dwork, maxm, hA(0, i), ldha ); // make sure that gpu queue is empty //magma_device_sync(); #ifndef WITHOUTTRTRI magma_strmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n - (ii+sb), ib, c_one, dL2(i-1), lddl, AT(i-1,i+1), ldda ); #else magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (ii+sb), ib, c_one, AT(i-1,i-1), ldda, AT(i-1,i+1), ldda ); #endif magma_sgemm( MagmaNoTrans, MagmaNoTrans, n-(ii+sb), m-ii, ib, c_neg_one, AT(i-1,i+1), ldda, AT(i, i-1), ldda, c_one, AT(i, i+1), ldda ); } // do the cpu part rows = m - ii; lapackf77_sgetrf( &rows, &sb, hA(i, i), &ldha, ipiv+ii, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + ii; { int j; int fin = ii + sb; for(j=ii ; j <fin; j++) { ipiv[j] = ii + ipiv[j]; } } magmablas_slaswp( n-ii, AT(0, i), ldda, ii+1, ii+sb, ipiv, 1 ); #ifndef WITHOUTTRTRI CORE_slacpy(PlasmaLower, sb, sb, (float*)hA(i, i), ldha, (float*)hL2(i), ldhl ); CORE_strtri( PlasmaLower, PlasmaUnit, sb, (float*)hL2(i), ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_ssetmatrix( sb, sb, hL2(i), ldhl, dL2(i), lddl ); #endif // upload i-th panel magma_ssetmatrix( rows, sb, hA(i, i), ldha, dwork, cols ); magmablas_stranspose( AT(i,i), ldda, dwork, cols, rows, sb); // do the small non-parallel computations if ( s > (i+1) ) { #ifndef WITHOUTTRTRI magma_strmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, sb, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif magma_sgemm( MagmaNoTrans, MagmaNoTrans, sb, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } else { /* Update of the last panel */ #ifndef WITHOUTTRTRI magma_strmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-mindim, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif /* m-(ii+sb) should be always 0 */ magma_sgemm( MagmaNoTrans, MagmaNoTrans, n-mindim, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } } if ( (storev == 'C') || (storev == 'c') ) { magmablas_sgetmo_out( dA, dAT, ldda, m, n ); } } return *info; }