void magma_smake_hpd( magma_int_t N, float* A, magma_int_t lda ) { magma_int_t i, j; for( i=0; i<N; ++i ) { A(i,i) = MAGMA_S_MAKE( MAGMA_S_REAL( A(i,i) ) + N, 0. ); for( j=0; j<i; ++j ) { A(j,i) = MAGMA_S_CNJG( A(i,j) ); } } }
void init_matrix( magma_int_t N, float *h_A, magma_int_t lda ) { magma_int_t ione = 1, n2 = N*lda; magma_int_t ISEED[4] = {0,0,0,1}; lapackf77_slarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ for (magma_int_t i = 0; i < N; ++i) { h_A(i,i) = MAGMA_S_MAKE( MAGMA_S_REAL(h_A(i,i)) + N, 0 ); for (magma_int_t j = 0; j < i; ++j) h_A(i, j) = MAGMA_S_CNJG( h_A(j, i) ); } }
extern "C" void magma_strdtype2cbHLsym_withQ_v2( magma_int_t n, magma_int_t nb, float *A, magma_int_t lda, float *V, magma_int_t ldv, float *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz, float *work) { /* WORK (workspace) float real array, dimension NB */ magma_int_t ione = 1; magma_int_t vpos, taupos; float conjtmp; float c_one = MAGMA_S_ONE; magma_int_t ldx = lda-1; magma_int_t len = ed - st + 1; magma_int_t lem = min(ed+nb, n) - ed; magma_int_t lem2; if (lem > 0) { magma_bulge_findVTAUpos(n, nb, Vblksiz, sweep-1, st-1, ldv, &vpos, &taupos); /* apply remaining right coming from the top block */ lapackf77_slarfx("R", &lem, &len, V(vpos), TAU(taupos), A(ed+1, st), &ldx, work); } if (lem > 1) { magma_bulge_findVTAUpos(n, nb, Vblksiz, sweep-1, ed, ldv, &vpos, &taupos); /* remove the first column of the created bulge */ *V(vpos) = c_one; //memcpy(V(vpos+1), A(ed+2, st), (lem-1)*sizeof(float)); lem2 = lem-1; blasf77_scopy( &lem2, A(ed+2, st), &ione, V(vpos+1), &ione ); memset(A(ed+2, st),0,(lem-1)*sizeof(float)); /* Eliminate the col at st */ lapackf77_slarfg( &lem, A(ed+1, st), V(vpos+1), &ione, TAU(taupos) ); /* apply left on A(J1:J2,st+1:ed) */ len = len-1; /* because we start at col st+1 instead of st. col st is the col that has been removed; */ conjtmp = MAGMA_S_CNJG(*TAU(taupos)); lapackf77_slarfx("L", &lem, &len, V(vpos), &conjtmp, A(ed+1, st+1), &ldx, work); } }
extern "C" void magma_strdtype2cbHLsym_withQ( magma_int_t N, magma_int_t NB, float *A, magma_int_t LDA, float *V, float *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz) { magma_int_t J1, J2, len, lem, LDX; //magma_int_t i, j; magma_int_t IONE=1; magma_int_t blkid, vpos, taupos, tpos; float conjtmp; float Z_ONE = MAGMA_S_ONE; //float WORK[NB]; float *WORK; magma_smalloc_cpu( &WORK, NB ); findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid); LDX = LDA-1; J1 = ed+1; J2 = min(ed+NB,N); len = ed-st+1; lem = J2-J1+1; if (lem > 0) { /* apply remaining right commming from the top block */ lapackf77_slarfx("R", &lem, &len, V(vpos), TAU(taupos), A(J1, st), &LDX, WORK); } if (lem > 1) { findVTpos(N,NB,Vblksiz,sweep-1,J1-1, &vpos, &taupos, &tpos, &blkid); /* remove the first column of the created bulge */ *V(vpos) = Z_ONE; memcpy(V(vpos+1), A(J1+1, st), (lem-1)*sizeof(float)); memset(A(J1+1, st),0,(lem-1)*sizeof(float)); /* Eliminate the col at st */ lapackf77_slarfg( &lem, A(J1, st), V(vpos+1), &IONE, TAU(taupos) ); /* apply left on A(J1:J2,st+1:ed) */ len = len-1; /* because we start at col st+1 instead of st. col st is the col that has been revomved; */ conjtmp = MAGMA_S_CNJG(*TAU(taupos)); lapackf77_slarfx("L", &lem, &len, V(vpos), &conjtmp, A(J1, st+1), &LDX, WORK); } magma_free_cpu(WORK); }
inline static void magma_slarfxsym_v2(magma_int_t n, float *A, magma_int_t lda, float *V, float *TAU, float *work) { /* WORK (workspace) float real array, dimension N */ magma_int_t ione = 1; float dtmp; float c_zero = MAGMA_S_ZERO; float c_neg_one= MAGMA_S_NEG_ONE; float c_half = MAGMA_S_HALF; /* X = AVtau */ blasf77_ssymv("L",&n, TAU, A, &lda, V, &ione, &c_zero, work, &ione); /* compute dtmp= X'*V */ #if defined(PRECISION_z) || defined(PRECISION_c) dtmp = c_zero; for (magma_int_t j = 0; j < n ; j++) dtmp = dtmp + MAGMA_S_CNJG(work[j]) * V[j]; //cblas_sdot_sub(n, work, ione, V, ione, &dtmp); #else dtmp = cblas_sdot(n, work, ione, V, ione); #endif /* compute 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * c_half * (*TAU); /* compute W=X-1/2VX'Vt = X - dtmp*V */ blasf77_saxpy(&n, &dtmp, V, &ione, work, &ione); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_ssyr2("L", &n, &c_neg_one, work, &ione, V, &ione, A, &lda); }
/** Purpose ------- SLAQPS computes a step of QR factorization with column pivoting of a real M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau REAL array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 REAL array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 REAL array, dimension (N) The vector with the exact column norms. @param[in,out] auxv REAL array, dimension (NB) Auxiliar vector. @param[in,out] F REAL array, dimension (LDF,NB) Matrix F' = L*Y'*A. @param[in] ldf INTEGER The leading dimension of the array F. LDF >= max(1,N). @ingroup magma_sgeqp3_aux ********************************************************************/ extern "C" magma_int_t magma_slaqps(magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, float *A, magma_int_t lda, float *dA, magma_int_t ldda, magma_int_t *jpvt, float *tau, float *vn1, float *vn2, float *auxv, float *F, magma_int_t ldf, float *dF, magma_int_t lddf) { #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dA + (i) + (j)*(ldda)) #define F(i, j) (F + (i) + (j)*(ldf )) #define dF(i, j) (dF + (i) + (j)*(lddf)) float c_zero = MAGMA_S_MAKE( 0.,0.); float c_one = MAGMA_S_MAKE( 1.,0.); float c_neg_one = MAGMA_S_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; float d__1; float z__1; magma_int_t j, k, rk; float Akk; magma_int_t pvt; float temp, temp2, tol3z; magma_int_t itemp; magma_int_t lsticc; magma_int_t lastrk; lastrk = min( m, n + offset ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); magma_queue_t stream; magma_queue_create( &stream ); lsticc = 0; k = 0; while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran isamax; pvt, k are 0-based. i__1 = n-k; pvt = k + blasf77_isamax( &i__1, &vn1[k], &ione ) - 1; if (pvt != k) { if (pvt >= nb) { /* 1. Start copy from GPU */ magma_sgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, stream ); } /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; blasf77_sswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; vn1[pvt] = vn1[k]; vn2[pvt] = vn2[k]; if (pvt < nb) { /* no need of transfer if pivot is within the panel */ blasf77_sswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { /* 1. Finish copy from GPU */ magma_queue_sync( stream ); /* 2. Swap as usual on CPU */ blasf77_sswap(&m, A(0, pvt), &ione, A(0, k), &ione); /* 3. Restore the GPU */ magma_ssetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, stream); } } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { #if defined(PRECISION_c) || defined(PRECISION_z) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_S_CNJG( *F(k,j) ); } #endif i__1 = m - rk; i__2 = k; blasf77_sgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione ); #if defined(PRECISION_c) || defined(PRECISION_z) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_S_CNJG( *F(k,j) ); } #endif } /* Generate elementary reflector H(k). */ if (rk < m-1) { i__1 = m - rk; lapackf77_slarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] ); } else { lapackf77_slarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] ); } Akk = *A(rk, k); *A(rk, k) = c_one; /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ magma_ssetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda ); /* Multiply on GPU */ // was CALL SGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) magma_int_t i__3 = nb-k-1; magma_int_t i__4 = i__2 - i__3; magma_int_t i__5 = nb-k; magma_sgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, tau[k], dA(rk +i__5, k+1+i__3), ldda, dA(rk +i__5, k ), ione, c_zero, dF(k+1+i__3, k ), ione ); magma_sgetmatrix_async( i__2-i__3, 1, dF(k + 1 +i__3, k), i__2, F (k + 1 +i__3, k), i__2, stream ); blasf77_sgemv( MagmaConjTransStr, &i__1, &i__3, &tau[k], A(rk, k+1), &lda, A(rk, k ), &ione, &c_zero, F(k+1, k ), &ione ); magma_queue_sync( stream ); blasf77_sgemv( MagmaConjTransStr, &i__5, &i__4, &tau[k], A(rk, k+1+i__3), &lda, A(rk, k ), &ione, &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. */ for (j = 0; j < k; ++j) { *F(j, k) = c_zero; } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */ if (k > 0) { i__1 = m - rk; i__2 = k; z__1 = MAGMA_S_NEGATE( tau[k] ); blasf77_sgemv( MagmaConjTransStr, &i__1, &i__2, &z__1, A(rk, 0), &lda, A(rk, k), &ione, &c_zero, auxv, &ione ); i__1 = k; blasf77_sgemv( MagmaNoTransStr, &n, &i__1, &c_one, F(0,0), &ldf, auxv, &ione, &c_one, F(0,k), &ione ); } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; blasf77_sgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, &c_neg_one, A(rk, 0 ), &lda, F(k+1,0 ), &ldf, &c_one, A(rk, k+1), &lda ); } /* Update partial column norms. */ if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { /* NOTE: The following 4 lines follow from the analysis in Lapack Working Note 176. */ temp = MAGMA_S_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (float) lsticc; lsticc = j; } else { vn1[j] *= magma_ssqrt(temp); } } } } *A(rk, k) = Akk; ++k; } // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU */ magma_ssetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2 ); magma_sgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), i__2, c_one, dA(rk+1, *kb), ldda ); } /* Recomputation of difficult columns. */ while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) vn1[lsticc] = magma_cblas_snrm2( i__1, A(rk+1,lsticc), ione ); else { /* Where is the data, CPU or GPU ? */ float r1, r2; r1 = magma_cblas_snrm2( nb-k, A(rk+1,lsticc), ione ); r2 = magma_snrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione); //vn1[lsticc] = magma_snrm2(i__1, dA(rk + 1, lsticc), ione); vn1[lsticc] = magma_ssqrt(r1*r1 + r2*r2); } /* NOTE: The computation of VN1( LSTICC ) relies on the fact that SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) */ vn2[lsticc] = vn1[lsticc]; lsticc = itemp; } magma_queue_destroy( stream ); return MAGMA_SUCCESS; } /* magma_slaqps */
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float *h_A, *h_R; magmaFloat_ptr d_lA[MagmaMaxGPUs]; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 }; magma_int_t i, j, k, info; float mz_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], matnorm, diffnorm; magma_int_t num_gpus0 = 1, num_gpus, flag = 0; int nb, mb, n_local, nk; magma_uplo_t uplo = MagmaLower; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0){ N = atoi(argv[++i]); if (N>0) { size[0] = size[9] = N; flag = 1; }else exit(1); } if(strcmp("-NGPU", argv[i])==0) num_gpus0 = atoi(argv[++i]); if(strcmp("-UPLO", argv[i])==0){ if(strcmp("L", argv[++i])==0){ uplo = MagmaLower; }else{ uplo = MagmaUpper; } } } } else { printf("\nUsage: \n"); printf(" testing_spotrf_mgpu -N %d -NGPU %d -UPLO -L\n\n", 1024, num_gpus0); } /* looking for max. ldda */ ldda = 0; n2 = 0; for(i=0;i<10;i++){ N = size[i]; nb = magma_get_spotrf_nb(N); mb = nb; if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; }else{ num_gpus = num_gpus0; } n_local = nb*(1+N/(nb*num_gpus))*mb*((N+mb-1)/mb); if(n_local > ldda) ldda = n_local; if(n2 < N*N) n2 = N*N; if(flag != 0) break; } /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; //magma_queue_t queues[MagmaMaxGPUs]; magma_device_t devices[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } } printf("each buffer size: %d\n", ldda); /* allocate local matrix on Buffers */ for(i=0; i<num_gpus0; i++){ TESTING_MALLOC_DEV( d_lA[i], float, ldda ); } printf("\n\n"); printf("Using GPUs: %d\n", num_gpus0); if(uplo == MagmaUpper){ printf("\n testing_spotrf_mgpu -N %d -NGPU %d -UPLO U\n\n", N, num_gpus0); }else{ printf("\n testing_spotrf_mgpu -N %d -NGPU %d -UPLO L\n\n", N, num_gpus0); } printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { MAGMA_S_SET2REAL( h_A(i,i), MAGMA_S_REAL(h_A(i,i)) + N ); for( int j = 0; j < i; ++j ) { h_A(i, j) = MAGMA_S_CNJG( h_A(j,i) ); } } lapackf77_slacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); /* Warm up to measure the performance */ nb = magma_get_spotrf_nb(N); if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus); }else{ num_gpus = num_gpus0; } /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(N, nk, &h_A[j*lda], 0, lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(nk, N, &h_A[j], 0, lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } magma_spotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, &info, queues ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(N, nk, &h_A[j*lda], 0, lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(nk, N, &h_A[j], 0, lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } gpu_time = magma_wtime(); magma_spotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, &info, queues ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_spotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* gather matrix from gpus */ if(uplo==MagmaUpper){ // Upper for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_sgetmatrix(N, nk, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, &h_R[j*lda], 0, lda, queues[2*k]); } }else{ // Lower for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_sgetmatrix( nk, N, d_lA[k], (j/(nb*num_gpus)*nb), ldda, &h_R[j], 0, lda, queues[2*k] ); } } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); if(uplo == MagmaLower){ lapackf77_spotrf( MagmaLowerStr, &N, h_A, &lda, &info ); }else{ lapackf77_spotrf( MagmaUpperStr, &N, h_A, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; if (info != 0) printf( "lapackf77_spotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ matnorm = lapackf77_slange("f", &N, &N, h_A, &lda, work); blasf77_saxpy(&n2, &mz_one, h_A, &ione, h_R, &ione); diffnorm = lapackf77_slange("f", &N, &N, h_R, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (flag != 0) break; } /* clean up */ TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); for(i=0;i<num_gpus;i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy( queues[2*i] ); magma_queue_destroy( queues[2*i+1] ); } magma_finalize(); }
void magmablas_ssyr2k_mgpu_spec( magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, float alpha, magmaFloat_ptr dA[], magma_int_t ldda, magma_int_t a_offset, magmaFloat_ptr dB[], magma_int_t lddb, magma_int_t b_offset, float beta, magmaFloat_ptr dC[], magma_int_t lddc, magma_int_t c_offset, magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[][20], magma_int_t nqueue ) { #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda + (a_offset) ) #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb + (b_offset) ) #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc) /* Check arguments */ magma_int_t info = 0; if ( uplo != MagmaLower ) { info = -1; // 'u' not yet handled } else if ( trans != MagmaNoTrans ) { info = -2; // 'c' not yet handled } else if ( n < 0 ) { info = -3; } else if ( k < 0 ) { info = -4; } else if ( ((trans == MagmaNoTrans) && ldda < max(1,n)) || ((trans == MagmaTrans) && ldda < max(1,k)) ) { info = -7; } else if ( a_offset < 0 || a_offset > ldda ) { info = -8; } else if ( ((trans == MagmaNoTrans) && lddb < max(1,n)) || ((trans == MagmaTrans) && lddb < max(1,k)) ) { info = -10; } else if ( b_offset < 0 || b_offset > lddb ) { info = -11; } else if ( lddc < max(1,n) ) { info = -13; } else if ( c_offset < 0 || c_offset > lddc ) { info = -14; } else if ( ngpu <= 0 ) { info = -15; } else if ( nb <= 0 ) { info = -16; } else if ( nqueue <= 0 ) { info = -18; } if ( info != 0 ) { magma_xerbla( __func__, -(info) ); return; } const float c_one = MAGMA_S_ONE; float cbeta = MAGMA_S_MAKE( beta, 0. ); magma_int_t ib, ioff, iblock, idev, di, s; magma_device_t cdev; magma_queue_t cqueue; magma_getdevice( &cdev ); magmablasGetKernelStream( &cqueue ); // loop over all blocks // Faster to have two loops: first loop does C_hat = alpha*A*B**H + beta*C // blockoffset is offset within first block; for subsequent blocks it is 0 magma_int_t blockoffset = c_offset % nb; for( magma_int_t i = 0; i < n; i += ib ) { ib = min( nb-blockoffset, n-i ); // block size ioff = i + c_offset; // global index in parent matrix iblock = (ioff / nb) / ngpu; // local block id idev = (ioff / nb) % ngpu; // device with this block di = iblock*nb + blockoffset; // local index in parent matrix magma_setdevice( idev ); s = iblock % nqueue; magmablasSetKernelStream( queues[ idev ][ s ] ); // C[i:n,i] = alpha * A[i:n,0] * B[i,0]' + beta*C[i:n,i] //printf( "sgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i ); magma_sgemm( MagmaNoTrans, MagmaTrans, n, ib, k, alpha, dA(idev,0,0), ldda, dB(idev,i,0), lddb, cbeta, dC(idev,c_offset,di), lddc ); blockoffset = 0; } // second loop does C = conj(alpha)*B*A**H + C_hat alpha = MAGMA_S_CNJG( alpha ); blockoffset = c_offset % nb; for( magma_int_t i = 0; i < n; i += ib ) { ib = min( nb-blockoffset, n-i ); // block size ioff = i + c_offset; // global index in parent matrix iblock = (ioff / nb) / ngpu; // local block id idev = (ioff / nb) % ngpu; // device with this block di = iblock*nb + blockoffset; // local index in parent matrix magma_setdevice( idev ); s = iblock % nqueue; magmablasSetKernelStream( queues[ idev ][ s ] ); // C[i:n,i] += conj(alpha) * B[i:n,0] * A[i,0]' //printf( "sgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i ); magma_sgemm( MagmaNoTrans, MagmaTrans, n, ib, k, alpha, dB(idev,0,0), lddb, dA(idev,i,0), ldda, c_one, dC(idev,c_offset,di), lddc ); blockoffset = 0; } magma_setdevice( cdev ); magmablasSetKernelStream( cqueue ); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ssymmetrize Code is very similar to testing_stranspose.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_R; float *d_A; magma_int_t N, size, lda, ldda; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf(" N CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("=====================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = ((N+31)/32)*32; size = lda*N; // load strictly lower triangle, save strictly upper triangle gbytes = sizeof(float) * 1.*N*(N-1) / 1e9; TESTING_MALLOC_CPU( h_A, float, size ); TESTING_MALLOC_CPU( h_R, float, size ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < N; ++i ) { h_A[i + j*lda] = MAGMA_S_MAKE( i + j/10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( N, N, h_A, lda, d_A, ldda ); gpu_time = magma_sync_wtime( 0 ); //magmablas_ssymmetrize( opts.uplo, N-2, d_A+1+ldda, ldda ); // inset by 1 row & col magmablas_ssymmetrize( opts.uplo, N, d_A, ldda ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using naive in-place algorithm (LAPACK doesn't implement symmetrize) =================================================================== */ cpu_time = magma_wtime(); //for( int j = 1; j < N-1; ++j ) { // inset by 1 row & col // for( int i = 1; i < j; ++i ) { for( int j = 0; j < N; ++j ) { for( int i = 0; i < j; ++i ) { if ( opts.uplo == MagmaLower ) { h_A[i + j*lda] = MAGMA_S_CNJG( h_A[j + i*lda] ); } else { h_A[j + i*lda] = MAGMA_S_CNJG( h_A[i + j*lda] ); } } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ===================================================================== Check the result =================================================================== */ magma_sgetmatrix( N, N, d_A, ldda, h_R, lda ); blasf77_saxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_slange("f", &N, &N, h_R, &lda, work); printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_slaqps(magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, float *A, magma_int_t lda, float *dA, magma_int_t ldda, magma_int_t *jpvt, float *tau, float *vn1, float *vn2, float *auxv, float *F, magma_int_t ldf, float *dF, magma_int_t lddf) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= SLAQPS computes a step of QR factorization with column pivoting of a real M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0 OFFSET (input) INTEGER The number of rows of A that have been factorized in previous steps. NB (input) INTEGER The number of columns to factorize. KB (output) INTEGER The number of columns actually factorized. A (input/output) REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). JPVT (input/output) INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. TAU (output) REAL array, dimension (KB) The scalar factors of the elementary reflectors. VN1 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. VN2 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. AUXV (input/output) REAL array, dimension (NB) Auxiliar vector. F (input/output) REAL array, dimension (LDF,NB) Matrix F' = L*Y'*A. LDF (input) INTEGER The leading dimension of the array F. LDF >= max(1,N). ===================================================================== */ #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dA + (i) + (j)*(ldda)) #define F(i, j) (F + (i) + (j)*(ldf )) #define dF(i, j) (dF + (i) + (j)*(lddf)) float c_zero = MAGMA_S_MAKE( 0.,0.); float c_one = MAGMA_S_MAKE( 1.,0.); float c_neg_one = MAGMA_S_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; float d__1; float z__1; magma_int_t j, k, rk; float Akk; magma_int_t pvt; float temp, temp2, tol3z; magma_int_t itemp; magma_int_t lsticc; magma_int_t lastrk; lastrk = min( m, n + offset ); tol3z = magma_ssqrt( lapackf77_slamch("Epsilon")); magma_queue_t stream; magma_queue_create( &stream ); lsticc = 0; k = 0; while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // Fortran: pvt, k, isamax are all 1-based; subtract 1 from k. // C: pvt, k, isamax are all 0-based; don't subtract 1. pvt = k + cblas_isamax( n-k, &vn1[k], ione ); if (pvt != k) { if (pvt >= nb) { /* 1. Start copy from GPU */ magma_sgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, stream ); } /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; blasf77_sswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; vn1[pvt] = vn1[k]; vn2[pvt] = vn2[k]; if (pvt < nb){ /* no need of transfer if pivot is within the panel */ blasf77_sswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { /* 1. Finish copy from GPU */ magma_queue_sync( stream ); /* 2. Swap as usual on CPU */ blasf77_sswap(&m, A(0, pvt), &ione, A(0, k), &ione); /* 3. Restore the GPU */ magma_ssetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, stream); } } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { #if defined(PRECISION_c) || defined(PRECISION_z) for (j = 0; j < k; ++j){ *F(k,j) = MAGMA_S_CNJG( *F(k,j) ); } #endif i__1 = m - rk; i__2 = k; blasf77_sgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione ); #if defined(PRECISION_c) || defined(PRECISION_z) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_S_CNJG( *F(k,j) ); } #endif } /* Generate elementary reflector H(k). */ if (rk < m-1) { i__1 = m - rk; lapackf77_slarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] ); } else { lapackf77_slarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] ); } Akk = *A(rk, k); *A(rk, k) = c_one; /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ magma_ssetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda ); /* Multiply on GPU */ // was CALL SGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) magma_int_t i__3 = nb-k-1; magma_int_t i__4 = i__2 - i__3; magma_int_t i__5 = nb-k; magma_sgemv( MagmaTrans, i__1 - i__5, i__2 - i__3, tau[k], dA(rk +i__5, k+1+i__3), ldda, dA(rk +i__5, k ), ione, c_zero, dF(k+1+i__3, k ), ione ); magma_sgetmatrix_async( i__2-i__3, 1, dF(k + 1 +i__3, k), i__2, F (k + 1 +i__3, k), i__2, stream ); blasf77_sgemv( MagmaTransStr, &i__1, &i__3, &tau[k], A(rk, k+1), &lda, A(rk, k ), &ione, &c_zero, F(k+1, k ), &ione ); magma_queue_sync( stream ); blasf77_sgemv( MagmaTransStr, &i__5, &i__4, &tau[k], A(rk, k+1+i__3), &lda, A(rk, k ), &ione, &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. */ for (j = 0; j < k; ++j) { *F(j, k) = c_zero; } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */ if (k > 0) { i__1 = m - rk; i__2 = k; z__1 = MAGMA_S_NEGATE( tau[k] ); blasf77_sgemv( MagmaTransStr, &i__1, &i__2, &z__1, A(rk, 0), &lda, A(rk, k), &ione, &c_zero, auxv, &ione ); i__1 = k; blasf77_sgemv( MagmaNoTransStr, &n, &i__1, &c_one, F(0,0), &ldf, auxv, &ione, &c_one, F(0,k), &ione ); } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; blasf77_sgemm( MagmaNoTransStr, MagmaTransStr, &ione, &i__1, &i__2, &c_neg_one, A(rk, 0 ), &lda, F(k+1,0 ), &ldf, &c_one, A(rk, k+1), &lda ); } /* Update partial column norms. */ if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { /* NOTE: The following 4 lines follow from the analysis in Lapack Working Note 176. */ temp = MAGMA_S_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (float) lsticc; lsticc = j; } else { vn1[j] *= magma_ssqrt(temp); } } } } *A(rk, k) = Akk; ++k; } // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU */ magma_ssetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2 ); magma_sgemm( MagmaNoTrans, MagmaTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), i__2, c_one, dA(rk+1, *kb), ldda ); } /* Recomputation of difficult columns. */ while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) vn1[lsticc] = cblas_snrm2(i__1, A(rk + 1, lsticc), ione); else { /* Where is the data, CPU or GPU ? */ float r1, r2; r1 = cblas_snrm2(nb-k, A(rk + 1, lsticc), ione); r2 = magma_snrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione); //vn1[lsticc] = magma_snrm2(i__1, dA(rk + 1, lsticc), ione); vn1[lsticc] = magma_ssqrt(r1*r1+r2*r2); } /* NOTE: The computation of VN1( LSTICC ) relies on the fact that SNRM2 does not fail on vectors with norm below the value of SQRT(SLAMCH('S')) */ vn2[lsticc] = vn1[lsticc]; lsticc = itemp; } magma_queue_destroy( stream ); return MAGMA_SUCCESS; } /* magma_slaqps */
/* //////////////////////////////////////////////////////////////////////////// -- Testing ssymmetrize Code is very similar to testing_stranspose.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_R; float *d_A; magma_int_t N, nb, size, lda, ldda, mstride, nstride, ntile; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); nb = (opts.nb == 0 ? 64 : opts.nb); mstride = 2*nb; nstride = 3*nb; printf("uplo = %s, nb = %d, mstride = %d, nstride = %d\n", lapack_uplo_const(opts.uplo), (int) nb, (int) mstride, (int) nstride ); printf(" N ntile CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("===========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = ((N+31)/32)*32; size = lda*N; if ( N < nb ) { ntile = 0; } else { ntile = min( (N - nb)/mstride + 1, (N - nb)/nstride + 1 ); } // load each tile, save each tile gbytes = sizeof(float) * 2.*nb*nb*ntile / 1e9; TESTING_MALLOC_CPU( h_A, float, size ); TESTING_MALLOC_CPU( h_R, float, size ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < N; ++i ) { h_A[i + j*lda] = MAGMA_S_MAKE( i + j/10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( N, N, h_A, lda, d_A, ldda ); gpu_time = magma_sync_wtime( 0 ); magmablas_ssymmetrize_tiles( opts.uplo, nb, d_A, ldda, ntile, mstride, nstride ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using naive in-place algorithm (LAPACK doesn't implement symmetrize) =================================================================== */ cpu_time = magma_wtime(); for( int tile = 0; tile < ntile; ++tile ) { int offset = tile*mstride + tile*nstride*lda; for( int j = 0; j < nb; ++j ) { for( int i = 0; i < j; ++i ) { if ( opts.uplo == MagmaLower ) { h_A[offset + i + j*lda] = MAGMA_S_CNJG( h_A[offset + j + i*lda] ); } else { h_A[offset + j + i*lda] = MAGMA_S_CNJG( h_A[offset + i + j*lda] ); } } } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ===================================================================== Check the result =================================================================== */ magma_sgetmatrix( N, N, d_A, ldda, h_R, lda ); blasf77_saxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_slange("f", &N, &N, h_R, &lda, work); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", (int) N, (int) ntile, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float *hA, *hR; magmaFloat_ptr dA; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1024, 2048, 3072, 4032, 5184, 6048, 7200, 8064, 8928, 10560 }; magma_int_t i, info; float mz_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], matnorm, diffnorm; if (argc != 1) { for(i = 1; i<argc; i++) { if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_spotrf_gpu -N %d\n\n", 1024); } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the largest matrix */ N = size[9]; n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC_CPU( hA, float, n2 ); TESTING_MALLOC_PIN( hR, float, n2 ); TESTING_MALLOC_DEV( dA, float, ldda*N ); printf("\n\n"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++) { N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, hA ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { MAGMA_S_SET2REAL( hA(i,i), MAGMA_S_REAL(hA(i,i)) + N ); for( int j = 0; j < i; ++j ) { hA(i, j) = MAGMA_S_CNJG( hA(j,i) ); } } lapackf77_slacpy( MagmaFullStr, &N, &N, hA, &lda, hR, &lda ); /* Warm up to measure the performance */ magma_ssetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); magma_spotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); gpu_time = magma_wtime(); magma_spotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_spotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_spotrf( MagmaUpperStr, &N, hA, &lda, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf( "lapackf77_spotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ magma_sgetmatrix( N, N, dA, 0, ldda, hR, 0, lda, queue ); matnorm = lapackf77_slange("f", &N, &N, hA, &lda, work); blasf77_saxpy(&n2, &mz_one, hA, &ione, hR, &ione); diffnorm = lapackf77_slange("f", &N, &N, hR, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( hA ); TESTING_FREE_PIN( hR ); TESTING_FREE_DEV( dA ); magma_queue_destroy( queue ); magma_finalize(); }