void magma_dmake_hpd( magma_int_t N, double* A, magma_int_t lda ) { magma_int_t i, j; for( i=0; i<N; ++i ) { A(i,i) = MAGMA_D_MAKE( MAGMA_D_REAL( A(i,i) ) + N, 0. ); for( j=0; j<i; ++j ) { A(j,i) = MAGMA_D_CNJG( A(i,j) ); } } }
void init_matrix( magma_int_t N, double *h_A, magma_int_t lda ) { magma_int_t ione = 1, n2 = N*lda; magma_int_t ISEED[4] = {0,0,0,1}; lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ for (magma_int_t i = 0; i < N; ++i) { h_A(i,i) = MAGMA_D_MAKE( MAGMA_D_REAL(h_A(i,i)) + N, 0 ); for (magma_int_t j = 0; j < i; ++j) h_A(i, j) = MAGMA_D_CNJG( h_A(j, i) ); } }
extern "C" void magma_dtrdtype2cbHLsym_withQ_v2(magma_int_t n, magma_int_t nb, double *A, magma_int_t lda, double *V, magma_int_t ldv, double *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz, double *work) { /* WORK (workspace) double real array, dimension NB */ magma_int_t ione = 1; magma_int_t vpos, taupos; double conjtmp; double c_one = MAGMA_D_ONE; magma_int_t ldx = lda-1; magma_int_t len = ed - st + 1; magma_int_t lem = min(ed+nb, n) - ed; if (lem > 0) { magma_bulge_findVTAUpos(n, nb, Vblksiz, sweep-1, st-1, ldv, &vpos, &taupos); /* apply remaining right coming from the top block */ lapackf77_dlarfx("R", &lem, &len, V(vpos), TAU(taupos), A(ed+1, st), &ldx, work); } if (lem > 1) { magma_bulge_findVTAUpos(n, nb, Vblksiz, sweep-1, ed, ldv, &vpos, &taupos); /* remove the first column of the created bulge */ *V(vpos) = c_one; //memcpy(V(vpos+1), A(ed+2, st), (lem-1)*sizeof(double)); cblas_dcopy(lem-1, A(ed+2, st), ione, V(vpos+1), ione); memset(A(ed+2, st),0,(lem-1)*sizeof(double)); /* Eliminate the col at st */ lapackf77_dlarfg( &lem, A(ed+1, st), V(vpos+1), &ione, TAU(taupos) ); /* apply left on A(J1:J2,st+1:ed) */ len = len-1; /* because we start at col st+1 instead of st. col st is the col that has been removed; */ conjtmp = MAGMA_D_CNJG(*TAU(taupos)); lapackf77_dlarfx("L", &lem, &len, V(vpos), &conjtmp, A(ed+1, st+1), &ldx, work); } }
extern "C" void magma_dtrdtype2cbHLsym_withQ( magma_int_t N, magma_int_t NB, double *A, magma_int_t LDA, double *V, double *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz) { magma_int_t J1, J2, len, lem, LDX; //magma_int_t i, j; magma_int_t IONE=1; magma_int_t blkid, vpos, taupos, tpos; double conjtmp; double Z_ONE = MAGMA_D_ONE; //double WORK[NB]; double *WORK; magma_dmalloc_cpu( &WORK, NB ); findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid); LDX = LDA-1; J1 = ed+1; J2 = min(ed+NB,N); len = ed-st+1; lem = J2-J1+1; if (lem > 0) { /* apply remaining right commming from the top block */ lapackf77_dlarfx("R", &lem, &len, V(vpos), TAU(taupos), A(J1, st), &LDX, WORK); } if (lem > 1) { findVTpos(N,NB,Vblksiz,sweep-1,J1-1, &vpos, &taupos, &tpos, &blkid); /* remove the first column of the created bulge */ *V(vpos) = Z_ONE; memcpy(V(vpos+1), A(J1+1, st), (lem-1)*sizeof(double)); memset(A(J1+1, st),0,(lem-1)*sizeof(double)); /* Eliminate the col at st */ lapackf77_dlarfg( &lem, A(J1, st), V(vpos+1), &IONE, TAU(taupos) ); /* apply left on A(J1:J2,st+1:ed) */ len = len-1; /* because we start at col st+1 instead of st. col st is the col that has been revomved; */ conjtmp = MAGMA_D_CNJG(*TAU(taupos)); lapackf77_dlarfx("L", &lem, &len, V(vpos), &conjtmp, A(J1, st+1), &LDX, WORK); } magma_free_cpu(WORK); }
inline static void magma_dlarfxsym_v2(magma_int_t n, double *A, magma_int_t lda, double *V, double *TAU, double *work) { /* WORK (workspace) double real array, dimension N */ magma_int_t ione = 1; double dtmp; double c_zero = MAGMA_D_ZERO; double c_neg_one= MAGMA_D_NEG_ONE; double c_half = MAGMA_D_HALF; /* X = AVtau */ blasf77_dsymv("L",&n, TAU, A, &lda, V, &ione, &c_zero, work, &ione); /* compute dtmp= X'*V */ #if defined(PRECISION_z) || defined(PRECISION_c) dtmp = c_zero; for (magma_int_t j = 0; j < n; j++) dtmp = dtmp + MAGMA_D_CNJG(work[j]) * V[j]; //cblas_ddot_sub(n, work, ione, V, ione, &dtmp); #else dtmp = cblas_ddot(n, work, ione, V, ione); #endif /* compute 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * c_half * (*TAU); /* compute W=X-1/2VX'Vt = X - dtmp*V */ blasf77_daxpy(&n, &dtmp, V, &ione, work, &ione); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_dsyr2("L", &n, &c_neg_one, work, &ione, V, &ione, A, &lda); }
extern "C" void magma_dlarfxsym(magma_int_t N, double *A, magma_int_t LDA, double *V, double *TAU) { magma_int_t IONE=1; double dtmp; double Z_ZERO = MAGMA_D_ZERO; //double Z_ONE = MAGMA_D_ONE; double Z_MONE = MAGMA_D_NEG_ONE; double Z_HALF = MAGMA_D_HALF; //double WORK[N]; double *WORK; magma_dmalloc_cpu( &WORK, N ); /* apply left and right on A(st:ed,st:ed)*/ //magma_dlarfxsym(len,A(st,st),LDX,V(st),TAU(st)); /* X = AVtau */ blasf77_dsymv("L",&N, TAU, A, &LDA, V, &IONE, &Z_ZERO, WORK, &IONE); /* je calcul dtmp= X'*V */ #if defined(PRECISION_z) || defined(PRECISION_c) dtmp = Z_ZERO; for (magma_int_t j = 0; j < N; j++) dtmp = dtmp + MAGMA_D_CNJG(WORK[j]) * V[j]; //cblas_ddot_sub(N, WORK, IONE, V, IONE, &dtmp); #else dtmp = cblas_ddot(N, WORK, IONE, V, IONE); #endif /* je calcul 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * Z_HALF * (*TAU); /* je calcul W=X-1/2VX'Vt = X - dtmp*V */ /* for (j = 0; j < N; j++) WORK[j] = WORK[j] + (dtmp*V[j]); */ blasf77_daxpy(&N, &dtmp, V, &IONE, WORK, &IONE); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_dsyr2("L",&N,&Z_MONE,WORK,&IONE,V,&IONE,A,&LDA); magma_free_cpu(WORK); }
void magmablas_dsyr2k_mgpu_spec( magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, double alpha, double *dA[], magma_int_t lda, magma_int_t aoffset, double *dB[], magma_int_t ldb, magma_int_t boffset, double beta, double *dC[], magma_int_t ldc, magma_int_t coffset, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream ) { #define dA(dev, i, j) (dA[dev] + (i) + (j)*lda + (aoffset) ) #define dB(dev, i, j) (dB[dev] + (i) + (j)*ldb + (boffset) ) #define dC(dev, i, j) (dC[dev] + (i) + (j)*ldc) /* Check arguments */ magma_int_t info = 0; if ( uplo != MagmaLower ) { info = -1; // 'u' not yet handled } else if ( trans != MagmaNoTrans ) { info = -2; // 'c' not yet handled } else if ( n < 0 ) { info = -3; } else if ( k < 0 ) { info = -4; } else if ( ((trans == MagmaNoTrans) && lda < max(1,n)) || ((trans == MagmaTrans) && lda < max(1,k)) ) { info = -7; } else if ( aoffset < 0 || aoffset > lda ) { info = -8; } else if ( ((trans == MagmaNoTrans) && ldb < max(1,n)) || ((trans == MagmaTrans) && ldb < max(1,k)) ) { info = -10; } else if ( boffset < 0 || boffset > ldb ) { info = -11; } else if ( ldc < max(1,n) ) { info = -13; } else if ( coffset < 0 || coffset > ldc ) { info = -14; } else if ( ngpu <= 0 ) { info = -15; } else if ( nb <= 0 ) { info = -16; } else if ( nstream <= 0 ) { info = -18; } if ( info != 0 ) { magma_xerbla( __func__, -(info) ); return; } const double c_one = MAGMA_D_ONE; double cbeta = MAGMA_D_MAKE( beta, 0. ); magma_int_t ib, ioff, iblock, idev, di, s; magma_device_t cdev; magma_queue_t cqueue; magma_getdevice( &cdev ); magmablasGetKernelStream( &cqueue ); // loop over all blocks // Faster to have two loops: first loop does C_hat = alpha*A*B' + beta*C // blockoffset is offset within first block; for subsequent blocks it is 0 magma_int_t blockoffset = coffset % nb; for( magma_int_t i = 0; i < n; i += ib ) { ib = min( nb-blockoffset, n-i ); // block size ioff = i + coffset; // global index in parent matrix iblock = (ioff / nb) / ngpu; // local block id idev = (ioff / nb) % ngpu; // device with this block di = iblock*nb + blockoffset; // local index in parent matrix magma_setdevice( idev ); s = iblock % nstream; magmablasSetKernelStream( streams[ idev ][ s ] ); // C[i:n,i] = alpha * A[i:n,0] * B[i,0]' + beta*C[i:n,i] //printf( "dgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i ); magma_dgemm( MagmaNoTrans, MagmaTrans, n, ib, k, alpha, dA(idev,0,0), lda, dB(idev,i,0), ldb, cbeta, dC(idev,coffset,di), ldc ); blockoffset = 0; } // second loop does C = (alpha)*B*A' + C_hat alpha = MAGMA_D_CNJG( alpha ); blockoffset = coffset % nb; for( magma_int_t i = 0; i < n; i += ib ) { ib = min( nb-blockoffset, n-i ); // block size ioff = i + coffset; // global index in parent matrix iblock = (ioff / nb) / ngpu; // local block id idev = (ioff / nb) % ngpu; // device with this block di = iblock*nb + blockoffset; // local index in parent matrix magma_setdevice( idev ); s = iblock % nstream; magmablasSetKernelStream( streams[ idev ][ s ] ); // C[i:n,i] += (alpha) * B[i:n,0] * A[i,0]' //printf( "dgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i ); magma_dgemm( MagmaNoTrans, MagmaTrans, n, ib, k, alpha, dB(idev,0,0), ldb, dA(idev,i,0), lda, c_one, dC(idev,coffset,di), ldc ); blockoffset = 0; } magma_setdevice( cdev ); magmablasSetKernelStream( cqueue ); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; double *hA, *hR; magmaDouble_ptr dA; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8160, 8192 }; magma_int_t i, info; double mz_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], matnorm, diffnorm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_dpotrf2_gpu -N %d\n\n", 1024); } /* Initialize */ magma_queue_t queue1, queue2; magma_device_t device; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( &device, 2, &num ); if ( err != 0 or num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue1 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue2 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } magma_queue_t queues[2] = {queue1, queue2}; /* Allocate memory for the largest matrix */ N = size[9]; n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC_CPU( hA, double, n2 ); TESTING_MALLOC_PIN( hR, double, n2 ); TESTING_MALLOC_DEV( dA, double, ldda*N ); printf("\n\n"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (double)N ) * 1e-9; /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, hA ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { MAGMA_D_SET2REAL( hA(i,i), MAGMA_D_REAL(hA(i,i)) + N ); for( int j = 0; j < i; ++j ) { hA(i, j) = MAGMA_D_CNJG( hA(j,i) ); } } lapackf77_dlacpy( MagmaFullStr, &N, &N, hA, &lda, hR, &lda ); /* Warm up to measure the performance */ magma_dsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue1); clFinish(queue1); magma_dpotrf2_gpu( MagmaLower, N, dA, 0, ldda, &info, queues ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue1 ); clFinish(queue1); gpu_time = magma_wtime(); magma_dpotrf2_gpu( MagmaLower, N, dA, 0, ldda, &info, queues ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_dpotrf2 had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dpotrf( MagmaLowerStr, &N, hA, &lda, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf( "lapackf77_dpotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ magma_dgetmatrix( N, N, dA, 0, ldda, hR, 0, lda, queue1 ); matnorm = lapackf77_dlange("f", &N, &N, hA, &lda, work); blasf77_daxpy(&n2, &mz_one, hA, &ione, hR, &ione); diffnorm = lapackf77_dlange("f", &N, &N, hR, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( hA ); TESTING_FREE_PIN( hR ); TESTING_FREE_DEV( dA ); magma_queue_destroy( queue1 ); magma_queue_destroy( queue2 ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsymmetrize Code is very similar to testing_dtranspose.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R; magmaDouble_ptr d_A; magma_int_t N, nb, size, lda, ldda, mstride, nstride, ntile; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); nb = (opts.nb == 0 ? 64 : opts.nb); mstride = 2*nb; nstride = 3*nb; printf("uplo = %s, nb = %d, mstride = %d, nstride = %d\n", lapack_uplo_const(opts.uplo), (int) nb, (int) mstride, (int) nstride ); printf(" N ntile CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("===========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldda = ((N+31)/32)*32; size = lda*N; if ( N < nb ) { ntile = 0; } else { ntile = min( (N - nb)/mstride + 1, (N - nb)/nstride + 1 ); } // load each tile, save each tile gbytes = sizeof(double) * 2.*nb*nb*ntile / 1e9; TESTING_MALLOC_CPU( h_A, double, size ); TESTING_MALLOC_CPU( h_R, double, size ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < N; ++i ) { h_A[i + j*lda] = MAGMA_D_MAKE( i + j/10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( N, N, h_A, lda, d_A, ldda ); gpu_time = magma_sync_wtime( 0 ); magmablas_dsymmetrize_tiles( opts.uplo, nb, d_A, ldda, ntile, mstride, nstride ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using naive in-place algorithm (LAPACK doesn't implement symmetrize) =================================================================== */ cpu_time = magma_wtime(); for( int tile = 0; tile < ntile; ++tile ) { int offset = tile*mstride + tile*nstride*lda; for( int j = 0; j < nb; ++j ) { for( int i = 0; i < j; ++i ) { if ( opts.uplo == MagmaLower ) { h_A[offset + i + j*lda] = MAGMA_D_CNJG( h_A[offset + j + i*lda] ); } else { h_A[offset + j + i*lda] = MAGMA_D_CNJG( h_A[offset + i + j*lda] ); } } } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ===================================================================== Check the result =================================================================== */ magma_dgetmatrix( N, N, d_A, ldda, h_R, lda ); blasf77_daxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &N, &N, h_R, &lda, work); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", (int) N, (int) ntile, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; double *h_A, *h_R; magmaDouble_ptr d_lA[MagmaMaxGPUs]; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 }; magma_int_t i, j, k, info; double mz_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], matnorm, diffnorm; magma_int_t num_gpus0 = 1, num_gpus, flag = 0; int nb, mb, n_local, nk; magma_uplo_t uplo = MagmaLower; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0){ N = atoi(argv[++i]); if (N>0) { size[0] = size[9] = N; flag = 1; }else exit(1); } if(strcmp("-NGPU", argv[i])==0) num_gpus0 = atoi(argv[++i]); if(strcmp("-UPLO", argv[i])==0){ if(strcmp("L", argv[++i])==0){ uplo = MagmaLower; }else{ uplo = MagmaUpper; } } } } else { printf("\nUsage: \n"); printf(" testing_dpotrf_mgpu -N %d -NGPU %d -UPLO -L\n\n", 1024, num_gpus0); } /* looking for max. ldda */ ldda = 0; n2 = 0; for(i=0;i<10;i++){ N = size[i]; nb = magma_get_dpotrf_nb(N); mb = nb; if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; }else{ num_gpus = num_gpus0; } n_local = nb*(1+N/(nb*num_gpus))*mb*((N+mb-1)/mb); if(n_local > ldda) ldda = n_local; if(n2 < N*N) n2 = N*N; if(flag != 0) break; } /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; //magma_queue_t queues[MagmaMaxGPUs]; magma_device_t devices[ MagmaMaxGPUs ]; magma_int_t num = 0; magma_int_t err; magma_init(); err = magma_getdevices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } } printf("each buffer size: %d\n", ldda); /* allocate local matrix on Buffers */ for(i=0; i<num_gpus0; i++){ TESTING_MALLOC_DEV( d_lA[i], double, ldda ); } printf("\n\n"); printf("Using GPUs: %d\n", num_gpus0); if(uplo == MagmaUpper){ printf("\n testing_dpotrf_mgpu -N %d -NGPU %d -UPLO U\n\n", N, num_gpus0); }else{ printf("\n testing_dpotrf_mgpu -N %d -NGPU %d -UPLO L\n\n", N, num_gpus0); } printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (double)N ) * 1e-9; /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { h_A(i,i) = MAGMA_D_MAKE( MAGMA_D_REAL(h_A(i,i)) + N, 0 ); for( int j = 0; j < i; ++j ) { h_A(i, j) = MAGMA_D_CNJG( h_A(j,i) ); } } lapackf77_dlacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); /* Warm up to measure the performance */ nb = magma_get_dpotrf_nb(N); if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus); }else{ num_gpus = num_gpus0; } /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_dsetmatrix( N, nk, &h_A[j*lda], lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_dsetmatrix( nk, N, &h_A[j], lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } magma_dpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_dsetmatrix( N, nk, &h_A[j*lda], lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_dsetmatrix( nk, N, &h_A[j], lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } gpu_time = magma_wtime(); magma_dpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_dpotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* gather matrix from gpus */ if(uplo==MagmaUpper){ // Upper for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_dgetmatrix( N, nk, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, &h_R[j*lda], lda, queues[2*k]); } }else{ // Lower for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_dgetmatrix( nk, N, d_lA[k], (j/(nb*num_gpus)*nb), ldda, &h_R[j], lda, queues[2*k] ); } } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); if(uplo == MagmaLower){ lapackf77_dpotrf( MagmaLowerStr, &N, h_A, &lda, &info ); }else{ lapackf77_dpotrf( MagmaUpperStr, &N, h_A, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; if (info != 0) printf( "lapackf77_dpotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ matnorm = lapackf77_dlange("f", &N, &N, h_A, &lda, work); blasf77_daxpy(&n2, &mz_one, h_A, &ione, h_R, &ione); diffnorm = lapackf77_dlange("f", &N, &N, h_R, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (flag != 0) break; } /* clean up */ TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); for(i=0;i<num_gpus;i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy( queues[2*i] ); magma_queue_destroy( queues[2*i+1] ); } magma_finalize(); }