void equation_of_state(int imin, int imax, const int Hnxyt, const int Hnvar, const real_t Hsmallc, const real_t Hgamma, const int slices, const int Hstep, real_t eint[Hstep][Hnxyt], real_t q[Hnvar][Hstep][Hnxyt], real_t c[Hstep][Hnxyt]) { int k, s = slices; int inpar = 0; real_t smallp; WHERE("equation_of_state"); smallp = Square(Hsmallc) / Hgamma; FLOPS(1, 1, 0, 0); // printf("EOS: %d %d %d %d %g %g %d %d\n", imin, imax, Hnxyt, Hnvar, Hsmallc, Hgamma, slices, Hstep); #pragma simd for (k = imin; k < imax; k++) { real_t rhok = q[ID][s][k]; real_t base = (Hgamma - one) * rhok * eint[s][k]; base = MAX(base, (real_t) (rhok * smallp)); q[IP][s][k] = base; c[s][k] = sqrt(Hgamma * base / rhok); } { int nops = slices * (imax - imin); FLOPS(5 * nops, 2 * nops, 1 * nops, 0 * nops); } } // equation_of_state
void compute_deltat(real_t *dt, const hydroparam_t H, hydrowork_t * Hw, hydrovar_t * Hv, hydrovarwork_t * Hvw) { real_t cournox, cournoy; int j, jend, slices, Hstep, Hmin, Hmax; real_t (*e)[H.nxyt]; real_t (*c)[H.nxystep]; real_t (*q)[H.nxystep][H.nxyt]; WHERE("compute_deltat"); // compute time step on grid interior cournox = zero; cournoy = zero; c = (real_t (*)[H.nxystep]) Hw->c; e = (real_t (*)[H.nxystep]) Hw->e; q = (real_t (*)[H.nxystep][H.nxyt]) Hvw->q; Hstep = H.nxystep; Hmin = H.jmin + ExtraLayer; Hmax = H.jmax - ExtraLayer; for (j = Hmin; j < Hmax; j += Hstep) { jend = j + Hstep; if (jend >= Hmax) jend = Hmax; slices = jend - j; // numbre of slices to compute ComputeQEforRow(j, H.smallr, H.nx, H.nxt, H.nyt, H.nxyt, H.nvar, slices, Hstep, Hv->uold, q, e); equation_of_state(0, H.nx, H.nxyt, H.nvar, H.smallc, H.gamma, slices, Hstep, e, q, c); courantOnXY(&cournox, &cournoy, H.nx, H.nxyt, H.nvar, slices, Hstep, c, q, Hw->tmpm1, Hw->tmpm2); // fprintf(stdout, "[%2d]\t%g %g %g %g\n", H.mype, cournox, cournoy, H.smallc, H.courant_factor); } *dt = H.courant_factor * H.dx / MAX(cournox, MAX(cournoy, H.smallc)); FLOPS(1, 1, 2, 0); // fprintf(stdout, "[%2d]\t%g %g %g %g %g %g\n", H.mype, cournox, cournoy, H.smallc, H.courant_factor, H.dx, *dt); } // compute_deltat
inline void courantOnXY(real_t *cournox, real_t *cournoy, const int Hnx, const int Hnxyt, const int Hnvar, const int slices, const int Hstep, real_t c[Hstep][Hnxyt], real_t q[Hnvar][Hstep][Hnxyt], real_t *tmpm1, real_t *tmpm2 ) { #ifdef WOMP int s, i; // real_t maxValC = zero; real_t tmp1 = *cournox, tmp2 = *cournoy; #pragma omp parallel for shared(tmpm1, tmpm2) private(s,i) reduction(max:tmp1) reduction(max:tmp2) for (s = 0; s < slices; s++) { for (i = 0; i < Hnx; i++) { tmp1 = MAX(tmp1, c[s][i] + DABS(q[IU][s][i])); tmp2 = MAX(tmp2, c[s][i] + DABS(q[IV][s][i])); } } *cournox = tmp1; *cournoy = tmp2; { int nops = (slices) * Hnx; FLOPS(2 * nops, 0 * nops, 2 * nops, 0 * nops); } #else int i, s; real_t tmp1, tmp2; for (s = 0; s < slices; s++) { for (i = 0; i < Hnx; i++) { tmp1 = c[s][i] + DABS(q[IU][s][i]); tmp2 = c[s][i] + DABS(q[IV][s][i]); *cournox = MAX(*cournox, tmp1); *cournoy = MAX(*cournoy, tmp2); } } { int nops = (slices) * Hnx; FLOPS(2 * nops, 0 * nops, 5 * nops, 0 * nops); } #endif #undef IHVW }
void slope(const int n, const int Hnvar, const int Hnxyt, const real_t Hslope_type, const int slices, const int Hstep, real_t q[Hnvar][Hstep][Hnxyt], real_t dq[Hnvar][Hstep][Hnxyt]) { int nbv, i, ijmin, ijmax, s; // long ihvwin, ihvwimn, ihvwipn; // #define IHVW(i, v) ((i) + (v) * Hnxyt) WHERE("slope"); ijmin = 0; ijmax = n; // #define OLDSTYLE #pragma omp parallel for private(nbv, s, i) shared(dq) COLLAPSE for (s = 0; s < slices; s++) { for (nbv = 0; nbv < Hnvar; nbv++) { #pragma ivdep for (i = ijmin + 1; i < ijmax - 1; i++) { real_t dlft, drgt, dcen, dsgn, slop, dlim; int llftrgt = 0; real_t t1; dlft = Hslope_type * (q[nbv][s][i] - q[nbv][s][i - 1]); drgt = Hslope_type * (q[nbv][s][i + 1] - q[nbv][s][i]); dcen = half * (dlft + drgt) / Hslope_type; dsgn = (dcen > 0) ? (real_t) 1.0 : (real_t) -1.0; // sign(one, dcen); #ifdef OLDSTYLE slop = fmin(fabs(dlft), fabs(drgt)); dlim = slop; if ((dlft * drgt) <= zero) { dlim = zero; } dq[nbv][s][i] = dsgn * fmin(dlim, fabs(dcen)); #else llftrgt = ((dlft * drgt) <= zero); t1 = fmin(fabs(dlft), fabs(drgt)); dq[nbv][s][i] = dsgn * fmin((1 - llftrgt) * t1, fabs(dcen)); #endif } } } { int nops = Hnvar * slices * ((ijmax - 1) - (ijmin + 1)); FLOPS(8 * nops, 1 * nops, 6 * nops, 0 * nops); } } // slope
void constoprim(const int n, const int Hnxyt, const int Hnvar, const real_t Hsmallr, const int slices, const int Hstep, real_t u[Hnvar][Hstep][Hnxyt], real_t q[Hnvar][Hstep][Hnxyt], real_t e[Hstep][Hnxyt]) { int ijmin, ijmax, IN, i, s; real_t eken; // const int nxyt = Hnxyt; WHERE("constoprim"); ijmin = 0; ijmax = n; #pragma omp parallel for private(i, s, eken), shared(q,e) COLLAPSE for (s = 0; s < slices; s++) { for (i = ijmin; i < ijmax; i++) { real_t qid = MAX(u[ID][s][i], Hsmallr); q[ID][s][i] = qid; real_t qiu = u[IU][s][i] / qid; real_t qiv = u[IV][s][i] / qid; q[IU][s][i] = qiu; q[IV][s][i] = qiv; eken = half * (Square(qiu) + Square(qiv)); real_t qip = u[IP][s][i] / qid - eken; q[IP][s][i] = qip; e[s][i] = qip; } } { int nops = slices * ((ijmax) - (ijmin)); FLOPS(5 * nops, 3 * nops, 1 * nops, 0 * nops); } if (Hnvar > IP) { for (IN = IP + 1; IN < Hnvar; IN++) { for (s = 0; s < slices; s++) { for (i = ijmin; i < ijmax; i++) { q[IN][s][i] = u[IN][s][i] / q[IN][s][i]; } } } } } // constoprim
inline void ComputeQEforRow(const int j, const real_t Hsmallr, const int Hnx, const int Hnxt, const int Hnyt, const int Hnxyt, const int Hnvar, const int slices, const int Hstep, real_t * uold, real_t q[Hnvar][Hstep][Hnxyt], real_t e[Hstep][Hnxyt] ) { int i, s; #define IHV(i, j, v) ((i) + Hnxt * ((j) + Hnyt * (v))) #pragma omp parallel for shared(q, e) private(s, i) COLLAPSE for (s = 0; s < slices; s++) { for (i = 0; i < Hnx; i++) { real_t eken; real_t tmp; int idxuID = IHV(i + ExtraLayer, j + s, ID); int idxuIU = IHV(i + ExtraLayer, j + s, IU); int idxuIV = IHV(i + ExtraLayer, j + s, IV); int idxuIP = IHV(i + ExtraLayer, j + s, IP); q[ID][s][i] = MAX(uold[idxuID], Hsmallr); q[IU][s][i] = uold[idxuIU] / q[ID][s][i]; q[IV][s][i] = uold[idxuIV] / q[ID][s][i]; eken = half * (Square(q[IU][s][i]) + Square(q[IV][s][i])); tmp = uold[idxuIP] / q[ID][s][i] - eken; q[IP][s][i] = tmp; e[s][i] = tmp; } } { int nops = slices * Hnx; FLOPS(5 * nops, 3 * nops, 1 * nops, 0 * nops); } #undef IHV #undef IHVW }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; magmaDoubleComplex *hA, *hR; magmaDoubleComplex_ptr dA; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1024, 2048, 3072, 4032, 5184, 6048, 7200, 8064, 8928, 10560 }; magma_int_t i, info; magmaDoubleComplex mz_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], matnorm, diffnorm; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if (N>0) size[0] = size[9] = N; else exit(1); } else { printf("\nUsage: \n"); printf(" testing_zpotrf_gpu -N %d\n\n", 1024); } /* Initialize */ magma_queue_t queue; magma_device_t device; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( &device, 1, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the largest matrix */ N = size[9]; n2 = N * N; ldda = ((N+31)/32) * 32; TESTING_MALLOC( hA, magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( hR, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*N ); printf("\n\n"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (double)N ) * 1e-9; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, hA ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { MAGMA_Z_SET2REAL( hA(i,i), MAGMA_Z_REAL(hA(i,i)) + N ); for( int j = 0; j < i; ++j ) { hA(i, j) = MAGMA_Z_CNJG( hA(j,i) ); } } lapackf77_zlacpy( MagmaFullStr, &N, &N, hA, &lda, hR, &lda ); /* Warm up to measure the performance */ magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); magma_zpotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue ); gpu_time = get_time(); magma_zpotrf_gpu( MagmaUpper, N, dA, 0, ldda, &info, queue ); gpu_time = get_time() - gpu_time; if (info != 0) printf( "magma_zpotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = get_time(); lapackf77_zpotrf( MagmaUpperStr, &N, hA, &lda, &info ); cpu_time = get_time() - cpu_time; if (info != 0) printf( "lapackf77_zpotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ magma_zgetmatrix( N, N, dA, 0, ldda, hR, 0, lda, queue ); matnorm = lapackf77_zlange("f", &N, &N, hA, &lda, work); blasf77_zaxpy(&n2, &mz_one, hA, &ione, hR, &ione); diffnorm = lapackf77_zlange("f", &N, &N, hR, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (argc != 1) break; } /* clean up */ TESTING_FREE( hA ); TESTING_FREE_HOST( hR ); TESTING_FREE_DEV( dA ); magma_queue_destroy( queue ); magma_finalize(); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float matnorm, work[1]; float mzone = MAGMA_S_NEG_ONE; float *h_A, *h_R, *tau, *hwork, tmp[1]; magmaFloat_ptr d_A; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, ldda, lhwork; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10176}; magma_int_t i, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_sgeqrf_gpu -M %d -N %d\n\n", M, N); else { printf("\nUsage: \n"); printf(" testing_sgeqrf_gpu -M %d -N %d\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_sgeqrf_gpu -M %d -N %d\n\n", 1024, 1024); M = N = size[7]; } /* Initialize */ magma_queue_t queue1, queue2; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue1 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue2 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } magma_queue_t queues[2] = {queue1, queue2}; ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, float, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); lhwork = -1; lapackf77_sgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_S_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, float, lhwork ); printf("\n\n"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<8; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (float)M, (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_sgeqrf(&M, &N, h_A, &M, tau, hwork, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_sgeqrf had an illegal value.\n", -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue1 ); magma_sgeqrf2_gpu( M, N, d_A, 0, ldda, tau, &info, queues); magma_ssetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue1 ); clFinish(queue1); clFinish(queue2); gpu_time = magma_wtime(); magma_sgeqrf2_gpu( M, N, d_A, 0, ldda, tau, &info, queues); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_sgeqrf2 had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_sgetmatrix( M, N, d_A, 0, ldda, h_R, 0, M, queue1 ); matnorm = lapackf77_slange("f", &M, &N, h_A, &M, work); blasf77_saxpy(&n2, &mzone, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", M, N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_slange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( hwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); magma_queue_destroy( queue1 ); magma_queue_destroy( queue2 ); magma_finalize(); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; float matnorm, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex_ptr d_lA[MagmaMaxGPUs]; /* Matrix size */ magma_int_t M = 0, N = 0, n2, n_local[4], lda, ldda, lhwork; magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000}; magma_int_t i, k, nk, info, min_mn; int max_num_gpus = 2, num_gpus = 2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-NGPU", argv[i])==0) num_gpus = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); M = N = size[9]; } ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); magma_int_t nb = magma_get_cgeqrf_nb(M); if (num_gpus > max_num_gpus){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = max_num_gpus; } printf("Number of GPUs to be used = %d\n", (int) num_gpus); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; magma_device_t devices[ MagmaMaxGPUs ]; magma_int_t num = 0; magma_int_t err; magma_init(); err = magma_getdevices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } } /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2 ); for(i=0; i<num_gpus; i++){ n_local[i] = ((N/nb)/num_gpus)*nb; if (i < (N/nb)%num_gpus) n_local[i] += nb; else if (i == (N/nb)%num_gpus) n_local[i] += N%nb; TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, ldda*n_local[i] ); printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); } lhwork = -1; lapackf77_cgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork ); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (float)M, (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_cgeqrf(&M, &N, h_A, &M, tau, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_cgeqrf had an illegal value.\n", (int) -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ int j; magma_queue_t *trans_queues = (magma_queue_t*)malloc(num_gpus*sizeof(magma_queue_t)); for(j=0;j<num_gpus;j++){ trans_queues[j] = queues[2*j]; } // warm-up magma_csetmatrix_1D_col_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, queues, &info); magma_csetmatrix_1D_col_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues); gpu_time = magma_wtime(); magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, queues, &info); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_cgeqrf2 had an illegal value.\n", (int) -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_cgetmatrix_1D_col_bcyclic(M, N, d_lA, ldda, h_R, lda, num_gpus, nb, trans_queues); matnorm = lapackf77_clange("f", &M, &N, h_A, &M, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_clange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); for(i=0; i<num_gpus; i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy(queues[2*i]); magma_queue_destroy(queues[2*i+1]); } /* Shutdown */ magma_finalize(); }
int main(int argc, char **argv) { TESTING_INIT(); magma_setdevice(0); magma_timestr_t start, end; float flops, magma_perf, cuda_perf, error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t n_local[4]; FILE *fp ; magma_int_t N, m, i, j, lda, LDA, M; magma_int_t matsize; magma_int_t vecsize; magma_int_t istart = 64; magma_int_t incx = 1; char uplo = MagmaLower; magmaFloatComplex alpha = MAGMA_C_MAKE(1., 0.); // MAGMA_C_MAKE( 1.5, -2.3 ); magmaFloatComplex beta = MAGMA_C_MAKE(0., 0.); // MAGMA_C_MAKE( -0.6, 0.8 ); magmaFloatComplex *A, *X, *Y[4], *Ycublas, *Ymagma; magmaFloatComplex *dA, *dX[4], *dY[4], *d_lA[4], *dYcublas ; magma_queue_t stream[4][10]; magmaFloatComplex *C_work; magmaFloatComplex *dC_work[4]; int max_num_gpus; magma_int_t num_gpus = 1, nb; magma_int_t blocks, lwork; magma_int_t offset = 0; M = 0; N = 0; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); istart = N; } else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-NGPU", argv[i])==0) num_gpus = atoi(argv[++i]); else if (strcmp("-offset", argv[i])==0) offset = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) { printf(" testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); printf(" in %c side \n", uplo); } else { printf("\nUsage: \n"); printf(" testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1); exit(1); } } else { #if defined(PRECISION_z) M = N = 8000; #else M = N = 12480; #endif num_gpus = 2; offset = 0; printf("\nUsage: \n"); printf(" testing_chemv_mgpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus); } ////////////////////////////////////////////////////////////////////////// cudaGetDeviceCount(&max_num_gpus); if (num_gpus > max_num_gpus){ printf("More GPUs requested than available. Have to change it.\n"); num_gpus = max_num_gpus; } printf("Number of GPUs to be used = %d\n", (int) num_gpus); for(int i=0; i< num_gpus; i++) { magma_queue_create(&stream[i][0]); } LDA = ((N+31)/32)*32; matsize = N*LDA; vecsize = N*incx; nb = 32; //nb = 64; printf("block size = %d\n", (int) nb); TESTING_MALLOC_CPU( A, magmaFloatComplex, matsize ); TESTING_MALLOC_CPU( X, magmaFloatComplex, vecsize ); TESTING_MALLOC_CPU( Ycublas, magmaFloatComplex, vecsize ); TESTING_MALLOC_CPU( Ymagma, magmaFloatComplex, vecsize ); for(i=0; i<num_gpus; i++) { TESTING_MALLOC_CPU( Y[i], magmaFloatComplex, vecsize ); } magma_setdevice(0); TESTING_MALLOC_DEV( dA, magmaFloatComplex, matsize ); TESTING_MALLOC_DEV( dYcublas, magmaFloatComplex, vecsize ); for(i=0; i<num_gpus; i++) { n_local[i] = ((N/nb)/num_gpus)*nb; if (i < (N/nb)%num_gpus) n_local[i] += nb; else if (i == (N/nb)%num_gpus) n_local[i] += N%nb; magma_setdevice(i); TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, LDA*n_local[i] );// potentially bugged TESTING_MALLOC_DEV( dX[i], magmaFloatComplex, vecsize ); TESTING_MALLOC_DEV( dY[i], magmaFloatComplex, vecsize ); printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]); } magma_setdevice(0); ////////////////////////////////////////////////////////////////////////// /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &matsize, A ); magma_cmake_hermitian( N, A, LDA ); blocks = N / nb + (N % nb != 0); lwork = LDA * (blocks + 1); TESTING_MALLOC_CPU( C_work, magmaFloatComplex, lwork ); for(i=0; i<num_gpus; i++){ magma_setdevice(i); TESTING_MALLOC_DEV( dC_work[i], magmaFloatComplex, lwork ); //fillZero(dC_work[i], lwork); } magma_setdevice(0); ////////////////////////////////////////////////////////////////////////// fp = fopen ("results_chemv_mgpu.csv", "w") ; if( fp == NULL ){ printf("Couldn't open output file\n"); exit(1);} printf("CHEMV magmaFloatComplex precision\n\n"); printf( " n CUBLAS,Gflop/s MAGMABLAS,Gflop/s \"error\"\n" "==============================================================\n"); fprintf(fp, " n CUBLAS,Gflop/s MAGMABLAS,Gflop/s \"error\"\n" "==============================================================\n"); // for( offset = 0; offset< N; offset ++ ) for(int size = istart ; size <= N ; size += 128) { // printf("offset = %d ", offset); m = size ; // m = N; // lda = ((m+31)/32)*32;// lda = LDA; flops = FLOPS( (float)m ) / 1e6; printf( "N %5d ", (int) m ); fprintf( fp, "%5d, ", (int) m ); vecsize = m * incx; lapackf77_clarnv( &ione, ISEED, &vecsize, X ); lapackf77_clarnv( &ione, ISEED, &vecsize, Y[0] ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_setdevice(0); magma_csetmatrix_1D_col_bcyclic(m, m, A, LDA, d_lA, lda, num_gpus, nb); magma_setdevice(0); magma_csetmatrix( m, m, A, LDA, dA, lda ); magma_csetvector( m, Y[0], incx, dYcublas, incx ); for(i=0; i<num_gpus; i++){ magma_setdevice(i); magma_csetvector( m, X, incx, dX[i], incx ); magma_csetvector( m, Y[0], incx, dY[i], incx ); blocks = m / nb + (m % nb != 0); magma_csetmatrix( lda, blocks, C_work, LDA, dC_work[i], lda ); } magma_setdevice(0); start = get_current_time(); cublasChemv( uplo, m-offset, alpha, dA + offset + offset * lda, lda, dX[0] + offset, incx, beta, dYcublas + offset, incx ); end = get_current_time(); magma_cgetvector( m, dYcublas, incx, Ycublas, incx ); cuda_perf = flops / GetTimerValue(start,end); printf( "%11.2f", cuda_perf ); fprintf(fp, "%11.2f,", cuda_perf ); magma_setdevice(0); start = get_current_time(); if(nb == 32) { magmablas_chemv2_mgpu_32_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, dC_work, lwork, num_gpus, nb, offset); } else // nb = 64 { magmablas_chemv2_mgpu_offset( uplo, m, alpha, d_lA, lda, dX, incx, beta, dY, incx, dC_work, lwork, num_gpus, nb, offset); } for(i=1; i<num_gpus; i++) { magma_setdevice(i); cudaDeviceSynchronize(); } end = get_current_time(); magma_perf = flops / GetTimerValue(start,end); printf( "%11.2f", magma_perf ); fprintf(fp, "%11.2f,", magma_perf ); for(i=0; i<num_gpus; i++) { magma_setdevice(i); magma_cgetvector( m, dY[i], incx, Y[i], incx ); } magma_setdevice(0); #ifdef validate for( j= offset;j<m;j++) { for(i=1; i<num_gpus; i++) { // printf("Y[%d][%d] = %15.14f\n", i, j, Y[i][j].x); #if defined(PRECISION_z) || defined(PRECISION_c) Y[0][j].x = Y[0][j].x + Y[i][j].x; Y[0][j].y = Y[0][j].y + Y[i][j].y; #else Y[0][j] = Y[0][j] + Y[i][j]; #endif } } /* #if defined(PRECISION_z) || defined(PRECISION_c) for( j=offset;j<m;j++) { if(Y[0][j].x != Ycublas[j].x) { printf("Y-multi[%d] = %f, %f\n", j, Y[0][j].x, Y[0][j].y ); printf("Ycublas[%d] = %f, %f\n", j, Ycublas[j].x, Ycublas[j].y); } } #else for( j=offset;j<m;j++) { if(Y[0][j] != Ycublas[j]) { printf("Y-multi[%d] = %f\n", j, Y[0][j] ); printf("Ycublas[%d] = %f\n", j, Ycublas[j]); } } #endif */ /* ===================================================================== Computing the Difference Cublas VS Magma =================================================================== */ magma_int_t nw = m - offset ; blasf77_caxpy( &nw, &c_neg_one, Y[0] + offset, &incx, Ycublas + offset, &incx); error = lapackf77_clange( "M", &nw, &ione, Ycublas + offset, &nw, work ); #if 0 printf( "\t\t %8.6e", error / m ); fprintf( fp, "\t\t %8.6e", error / m ); /* * Extra check with cblas vs magma */ cblas_ccopy( m, Y, incx, Ycublas, incx ); cblas_chemv( CblasColMajor, CblasLower, m, CBLAS_SADDR(alpha), A, LDA, X, incx, CBLAS_SADDR(beta), Ycublas, incx ); blasf77_caxpy( &m, &c_neg_one, Ymagma, &incx, Ycublas, &incx); error = lapackf77_clange( "M", &m, &ione, Ycublas, &m, work ); #endif printf( "\t\t %8.6e", error / m ); fprintf( fp, "\t\t %8.6e", error / m ); #endif printf("\n"); fprintf(fp, "\n"); } fclose( fp ) ; /* Free Memory */ TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_CPU( C_work ); magma_setdevice(0); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dYcublas ); for(i=0; i<num_gpus; i++) { TESTING_FREE_CPU( Y[i] ); magma_setdevice(i); TESTING_FREE_DEV( d_lA[i] ); TESTING_FREE_DEV( dX[i] ); TESTING_FREE_DEV( dY[i] ); TESTING_FREE_DEV( dC_work[i] ); } magma_setdevice(0); /////////////////////////////////////////////////////////// /* Free device */ TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrf */ int main( magma_int_t argc, char** argv) { cuDoubleComplex *h_A, *h_R, *h_A2, *h_A3, *h_work, *h_work2, *tau, *d_work2; cuDoubleComplex *d_A, *d_work; float gpu_perf, cpu_perf, cpu2_perf; double flops; magma_timestr_t start, end; /* Matrix size */ magma_int_t N=0, n2, lda, M=0; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, j, info[1]; magma_int_t loop = argc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t num_cores = 4; magma_int_t num_gpus = 0; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-C", argv[i])==0) num_cores = atoi(argv[++i]); } if ((M>0 && N>0) || (M==0 && N==0)) { printf(" testing_zgeqrf_mc -M %d -N %d \n\n", M, N); if (M==0 && N==0) { M = N = size[9]; loop = 1; } } else { printf("\nUsage: \n"); printf(" testing_zgeqrf_mc -M %d -N %d -B 128 -T 1\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_zgeqrf_mc -M %d -N %d -B 128 -T 1\n\n", 1024, 1024); M = N = size[9]; } n2 = M * N; magma_int_t min_mn = min(M,N); /* Allocate host memory for the matrix */ h_A2 = (cuDoubleComplex*)malloc(n2 * sizeof(h_A2[0])); if (h_A2 == 0) { fprintf (stderr, "!!!! host memory allocation error (A2)\n"); } magma_int_t lwork = n2; h_work2 = (cuDoubleComplex*)malloc(lwork * sizeof(cuDoubleComplex)); if (h_work2 == 0) { fprintf (stderr, "!!!! host memory allocation error (h_work2)\n"); } h_A3 = (cuDoubleComplex*)malloc(n2 * sizeof(h_A3[0])); if (h_A3 == 0) { fprintf (stderr, "!!!! host memory allocation error (A3)\n"); } tau = (cuDoubleComplex*)malloc(min_mn * sizeof(cuDoubleComplex)); if (tau == 0) { fprintf (stderr, "!!!! host memory allocation error (tau)\n"); } /* Initialize MAGMA hardware context, seeting how many CPU cores and how many GPUs to be used in the consequent computations */ magma_context *context; context = magma_init(NULL, NULL, 0, num_cores, num_gpus, argc, argv); printf("\n\n"); printf(" M N LAPACK Gflop/s Multi-core Gflop/s ||R||_F / ||A||_F\n"); printf("===========================================================================\n"); for(i=0; i<10; i++){ if (loop == 1) { M = N = size[i]; n2 = M*N; } flops = FLOPS( (double)M, (double)N ) / 1000000; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A2 ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A2, &M, h_A3, &M ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_zgeqrf(&M, &N, h_A3, &M, tau, h_work2, &lwork, info); end = get_current_time(); if (info[0] < 0) printf("Argument %d of sgeqrf had an illegal value.\n", -info[0]); cpu2_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using multicore =================================================================== */ start = get_current_time(); magma_zgeqrf_mc(context, &M, &N, h_A2, &M, tau, h_work2, &lwork, info); end = get_current_time(); if (info[0] < 0) printf("Argument %d of sgeqrf had an illegal value.\n", -info[0]); cpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ double work[1], matnorm = 1.; cuDoubleComplex mone = MAGMA_Z_NEG_ONE; magma_int_t one = 1; matnorm = lapackf77_zlange("f", &M, &N, h_A2, &M, work); blasf77_zaxpy(&n2, &mone, h_A2, &one, h_A3, &one); printf("%5d %5d %6.2f %6.2f %e\n", M, N, cpu2_perf, cpu_perf, lapackf77_zlange("f", &M, &N, h_A3, &M, work) / matnorm); if (loop != 1) break; } /* Memory clean up */ free(h_A2); free(tau); free(h_A3); free(h_work2); /* Shut down the MAGMA context */ magma_finalize(context); }
int main(int argc, char **argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; double flops, magma_perf, cuda_perf, error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; FILE *fp ; magma_int_t i, lda, Xm, Ym; magma_int_t M, M0 = 0; magma_int_t N, N0 = 0; magma_int_t szeA, szeX, szeY; magma_int_t istart = 64; magma_int_t iend = 10240; magma_int_t incx = 1; magma_int_t incy = 1; char trans = MagmaNoTrans; cuDoubleComplex alpha = MAGMA_Z_MAKE(1., 0.); // MAGMA_Z_MAKE( 1.5, -2.3 ); cuDoubleComplex beta = MAGMA_Z_MAKE(0., 0.); // MAGMA_Z_MAKE( -0.6, 0.8 ); cuDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma; cuDoubleComplex *dA, *dX, *dY; if (argc != 1){ for(i=1; i<argc; i++){ if ( strcmp("-n", argv[i]) == 0 ){ N0 = atoi(argv[++i]); } else if ( strcmp("-m", argv[i]) == 0 ){ M0 = atoi(argv[++i]); } else if (strcmp("-N", argv[i])==0){ trans = MagmaNoTrans; } else if (strcmp("-T", argv[i])==0){ trans = MagmaTrans; } #if defined(PRECISION_z) || defined(PRECISION_c) else if (strcmp("-C", argv[i])==0){ trans = MagmaConjTrans; } #endif } } if ( (M0 != 0) && (N0 != 0) ) iend = istart + 1; M = N = iend; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if( trans == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } lda = ((M+31)/32)*32; szeA = lda*N; szeX = incx*Xm; szeY = incy*Ym; TESTING_MALLOC( A, cuDoubleComplex, szeA ); TESTING_MALLOC( X, cuDoubleComplex, szeX ); TESTING_MALLOC( Y, cuDoubleComplex, szeY ); TESTING_MALLOC( Ycublas, cuDoubleComplex, szeY ); TESTING_MALLOC( Ymagma, cuDoubleComplex, szeY ); TESTING_DEVALLOC( dA, cuDoubleComplex, szeA ); TESTING_DEVALLOC( dX, cuDoubleComplex, szeX ); TESTING_DEVALLOC( dY, cuDoubleComplex, szeY ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &szeA, A ); lapackf77_zlarnv( &ione, ISEED, &szeX, X ); lapackf77_zlarnv( &ione, ISEED, &szeY, Y ); fp = fopen ("results_zgemv.txt", "w") ; if( fp == NULL ){ printf("Couldn't open output file\n"); exit(1);} printf("\nUsage: \n"); printf(" testing_zgemv [-N|T|C] [-m %d] [-n %d]\n\n", 1024, 1024); printf( " m n CUBLAS,Gflop/s MAGMABLAS Gflop/s \"error\"\n" "==============================================================\n"); fprintf(fp, " m n CUBLAS,Gflop/s MAGMABLAS Gflop/s \"error\"\n" "==============================================================\n"); for( i=istart; i < iend; i = (int)((i+1)*1.1) ) { M = N = i; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if( trans == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } lda = ((M+31)/32)*32; flops = FLOPS( (double)M, (double)N ) / 1000000; printf( "%5d %5d ", (int) M, (int) N ); fprintf( fp, "%5d %5d ", (int) M, (int) N ); /* ===================================================================== Performs operation using CUDA-BLAS =================================================================== */ magma_zsetmatrix( M, N, A, lda, dA, lda ); magma_zsetvector( Xm, X, incx, dX, incx ); magma_zsetvector( Ym, Y, incy, dY, incy ); /* * Cublas Version */ start = get_current_time(); cublasZgemv( trans, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); end = get_current_time(); magma_zgetvector( Ym, dY, incy, Ycublas, incy ); cuda_perf = flops / GetTimerValue(start, end); printf( "%11.2f", cuda_perf ); fprintf(fp, "%11.2f", cuda_perf ); /* * Magma Version */ magma_zsetvector( Ym, Y, incy, dY, incy ); start = get_current_time(); magmablas_zgemv( trans, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); end = get_current_time(); magma_zgetvector( Ym, dY, incx, Ymagma, incx ); magma_perf = flops / GetTimerValue(start, end); printf( "%11.2f", magma_perf ); fprintf(fp, "%11.2f", magma_perf ); /* ===================================================================== Computing the Difference Cublas VS Magma =================================================================== */ blasf77_zaxpy( &Ym, &c_neg_one, Ymagma, &incy, Ycublas, &incy); error = lapackf77_zlange( "M", &Ym, &ione, Ycublas, &Ym, work ); #if 0 printf( "\t\t %8.6e", error / (double)Ym ); fprintf( fp, "\t\t %8.6e", error / (double)Ym ); /* * Blas comparaison */ { char *blastrans = MagmaNoTransStr; if ( trans == MagmaConjTrans ) blastrans = MagmaConjTransStr; else if ( trans == MagmaTrans ) blastrans = MagmaTransStr; blasf77_zcopy( &Ym, Y, &incy, Ycublas, &incy ); blasf77_zgemv( blastrans, &M, &N, &alpha, A, &lda, X, &incx, &beta, Ycublas, &incy ); blasf77_zaxpy( &Ym, &c_neg_one, Ymagma, &incy, Ycublas, &incy); error = lapackf77_zlange( "M", &Ym, &ione, Ycublas, &Ym, work ); } #endif printf( "\t\t %8.6e\n", error / (double)Ym ); fprintf( fp, "\t\t %8.6e\n", error / (double)Ym ); } /* Free Memory */ TESTING_FREE( A ); TESTING_FREE( X ); TESTING_FREE( Y ); TESTING_FREE( Ycublas ); TESTING_FREE( Ymagma ); TESTING_DEVFREE( dA ); TESTING_DEVFREE( dX ); TESTING_DEVFREE( dY ); /* Free device */ TESTING_CUDA_FINALIZE(); return EXIT_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zpotrf_mc */ int main( magma_int_t argc, char** argv) { cuDoubleComplex *h_A, *h_R, *h_work, *h_A2; cuDoubleComplex *d_A; float gpu_perf, cpu_perf, cpu_perf2; magma_timestr_t start, end; /* Matrix size */ magma_int_t N=0, n2, lda; magma_int_t size[10] = {1024,2048,3072,4032,5184,6048,7200,8064,8928,10080}; magma_int_t i, j, info[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t num_cores = 4; int num_gpus = 0; magma_int_t loop = argc; if (argc != 1) { for(i = 1; i<argc; i++) { if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-C", argv[i])==0) num_cores = atoi(argv[++i]); } if (N==0) { N = size[9]; loop = 1; } else { size[0] = size[9] = N; } } else { printf("\nUsage: \n"); printf(" testing_zpotrf_mc -N %d -B 128 \n\n", 1024); N = size[9]; } lda = N; n2 = size[9] * size[9]; /* Allocate host memory for the matrix */ h_A = (cuDoubleComplex*)malloc(n2 * sizeof(h_A[0])); if (h_A == 0) { fprintf (stderr, "!!!! host memory allocation error (A)\n"); } /* Allocate host memory for the matrix */ h_A2 = (cuDoubleComplex*)malloc(n2 * sizeof(h_A2[0])); if (h_A2 == 0) { fprintf (stderr, "!!!! host memory allocation error (A2)\n"); } /* Initialize MAGMA hardware context, seeting how many CPU cores and how many GPUs to be used in the consequent computations */ magma_context *context; context = magma_init(NULL, NULL, 0, num_cores, num_gpus, argc, argv); printf("\n\n"); printf(" N Multicore GFlop/s ||R||_F / ||A||_F\n"); printf("=============================================\n"); for(i=0; i<10; i++) { N = lda = size[i]; n2 = N*N; lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); for(j=0; j<N; j++) MAGMA_Z_SET2REAL( h_A[j*lda+j], ( MAGMA_Z_GET_X(h_A[j*lda+j]) + 2000. ) ); for(j=0; j<n2; j++) h_A2[j] = h_A[j]; /* ===================================================================== Performs operation using LAPACK =================================================================== */ //lapackf77_zpotrf("L", &N, h_A, &lda, info); lapackf77_zpotrf("U", &N, h_A, &lda, info); if (info[0] < 0) printf("Argument %d of zpotrf had an illegal value.\n", -info[0]); /* ===================================================================== Performs operation using multi-core =================================================================== */ start = get_current_time(); //magma_zpotrf_mc(context, "L", &N, h_A2, &lda, info); magma_zpotrf_mc(context, "U", &N, h_A2, &lda, info); end = get_current_time(); if (info[0] < 0) printf("Argument %d of magma_zpotrf_mc had an illegal value.\n", -info[0]); cpu_perf2 = FLOPS( (double)N ) / (1000000.*GetTimerValue(start,end)); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ double work[1], matnorm = 1.; cuDoubleComplex mone = MAGMA_Z_NEG_ONE; int one = 1; matnorm = lapackf77_zlange("f", &N, &N, h_A, &N, work); blasf77_zaxpy(&n2, &mone, h_A, &one, h_A2, &one); printf("%5d %6.2f %e\n", size[i], cpu_perf2, lapackf77_zlange("f", &N, &N, h_A2, &N, work) / matnorm); if (loop != 1) break; } /* Memory clean up */ free(h_A); free(h_A2); /* Shut down the MAGMA context */ magma_finalize(context); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float *h_A, *h_R; magmaFloat_ptr d_lA[MagmaMaxGPUs]; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 }; magma_int_t i, j, k, info; float mz_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], matnorm, diffnorm; magma_int_t num_gpus0 = 1, num_gpus, flag = 0; int nb, mb, n_local, nk; magma_uplo_t uplo = MagmaLower; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0){ N = atoi(argv[++i]); if (N>0) { size[0] = size[9] = N; flag = 1; }else exit(1); } if(strcmp("-NGPU", argv[i])==0) num_gpus0 = atoi(argv[++i]); if(strcmp("-UPLO", argv[i])==0){ if(strcmp("L", argv[++i])==0){ uplo = MagmaLower; }else{ uplo = MagmaUpper; } } } } else { printf("\nUsage: \n"); printf(" testing_spotrf_mgpu -N %d -NGPU %d -UPLO -L\n\n", 1024, num_gpus0); } /* looking for max. ldda */ ldda = 0; n2 = 0; for(i=0;i<10;i++){ N = size[i]; nb = magma_get_spotrf_nb(N); mb = nb; if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; }else{ num_gpus = num_gpus0; } n_local = nb*(1+N/(nb*num_gpus))*mb*((N+mb-1)/mb); if(n_local > ldda) ldda = n_local; if(n2 < N*N) n2 = N*N; if(flag != 0) break; } /* Allocate host memory for the matrix */ TESTING_MALLOC_PIN( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); /* Initialize */ magma_queue_t queues[MagmaMaxGPUs * 2]; //magma_queue_t queues[MagmaMaxGPUs]; magma_device_t devices[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } for(i=0;i<num_gpus;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } } printf("each buffer size: %d\n", ldda); /* allocate local matrix on Buffers */ for(i=0; i<num_gpus0; i++){ TESTING_MALLOC_DEV( d_lA[i], float, ldda ); } printf("\n\n"); printf("Using GPUs: %d\n", num_gpus0); if(uplo == MagmaUpper){ printf("\n testing_spotrf_mgpu -N %d -NGPU %d -UPLO U\n\n", N, num_gpus0); }else{ printf("\n testing_spotrf_mgpu -N %d -NGPU %d -UPLO L\n\n", N, num_gpus0); } printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS( (float)N ) * 1e-9; /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ for( int i = 0; i < N; ++i ) { MAGMA_S_SET2REAL( h_A(i,i), MAGMA_S_REAL(h_A(i,i)) + N ); for( int j = 0; j < i; ++j ) { h_A(i, j) = MAGMA_S_CNJG( h_A(j,i) ); } } lapackf77_slacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda ); /* Warm up to measure the performance */ nb = magma_get_spotrf_nb(N); if(num_gpus0 > N/nb){ num_gpus = N/nb; if(N%nb != 0) num_gpus ++; printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus); }else{ num_gpus = num_gpus0; } /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(N, nk, &h_A[j*lda], 0, lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(nk, N, &h_A[j], 0, lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } magma_spotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, &info, queues ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* distribute matrix to gpus */ if(uplo == MagmaUpper){ // Upper ldda = ((N+mb-1)/mb)*mb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(N, nk, &h_A[j*lda], 0, lda, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, queues[2*k]); } }else{ // Lower ldda = (1+N/(nb*num_gpus))*nb; for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_ssetmatrix(nk, N, &h_A[j], 0, lda, d_lA[k], (j/(nb*num_gpus)*nb), ldda, queues[2*k]); } } gpu_time = magma_wtime(); magma_spotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, &info, queues ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf( "magma_spotrf had error %d.\n", info ); gpu_perf = gflops / gpu_time; /* gather matrix from gpus */ if(uplo==MagmaUpper){ // Upper for(j=0;j<N;j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_sgetmatrix(N, nk, d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, &h_R[j*lda], 0, lda, queues[2*k]); } }else{ // Lower for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; nk = min(nb, N-j); magma_sgetmatrix( nk, N, d_lA[k], (j/(nb*num_gpus)*nb), ldda, &h_R[j], 0, lda, queues[2*k] ); } } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); if(uplo == MagmaLower){ lapackf77_spotrf( MagmaLowerStr, &N, h_A, &lda, &info ); }else{ lapackf77_spotrf( MagmaUpperStr, &N, h_A, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; if (info != 0) printf( "lapackf77_spotrf had error %d.\n", info ); cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ matnorm = lapackf77_slange("f", &N, &N, h_A, &lda, work); blasf77_saxpy(&n2, &mz_one, h_A, &ione, h_R, &ione); diffnorm = lapackf77_slange("f", &N, &N, h_R, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); if (flag != 0) break; } /* clean up */ TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); for(i=0;i<num_gpus;i++){ TESTING_FREE_DEV( d_lA[i] ); magma_queue_destroy( queues[2*i] ); magma_queue_destroy( queues[2*i+1] ); } magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgehrd2 */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; double eps, flops, gpu_perf, cpu_perf; cuDoubleComplex *h_A, *h_R, *h_Q, *h_work, *tau, *twork, *dT; double *rwork; double result[2] = {0., 0.}; /* Matrix size */ magma_int_t N=0, n2, lda, nb, lwork, ltwork, once = 0; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info, checkres; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); once = true; } } if ( N > 0 ) printf(" testing_zgehrd -N %d\n\n", (int) N); else { printf("\nUsage: \n"); printf(" testing_zgehrd -N %d\n\n", 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_zgehrd -N %d\n\n", 1024); N = size[9]; } checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; eps = lapackf77_dlamch( "E" ); lda = N; n2 = N*lda; nb = magma_get_zgehrd_nb(N); /* We suppose the magma nb is bigger than lapack nb */ lwork = N*nb; TESTING_MALLOC ( h_A , cuDoubleComplex, n2 ); TESTING_MALLOC ( tau , cuDoubleComplex, N ); TESTING_HOSTALLOC( h_R , cuDoubleComplex, n2 ); TESTING_HOSTALLOC( h_work, cuDoubleComplex, lwork ); TESTING_DEVALLOC ( dT , cuDoubleComplex, nb*N ); /* To avoid uninitialized variable warning */ h_Q = NULL; twork = NULL; rwork = NULL; if ( checkres ) { ltwork = 2*(N*N); TESTING_HOSTALLOC( h_Q, cuDoubleComplex, lda*N ); TESTING_MALLOC( twork, cuDoubleComplex, ltwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC( rwork, double, N ); #endif } printf(" N CPU GFlop/s GPU GFlop/s |A-QHQ'|/N|A| |I-QQ'|/N \n"); printf("=============================================================\n"); for(i=0; i<10; i++){ if ( !once ) { N = size[i]; } lda = N; n2 = lda*N; flops = FLOPS( (double)N ) / 1e6; /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ start = get_current_time(); magma_zgehrd ( N, ione, N, h_R, lda, tau, h_work, lwork, dT, &info); end = get_current_time(); if ( info < 0 ) printf("Argument %d of magma_zgehrd had an illegal value\n", (int) -info); gpu_perf = flops / GetTimerValue(start,end); /* ===================================================================== Check the factorization =================================================================== */ if ( checkres ) { lapackf77_zlacpy(MagmaUpperLowerStr, &N, &N, h_R, &lda, h_Q, &lda); { int i, j; for(j=0; j<N-1; j++) for(i=j+2; i<lda; i++) h_R[i+j*lda] = MAGMA_Z_ZERO; } nb = magma_get_zgehrd_nb(N); magma_zunghr(N, ione, N, h_Q, lda, tau, dT, nb, &info); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zhst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, rwork, result); #else lapackf77_zhst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, result); #endif } /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_zgehrd(&N, &ione, &N, h_R, &lda, tau, h_work, &lwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of lapack_zgehrd had an illegal value.\n", (int) -info); cpu_perf = flops / GetTimerValue(start,end); /* ===================================================================== Print performance and error. =================================================================== */ if ( checkres ) { printf("%5d %6.2f %6.2f %e %e\n", (int) N, cpu_perf, gpu_perf, result[0]*eps, result[1]*eps ); } else { printf("%5d %6.2f %6.2f\n", (int) N, cpu_perf, gpu_perf ); } if ( once ) break; } /* Memory clean up */ TESTING_FREE ( h_A ); TESTING_FREE ( tau ); TESTING_HOSTFREE( h_work); TESTING_HOSTFREE( h_R ); TESTING_DEVFREE ( dT ); if ( checkres ) { TESTING_HOSTFREE( h_Q ); TESTING_FREE( twork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE( rwork ); #endif } /* Shutdown */ TESTING_CUDA_FINALIZE(); return EXIT_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zpotrf_mgpu */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); magma_setdevice(0); magma_timestr_t start, end; double flops, gpu_perf, cpu_perf; cuDoubleComplex *h_A, *h_R; cuDoubleComplex *d_lA[4]; magma_int_t N = 0, n2, mb, nb, nk, lda, ldda, n_local, ldn_local; //magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000}; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t n_sizes = 10, flag = 0; magma_int_t i, j, k, info, num_gpus0 = 1, num_gpus; const char *uplo = MagmaLowerStr; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], matnorm; N = size[n_sizes-1]; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { flag = 1; N = atoi(argv[++i]); size[0] = size[n_sizes-1] = N; } if (strcmp("-NGPU", argv[i])==0) num_gpus0 = atoi(argv[++i]); if (strcmp("-UPLO",argv[i])==0) { if (strcmp("L",argv[++i])==0) uplo = MagmaLowerStr; else uplo = MagmaUpperStr; } } if (strcmp(uplo,MagmaLowerStr)==0) printf("\n testing_zpotrf_mgpu -N %d -NGPU %d -UPLO L\n\n", (int) N, (int) num_gpus0 ); else printf("\n testing_zpotrf_mgpu -N %d -NGPU %d -UPLO U\n\n", (int) N, (int) num_gpus0 ); } else { printf("\nDefault: \n"); printf(" testing_zpotrf_mgpu -N %d:%d -NGPU %d -UPLO L\n\n", (int) size[0], (int) size[n_sizes-1], (int) num_gpus0 ); } if( N <= 0 || num_gpus0 <= 0 ) { printf( " invalid input N=%d NGPU=%d\n", (int) N, (int) num_gpus0 ); exit(1); } /* looking for max. ldda */ ldda = 0; n2 = 0; for(i=0; i<n_sizes; i++){ N = size[i]; nb = magma_get_zpotrf_nb(N); mb = nb; if( num_gpus0 > N/nb ) { num_gpus = N/nb; if( N%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } n_local = nb*(1+N/(nb*num_gpus)) * mb*((N+mb-1)/mb); if( n_local > ldda ) ldda = n_local; if( n2 < N*N ) n2 = N*N; if (flag != 0) break; } /* Allocate host memory for the matrix */ TESTING_HOSTALLOC( h_A, cuDoubleComplex, n2); TESTING_HOSTALLOC( h_R, cuDoubleComplex, n2); /* allocate local matrix on GPU */ for(i=0; i<num_gpus0; i++){ magma_setdevice(i); TESTING_DEVALLOC( d_lA[i], cuDoubleComplex, ldda ); } magma_setdevice(0); printf(" N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("========================================================\n"); for(i=0; i<n_sizes; i++){ N = size[i]; lda = N; n2 = lda*N; flops = FLOPS( (double)N ) / 1000000; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); /* Symmetrize and increase the diagonal */ { magma_int_t i, j; for(i=0; i<N; i++) { MAGMA_Z_SET2REAL( h_A[i*lda+i], ( MAGMA_Z_REAL(h_A[i*lda+i]) + 1.*N ) ); for(j=0; j<i; j++) h_A[i*lda+j] = cuConj(h_A[j*lda+i]); } } lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ nb = magma_get_zpotrf_nb(N); if( num_gpus0 > N/nb ) { num_gpus = N/nb; if( N%nb != 0 ) num_gpus ++; printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) num_gpus ); } else { num_gpus = num_gpus0; } /* distribute matrix to gpus */ if( lapackf77_lsame(uplo, "U") ) { /* going through each block-column */ ldda = ((N+mb-1)/mb)*mb; for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; magma_setdevice(k); nk = min(nb, N-j); magma_zsetmatrix( N, nk, h_A+j*lda, lda, d_lA[k]+j/(nb*num_gpus)*nb*ldda, ldda ); } } else { /* going through each block-row */ ldda = (1+N/(nb*num_gpus))*nb; for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; magma_setdevice(k); nk = min(nb, N-j); magma_zsetmatrix( nk, N, h_A+j, lda, d_lA[k]+j/(nb*num_gpus)*nb, ldda ); } } magma_setdevice(0); /* call magma_zpotrf_mgpu */ start = get_current_time(); magma_zpotrf_mgpu(num_gpus, uplo[0], N, d_lA, ldda, &info); end = get_current_time(); if (info < 0) { printf("Argument %d of magma_zpotrf_mgpu had an illegal value.\n", (int) -info); break; } else if (info != 0) { printf("magma_zpotrf_mgpu returned info=%d\n", (int) info ); break; } gpu_perf = flops / GetTimerValue(start, end); /* gather matrix from gpus */ if( lapackf77_lsame(uplo, "U") ) { for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; magma_setdevice(k); nk = min(nb, N-j); magma_zgetmatrix( N, nk, d_lA[k]+j/(nb*num_gpus)*nb*ldda, ldda, h_R+j*lda, lda ); } } else { for(j=0; j<N; j+=nb){ k = (j/nb)%num_gpus; magma_setdevice(k); nk = min(nb, N-j); magma_zgetmatrix( nk, N, d_lA[k]+j/(nb*num_gpus)*nb, ldda, h_R+j, lda ); } } magma_setdevice(0); /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_zpotrf(uplo, &N, h_A, &lda, &info); end = get_current_time(); if (info < 0) { printf("Argument %d of zpotrf had an illegal value.\n", (int) -info); break; } else if (info != 0) { printf("lapackf77_zpotrf returned info=%d\n", (int) info ); break; } cpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ matnorm = lapackf77_zlange("f", &N, &N, h_A, &lda, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %6.2f %6.2f %e\n", (int) size[i], cpu_perf, gpu_perf, lapackf77_zlange("f", &N, &N, h_R, &lda, work) / matnorm); if (flag != 0) break; } /* Memory clean up */ TESTING_HOSTFREE( h_A ); TESTING_HOSTFREE( h_R ); for(i=0; i<num_gpus; i++){ magma_setdevice(i); TESTING_DEVFREE( d_lA[i] ); } /* Shutdown */ TESTING_CUDA_FINALIZE(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrf */ int main( magma_int_t argc, char** argv) { magma_int_t nquarkthreads=2; magma_int_t nthreads=2; magma_int_t num_gpus = 1; TRACE = 0; //magma_qr_params mp; cuDoubleComplex *h_A, *h_R, *h_work, *tau; double gpu_perf, cpu_perf, flops; magma_timestr_t start, end; magma_qr_params *mp = (magma_qr_params*)malloc(sizeof(magma_qr_params)); /* Matrix size */ magma_int_t M=0, N=0, n2; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; cublasStatus status; magma_int_t i, j, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; mp->nb=-1; mp->ob=-1; mp->fb=-1; mp->ib=32; magma_int_t loop = argc; magma_int_t accuracyflag = 1; char precision; magma_int_t nc = -1; magma_int_t ncps = -1; if (argc != 1) { for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); else if (strcmp("-F", argv[i])==0) mp->fb = atoi(argv[++i]); else if (strcmp("-O", argv[i])==0) mp->ob = atoi(argv[++i]); else if (strcmp("-B", argv[i])==0) mp->nb = atoi(argv[++i]); else if (strcmp("-b", argv[i])==0) mp->ib = atoi(argv[++i]); else if (strcmp("-A", argv[i])==0) accuracyflag = atoi(argv[++i]); else if (strcmp("-P", argv[i])==0) nthreads = atoi(argv[++i]); else if (strcmp("-Q", argv[i])==0) nquarkthreads = atoi(argv[++i]); else if (strcmp("-nc", argv[i])==0) nc = atoi(argv[++i]); else if (strcmp("-ncps", argv[i])==0) ncps = atoi(argv[++i]); } if ((M>0 && N>0) || (M==0 && N==0)) { printf(" testing_zgeqrf-v2 -M %d -N %d\n\n", M, N); if (M==0 && N==0) { M = N = size[9]; loop = 1; } } else { printf("\nUsage: \n"); printf(" Make sure you set the number of BLAS threads to 1, e.g.,\n"); printf(" > setenv MKL_NUM_THREADS 1\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -B 128 -T 1\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" Make sure you set the number of BLAS threads to 1, e.g.,\n"); printf(" > setenv MKL_NUM_THREADS 1\n"); printf(" Set number of cores per socket and number of cores.\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -ncps 6 -nc 12\n\n", 1024, 1024); printf(" Alternatively, set:\n"); printf(" Q: Number of threads for panel factorization.\n"); printf(" P: Number of threads for trailing matrix update (CPU).\n"); printf(" B: Block size.\n"); printf(" b: Inner block size.\n"); printf(" O: Block size for trailing matrix update (CPU).\n"); printf(" > testing_zgeqrf-v2 -M %d -N %d -Q 4 -P 4 -B 128 -b 32 -O 200\n\n", 10112, 10112); M = N = size[9]; } /* Auto tune based on number of cores and number of cores per socket if provided */ if ((nc > 0) && (ncps > 0)) { precision = 's'; #if (defined(PRECISION_d)) precision = 'd'; #endif #if (defined(PRECISION_c)) precision = 'c'; #endif #if (defined(PRECISION_z)) precision = 'z'; #endif auto_tune('q', precision, nc, ncps, M, N, &(mp->nb), &(mp->ob), &(mp->ib), &nthreads, &nquarkthreads); fprintf(stderr,"%d %d %d %d %d\n",mp->nb,mp->ob,mp->ib,nquarkthreads,nthreads); } /* Initialize MAGMA hardware context, seeting how many CPU cores and how many GPUs to be used in the consequent computations */ mp->sync0 = 0; magma_context *context; context = magma_init((void*)(mp),cpu_thread, nthreads, nquarkthreads, num_gpus, argc, argv); context->params = (void *)(mp); mp->sync1 = (volatile magma_int_t *) malloc (sizeof(int)*nthreads); for (i = 0; i < nthreads; i++) mp->sync1[i] = 0; n2 = M * N; magma_int_t min_mn = min(M, N); magma_int_t nb = magma_get_zgeqrf_nb(min_mn); magma_int_t lwork = N*nb; /* Allocate host memory for the matrix */ TESTING_MALLOC ( h_A , cuDoubleComplex, n2 ); TESTING_MALLOC ( tau , cuDoubleComplex, min_mn); TESTING_HOSTALLOC( h_R , cuDoubleComplex, n2 ); TESTING_HOSTALLOC(h_work, cuDoubleComplex, lwork ); printf("\n\n"); printf(" M N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("==========================================================\n"); for(i=0; i<10; i++){ if (loop==1){ M = N = min_mn = size[i]; n2 = M*N; } flops = FLOPS( (double)M, (double)N ) / 1000000; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &M, h_R, &M ); //magma_zgeqrf(M, N, h_R, M, tau, h_work, lwork, &info); for(j=0; j<n2; j++) h_R[j] = h_A[j]; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_qr_init(mp, M, N, h_R, nthreads); start = get_current_time(); magma_zgeqrf3(context, M, N, h_R, M, tau, h_work, lwork, &info); end = get_current_time(); gpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); if (accuracyflag == 1) lapackf77_zgeqrf(&M, &N, h_A, &M, tau, h_work, &lwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of zgeqrf had an illegal value.\n", -info); cpu_perf = 4.*M*N*min_mn/(3.*1000000*GetTimerValue(start,end)); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ double work[1], matnorm = 1.; cuDoubleComplex mone = MAGMA_Z_NEG_ONE; magma_int_t one = 1; if (accuracyflag == 1){ matnorm = lapackf77_zlange("f", &M, &N, h_A, &M, work); blasf77_zaxpy(&n2, &mone, h_A, &one, h_R, &one); } if (accuracyflag == 1){ printf("%5d %5d %6.2f %6.2f %e\n", M, N, cpu_perf, gpu_perf, lapackf77_zlange("f", &M, &N, h_R, &M, work) / matnorm); } else { printf("%5d %5d %6.2f \n", M, N, gpu_perf); } if (loop != 1) break; } /* Memory clean up */ TESTING_FREE ( h_A ); TESTING_FREE ( tau ); TESTING_HOSTFREE(h_work); TESTING_HOSTFREE( h_R ); /* Shut down the MAGMA context */ magma_finalize(context); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; float error, work[1]; int transA = MagmaNoTrans; int transB = MagmaNoTrans; float Cnorm; magma_int_t istart = 1024; magma_int_t iend = 8194; magma_int_t M, M0 = 0; magma_int_t N, N0 = 0; magma_int_t K, K0 = 0; magma_int_t i; magma_int_t Am, An, Bm, Bn; magma_int_t szeA, szeB, szeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaFloatComplex *h_A, *h_B, *h_C, *h_C2; magmaFloatComplex_ptr d_A, d_B, d_C; magmaFloatComplex mzone = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magmaFloatComplex beta = MAGMA_C_MAKE( -0.48, 0.38 ); if (argc != 1){ for(i=1; i<argc; i++){ if ( strcmp("-N", argv[i]) == 0 ){ N0 = atoi(argv[++i]); } else if ( strcmp("-M", argv[i]) == 0 ){ M0 = atoi(argv[++i]); } else if ( strcmp("-K", argv[i]) == 0 ){ K0 = atoi(argv[++i]); } else if (strcmp("-NN", argv[i])==0){ transA = transB = MagmaNoTrans; } else if (strcmp("-TT", argv[i])==0){ transA = transB = MagmaTrans; } else if (strcmp("-NT", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaTrans; } else if (strcmp("-TN", argv[i])==0){ transA = MagmaTrans; transB = MagmaNoTrans; } #if defined(PRECISION_z) || defined(PRECISION_c) else if (strcmp("-NC", argv[i])==0){ transA = MagmaNoTrans; transB = MagmaConjTrans; } else if (strcmp("-TC", argv[i])==0){ transA = MagmaTrans; transB = MagmaConjTrans; } else if (strcmp("-CN", argv[i])==0){ transA = MagmaConjTrans; transB = MagmaNoTrans; } else if (strcmp("-CT", argv[i])==0){ transA = MagmaConjTrans; transB = MagmaTrans; } else if (strcmp("-CC", argv[i])==0){ transA = transB = MagmaConjTrans; } #endif } } if ( (M0 != 0) && (N0 != 0) && (K0 != 0) ) iend = istart + 1; M = N = K = iend; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { Am = M; An = K; } else { Am = K; An = M; } if( transB == MagmaNoTrans ) { Bm = K; Bn = N; } else { Bm = N; Bn = K; } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } lda = ldc = M; ldb = Bm; ldda = lddc = ((M+31)/32)*32; lddb = ((ldb+31)/32)*32; K+=32; M+=32; N +=32; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*K ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*Bn ); TESTING_MALLOC_CPU( h_C, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( h_C2, magmaFloatComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*K ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*Bn ); TESTING_MALLOC_DEV( d_C, magmaFloatComplex, lddc*N ); printf("\nUsage: \n"); printf(" testing_cgemm [-NN|NT|TN|TT] [-N %d] \n\n", 1024); printf("\n"); printf("Testing transA = %c transB = %c\n", transA, transB); printf(" M N K clAmdBlas GFLop/s (sec) CPU GFlop/s (sec) error\n"); printf("===========================================================================\n"); for(i=istart; i<iend; i = (int)(i*1.25) ) { M = N = K = i; if ( M0 != 0 ) M = M0; if ( N0 != 0 ) N = N0; if ( K0 != 0 ) K = K0; if( transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if( transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } gflops = FLOPS( (float)M, (float)N, (float)K ) * 1e-9; ldc = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; szeA = lda * An; szeB = ldb * Bn; szeC = ldc * N; /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &szeA, h_A ); lapackf77_clarnv( &ione, ISEED, &szeB, h_B ); lapackf77_clarnv( &ione, ISEED, &szeC, h_C ); /* ===================================================================== Performs operation using MAGMA-BLAS =================================================================== */ magma_csetmatrix( Am, An, h_A, 0, lda, d_A, 0, ldda, queue ); magma_csetmatrix( Bm, Bn, h_B, 0, ldb, d_B, 0, lddb, queue ); magma_csetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magma_cgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_csetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue ); magma_queue_sync( queue ); gpu_time = magma_wtime(); magma_cgemm( transA, transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, queue ); magma_queue_sync( queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; magma_cgetmatrix( M, N, d_C, 0, lddc, h_C2, 0, ldc, queue ); /* ===================================================================== Performs operation using CPU-BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_cgemm( lapack_const(transA), lapack_const(transB), &M, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_clange( "M", &M, &N, h_C, &ldc, work ); /* ===================================================================== Error Computation and Performance Compariosn =================================================================== */ blasf77_caxpy(&szeC, &mzone, h_C, &ione, h_C2, &ione); error = lapackf77_clange("M", &M, &N, h_C2, &ldc, work)/Cnorm; printf("%5d %5d %5d %8.2f (%6.2f) %6.2f (%6.2f) %e\n", M, N, K, gpu_perf, gpu_time, cpu_perf, cpu_time, error); } /* Memory clean up */ TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_C2 ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error; magmaDoubleComplex *h_A, *h_R; magmaDoubleComplex_ptr d_A; magma_int_t *ipiv; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, ldda; #if defined (PRECISION_z) magma_int_t size[10] = {1024,2048,3072,4032,4992,5952,7000,7000,7000,7000}; #else magma_int_t size[10] = {1024,2048,3072,4032,4992,5952,7104,8064,9000,10000}; #endif magma_int_t i, info, min_mn; //magma_int_t nb, maxn, ret; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if (M>0 && N>0) printf(" testing_zgetrf -M %d -N %d\n\n", M, N); else { printf("\nUsage: \n"); printf(" testing_zgetrf -M %d -N %d\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_zgetrf_gpu -M %d -N %d\n\n", 1024, 1024); M = N = size[9]; } /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } ldda = ((M+31)/32)*32; //maxn = ((N+31)/32)*32; n2 = M * N; min_mn = min(M, N); //nb = magma_get_zgetrf_nb(min_mn); /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); printf("\n\n"); printf(" M N CPU GFlop/ (sec)s GPU GFlop/s (sec) ||PA-LU||/(||A||*N)\n"); printf("========================================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (double)M, (double)N ) *1e-9; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of zgetrf had an illegal value.\n", -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); magma_zgetrf_gpu( M, N, d_A, 0, ldda, ipiv, &info, queue ); magma_zsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); gpu_time = magma_wtime(); magma_zgetrf_gpu( M, N, d_A, 0, ldda, ipiv, &info, queue ); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of zgetrf had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the factorization =================================================================== */ magma_zgetmatrix( M, N, d_A, 0, ldda, h_A, 0, lda, queue ); error = get_LU_error(M, N, h_R, lda, h_A, ipiv); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", M, N, cpu_perf, cpu_time, gpu_perf, gpu_time, error); if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgelqf */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); magma_timestr_t start, end; float flops, gpu_perf, cpu_perf; float matnorm, work[1]; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; cuFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1]; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, lwork; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (N>0 && M>0) printf(" testing_cgelqf -M %d -N %d\n\n", (int) M, (int) N); else { printf("\nUsage: \n"); printf(" testing_cgelqf -M %d -N %d\n\n", (int) M, (int) N); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cgelqf -M %d -N %d\n\n", 1024, 1024); M = N = size[9]; } n2 = M * N; min_mn = min(M, N); nb = magma_get_cgeqrf_nb(M); TESTING_MALLOC( tau, cuFloatComplex, min_mn ); TESTING_MALLOC( h_A, cuFloatComplex, n2 ); TESTING_HOSTALLOC( h_R, cuFloatComplex, n2 ); lwork = -1; lapackf77_cgelqf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_C_REAL( tmp[0] ); lwork = max( lwork, M*nb ); TESTING_HOSTALLOC( h_work, cuFloatComplex, lwork ); printf(" M N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n"); printf("==========================================================\n"); for(i=0; i<10; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; flops = FLOPS( (float)M, (float)N ) / 1000000; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ start = get_current_time(); magma_cgelqf( M, N, h_R, lda, tau, h_work, lwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of magma_cgelqf had an illegal value.\n", (int) -info); gpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_cgelqf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of lapack_cgelqf had an illegal value.\n", (int) -info); cpu_perf = flops / GetTimerValue(start, end); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ matnorm = lapackf77_clange("f", &M, &N, h_A, &lda, work); blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f %6.2f %e\n", (int) M, (int) N, cpu_perf, gpu_perf, lapackf77_clange("f", &M, &N, h_R, &lda, work) / matnorm); if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); TESTING_HOSTFREE( h_work ); /* Shutdown */ TESTING_CUDA_FINALIZE(); }
void trace(const real_t dtdx, const int n, const int Hscheme, const int Hnvar, const int Hnxyt, const int slices, const int Hstep, real_t q[Hnvar][Hstep][Hnxyt], real_t dq[Hnvar][Hstep][Hnxyt], real_t c[Hstep][Hnxyt], real_t qxm[Hnvar][Hstep][Hnxyt], real_t qxp[Hnvar][Hstep][Hnxyt]) { int ijmin, ijmax; int i, IN, s = slices; real_t zerol = 0.0, zeror = 0.0, project = 0.; WHERE("trace"); ijmin = 0; ijmax = n; // if (strcmp(Hscheme, "muscl") == 0) { // MUSCL-Hancock method if (Hscheme == HSCHEME_MUSCL) { // MUSCL-Hancock method zerol = -hundred / dtdx; zeror = hundred / dtdx; project = one; } // if (strcmp(Hscheme, "plmde") == 0) { // standard PLMDE if (Hscheme == HSCHEME_PLMDE) { // standard PLMDE zerol = zero; zeror = zero; project = one; } // if (strcmp(Hscheme, "collela") == 0) { // Collela's method if (Hscheme == HSCHEME_COLLELA) { // Collela's method zerol = zero; zeror = zero; project = zero; } #pragma ivdep for (i = ijmin + 1; i < ijmax - 1; i++) { real_t cc, csq, r, u, v, p; real_t dr, du, dv, dp; real_t alpham, alphap, alpha0r, alpha0v; real_t spminus, spplus, spzero; real_t apright, amright, azrright, azv1right; real_t apleft, amleft, azrleft, azv1left; cc = c[s][i]; csq = Square(cc); r = q[ID][s][i]; u = q[IU][s][i]; v = q[IV][s][i]; p = q[IP][s][i]; dr = dq[ID][s][i]; du = dq[IU][s][i]; dv = dq[IV][s][i]; dp = dq[IP][s][i]; alpham = half * (dp / (r * cc) - du) * r / cc; alphap = half * (dp / (r * cc) + du) * r / cc; alpha0r = dr - dp / csq; alpha0v = dv; // Right state spminus = (u - cc) * dtdx + one; spplus = (u + cc) * dtdx + one; spzero = u * dtdx + one; if ((u - cc) >= zeror) { spminus = project; } if ((u + cc) >= zeror) { spplus = project; } if (u >= zeror) { spzero = project; } apright = -half * spplus * alphap; amright = -half * spminus * alpham; azrright = -half * spzero * alpha0r; azv1right = -half * spzero * alpha0v; qxp[ID][s][i] = r + (apright + amright + azrright); qxp[IU][s][i] = u + (apright - amright) * cc / r; qxp[IV][s][i] = v + (azv1right); qxp[IP][s][i] = p + (apright + amright) * csq; // Left state spminus = (u - cc) * dtdx - one; spplus = (u + cc) * dtdx - one; spzero = u * dtdx - one; if ((u - cc) <= zerol) { spminus = -project; } if ((u + cc) <= zerol) { spplus = -project; } if (u <= zerol) { spzero = -project; } apleft = -half * spplus * alphap; amleft = -half * spminus * alpham; azrleft = -half * spzero * alpha0r; azv1left = -half * spzero * alpha0v; qxm[ID][s][i] = r + (apleft + amleft + azrleft); qxm[IU][s][i] = u + (apleft - amleft) * cc / r; qxm[IV][s][i] = v + (azv1left); qxm[IP][s][i] = p + (apleft + amleft) * csq; } { int nops = slices * ((ijmax - 1) - (ijmin + 1)); FLOPS(77 * nops, 7 * nops, 0 * nops, 0 * nops); } if (Hnvar > IP) { for (IN = IP + 1; IN < Hnvar; IN++) { #pragma ivdep for (i = ijmin + 1; i < ijmax - 1; i++) { real_t u, a; real_t da; real_t spzero; real_t acmpright; real_t acmpleft; u = q[IU][s][i]; a = q[IN][s][i]; da = dq[IN][s][i]; // Right state spzero = u * dtdx + one; if (u >= zeror) { spzero = project; } acmpright = -half * spzero * da; qxp[IN][s][i] = a + acmpright; // Left state spzero = u * dtdx - one; if (u <= zerol) { spzero = -project; } acmpleft = -half * spzero * da; qxm[IN][s][i] = a + acmpleft; } } } } // trace
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgehrd2 */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; //*h_R1 is used for warm-up magmaDoubleComplex *h_A, *h_R, *h_Q, *h_work, *tau, *twork, *h_R1; magmaDoubleComplex_ptr dT; double *rwork; double result[2] = {0., 0.}; double eps; int checkres; checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; /* Matrix size */ int N=0, n2, lda, nb, lwork, ltwork, once = 0; #if defined (PRECISION_z) magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7000,7000,7000,7000}; #else magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,9900}; #endif int i, info; int ione = 1; int ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); } if ( N > 0 ){ printf(" testing_zgehrd -N %d\n\n", N); once = 1; } else { printf("\nUsage: \n"); printf(" testing_zgehrd -N %d\n\n", 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_zgehrd -N %d\n\n", 1024); N = size[9]; } /* Initialize */ magma_queue_t queue; magma_device_t device; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( &device, 1, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device, &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } eps = lapackf77_dlamch( "E" ); lda = N; n2 = N*lda; nb = magma_get_zgehrd_nb(N); /* We suppose the magma nb is bigger than lapack nb */ lwork = N*nb; TESTING_MALLOC_HOST( h_A , magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( tau , magmaDoubleComplex, N ); TESTING_MALLOC_HOST( h_R , magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( h_R1 , magmaDoubleComplex, n2 ); TESTING_MALLOC_HOST( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_DEV ( dT , magmaDoubleComplex, nb*N ); /* To avoid uninitialized variable warning */ h_Q = NULL; twork = NULL; rwork = NULL; if ( checkres ) { ltwork = 2*(N*N); TESTING_MALLOC_HOST( h_Q, magmaDoubleComplex, lda*N ); TESTING_MALLOC_HOST( twork, magmaDoubleComplex, ltwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_HOST( rwork, double, N ); #endif } printf("\n\n"); printf(" N CPU GFlop/s GPU GFlop/s |A-QHQ'|/N|A| |I-QQ'|/N \n"); printf("=============================================================\n"); for(i=0; i<10; i++){ if ( !once ) { N = size[i]; } lda = N; n2 = lda*N; gflops = FLOPS( (double)N ) / 1e9; /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R1, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zgehrd ( N, ione, N, h_R1, lda, tau, h_work, lwork, dT, 0, &info, queue); if ( info < 0 ) printf("Argument %d of magma_zgehrd had an illegal value\n", -info); clFinish(queue); gpu_time = get_time(); magma_zgehrd ( N, ione, N, h_R, lda, tau, h_work, lwork, dT, 0, &info, queue); gpu_time = get_time() - gpu_time; if ( info < 0 ) printf("Argument %d of magma_zgehrd had an illegal value\n", -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the factorization =================================================================== */ if ( checkres ) { lapackf77_zlacpy(MagmaUpperLowerStr, &N, &N, h_R, &lda, h_Q, &lda); { int i, j; for(j=0; j<N-1; j++) for(i=j+2; i<lda; i++) h_R[i+j*lda] = MAGMA_Z_ZERO; } nb = magma_get_zgehrd_nb(N); magma_zunghr(N, ione, N, h_Q, lda, tau, dT, 0, nb, &info, queue); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zhst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, rwork, result); #else lapackf77_zhst01(&N, &ione, &N, h_A, &lda, h_R, &lda, h_Q, &lda, twork, <work, result); #endif } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = get_time(); lapackf77_zgehrd(&N, &ione, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = get_time() - cpu_time; if (info < 0) printf("Argument %d of lapack_zgehrd had an illegal value.\n", -info); cpu_perf = gflops / cpu_time; /* ===================================================================== Print performance and error. =================================================================== */ if ( checkres ) { printf("%5d %6.2f %6.2f %e %e\n", N, cpu_perf, gpu_perf, result[0]*eps, result[1]*eps ); } else { printf("%5d %6.2f %6.2f\n", N, cpu_perf, gpu_perf ); } if ( once ) break; } /* Memory clean up */ TESTING_FREE ( h_A ); TESTING_FREE ( tau ); TESTING_FREE_HOST( h_work); TESTING_FREE_HOST( h_R ); TESTING_FREE_HOST( h_R1 ); TESTING_FREE_DEV ( dT ); if ( checkres ) { TESTING_FREE_HOST( h_Q ); TESTING_FREE( twork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE( rwork ); #endif } /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); return EXIT_SUCCESS; }
extern "C" magma_int_t magma_dgetrf_m(magma_int_t num_gpus0, magma_int_t m, magma_int_t n, double *a, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2010 Purpose ======= DGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may not fit entirely in the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define A(i,j) (a + (j)*lda + (i)) #define inAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define inPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) /* Flops formula */ //#define PROFILE #ifdef PROFILE double flops, time_rmajor = 0, time_rmajor2 = 0, time_rmajor3 = 0, time_mem = 0; magma_timestr_t start, start1, start2, end1, end, start0 = get_current_time(); #define FMULS_GETRF(__m, __n) ( ((__m) < (__n)) ? (0.5 * (__m) * ((__m) * ((__n) - (1./3.) * (__m) - 1. ) + (__n)) + (2. / 3.) * (__m)) \ : (0.5 * (__n) * ((__n) * ((__m) - (1./3.) * (__n) - 1. ) + (__m)) + (2. / 3.) * (__n)) ) #define FADDS_GETRF(__m, __n) ( ((__m) < (__n)) ? (0.5 * (__m) * ((__m) * ((__n) - (1./3.) * (__m) ) - (__n)) + (1. / 6.) * (__m)) \ : (0.5 * (__n) * ((__n) * ((__m) - (1./3.) * (__n) ) - (__m)) + (1. / 6.) * (__n)) ) #define PRECISION_d #if defined(PRECISION_z) || defined(PRECISION_c) #define FLOPS(m, n) ( 6. * FMULS_GETRF(m, n) + 2. * FADDS_GETRF(m, n) ) #else #define FLOPS(m, n) ( FMULS_GETRF(m, n) + FADDS_GETRF(m, n) ) #endif #endif double *dAT[4], *dA[4], *dPT[4]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[4], ldn_local; magma_int_t N, M, NB, NBk, I, d, num_gpus; magma_int_t i, ii, jj, h = 3, offset, ib, rows, s; cudaStream_t stream[4][2]; cudaEvent_t event[4][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* initialize nb */ nb = magma_get_dgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(double); /* number of columns in the big panel */ NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); char * ngr_nb_char = getenv("MAGMA_NGR_NB"); if( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if( num_gpus0 > ceil((double)NB/nb) ) { num_gpus = (int)ceil((double)NB/nb); } else { num_gpus = num_gpus0; } if( num_gpus*NB >= n ) { #ifdef CHECK_DGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_DGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = num_gpus*NB; NB = max(nb,(NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_DGETRF_OOC if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d).\n",n,NB,nb ); fflush(stdout); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_dgetrf(&m, &n, a, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ #ifdef PROFILE start = get_current_time(); #endif n_local[0] = (NB/nb)/num_gpus; if( NB%(nb*num_gpus) != 0 ) n_local[0] ++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_dmalloc( &dA[d], (h*nb + ldn_local)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); #ifdef PROFILE end = get_current_time(); printf( " memory-allocation time: %e\n",GetTimerValue(start, end)/1000.0 ); start = get_current_time(); #endif for( I=0; I<n; I+=NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ s = min(max(m-I,0),N)/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if( num_gpus0 > ceil((double)N/nb) ) { num_gpus = (int)ceil((double)N/nb); } else { num_gpus = num_gpus0; } for( d=0; d<num_gpus; d++ ) { n_local[d] = ((N/nb)/num_gpus)*nb; if (d < (N/nb)%num_gpus) n_local[d] += nb; else if (d == (N/nb)%num_gpus) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; #ifdef PROFILE start2 = get_current_time(); #endif /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ magmablas_dsetmatrix_transpose_mgpu(num_gpus, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } #ifdef PROFILE start1 = get_current_time(); #endif /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset<min(m,I); offset+=NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_dsetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_dtranspose2( inPT(d,0,0), nb, dA[d], maxm-offset, M-offset, nbi); } /* applying the pivot from the previous big-panel */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][1]); magmablas_dpermute_long3( inAT(d,0,0), ldn_local, ipiv, NBk, offset ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj<NBk; jj+=nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if( jj+nb < NBk ) { magma_dsetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_dtranspose2( inPT(d,0,(1+jj/nb)%2), nb, dA[d], rows-nb, M-ii-nb, nb); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, inPT(d,0,(jj/nb)%2), nb, inAT(d,ib,0), ldn_local ); if( M > ii+nb ) { magma_dgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, inAT(d,ib,0), ldn_local, inPT(d,1,(jj/nb)%2), nb, c_one, inAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if( M > I ) { //magma_dgetrf1_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, &a[I*lda], lda, // (cudaStream_t **)stream, &iinfo); magma_dgetrf2_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, &a[I*lda], lda, stream, &iinfo); if( iinfo < 0 ) { *info = iinfo; break; } else if( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii<min(I+N,m); ii++ ) ipiv[ii] += I; } #ifdef PROFILE end1 = get_current_time(); time_rmajor += GetTimerValue(start1, end1); time_rmajor3 += GetTimerValue(start2, end1); time_mem += (GetTimerValue(start2, end1)-GetTimerValue(start1, end1))/1000.0; #endif /* download the current big panel to CPU */ magmablas_dgetmatrix_transpose_mgpu(num_gpus, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } #ifdef PROFILE end1 = get_current_time(); time_rmajor2 += GetTimerValue(start1, end1); #endif } /* end of for */ #ifdef PROFILE end = get_current_time(); flops = FLOPS( (double)m, (double)n ) / 1000000; printf(" NB=%d nb=%d\n",NB,nb); printf(" memcopy and transpose %e seconds\n",time_mem ); printf(" total time %e seconds\n",GetTimerValue(start0,end)/1000.0); printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / time_rmajor, time_rmajor /1000.0); printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / time_rmajor3, time_rmajor3/1000.0); printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / time_rmajor2, time_rmajor2/1000.0); printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / GetTimerValue(start, end), GetTimerValue(start,end)/1000.0); #endif for( d=0; d<num_gpus0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } return *info; } /* magma_dgetrf_m */