int init_accel (void) { #if defined(_ENABLE_OPENACC_) || defined(_ENABLE_CUDA_) char * str; int local_rank, dev_count; int dev_id = 0; #endif #ifdef _ENABLE_CUDA_ CUresult curesult = CUDA_SUCCESS; CUdevice cuDevice; #endif switch (options.accel) { #ifdef _ENABLE_CUDA_ case managed: case cuda: if ((str = getenv("LOCAL_RANK")) != NULL) { cudaGetDeviceCount(&dev_count); local_rank = atoi(str); dev_id = local_rank % dev_count; } curesult = cuInit(0); if (curesult != CUDA_SUCCESS) { return 1; } curesult = cuDeviceGet(&cuDevice, dev_id); if (curesult != CUDA_SUCCESS) { return 1; } curesult = cuCtxCreate(&cuContext, 0, cuDevice); if (curesult != CUDA_SUCCESS) { return 1; } break; #endif #ifdef _ENABLE_OPENACC_ case openacc: if ((str = getenv("LOCAL_RANK")) != NULL) { dev_count = acc_get_num_devices(acc_device_not_host); local_rank = atoi(str); dev_id = local_rank % dev_count; } acc_set_device_num (dev_id, acc_device_not_host); break; #endif default: fprintf(stderr, "Invalid device type, should be cuda or openacc\n"); return 1; } return 0; }
static void initGPU(int argc, char** argv) { // gets the device id (if specified) to run on int devId = -1; if (argc > 1) { devId = atoi(argv[1]); int devCount = acc_get_num_devices(acc_device_nvidia); if (devId < 0 || devId >= devCount) { printf("The specified device ID is not supported.\n"); exit(1); } } if (devId != -1) { acc_set_device_num(devId, acc_device_nvidia); } // creates a context on the GPU just to // exclude initialization time from computations acc_init(acc_device_nvidia); // print device id devId = acc_get_device_num(acc_device_nvidia); printf("Running on GPU with ID %d.\n\n", devId); }
int main(int argc, char* argv[]) { acc_set_device_num(0, acc_device_nvidia); // read command line arguments readcmdline(&options, argc, argv); int nx = options.nx; int ny = options.ny; int N = options.N; int nt = options.nt; printf("========================================================================\n"); printf(" Welcome to mini-stencil!\n"); printf("mesh :: %d * %d, dx = %f\n", nx, ny, options.dx); printf("time :: %d, time steps from 0 .. %f\n", nt, options.nt * options.dt); printf("========================================================================\n"); // allocate global fields x_new = (double*) malloc(sizeof(double)*nx*ny); x_old = (double*) malloc(sizeof(double)*nx*ny); bndN = (double*) malloc(sizeof(double)*nx); bndS = (double*) malloc(sizeof(double)*nx); bndE = (double*) malloc(sizeof(double)*ny); bndW = (double*) malloc(sizeof(double)*ny); double* b = (double*) malloc(N*sizeof(double)); double* deltax = (double*) malloc(N*sizeof(double)); // set dirichlet boundary conditions to 0 all around memset(x_old, 0, sizeof(double) * nx * ny); memset(bndN, 0, sizeof(double) * nx); memset(bndS, 0, sizeof(double) * nx); memset(bndE, 0, sizeof(double) * ny); memset(bndW, 0, sizeof(double) * ny); memset(deltax, 0, sizeof(double) * N); // set the initial condition // a circle of concentration 0.1 centred at (xdim/4, ydim/4) with radius // no larger than 1/8 of both xdim and ydim memset(x_new, 0, sizeof(double) * nx * ny); double xc = 1.0 / 4.0; double yc = (ny - 1) * options.dx / 4; double radius = fmin(xc, yc) / 2.0; int i,j; // for (j = 0; j < ny; j++) { double y = (j - 1) * options.dx; for (i = 0; i < nx; i++) { double x = (i - 1) * options.dx; if ((x - xc) * (x - xc) + (y - yc) * (y - yc) < radius * radius) //((double(*)[nx])x_new)[j][i] = 0.1; x_new[i+j*nx] = 0.1; } } flops_bc = 0; flops_diff = 0; flops_blas1 = 0; verbose_output = 0; iters_cg = 0; iters_newton = 0; // initialize temporary storage fields used by the cg solver // I do this here so that the fields are persistent between calls // to the CG solver. This is useful if we want to avoid malloc/free calls // on the device for the OpenACC implementation (feel free to suggest a better // method for doing this) printf("INITIALIZING CG STATE\n"); Ap = (double*) malloc(N*sizeof(double)); r = (double*) malloc(N*sizeof(double)); p = (double*) malloc(N*sizeof(double)); Fx = (double*) malloc(N*sizeof(double)); Fxold = (double*) malloc(N*sizeof(double)); v = (double*) malloc(N*sizeof(double)); xold = (double*) malloc(N*sizeof(double)); int cg_converged = 1; double timespent; // start timer timespent = -omp_get_wtime(); // main timeloop double tolerance = 1.e-6; int timestep; for (timestep = 1; timestep <= nt; timestep++) { // set x_new and x_old to be the solution ss_copy(x_old, x_new, N); double residual; int converged = 0; int it = 1; for ( ; it <= 50; it++) { // compute residual : requires both x_new and x_old diffusion(x_new, b); residual = ss_norm2(b, N); // check for convergence if (residual < tolerance) { converged = 1; break; } // solve linear system to get -deltax ss_cg(deltax, b, 200, tolerance, &cg_converged); // check that the CG solver converged if (!cg_converged) break; // update solution ss_axpy(x_new, -1.0, deltax, N); } iters_newton += it; // output some statistics //if (converged && verbose_output) if (converged && verbose_output) printf("step %d required %d iterations for residual %E\n", timestep, it, residual); if (!converged) { fprintf(stderr, "step %d ERROR : nonlinear iterations failed to converge\n", timestep); break; } } // get times timespent += omp_get_wtime(); unsigned long long flops_total = flops_diff + flops_blas1; //////////////////////////////////////////////////////////////////// // write final solution to BOV file for visualization //////////////////////////////////////////////////////////////////// // binary data { FILE* output = fopen("output.bin", "w"); fwrite(x_new, sizeof(double), nx * ny, output); fclose(output); } // metadata { FILE* output = fopen("output.bov", "wb"); fprintf(output, "TIME: 0.0\n"); fprintf(output, "DATA_FILE: output.bin\n"); fprintf(output, "DATA_SIZE: %d, %d, 1\n", nx, ny); fprintf(output, "DATA_FORMAT: DOUBLE\n"); fprintf(output, "VARIABLE: phi\n"); fprintf(output, "DATA_ENDIAN: LITTLE\n"); fprintf(output, "CENTERING: nodal\n"); //fprintf(output, "BYTE_OFFSET: 4\n"); fprintf(output, "BRICK_SIZE: 1.0 %f 1.0\n", (ny - 1) * options.dx); fclose(output); } // print table sumarizing results printf("--------------------------------------------------------------------------------\n"); printf("simulation took %f seconds (%f GFLOP/s)\n", timespent, flops_total / 1e9 / timespent); printf("%u conjugate gradient iterations\n", iters_cg); printf("%u newton iterations\n", iters_newton); printf("--------------------------------------------------------------------------------\n"); // deallocate global fields free (x_new); free (x_old); free (bndN); free (bndS); free (bndE); free (bndW); printf("Goodbye!\n"); return 0; }
int main(int argc, char** argv) { int iter_max = 1000; const float pi = 2.0 * asinf(1.0f); const float tol = 1.0e-5f; int rank = 0; int size = 1; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); memset(A, 0, N * M * sizeof(float)); memset(Aref, 0, N * M * sizeof(float)); // set boundary conditions for (int j = 0; j < N; j++) { float y0 = sinf( 2.0 * pi * j / (N-1)); A[j][0] = y0; A[j][M-1] = y0; Aref[j][0] = y0; Aref[j][M-1] = y0; } #if _OPENACC int ngpus=acc_get_num_devices(acc_device_nvidia); int devicenum=rank%ngpus; acc_set_device_num(devicenum,acc_device_nvidia); // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems acc_init(acc_device_nvidia); #endif /*_OPENACC*/ // Ensure correctness if N%size != 0 int chunk_size = ceil( (1.0*N)/size ); int jstart = rank * chunk_size; int jend = jstart + chunk_size; // Do not process boundaries jstart = max( jstart, 1 ); jend = min( jend, N - 1 ); if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", N, M); if ( rank == 0) printf("Calculate reference solution and time serial execution.\n"); StartTimer(); laplace2d_serial( rank, iter_max, tol ); double runtime_serial = GetTimer(); //Wait for all processes to ensure correct timing of the parallel version MPI_Barrier( MPI_COMM_WORLD ); if ( rank == 0) printf("Parallel execution.\n"); StartTimer(); int iter = 0; float error = 1.0f; #pragma acc data copy(A) create(Anew) while ( error > tol && iter < iter_max ) { error = 0.f; #pragma acc kernels for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { Anew[j][i] = 0.25f * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmaxf( error, fabsf(Anew[j][i]-A[j][i])); } } float globalerror = 0.0f; MPI_Allreduce( &error, &globalerror, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD ); error = globalerror; //TODO: Split into halo and bulk part #pragma acc kernels for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { A[j][i] = Anew[j][i]; } } //TODO: Start bulk part asynchronously //Periodic boundary conditions int top = (rank == 0) ? (size-1) : rank-1; int bottom = (rank == (size-1)) ? 0 : rank+1; #pragma acc host_data use_device( A ) { //1. Sent row jstart (first modified row) to top receive lower boundary (jend) from bottom MPI_Sendrecv( A[jstart], M, MPI_FLOAT, top , 0, A[jend], M, MPI_FLOAT, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); //2. Sent row (jend-1) (last modified row) to bottom receive upper boundary (jstart-1) from top MPI_Sendrecv( A[(jend-1)], M, MPI_FLOAT, bottom, 0, A[(jstart-1)], M, MPI_FLOAT, top , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); } //TODO: wait for bulk part if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error); iter++; } MPI_Barrier( MPI_COMM_WORLD ); double runtime = GetTimer(); if (check_results( rank, jstart, jend, tol ) && rank == 0) { printf( "Num GPUs: %d\n", size ); printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", N,M, runtime_serial/ 1000.f, size, runtime/ 1000.f, runtime_serial/runtime, runtime_serial/(size*runtime)*100 ); } MPI_Finalize(); return 0; }
void blur5_pipelined_multi(unsigned restrict char *imgData, unsigned restrict char *out, long w, long h, long ch, long step) { const int filtersize = 5, nblocks = 32; double filter[5][5] = { 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 3, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1 }; // The denominator for scale should be the sum // of non-zero elements in the filter. float scale = 1.0 / 35.0; long blocksize = h/ nblocks; #pragma omp parallel num_threads(acc_get_num_devices(acc_device_nvidia)) { printf("Thread %d of %d\n", omp_get_thread_num(), omp_get_num_threads()); int myid = omp_get_thread_num(); acc_set_device_num(myid,acc_device_nvidia); int queue = 1; // TODO: create a data region { #pragma omp for schedule(static) for ( long blocky = 0; blocky < nblocks; blocky++) { // For data copies we need to include the ghost zones for the filter long starty = MAX(0,blocky * blocksize - filtersize/2); long endy = MIN(h,starty + blocksize + filtersize/2); // TODO: move data starty = blocky * blocksize; endy = starty + blocksize; // TODO: parallelize this loop for ( long y = starty; y < endy; y++ ) { for ( long x = 0; x < w; x++ ) { float blue = 0.0, green = 0.0, red = 0.0; for ( int fy = 0; fy < filtersize; fy++ ) { long iy = y - (filtersize/2) + fy; for ( int fx = 0; fx < filtersize; fx++ ) { long ix = x - (filtersize/2) + fx; if ( (iy<0) || (ix<0) || (iy>=h) || (ix>=w) ) continue; blue += filter[fy][fx] * (float)imgData[iy * step + ix * ch]; green += filter[fy][fx] * (float)imgData[iy * step + ix * ch + 1]; red += filter[fy][fx] * (float)imgData[iy * step + ix * ch + 2]; } } out[y * step + x * ch] = 255 - (scale * blue); out[y * step + x * ch + 1 ] = 255 - (scale * green); out[y * step + x * ch + 2 ] = 255 - (scale * red); } } // TODO: move data queue = (queue%3)+1; } // TODO: create synchronization point } } }
int main(int argc, char** argv) { int iter_max = 1000; const float pi = 2.0 * asinf(1.0f); const float tol = 1.0e-5f; int rank = 0; int size = 1; //TODO: Initialize MPI and determine rank and size //int MPI_Init(int *argc, char ***argv); //int MPI_Comm_rank(MPI_COMM_WORLD, int *rank); //int MPI_Comm_size(MPI_COMM_WORLD, int *size) memset(A, 0, N * M * sizeof(float)); memset(Aref, 0, N * M * sizeof(float)); // set boundary conditions for (int j = 0; j < N; j++) { float y0 = sinf( 2.0 * pi * j / (N-1)); A[j][0] = y0; A[j][M-1] = y0; Aref[j][0] = y0; Aref[j][M-1] = y0; } #if _OPENACC int ngpus=acc_get_num_devices(acc_device_nvidia); //TODO: choose device to use by this rank int devicenum=0; acc_set_device_num(devicenum,acc_device_nvidia); // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems acc_init(acc_device_nvidia); #endif /*_OPENACC*/ int jstart = 1; int jend = N-1; if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", N, M); if ( rank == 0) printf("Calculate reference solution and time serial execution.\n"); StartTimer(); laplace2d_serial( rank, iter_max, tol ); double runtime_serial = GetTimer(); //TODO: Wait for all processes to ensure correct timing of the parallel version //int MPI_Barrier( MPI_COMM_WORLD ); if ( rank == 0) printf("Parallel execution.\n"); StartTimer(); int iter = 0; float error = 1.0f; #pragma acc data copy(A) create(Anew) while ( error > tol && iter < iter_max ) { error = 0.f; #pragma acc kernels for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { Anew[j][i] = 0.25f * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmaxf( error, fabsf(Anew[j][i]-A[j][i])); } } #pragma acc kernels for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { A[j][i] = Anew[j][i]; } } //Periodic boundary conditions #pragma acc kernels for( int i = 1; i < M-1; i++ ) { A[0][i] = A[(N-2)][i]; A[(N-1)][i] = A[1][i]; } if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error); iter++; } //TODO: Wait for all processes to ensure correct timing of the parallel version //int MPI_Barrier( MPI_COMM_WORLD ); double runtime = GetTimer(); if (check_results( rank, jstart, jend, tol ) && rank == 0) { printf( "Num GPUs: %d\n", size ); printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", N,M, runtime_serial/ 1000.f, size, runtime/ 1000.f, runtime_serial/runtime, runtime_serial/(size*runtime)*100 ); } //TODO: Finalize MPI //int MPI_Finalize(); return 0; }
int main (int argc, char **argv) { cublasStatus_t s; cublasHandle_t h; CUcontext pctx; CUresult r; int i; const int N = 256; float *h_X, *h_Y1, *h_Y2; float *d_X,*d_Y; float alpha = 2.0f; float error_norm; float ref_norm; /* Test 4 - OpenACC creates, cuBLAS shares. */ acc_set_device_num (0, acc_device_nvidia); r = cuCtxGetCurrent (&pctx); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r); exit (EXIT_FAILURE); } h_X = (float *) malloc (N * sizeof (float)); if (h_X == 0) { fprintf (stderr, "malloc failed: for h_X\n"); exit (EXIT_FAILURE); } h_Y1 = (float *) malloc (N * sizeof (float)); if (h_Y1 == 0) { fprintf (stderr, "malloc failed: for h_Y1\n"); exit (EXIT_FAILURE); } h_Y2 = (float *) malloc (N * sizeof (float)); if (h_Y2 == 0) { fprintf (stderr, "malloc failed: for h_Y2\n"); exit (EXIT_FAILURE); } for (i = 0; i < N; i++) { h_X[i] = rand () / (float) RAND_MAX; h_Y2[i] = h_Y1[i] = rand () / (float) RAND_MAX; } #pragma acc parallel copyin (h_X[0:N]), copy (h_Y2[0:N]) copy (alpha) { int i; for (i = 0; i < N; i++) { h_Y2[i] = alpha * h_X[i] + h_Y2[i]; } } r = cuCtxGetCurrent (&pctx); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r); exit (EXIT_FAILURE); } d_X = (float *) acc_copyin (&h_X[0], N * sizeof (float)); if (d_X == NULL) { fprintf (stderr, "copyin error h_Y1\n"); exit (EXIT_FAILURE); } d_Y = (float *) acc_copyin (&h_Y1[0], N * sizeof (float)); if (d_Y == NULL) { fprintf (stderr, "copyin error h_Y1\n"); exit (EXIT_FAILURE); } s = cublasCreate (&h); if (s != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "cublasCreate failed: %d\n", s); exit (EXIT_FAILURE); } context_check (pctx); s = cublasSaxpy (h, N, &alpha, d_X, 1, d_Y, 1); if (s != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "cublasSaxpy failed: %d\n", s); exit (EXIT_FAILURE); } context_check (pctx); acc_memcpy_from_device (&h_Y1[0], d_Y, N * sizeof (float)); context_check (pctx); error_norm = 0; ref_norm = 0; for (i = 0; i < N; ++i) { float diff; diff = h_Y1[i] - h_Y2[i]; error_norm += diff * diff; ref_norm += h_Y2[i] * h_Y2[i]; } error_norm = (float) sqrt ((double) error_norm); ref_norm = (float) sqrt ((double) ref_norm); if ((fabs (ref_norm) < 1e-7) || ((error_norm / ref_norm) >= 1e-6f)) { fprintf (stderr, "math error\n"); exit (EXIT_FAILURE); } free (h_X); free (h_Y1); free (h_Y2); acc_free (d_X); acc_free (d_Y); context_check (pctx); s = cublasDestroy (h); if (s != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "cublasDestroy failed: %d\n", s); exit (EXIT_FAILURE); } context_check (pctx); acc_shutdown (acc_device_nvidia); r = cuCtxGetCurrent (&pctx); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuCtxGetCurrent failed: %d\n", r); exit (EXIT_FAILURE); } if (pctx) { fprintf (stderr, "Unexpected context\n"); exit (EXIT_FAILURE); } return EXIT_SUCCESS; }
void ops_init_backend() { acc_set_device_num(ops_get_proc() % acc_get_num_devices(acc_device_nvidia), acc_device_nvidia); ops_device_initialised_externally = 1; }
int main(int argc, char** argv) { int iter_max = 1000; const real tol = 1.0e-5; int rank = 0; int size = 1; //Initialize MPI and determine rank and size MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if ( size > MAX_MPI_SIZE ) { if ( 0 == rank ) { fprintf(stderr,"ERROR: Only up to %d MPI ranks are supported.\n",MAX_MPI_SIZE); } return -1; } dim2 size2d = size_to_2Dsize(size); int sizex = size2d.x; int sizey = size2d.y; assert(sizex*sizey == size); int rankx = rank%sizex; int ranky = rank/sizex; memset(A, 0, NY * NX * sizeof(real)); memset(Aref, 0, NY * NX * sizeof(real)); // set rhs for (int iy = 1; iy < NY-1; iy++) { for( int ix = 1; ix < NX-1; ix++ ) { const real x = -1.0 + (2.0*ix/(NX-1)); const real y = -1.0 + (2.0*iy/(NY-1)); rhs[iy][ix] = expr(-10.0*(x*x + y*y)); } } #if _OPENACC acc_device_t device_type = acc_get_device_type(); if ( acc_device_nvidia == device_type ) { int ngpus=acc_get_num_devices(acc_device_nvidia); int devicenum=rank%ngpus; acc_set_device_num(devicenum,acc_device_nvidia); } // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems acc_init(device_type); #endif /*_OPENACC*/ // Ensure correctness if NX%sizex != 0 int chunk_sizex = ceil( (1.0*NX)/sizex ); int ix_start = rankx * chunk_sizex; int ix_end = ix_start + chunk_sizex; // Do not process boundaries ix_start = max( ix_start, 1 ); ix_end = min( ix_end, NX - 1 ); // Ensure correctness if NY%sizey != 0 int chunk_sizey = ceil( (1.0*NY)/sizey ); int iy_start = ranky * chunk_sizey; int iy_end = iy_start + chunk_sizey; // Do not process boundaries iy_start = max( iy_start, 1 ); iy_end = min( iy_end, NY - 1 ); if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", NY, NX); if ( rank == 0) printf("Calculate reference solution and time serial execution.\n"); StartTimer(); poisson2d_serial( rank, iter_max, tol ); double runtime_serial = GetTimer(); //Wait for all processes to ensure correct timing of the parallel version MPI_Barrier( MPI_COMM_WORLD ); if ( rank == 0) printf("Parallel execution.\n"); StartTimer(); int iter = 0; real error = 1.0; #pragma acc data copy(A) copyin(rhs) create(Anew,to_left,from_left,to_right,from_right) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc kernels for (int iy = iy_start; iy < iy_end; iy++) { for( int ix = ix_start; ix < ix_end; ix++ ) { Anew[iy][ix] = -0.25 * (rhs[iy][ix] - ( A[iy][ix+1] + A[iy][ix-1] + A[iy-1][ix] + A[iy+1][ix] )); error = fmaxr( error, fabsr(Anew[iy][ix]-A[iy][ix])); } } real globalerror = 0.0; MPI_Allreduce( &error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD ); error = globalerror; #pragma acc kernels for (int iy = iy_start; iy < iy_end; iy++) { for( int ix = ix_start; ix < ix_end; ix++ ) { A[iy][ix] = Anew[iy][ix]; } } //Periodic boundary conditions int topy = (ranky == 0) ? (sizey-1) : ranky-1; int bottomy = (ranky == (sizey-1)) ? 0 : ranky+1; int top = topy * sizex + rankx; int bottom = bottomy * sizex + rankx; #pragma acc host_data use_device( A ) { //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom MPI_Sendrecv( &A[iy_start][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top , 0, &A[iy_end][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top MPI_Sendrecv( &A[(iy_end-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, &A[(iy_start-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); } int leftx = (rankx == 0) ? (sizex-1) : rankx-1; int rightx = (rankx == (sizex-1)) ? 0 : rankx+1; int left = ranky * sizex + leftx; int right = ranky * sizex + rightx; #pragma acc kernels for( int iy = iy_start; iy < iy_end; iy++ ) { to_left[iy] = A[iy][ix_start]; to_right[iy] = A[iy][ix_end-1]; } #pragma acc host_data use_device( to_left, from_left, to_right, from_right ) { //1. Sent to_left starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_right from right MPI_Sendrecv( to_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left , 0, from_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); //2. Sent to_right starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_left from left MPI_Sendrecv( to_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right , 0, from_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); } #pragma acc kernels for( int iy = iy_start; iy < iy_end; iy++ ) { A[iy][ix_start-1] = from_left[iy]; A[iy][ix_end] = from_right[iy]; } if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error); iter++; } MPI_Barrier( MPI_COMM_WORLD ); double runtime = GetTimer(); if (check_results( rank, ix_start, ix_end, iy_start, iy_end, tol ) && rank == 0) { printf( "Num GPUs: %d with a (%d,%d) layout.\n", size, sizey,sizex ); printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", NY,NX, runtime_serial/ 1000.0, size, runtime/ 1000.0, runtime_serial/runtime, runtime_serial/(size*runtime)*100 ); } MPI_Finalize(); return 0; }
int main(int argc, char ** argv) { acc_init(acc_device_cuda); acc_set_device_num(0, acc_device_cuda); // Keep track of the start time of the program long long program_start_time = get_time(); // Let the user specify the number of frames to process int num_frames = 1; if (argc != 3){ fprintf(stderr, "usage: %s <input file> <num of frames>", argv[0]); exit(1); } num_frames = atoi(argv[2]); // Open video file char *video_file_name; video_file_name = argv[1]; avi_t *cell_file = AVI_open_input_file(video_file_name, 1); if (cell_file == NULL) { AVI_print_error("Error with AVI_open_input_file"); return -1; } int i, j, *crow, *ccol, pair_counter = 0, x_result_len = 0, Iter = 20, ns = 4, k_count = 0, n; MAT *cellx, *celly, *A; double *GICOV_spots, *t, *G, *x_result, *y_result, *V, *QAX_CENTERS, *QAY_CENTERS; double threshold = 1.8, radius = 10.0, delta = 3.0, dt = 0.01, b = 5.0; // Extract a cropped version of the first frame from the video file MAT *image_chopped = get_frame(cell_file, 0, 1, 0); printf("Detecting cells in frame 0\n"); // Get gradient matrices in x and y directions MAT *grad_x = gradient_x(image_chopped); MAT *grad_y = gradient_y(image_chopped); m_free(image_chopped); // Get GICOV matrix corresponding to image gradients long long GICOV_start_time = get_time(); /* MAT *gicov = ellipsematching(grad_x, grad_y); // Square GICOV values MAT *max_gicov = m_get(gicov->m, gicov->n); for (i = 0; i < gicov->m; i++) { for (j = 0; j < gicov->n; j++) { double val = m_get_val(gicov, i, j); m_set_val(max_gicov, i, j, val * val); } } */ MAT *gicov = GICOV(grad_x, grad_y); long long GICOV_end_time = get_time(); // Dilate the GICOV matrix long long dilate_start_time = get_time(); //MAT *strel = structuring_element_f(12); //MAT *img_dilated = dilate_f(gicov, strel); MAT *img_dilated = dilate(gicov); long long dilate_end_time = get_time(); // Find possible matches for cell centers based on GICOV and record the rows/columns in which they are found pair_counter = 0; crow = (int *) malloc(gicov->m * gicov->n * sizeof(int)); ccol = (int *) malloc(gicov->m * gicov->n * sizeof(int)); for (i = 0; i < gicov->m; i++) { for (j = 0; j < gicov->n; j++) { if (!(m_get_val(gicov,i,j) == 0.0) && (m_get_val(img_dilated,i,j) == m_get_val(gicov,i,j))) { crow[pair_counter] = i; ccol[pair_counter] = j; pair_counter++; } } } GICOV_spots = (double *) malloc(sizeof(double)*pair_counter); for (i = 0; i < pair_counter; i++) GICOV_spots[i] = sqrt(m_get_val(gicov, crow[i], ccol[i])); G = (double *) calloc(pair_counter, sizeof(double)); x_result = (double *) calloc(pair_counter, sizeof(double)); y_result = (double *) calloc(pair_counter, sizeof(double)); x_result_len = 0; for (i = 0; i < pair_counter; i++) { if ((crow[i] > 29) && (crow[i] < BOTTOM - TOP + 39)) { x_result[x_result_len] = ccol[i]; y_result[x_result_len] = crow[i] - 40; G[x_result_len] = GICOV_spots[i]; x_result_len++; } } // Make an array t which holds each "time step" for the possible cells t = (double *) malloc(sizeof(double) * 36); for (i = 0; i < 36; i++) { t[i] = (double)i * 2.0 * PI / 36.0; } // Store cell boundaries (as simple circles) for all cells cellx = m_get(x_result_len, 36); celly = m_get(x_result_len, 36); for(i = 0; i < x_result_len; i++) { for(j = 0; j < 36; j++) { m_set_val(cellx, i, j, x_result[i] + radius * cos(t[j])); m_set_val(celly, i, j, y_result[i] + radius * sin(t[j])); } } A = TMatrix(9,4); V = (double *) malloc(sizeof(double) * pair_counter); QAX_CENTERS = (double * )malloc(sizeof(double) * pair_counter); QAY_CENTERS = (double *) malloc(sizeof(double) * pair_counter); memset(V, 0, sizeof(double) * pair_counter); memset(QAX_CENTERS, 0, sizeof(double) * pair_counter); memset(QAY_CENTERS, 0, sizeof(double) * pair_counter); // For all possible results, find the ones that are feasibly leukocytes and store their centers k_count = 0; for (n = 0; n < x_result_len; n++) { if ((G[n] < -1 * threshold) || G[n] > threshold) { MAT * x, *y; VEC * x_row, * y_row; x = m_get(1, 36); y = m_get(1, 36); x_row = v_get(36); y_row = v_get(36); // Get current values of possible cells from cellx/celly matrices x_row = get_row(cellx, n, x_row); y_row = get_row(celly, n, y_row); uniformseg(x_row, y_row, x, y); // Make sure that the possible leukocytes are not too close to the edge of the frame if ((m_min(x) > b) && (m_min(y) > b) && (m_max(x) < cell_file->width - b) && (m_max(y) < cell_file->height - b)) { MAT * Cx, * Cy, *Cy_temp, * Ix1, * Iy1; VEC *Xs, *Ys, *W, *Nx, *Ny, *X, *Y; Cx = m_get(1, 36); Cy = m_get(1, 36); Cx = mmtr_mlt(A, x, Cx); Cy = mmtr_mlt(A, y, Cy); Cy_temp = m_get(Cy->m, Cy->n); for (i = 0; i < 9; i++) m_set_val(Cy, i, 0, m_get_val(Cy, i, 0) + 40.0); // Iteratively refine the snake/spline for (i = 0; i < Iter; i++) { int typeofcell; if(G[n] > 0.0) typeofcell = 0; else typeofcell = 1; splineenergyform01(Cx, Cy, grad_x, grad_y, ns, delta, 2.0 * dt, typeofcell); } X = getsampling(Cx, ns); for (i = 0; i < Cy->m; i++) m_set_val(Cy_temp, i, 0, m_get_val(Cy, i, 0) - 40.0); Y = getsampling(Cy_temp, ns); Ix1 = linear_interp2(grad_x, X, Y); Iy1 = linear_interp2(grad_x, X, Y); Xs = getfdriv(Cx, ns); Ys = getfdriv(Cy, ns); Nx = v_get(Ys->dim); for (i = 0; i < Ys->dim; i++) v_set_val(Nx, i, v_get_val(Ys, i) / sqrt(v_get_val(Xs, i)*v_get_val(Xs, i) + v_get_val(Ys, i)*v_get_val(Ys, i))); Ny = v_get(Xs->dim); for (i = 0; i < Xs->dim; i++) v_set_val(Ny, i, -1.0 * v_get_val(Xs, i) / sqrt(v_get_val(Xs, i)*v_get_val(Xs, i) + v_get_val(Ys, i)*v_get_val(Ys, i))); W = v_get(Nx->dim); for (i = 0; i < Nx->dim; i++) v_set_val(W, i, m_get_val(Ix1, 0, i) * v_get_val(Nx, i) + m_get_val(Iy1, 0, i) * v_get_val(Ny, i)); V[n] = mean(W) / std_dev(W); //get means of X and Y values for all "snaxels" of the spline contour, thus finding the cell centers QAX_CENTERS[k_count] = mean(X); QAY_CENTERS[k_count] = mean(Y) + TOP; k_count++; // Free memory v_free(W); v_free(Ny); v_free(Nx); v_free(Ys); v_free(Xs); m_free(Iy1); m_free(Ix1); v_free(Y); v_free(X); m_free(Cy_temp); m_free(Cy); m_free(Cx); } // Free memory v_free(y_row); v_free(x_row); m_free(y); m_free(x); } } // Free memory free(V); free(ccol); free(crow); free(GICOV_spots); free(t); free(G); free(x_result); free(y_result); m_free(A); m_free(celly); m_free(cellx); m_free(img_dilated); m_free(gicov); m_free(grad_y); m_free(grad_x); // Report the total number of cells detected printf("Cells detected: %d\n\n", k_count); // Report the breakdown of the detection runtime printf("Detection runtime\n"); printf("-----------------\n"); printf("GICOV computation: %.5f seconds\n", ((float) (GICOV_end_time - GICOV_start_time)) / (1000*1000)); printf(" GICOV dilation: %.5f seconds\n", ((float) (dilate_end_time - dilate_start_time)) / (1000*1000)); printf(" Total: %.5f seconds\n", ((float) (get_time() - program_start_time)) / (1000*1000)); // Now that the cells have been detected in the first frame, // track the ellipses through subsequent frames if (num_frames > 1) printf("\nTracking cells across %d frames\n", num_frames); else printf("\nTracking cells across 1 frame\n"); long long tracking_start_time = get_time(); int num_snaxels = 20; ellipsetrack(cell_file, QAX_CENTERS, QAY_CENTERS, k_count, radius, num_snaxels, num_frames); printf(" Total: %.5f seconds\n", ((float) (get_time() - tracking_start_time)) / (float) (1000*1000*num_frames)); // Report total program execution time printf("\nTotal application run time: %.5f seconds\n", ((float) (get_time() - program_start_time)) / (1000*1000)); return 0; }
/** * \brief Creates and initializes the working data for the plan * \param plan The Plan struct that holds the plan's data values. * \return Error flag value */ int initDOPENACCGEMMPlan(void *plan){ // <- Replace YOUR_NAME with the name of your module. if(!plan){ return make_error(ALLOC, generic_err); // <- This is the error code for one of the malloc fails. } Plan *p; DOPENACCGEMM_DATA *d; p = (Plan *)plan; #ifdef HAVE_PAPI int temp_event, i; int PAPI_Events [NUM_PAPI_EVENTS] = PAPI_COUNTERS; char *PAPI_units [NUM_PAPI_EVENTS] = PAPI_UNITS; #endif //HAVE_PAPI if(p){ d = (DOPENACCGEMM_DATA *)p->vptr; p->exec_count = 0; // Initialize the plan execution count to zero. perftimer_init(&p->timers, NUM_TIMERS); // Initialize all performance timers to zero. #ifdef HAVE_PAPI /* Initialize plan's PAPI data */ p->PAPI_EventSet = PAPI_NULL; p->PAPI_Num_Events = 0; TEST_PAPI(PAPI_create_eventset(&p->PAPI_EventSet), PAPI_OK, MyRank, 9999, PRINT_SOME); //Add the desired events to the Event Set; ensure the dsired counters // are on the system then add, ignore otherwise for(i = 0; i < TOTAL_PAPI_EVENTS && i < NUM_PAPI_EVENTS; i++){ temp_event = PAPI_Events[i]; if(PAPI_query_event(temp_event) == PAPI_OK){ p->PAPI_Num_Events++; TEST_PAPI(PAPI_add_event(p->PAPI_EventSet, temp_event), PAPI_OK, MyRank, 9999, PRINT_SOME); } } PAPIRes_init(p->PAPI_Results, p->PAPI_Times); PAPI_set_units(p->name, PAPI_units, NUM_PAPI_EVENTS); TEST_PAPI(PAPI_start(p->PAPI_EventSet), PAPI_OK, MyRank, 9999, PRINT_SOME); #endif //HAVE_PAPI } if(d){ int error; acc_device_t my_device = acc_get_device_type(); acc_set_device_num(d->device_id, my_device); //When OpenACC can report back on accelerator size, these two lines should be enabled //d->device_memory = system_burn_accelerator_memory(d->device_id); //d->device_memory -= SUB_FACTOR; d->M = ((int)sqrt(d->device_memory/sizeof(double))) / 3; size_t page_size = sysconf(_SC_PAGESIZE); error = posix_memalign((void **)&(d->A_buffer),page_size,d->M*d->M*sizeof(double)); assert(error==0); error = posix_memalign((void **)&(d->B_buffer),page_size,d->M*d->M*sizeof(double)); assert(error==0); error = posix_memalign((void **)&(d->C_buffer),page_size,d->M*d->M*sizeof(double)); assert(error==0); for(size_t idx=0; idx < d->M*d->M; idx++) { d->A_buffer[idx] = (double)4.5; d->B_buffer[idx] = (double)2.0; d->C_buffer[idx] = (double)0.0; } } return ERR_CLEAN; // <- This indicates a clean run with no errors. Does not need to be changed. } /* initDOPENACCGEMMPlan */