int init_accel (void) { #if defined(_ENABLE_OPENACC_) || defined(_ENABLE_CUDA_) char * str; int local_rank, dev_count; int dev_id = 0; #endif #ifdef _ENABLE_CUDA_ CUresult curesult = CUDA_SUCCESS; CUdevice cuDevice; #endif switch (options.accel) { #ifdef _ENABLE_CUDA_ case managed: case cuda: if ((str = getenv("LOCAL_RANK")) != NULL) { cudaGetDeviceCount(&dev_count); local_rank = atoi(str); dev_id = local_rank % dev_count; } curesult = cuInit(0); if (curesult != CUDA_SUCCESS) { return 1; } curesult = cuDeviceGet(&cuDevice, dev_id); if (curesult != CUDA_SUCCESS) { return 1; } curesult = cuCtxCreate(&cuContext, 0, cuDevice); if (curesult != CUDA_SUCCESS) { return 1; } break; #endif #ifdef _ENABLE_OPENACC_ case openacc: if ((str = getenv("LOCAL_RANK")) != NULL) { dev_count = acc_get_num_devices(acc_device_not_host); local_rank = atoi(str); dev_id = local_rank % dev_count; } acc_set_device_num (dev_id, acc_device_not_host); break; #endif default: fprintf(stderr, "Invalid device type, should be cuda or openacc\n"); return 1; } return 0; }
void acc_shutdown(acc_device_t dev) { acc_init_once(); size_t num_devices = acc_get_num_devices(dev); assert(num_devices > 0); unsigned i; for (i = 0; i < num_devices; i++) acc_shutdown_(dev, i); }
int main (int argc, char **argv) { void *d; acc_device_t devtype = acc_device_host; #if ACC_DEVICE_TYPE_nvidia devtype = acc_device_nvidia; if (acc_get_num_devices (acc_device_nvidia) == 0) return 0; #endif acc_init (devtype); d = acc_malloc (0); if (d != NULL) abort (); acc_free (0); acc_shutdown (devtype); acc_set_device_type (devtype); d = acc_malloc (0); if (d != NULL) abort (); acc_shutdown (devtype); acc_init (devtype); d = acc_malloc (1024); if (d == NULL) abort (); acc_free (d); acc_shutdown (devtype); acc_set_device_type (devtype); d = acc_malloc (1024); if (d == NULL) abort (); acc_free (d); acc_shutdown (devtype); return 0; }
int main (int argc, char **argv) { acc_device_t devtype = acc_device_host; #if ACC_DEVICE_TYPE_nvidia devtype = acc_device_nvidia; if (acc_get_num_devices (devtype) == 0) return 0; #endif acc_init (devtype); acc_init (devtype); return 0; }
int main (int argc, char **argv) { acc_device_t devtype = acc_device_host; #if ACC_DEVICE_TYPE_nvidia devtype = acc_device_nvidia; if (acc_get_num_devices (acc_device_nvidia) == 0) return 0; #endif acc_init (devtype); acc_shutdown (devtype); fprintf (stderr, "CheCKpOInT\n"); acc_shutdown (devtype); return 0; }
static void initGPU(int argc, char** argv) { // gets the device id (if specified) to run on int devId = -1; if (argc > 1) { devId = atoi(argv[1]); int devCount = acc_get_num_devices(acc_device_nvidia); if (devId < 0 || devId >= devCount) { printf("The specified device ID is not supported.\n"); exit(1); } } if (devId != -1) { acc_set_device_num(devId, acc_device_nvidia); } // creates a context on the GPU just to // exclude initialization time from computations acc_init(acc_device_nvidia); // print device id devId = acc_get_device_num(acc_device_nvidia); printf("Running on GPU with ID %d.\n\n", devId); }
int main(int argc, char** argv) { int iter_max = 1000; const float pi = 2.0 * asinf(1.0f); const float tol = 1.0e-5f; int rank = 0; int size = 1; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); memset(A, 0, N * M * sizeof(float)); memset(Aref, 0, N * M * sizeof(float)); // set boundary conditions for (int j = 0; j < N; j++) { float y0 = sinf( 2.0 * pi * j / (N-1)); A[j][0] = y0; A[j][M-1] = y0; Aref[j][0] = y0; Aref[j][M-1] = y0; } #if _OPENACC int ngpus=acc_get_num_devices(acc_device_nvidia); int devicenum=rank%ngpus; acc_set_device_num(devicenum,acc_device_nvidia); // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems acc_init(acc_device_nvidia); #endif /*_OPENACC*/ // Ensure correctness if N%size != 0 int chunk_size = ceil( (1.0*N)/size ); int jstart = rank * chunk_size; int jend = jstart + chunk_size; // Do not process boundaries jstart = max( jstart, 1 ); jend = min( jend, N - 1 ); if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", N, M); if ( rank == 0) printf("Calculate reference solution and time serial execution.\n"); StartTimer(); laplace2d_serial( rank, iter_max, tol ); double runtime_serial = GetTimer(); //Wait for all processes to ensure correct timing of the parallel version MPI_Barrier( MPI_COMM_WORLD ); if ( rank == 0) printf("Parallel execution.\n"); StartTimer(); int iter = 0; float error = 1.0f; #pragma acc data copy(A) create(Anew) while ( error > tol && iter < iter_max ) { error = 0.f; #pragma acc kernels for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { Anew[j][i] = 0.25f * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmaxf( error, fabsf(Anew[j][i]-A[j][i])); } } float globalerror = 0.0f; MPI_Allreduce( &error, &globalerror, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD ); error = globalerror; //TODO: Split into halo and bulk part #pragma acc kernels for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { A[j][i] = Anew[j][i]; } } //TODO: Start bulk part asynchronously //Periodic boundary conditions int top = (rank == 0) ? (size-1) : rank-1; int bottom = (rank == (size-1)) ? 0 : rank+1; #pragma acc host_data use_device( A ) { //1. Sent row jstart (first modified row) to top receive lower boundary (jend) from bottom MPI_Sendrecv( A[jstart], M, MPI_FLOAT, top , 0, A[jend], M, MPI_FLOAT, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); //2. Sent row (jend-1) (last modified row) to bottom receive upper boundary (jstart-1) from top MPI_Sendrecv( A[(jend-1)], M, MPI_FLOAT, bottom, 0, A[(jstart-1)], M, MPI_FLOAT, top , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); } //TODO: wait for bulk part if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error); iter++; } MPI_Barrier( MPI_COMM_WORLD ); double runtime = GetTimer(); if (check_results( rank, jstart, jend, tol ) && rank == 0) { printf( "Num GPUs: %d\n", size ); printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", N,M, runtime_serial/ 1000.f, size, runtime/ 1000.f, runtime_serial/runtime, runtime_serial/(size*runtime)*100 ); } MPI_Finalize(); return 0; }
int main(int argc, char** argv) { int iter_max = 1000; const float pi = 2.0 * asinf(1.0f); const float tol = 1.0e-5f; int rank = 0; int size = 1; //TODO: Initialize MPI and determine rank and size //int MPI_Init(int *argc, char ***argv); //int MPI_Comm_rank(MPI_COMM_WORLD, int *rank); //int MPI_Comm_size(MPI_COMM_WORLD, int *size) memset(A, 0, N * M * sizeof(float)); memset(Aref, 0, N * M * sizeof(float)); // set boundary conditions for (int j = 0; j < N; j++) { float y0 = sinf( 2.0 * pi * j / (N-1)); A[j][0] = y0; A[j][M-1] = y0; Aref[j][0] = y0; Aref[j][M-1] = y0; } #if _OPENACC int ngpus=acc_get_num_devices(acc_device_nvidia); //TODO: choose device to use by this rank int devicenum=0; acc_set_device_num(devicenum,acc_device_nvidia); // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems acc_init(acc_device_nvidia); #endif /*_OPENACC*/ int jstart = 1; int jend = N-1; if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", N, M); if ( rank == 0) printf("Calculate reference solution and time serial execution.\n"); StartTimer(); laplace2d_serial( rank, iter_max, tol ); double runtime_serial = GetTimer(); //TODO: Wait for all processes to ensure correct timing of the parallel version //int MPI_Barrier( MPI_COMM_WORLD ); if ( rank == 0) printf("Parallel execution.\n"); StartTimer(); int iter = 0; float error = 1.0f; #pragma acc data copy(A) create(Anew) while ( error > tol && iter < iter_max ) { error = 0.f; #pragma acc kernels for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { Anew[j][i] = 0.25f * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmaxf( error, fabsf(Anew[j][i]-A[j][i])); } } #pragma acc kernels for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { A[j][i] = Anew[j][i]; } } //Periodic boundary conditions #pragma acc kernels for( int i = 1; i < M-1; i++ ) { A[0][i] = A[(N-2)][i]; A[(N-1)][i] = A[1][i]; } if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error); iter++; } //TODO: Wait for all processes to ensure correct timing of the parallel version //int MPI_Barrier( MPI_COMM_WORLD ); double runtime = GetTimer(); if (check_results( rank, jstart, jend, tol ) && rank == 0) { printf( "Num GPUs: %d\n", size ); printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", N,M, runtime_serial/ 1000.f, size, runtime/ 1000.f, runtime_serial/runtime, runtime_serial/(size*runtime)*100 ); } //TODO: Finalize MPI //int MPI_Finalize(); return 0; }
int metrosim::run(int argc, char** argv) { SimulationArgs args = SimulationArgs(); if (!getCommands(argc, argv, &args)) { exit(EXIT_FAILURE); } #ifdef _OPENACC int numDevices = acc_get_num_devices(acc_device_nvidia); // Ensure args.simulationMode is either Serial or Parallel if (args.simulationMode == SimulationMode::Default) { if (numDevices < 1) { fprintf(stdout, "No GPU devices found; defaulting to CPU " "execution.\n"); args.simulationMode = SimulationMode::Serial; } else { fprintf(stdout, "%d GPU device(s) found; running on GPU.\n", numDevices); // FIXME (blm) fprintf(stdout, "WARNING: Not all features support GPU offloading at " "this time. Results may be inaccurate.\n"); args.simulationMode = SimulationMode::Parallel; } } if (args.simulationMode == SimulationMode::Parallel) { if (numDevices == 0) { fprintf(stdout, "ERROR: Cannot find suitable GPU!\n"); exit(EXIT_FAILURE); } // Implicitly sets the device acc_init(acc_device_nvidia); } #else // Without OpenACC, only serial calculations are supported if (args.simulationMode == SimulationMode::Parallel) { fprintf(stdout, "Must compile with OpenACC capability to run in " "parallel mode.\n"); exit(EXIT_FAILURE); } else { args.simulationMode = SimulationMode::Serial; fprintf(stdout, "Beginning simulation using CPU...\n"); } #endif Simulation sim = Simulation(args); sim.run(); fprintf(stdout, "Finishing simulation...\n\n"); // Shutdown the device if we were using the GPU #ifdef _OPENACC if (args.simulationMode == SimulationMode::Parallel) { acc_shutdown(acc_device_nvidia); } #endif exit(EXIT_SUCCESS); }
int main (int argc, char **argv) { const int nthreads = 1; int i; pthread_attr_t attr; pthread_t *tid; if (acc_get_num_devices (acc_device_nvidia) == 0) return 0; acc_init (acc_device_nvidia); x = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { x[i] = i; } d_x = acc_copyin (x, N); if (acc_is_present (x, N) != 1) abort (); if (pthread_attr_init (&attr) != 0) perror ("pthread_attr_init failed"); tid = (pthread_t *) malloc (nthreads * sizeof (pthread_t)); for (i = 0; i < nthreads; i++) { if (pthread_create (&tid[i], &attr, &test, (void *) (unsigned long) (i)) != 0) perror ("pthread_create failed"); } if (pthread_attr_destroy (&attr) != 0) perror ("pthread_attr_destroy failed"); for (i = 0; i < nthreads; i++) { void *res; if (pthread_join (tid[i], &res) != 0) perror ("pthread join failed"); } if (acc_is_present (x, N) != 1) abort (); memset (x, 0, N); acc_copyout (x, N); for (i = 0; i < N; i++) { if (x[i] != N - i - 1) abort (); } if (acc_is_present (x, N) != 0) abort (); acc_shutdown (acc_device_nvidia); return 0; }
void ops_init_backend() { acc_set_device_num(ops_get_proc() % acc_get_num_devices(acc_device_nvidia), acc_device_nvidia); ops_device_initialised_externally = 1; }
int main(int argc, char** argv) { int iter_max = 1000; const real tol = 1.0e-5; int rank = 0; int size = 1; //Initialize MPI and determine rank and size MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if ( size > MAX_MPI_SIZE ) { if ( 0 == rank ) { fprintf(stderr,"ERROR: Only up to %d MPI ranks are supported.\n",MAX_MPI_SIZE); } return -1; } dim2 size2d = size_to_2Dsize(size); int sizex = size2d.x; int sizey = size2d.y; assert(sizex*sizey == size); int rankx = rank%sizex; int ranky = rank/sizex; memset(A, 0, NY * NX * sizeof(real)); memset(Aref, 0, NY * NX * sizeof(real)); // set rhs for (int iy = 1; iy < NY-1; iy++) { for( int ix = 1; ix < NX-1; ix++ ) { const real x = -1.0 + (2.0*ix/(NX-1)); const real y = -1.0 + (2.0*iy/(NY-1)); rhs[iy][ix] = expr(-10.0*(x*x + y*y)); } } #if _OPENACC acc_device_t device_type = acc_get_device_type(); if ( acc_device_nvidia == device_type ) { int ngpus=acc_get_num_devices(acc_device_nvidia); int devicenum=rank%ngpus; acc_set_device_num(devicenum,acc_device_nvidia); } // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems acc_init(device_type); #endif /*_OPENACC*/ // Ensure correctness if NX%sizex != 0 int chunk_sizex = ceil( (1.0*NX)/sizex ); int ix_start = rankx * chunk_sizex; int ix_end = ix_start + chunk_sizex; // Do not process boundaries ix_start = max( ix_start, 1 ); ix_end = min( ix_end, NX - 1 ); // Ensure correctness if NY%sizey != 0 int chunk_sizey = ceil( (1.0*NY)/sizey ); int iy_start = ranky * chunk_sizey; int iy_end = iy_start + chunk_sizey; // Do not process boundaries iy_start = max( iy_start, 1 ); iy_end = min( iy_end, NY - 1 ); if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", NY, NX); if ( rank == 0) printf("Calculate reference solution and time serial execution.\n"); StartTimer(); poisson2d_serial( rank, iter_max, tol ); double runtime_serial = GetTimer(); //Wait for all processes to ensure correct timing of the parallel version MPI_Barrier( MPI_COMM_WORLD ); if ( rank == 0) printf("Parallel execution.\n"); StartTimer(); int iter = 0; real error = 1.0; #pragma acc data copy(A) copyin(rhs) create(Anew,to_left,from_left,to_right,from_right) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc kernels for (int iy = iy_start; iy < iy_end; iy++) { for( int ix = ix_start; ix < ix_end; ix++ ) { Anew[iy][ix] = -0.25 * (rhs[iy][ix] - ( A[iy][ix+1] + A[iy][ix-1] + A[iy-1][ix] + A[iy+1][ix] )); error = fmaxr( error, fabsr(Anew[iy][ix]-A[iy][ix])); } } real globalerror = 0.0; MPI_Allreduce( &error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD ); error = globalerror; #pragma acc kernels for (int iy = iy_start; iy < iy_end; iy++) { for( int ix = ix_start; ix < ix_end; ix++ ) { A[iy][ix] = Anew[iy][ix]; } } //Periodic boundary conditions int topy = (ranky == 0) ? (sizey-1) : ranky-1; int bottomy = (ranky == (sizey-1)) ? 0 : ranky+1; int top = topy * sizex + rankx; int bottom = bottomy * sizex + rankx; #pragma acc host_data use_device( A ) { //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom MPI_Sendrecv( &A[iy_start][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top , 0, &A[iy_end][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top MPI_Sendrecv( &A[(iy_end-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, &A[(iy_start-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); } int leftx = (rankx == 0) ? (sizex-1) : rankx-1; int rightx = (rankx == (sizex-1)) ? 0 : rankx+1; int left = ranky * sizex + leftx; int right = ranky * sizex + rightx; #pragma acc kernels for( int iy = iy_start; iy < iy_end; iy++ ) { to_left[iy] = A[iy][ix_start]; to_right[iy] = A[iy][ix_end-1]; } #pragma acc host_data use_device( to_left, from_left, to_right, from_right ) { //1. Sent to_left starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_right from right MPI_Sendrecv( to_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left , 0, from_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); //2. Sent to_right starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_left from left MPI_Sendrecv( to_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right , 0, from_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); } #pragma acc kernels for( int iy = iy_start; iy < iy_end; iy++ ) { A[iy][ix_start-1] = from_left[iy]; A[iy][ix_end] = from_right[iy]; } if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error); iter++; } MPI_Barrier( MPI_COMM_WORLD ); double runtime = GetTimer(); if (check_results( rank, ix_start, ix_end, iy_start, iy_end, tol ) && rank == 0) { printf( "Num GPUs: %d with a (%d,%d) layout.\n", size, sizey,sizex ); printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", NY,NX, runtime_serial/ 1000.0, size, runtime/ 1000.0, runtime_serial/runtime, runtime_serial/(size*runtime)*100 ); } MPI_Finalize(); return 0; }