int main(int argc, char** argv) { int iter_max = 1000; const float pi = 2.0 * asinf(1.0f); const float tol = 1.0e-5f; int rank = 0; int size = 1; //TODO: Initialize MPI and determine rank and size //int MPI_Init(int *argc, char ***argv); //int MPI_Comm_rank(MPI_COMM_WORLD, int *rank); //int MPI_Comm_size(MPI_COMM_WORLD, int *size) memset(A, 0, N * M * sizeof(float)); memset(Aref, 0, N * M * sizeof(float)); // set boundary conditions for (int j = 0; j < N; j++) { float y0 = sinf( 2.0 * pi * j / (N-1)); A[j][0] = y0; A[j][M-1] = y0; Aref[j][0] = y0; Aref[j][M-1] = y0; } #if _OPENACC int ngpus=acc_get_num_devices(acc_device_nvidia); //TODO: choose device to use by this rank int devicenum=0; acc_set_device_num(devicenum,acc_device_nvidia); // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems acc_init(acc_device_nvidia); #endif /*_OPENACC*/ int jstart = 1; int jend = N-1; if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", N, M); if ( rank == 0) printf("Calculate reference solution and time serial execution.\n"); StartTimer(); laplace2d_serial( rank, iter_max, tol ); double runtime_serial = GetTimer(); //TODO: Wait for all processes to ensure correct timing of the parallel version //int MPI_Barrier( MPI_COMM_WORLD ); if ( rank == 0) printf("Parallel execution.\n"); StartTimer(); int iter = 0; float error = 1.0f; #pragma acc data copy(A) create(Anew) while ( error > tol && iter < iter_max ) { error = 0.f; #pragma acc kernels for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { Anew[j][i] = 0.25f * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmaxf( error, fabsf(Anew[j][i]-A[j][i])); } } #pragma acc kernels for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { A[j][i] = Anew[j][i]; } } //Periodic boundary conditions #pragma acc kernels for( int i = 1; i < M-1; i++ ) { A[0][i] = A[(N-2)][i]; A[(N-1)][i] = A[1][i]; } if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error); iter++; } //TODO: Wait for all processes to ensure correct timing of the parallel version //int MPI_Barrier( MPI_COMM_WORLD ); double runtime = GetTimer(); if (check_results( rank, jstart, jend, tol ) && rank == 0) { printf( "Num GPUs: %d\n", size ); printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", N,M, runtime_serial/ 1000.f, size, runtime/ 1000.f, runtime_serial/runtime, runtime_serial/(size*runtime)*100 ); } //TODO: Finalize MPI //int MPI_Finalize(); return 0; }
int main(int argc, char** argv) { int iter_max = 1000; const float pi = 2.0 * asinf(1.0f); const float tol = 1.0e-5f; int rank = 0; int size = 1; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); memset(A, 0, N * M * sizeof(float)); memset(Aref, 0, N * M * sizeof(float)); // set boundary conditions for (int j = 0; j < N; j++) { float y0 = sinf( 2.0 * pi * j / (N-1)); A[j][0] = y0; A[j][M-1] = y0; Aref[j][0] = y0; Aref[j][M-1] = y0; } #if _OPENACC int ngpus=acc_get_num_devices(acc_device_nvidia); int devicenum=rank%ngpus; acc_set_device_num(devicenum,acc_device_nvidia); // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems acc_init(acc_device_nvidia); #endif /*_OPENACC*/ // Ensure correctness if N%size != 0 int chunk_size = ceil( (1.0*N)/size ); int jstart = rank * chunk_size; int jend = jstart + chunk_size; // Do not process boundaries jstart = max( jstart, 1 ); jend = min( jend, N - 1 ); if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", N, M); if ( rank == 0) printf("Calculate reference solution and time serial execution.\n"); StartTimer(); laplace2d_serial( rank, iter_max, tol ); double runtime_serial = GetTimer(); //Wait for all processes to ensure correct timing of the parallel version MPI_Barrier( MPI_COMM_WORLD ); if ( rank == 0) printf("Parallel execution.\n"); StartTimer(); int iter = 0; float error = 1.0f; #pragma acc data copy(A) create(Anew) while ( error > tol && iter < iter_max ) { error = 0.f; #pragma acc kernels for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { Anew[j][i] = 0.25f * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmaxf( error, fabsf(Anew[j][i]-A[j][i])); } } float globalerror = 0.0f; MPI_Allreduce( &error, &globalerror, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD ); error = globalerror; //TODO: Split into halo and bulk part #pragma acc kernels for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { A[j][i] = Anew[j][i]; } } //TODO: Start bulk part asynchronously //Periodic boundary conditions int top = (rank == 0) ? (size-1) : rank-1; int bottom = (rank == (size-1)) ? 0 : rank+1; #pragma acc host_data use_device( A ) { //1. Sent row jstart (first modified row) to top receive lower boundary (jend) from bottom MPI_Sendrecv( A[jstart], M, MPI_FLOAT, top , 0, A[jend], M, MPI_FLOAT, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); //2. Sent row (jend-1) (last modified row) to bottom receive upper boundary (jstart-1) from top MPI_Sendrecv( A[(jend-1)], M, MPI_FLOAT, bottom, 0, A[(jstart-1)], M, MPI_FLOAT, top , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); } //TODO: wait for bulk part if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error); iter++; } MPI_Barrier( MPI_COMM_WORLD ); double runtime = GetTimer(); if (check_results( rank, jstart, jend, tol ) && rank == 0) { printf( "Num GPUs: %d\n", size ); printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", N,M, runtime_serial/ 1000.f, size, runtime/ 1000.f, runtime_serial/runtime, runtime_serial/(size*runtime)*100 ); } MPI_Finalize(); return 0; }
int main(int argc, char** argv) { int iter_max = 1000; const float pi = 2.0 * asinf(1.0f); const float tol = 1.0e-5f; int rank = 0; int size = 1; memset(A, 0, N * M * sizeof(float)); memset(Aref, 0, N * M * sizeof(float)); // set boundary conditions for (int j = 0; j < N; j++) { float y0 = sinf( 2.0 * pi * j / (N-1)); A[j][0] = y0; A[j][M-1] = y0; Aref[j][0] = y0; Aref[j][M-1] = y0; } int jstart = 1; int jend = N-1; if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", N, M); if ( rank == 0) printf("Calculate reference solution and time serial execution.\n"); StartTimer(); laplace2d_serial( rank, iter_max, tol ); double runtime_serial = GetTimer(); if ( rank == 0) printf("Parallel execution.\n"); StartTimer(); int iter = 0; float error = 1.0f; //TODO: Insert OpenACC `data` region while ( error > tol && iter < iter_max ) { error = 0.f; //TODO: Accelerate the next 3 loop nest with OpenACC `kernels` or `parallel` for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { Anew[j][i] = 0.25f * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmaxf( error, fabsf(Anew[j][i]-A[j][i])); } } for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { A[j][i] = Anew[j][i]; } } //Periodic boundary conditions for( int i = 1; i < M-1; i++ ) { A[0][i] = A[(N-2)][i]; A[(N-1)][i] = A[1][i]; } if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error); iter++; } double runtime = GetTimer(); if (check_results( rank, jstart, jend, tol ) && rank == 0) { printf( "Num GPUs: %d\n", size ); printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", N,M, runtime_serial/ 1000.f, size, runtime/ 1000.f, runtime_serial/runtime, runtime_serial/(size*runtime)*100 ); } return 0; }