int acc_on_device (acc_device_t dev) { if (acc_get_device_type () == acc_device_host_nonshm) return dev == acc_device_host_nonshm || dev == acc_device_not_host; /* Just rely on the compiler builtin. */ return __builtin_acc_on_device (dev); }
int acc_on_device (acc_device_t dev) { struct goacc_thread *thr = goacc_thread (); /* We only want to appear to be the "host_nonshm" plugin from "offloaded" code -- i.e. within a parallel region. Test a flag set by the openacc_parallel hook of the host_nonshm plugin to determine that. */ if (acc_get_device_type () == acc_device_host_nonshm && thr && thr->target_tls && ((struct nonshm_thread *)thr->target_tls)->nonshm_exec) return dev == acc_device_host_nonshm || dev == acc_device_not_host; /* For OpenACC, libgomp is only built for the host, so this is sufficient. */ return dev == acc_device_host || dev == acc_device_none; }
void t3 () { int a, b[N], c, d, i; int n = acc_get_device_type () == acc_device_nvidia ? N : 1; a = 5; for (i = 0; i < n; i++) b[i] = -1; #pragma acc parallel num_gangs (n) firstprivate (a) #pragma acc loop gang for (i = 0; i < n; i++) { a = a + i; b[i] = a; } for (i = 0; i < n; i++) if (a + i != b[i]) __builtin_abort (); #pragma acc data copy (a) { #pragma acc parallel firstprivate (a) copyout (c) { a = 10; c = a; } /* This version of 'a' should still be 5. */ #pragma acc parallel copyout (d) present (a) { d = a; } } if (c != 10) __builtin_abort (); if (d != 5) __builtin_abort (); }
int main () { int s1 = 0, s2 = 0; int i; int dummy = 0; #pragma acc data copy (dummy) { #pragma acc parallel num_gangs (N) reduction (+:s1) copy(s1) { s1++; } } if (acc_get_device_type () == acc_device_host) { if (s1 != 1) abort (); } else { if (s1 != N) abort (); } s1 = 0; s2 = 0; #pragma acc parallel num_gangs (10) reduction (+:s1, s2) copy(s1, s2) { s1++; s2 += N; } if (acc_get_device_type () == acc_device_host) { if (s1 != 1) abort (); if (s2 != N) abort (); } else { if (s1 != N) abort (); if (s2 != N*N) abort (); } s1 = 0; #pragma acc parallel num_gangs (10) reduction (+:s1) copy(s1) { #pragma acc loop gang reduction (+:s1) for (i = 0; i < 10; i++) s1++; } if (s1 != N) abort (); return 0; }
int main(int argc, char** argv) { int iter_max = 1000; const real tol = 1.0e-5; int rank = 0; int size = 1; //Initialize MPI and determine rank and size MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if ( size > MAX_MPI_SIZE ) { if ( 0 == rank ) { fprintf(stderr,"ERROR: Only up to %d MPI ranks are supported.\n",MAX_MPI_SIZE); } return -1; } dim2 size2d = size_to_2Dsize(size); int sizex = size2d.x; int sizey = size2d.y; assert(sizex*sizey == size); int rankx = rank%sizex; int ranky = rank/sizex; memset(A, 0, NY * NX * sizeof(real)); memset(Aref, 0, NY * NX * sizeof(real)); // set rhs for (int iy = 1; iy < NY-1; iy++) { for( int ix = 1; ix < NX-1; ix++ ) { const real x = -1.0 + (2.0*ix/(NX-1)); const real y = -1.0 + (2.0*iy/(NY-1)); rhs[iy][ix] = expr(-10.0*(x*x + y*y)); } } #if _OPENACC acc_device_t device_type = acc_get_device_type(); if ( acc_device_nvidia == device_type ) { int ngpus=acc_get_num_devices(acc_device_nvidia); int devicenum=rank%ngpus; acc_set_device_num(devicenum,acc_device_nvidia); } // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems acc_init(device_type); #endif /*_OPENACC*/ // Ensure correctness if NX%sizex != 0 int chunk_sizex = ceil( (1.0*NX)/sizex ); int ix_start = rankx * chunk_sizex; int ix_end = ix_start + chunk_sizex; // Do not process boundaries ix_start = max( ix_start, 1 ); ix_end = min( ix_end, NX - 1 ); // Ensure correctness if NY%sizey != 0 int chunk_sizey = ceil( (1.0*NY)/sizey ); int iy_start = ranky * chunk_sizey; int iy_end = iy_start + chunk_sizey; // Do not process boundaries iy_start = max( iy_start, 1 ); iy_end = min( iy_end, NY - 1 ); if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", NY, NX); if ( rank == 0) printf("Calculate reference solution and time serial execution.\n"); StartTimer(); poisson2d_serial( rank, iter_max, tol ); double runtime_serial = GetTimer(); //Wait for all processes to ensure correct timing of the parallel version MPI_Barrier( MPI_COMM_WORLD ); if ( rank == 0) printf("Parallel execution.\n"); StartTimer(); int iter = 0; real error = 1.0; #pragma acc data copy(A) copyin(rhs) create(Anew,to_left,from_left,to_right,from_right) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc kernels for (int iy = iy_start; iy < iy_end; iy++) { for( int ix = ix_start; ix < ix_end; ix++ ) { Anew[iy][ix] = -0.25 * (rhs[iy][ix] - ( A[iy][ix+1] + A[iy][ix-1] + A[iy-1][ix] + A[iy+1][ix] )); error = fmaxr( error, fabsr(Anew[iy][ix]-A[iy][ix])); } } real globalerror = 0.0; MPI_Allreduce( &error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD ); error = globalerror; #pragma acc kernels for (int iy = iy_start; iy < iy_end; iy++) { for( int ix = ix_start; ix < ix_end; ix++ ) { A[iy][ix] = Anew[iy][ix]; } } //Periodic boundary conditions int topy = (ranky == 0) ? (sizey-1) : ranky-1; int bottomy = (ranky == (sizey-1)) ? 0 : ranky+1; int top = topy * sizex + rankx; int bottom = bottomy * sizex + rankx; #pragma acc host_data use_device( A ) { //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom MPI_Sendrecv( &A[iy_start][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top , 0, &A[iy_end][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top MPI_Sendrecv( &A[(iy_end-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, &A[(iy_start-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); } int leftx = (rankx == 0) ? (sizex-1) : rankx-1; int rightx = (rankx == (sizex-1)) ? 0 : rankx+1; int left = ranky * sizex + leftx; int right = ranky * sizex + rightx; #pragma acc kernels for( int iy = iy_start; iy < iy_end; iy++ ) { to_left[iy] = A[iy][ix_start]; to_right[iy] = A[iy][ix_end-1]; } #pragma acc host_data use_device( to_left, from_left, to_right, from_right ) { //1. Sent to_left starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_right from right MPI_Sendrecv( to_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left , 0, from_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); //2. Sent to_right starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_left from left MPI_Sendrecv( to_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right , 0, from_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); } #pragma acc kernels for( int iy = iy_start; iy < iy_end; iy++ ) { A[iy][ix_start-1] = from_left[iy]; A[iy][ix_end] = from_right[iy]; } if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error); iter++; } MPI_Barrier( MPI_COMM_WORLD ); double runtime = GetTimer(); if (check_results( rank, ix_start, ix_end, iy_start, iy_end, tol ) && rank == 0) { printf( "Num GPUs: %d with a (%d,%d) layout.\n", size, sizey,sizex ); printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", NY,NX, runtime_serial/ 1000.0, size, runtime/ 1000.0, runtime_serial/runtime, runtime_serial/(size*runtime)*100 ); } MPI_Finalize(); return 0; }
int main () { acc_init (acc_device_default); /* Non-positive value. */ /* GR, WS, VS. */ { #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */ int gangs_actual = GANGS; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (gangs_actual) \ reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \ num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */ { /* We're actually executing with num_gangs (1). */ gangs_actual = 1; for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i) { /* <https://gcc.gnu.org/PR80547>. */ #if 0 gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); #else int gangs = acc_gang (); gangs_min = (gangs_min < gangs) ? gangs_min : gangs; gangs_max = (gangs_max > gangs) ? gangs_max : gangs; int workers = acc_worker (); workers_min = (workers_min < workers) ? workers_min : workers; workers_max = (workers_max > workers) ? workers_max : workers; int vectors = acc_vector (); vectors_min = (vectors_min < vectors) ? vectors_min : vectors; vectors_max = (vectors_max > vectors) ? vectors_max : vectors; #endif } } if (gangs_actual != 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != gangs_actual - 1 || workers_min != 0 || workers_max != 0 || vectors_min != 0 || vectors_max != 0) __builtin_abort (); #undef GANGS } /* GP, WS, VS. */ { #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */ int gangs_actual = GANGS; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (gangs_actual) \ num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */ { /* We're actually executing with num_gangs (1). */ gangs_actual = 1; #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (gangs_actual != 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != gangs_actual - 1 || workers_min != 0 || workers_max != 0 || vectors_min != 0 || vectors_max != 0) __builtin_abort (); #undef GANGS } /* GR, WP, VS. */ { #define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */ int workers_actual = WORKERS; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (workers_actual) \ num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */ { /* We're actually executing with num_workers (1). */ workers_actual = 1; #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * workers_actual; i > -100 * workers_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (workers_actual != 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != 0 || workers_min != 0 || workers_max != workers_actual - 1 || vectors_min != 0 || vectors_max != 0) __builtin_abort (); #undef WORKERS } /* GR, WS, VP. */ { #define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */ int vectors_actual = VECTORS; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_configured } } */ \ vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */ { /* We're actually executing with vector_length (1), just the GCC nvptx back end enforces vector_length (32). */ if (acc_on_device (acc_device_nvidia)) vectors_actual = 32; else vectors_actual = 1; #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (acc_get_device_type () == acc_device_nvidia) { if (vectors_actual != 32) __builtin_abort (); } else if (vectors_actual != 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != 0 || workers_min != 0 || workers_max != 0 || vectors_min != 0 || vectors_max != vectors_actual - 1) __builtin_abort (); #undef VECTORS } /* High value. */ /* GR, WS, VS. */ { /* There is no actual limit for the number of gangs, so we try with a rather high value. */ int gangs = 12345; int gangs_actual = gangs; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (gangs_actual) \ reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \ num_gangs (gangs) { if (acc_on_device (acc_device_host)) { /* We're actually executing with num_gangs (1). */ gangs_actual = 1; } /* As we're executing GR not GP, don't multiply with a "gangs_actual" factor. */ for (int i = 100 /* * gangs_actual */; i > -100 /* * gangs_actual */; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (gangs_actual < 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != gangs_actual - 1 || workers_min != 0 || workers_max != 0 || vectors_min != 0 || vectors_max != 0) __builtin_abort (); } /* GP, WS, VS. */ { /* There is no actual limit for the number of gangs, so we try with a rather high value. */ int gangs = 12345; int gangs_actual = gangs; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (gangs_actual) \ num_gangs (gangs) { if (acc_on_device (acc_device_host)) { /* We're actually executing with num_gangs (1). */ gangs_actual = 1; } #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (gangs_actual < 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != gangs_actual - 1 || workers_min != 0 || workers_max != 0 || vectors_min != 0 || vectors_max != 0) __builtin_abort (); } /* GR, WP, VS. */ { /* We try with an outrageously large value. */ #define WORKERS 2 << 20 int workers_actual = WORKERS; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \ num_workers (WORKERS) { if (acc_on_device (acc_device_host)) { /* We're actually executing with num_workers (1). */ workers_actual = 1; } else if (acc_on_device (acc_device_nvidia)) { /* The GCC nvptx back end enforces num_workers (32). */ workers_actual = 32; } else __builtin_abort (); #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * workers_actual; i > -100 * workers_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (workers_actual < 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != 0 || workers_min != 0 || workers_max != workers_actual - 1 || vectors_min != 0 || vectors_max != 0) __builtin_abort (); #undef WORKERS } /* GR, WP, VS. */ { /* We try with an outrageously large value. */ int workers = 2 << 20; /* For nvptx offloading, this one will not result in "using num_workers (32), ignoring runtime setting", and will in fact try to launch with "num_workers (workers)", which will run into "libgomp: cuLaunchKernel error: invalid argument". So, limit ourselves here. */ if (acc_get_device_type () == acc_device_nvidia) workers = 32; int workers_actual = workers; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (workers_actual) \ num_workers (workers) { if (acc_on_device (acc_device_host)) { /* We're actually executing with num_workers (1). */ workers_actual = 1; } else if (acc_on_device (acc_device_nvidia)) { /* We're actually executing with num_workers (32). */ /* workers_actual = 32; */ } else __builtin_abort (); #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * workers_actual; i > -100 * workers_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (workers_actual < 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != 0 || workers_min != 0 || workers_max != workers_actual - 1 || vectors_min != 0 || vectors_max != 0) __builtin_abort (); } /* GR, WS, VP. */ { /* We try with an outrageously large value. */ #define VECTORS 2 << 20 int vectors_actual = VECTORS; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \ vector_length (VECTORS) { if (acc_on_device (acc_device_host)) { /* We're actually executing with vector_length (1). */ vectors_actual = 1; } else if (acc_on_device (acc_device_nvidia)) { /* The GCC nvptx back end enforces vector_length (32). */ vectors_actual = 32; } else __builtin_abort (); #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (vectors_actual < 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != 0 || workers_min != 0 || workers_max != 0 || vectors_min != 0 || vectors_max != vectors_actual - 1) __builtin_abort (); #undef VECTORS } /* GR, WS, VP. */ { /* We try with an outrageously large value. */ int vectors = 2 << 20; int vectors_actual = vectors; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_configured } } */ \ vector_length (vectors) { if (acc_on_device (acc_device_host)) { /* We're actually executing with vector_length (1). */ vectors_actual = 1; } else if (acc_on_device (acc_device_nvidia)) { /* The GCC nvptx back end enforces vector_length (32). */ vectors_actual = 32; } else __builtin_abort (); #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (vectors_actual < 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != 0 || workers_min != 0 || workers_max != 0 || vectors_min != 0 || vectors_max != vectors_actual - 1) __builtin_abort (); } /* Composition of GP, WP, VP. */ { int gangs = 12345; /* With nvptx offloading, multi-level reductions apparently are very slow in the following case. So, limit ourselves here. */ if (acc_get_device_type () == acc_device_nvidia) gangs = 3; int gangs_actual = gangs; #define WORKERS 3 int workers_actual = WORKERS; #define VECTORS 11 int vectors_actual = VECTORS; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_configured } } */ \ num_gangs (gangs) \ num_workers (WORKERS) \ vector_length (VECTORS) { if (acc_on_device (acc_device_host)) { /* We're actually executing with num_gangs (1), num_workers (1), vector_length (1). */ gangs_actual = 1; workers_actual = 1; vectors_actual = 1; } else if (acc_on_device (acc_device_nvidia)) { /* The GCC nvptx back end enforces vector_length (32). */ vectors_actual = 32; } else __builtin_abort (); #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i) #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int j = 100 * workers_actual; j > -100 * workers_actual; --j) #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int k = 100 * vectors_actual; k > -100 * vectors_actual; --k) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (gangs_min != 0 || gangs_max != gangs_actual - 1 || workers_min != 0 || workers_max != workers_actual - 1 || vectors_min != 0 || vectors_max != vectors_actual - 1) __builtin_abort (); #undef VECTORS #undef WORKERS } /* We can't test parallelized OpenACC kernels constructs in this way: use of the acc_gang, acc_worker, acc_vector functions will make the construct unparallelizable. */ /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1 kernels. */ { int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc kernels { /* This is to make the OpenACC kernels construct unparallelizable. */ asm volatile ("" : : : "memory"); #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100; i > -100; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (gangs_min != 0 || gangs_max != 1 - 1 || workers_min != 0 || workers_max != 1 - 1 || vectors_min != 0 || vectors_max != 1 - 1) __builtin_abort (); } /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1 kernels even when there are explicit num_gangs, num_workers, or vector_length clauses. */ { int gangs = 5; #define WORKERS 5 #define VECTORS 13 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc kernels \ num_gangs (gangs) \ num_workers (WORKERS) \ vector_length (VECTORS) { /* This is to make the OpenACC kernels construct unparallelizable. */ asm volatile ("" : : : "memory"); #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100; i > -100; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (gangs_min != 0 || gangs_max != 1 - 1 || workers_min != 0 || workers_max != 1 - 1 || vectors_min != 0 || vectors_max != 1 - 1) __builtin_abort (); #undef VECTORS #undef WORKERS } return 0; }
/** * \brief Creates and initializes the working data for the plan * \param plan The Plan struct that holds the plan's data values. * \return Error flag value */ int initDOPENACCGEMMPlan(void *plan){ // <- Replace YOUR_NAME with the name of your module. if(!plan){ return make_error(ALLOC, generic_err); // <- This is the error code for one of the malloc fails. } Plan *p; DOPENACCGEMM_DATA *d; p = (Plan *)plan; #ifdef HAVE_PAPI int temp_event, i; int PAPI_Events [NUM_PAPI_EVENTS] = PAPI_COUNTERS; char *PAPI_units [NUM_PAPI_EVENTS] = PAPI_UNITS; #endif //HAVE_PAPI if(p){ d = (DOPENACCGEMM_DATA *)p->vptr; p->exec_count = 0; // Initialize the plan execution count to zero. perftimer_init(&p->timers, NUM_TIMERS); // Initialize all performance timers to zero. #ifdef HAVE_PAPI /* Initialize plan's PAPI data */ p->PAPI_EventSet = PAPI_NULL; p->PAPI_Num_Events = 0; TEST_PAPI(PAPI_create_eventset(&p->PAPI_EventSet), PAPI_OK, MyRank, 9999, PRINT_SOME); //Add the desired events to the Event Set; ensure the dsired counters // are on the system then add, ignore otherwise for(i = 0; i < TOTAL_PAPI_EVENTS && i < NUM_PAPI_EVENTS; i++){ temp_event = PAPI_Events[i]; if(PAPI_query_event(temp_event) == PAPI_OK){ p->PAPI_Num_Events++; TEST_PAPI(PAPI_add_event(p->PAPI_EventSet, temp_event), PAPI_OK, MyRank, 9999, PRINT_SOME); } } PAPIRes_init(p->PAPI_Results, p->PAPI_Times); PAPI_set_units(p->name, PAPI_units, NUM_PAPI_EVENTS); TEST_PAPI(PAPI_start(p->PAPI_EventSet), PAPI_OK, MyRank, 9999, PRINT_SOME); #endif //HAVE_PAPI } if(d){ int error; acc_device_t my_device = acc_get_device_type(); acc_set_device_num(d->device_id, my_device); //When OpenACC can report back on accelerator size, these two lines should be enabled //d->device_memory = system_burn_accelerator_memory(d->device_id); //d->device_memory -= SUB_FACTOR; d->M = ((int)sqrt(d->device_memory/sizeof(double))) / 3; size_t page_size = sysconf(_SC_PAGESIZE); error = posix_memalign((void **)&(d->A_buffer),page_size,d->M*d->M*sizeof(double)); assert(error==0); error = posix_memalign((void **)&(d->B_buffer),page_size,d->M*d->M*sizeof(double)); assert(error==0); error = posix_memalign((void **)&(d->C_buffer),page_size,d->M*d->M*sizeof(double)); assert(error==0); for(size_t idx=0; idx < d->M*d->M; idx++) { d->A_buffer[idx] = (double)4.5; d->B_buffer[idx] = (double)2.0; d->C_buffer[idx] = (double)0.0; } } return ERR_CLEAN; // <- This indicates a clean run with no errors. Does not need to be changed. } /* initDOPENACCGEMMPlan */