static unsigned int __attribute__ ((optimize ("O2"))) acc_vector () { if (acc_on_device ((int) acc_device_host)) return 0; else if (acc_on_device ((int) acc_device_nvidia)) return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR); else __builtin_abort (); }
int f (void) { int r = 0; r |= acc_on_device (); r |= acc_on_device (1, 2); r |= acc_on_device (3.14); r |= acc_on_device ("hello"); return r; }
void t1 () { int ok = 1; int val = 2; int ary[32]; int ondev = 0; for (int i = 0; i < 32; i++) ary[i] = ~0; #pragma acc parallel num_gangs (32) copy (ok) firstprivate (val) copy(ary, ondev) { ondev = acc_on_device (acc_device_not_host); #pragma acc loop gang(static:1) for (unsigned i = 0; i < 32; i++) { if (val != 2) ok = 0; val += i; ary[i] = val; } } if (ondev) { if (!ok) __builtin_abort (); if (val != 2) __builtin_abort (); for (int i = 0; i < 32; i++) if (ary[i] != 2 + i) __builtin_abort (); } }
int main () { int ix; int ondev = 0; int q = 0, h = 0; #pragma acc parallel vector_length(32) copy(q) copy(ondev) { int t = q; #pragma acc loop vector reduction (+:t) for (unsigned ix = 0; ix < N; ix++) { int val = ix; if (acc_on_device (acc_device_not_host)) { int g, w, v; g = __builtin_goacc_parlevel_id (GOMP_DIM_GANG); w = __builtin_goacc_parlevel_id (GOMP_DIM_WORKER); v = __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR); val = (g << 16) | (w << 8) | v; ondev = 1; } t += val; } q = t; } for (ix = 0; ix < N; ix++) { int val = ix; if (ondev) { int g = 0; int w = 0; int v = ix % 32; val = (g << 16) | (w << 8) | v; } h += val; } if (q != h) { printf ("t=%x expected %x\n", q, h); return 1; } return 0; }
int main () { int dev; #pragma acc parallel copyout (dev) { dev = acc_on_device (acc_device_not_host); } int expect = 1; #if ACC_DEVICE_TYPE_host expect = 0; #endif return dev != expect; }
void __attribute__ ((noinline)) gang (int ary[N]) { #pragma acc loop gang for (unsigned ix = 0; ix < N; ix++) { if (acc_on_device (acc_device_not_host)) { int g, w, v; g = __builtin_goacc_parlevel_id (GOMP_DIM_GANG); w = __builtin_goacc_parlevel_id (GOMP_DIM_WORKER); v = __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR); ary[ix] = (g << 16) | (w << 8) | v; } else ary[ix] = ix; } }
int main () { int ary[N]; int ix; int exit = 0; int ondev = 0; for (ix = 0; ix < N;ix++) ary[ix] = -1; #pragma acc parallel num_gangs(32) copy(ary) copy(ondev) { ondev = acc_on_device (acc_device_not_host); gang (ary); } for (ix = 0; ix < N; ix++) { int expected = ix; if(ondev) { int g = ix / ((N + 31) / 32); int w = 0; int v = 0; expected = (g << 16) | (w << 8) | v; } if (ary[ix] != expected) { exit = 1; printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected); } } return exit; }
int main () { int ary[N]; int ix; int exit = 0; int ondev = 0; for (ix = 0; ix < N;ix++) ary[ix] = -1; #pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) { ondev = acc_on_device (acc_device_not_host); worker (ary); } for (ix = 0; ix < N; ix++) { int expected = ix; if(ondev) { int g = 0; int w = ix % 32; int v = 0; expected = (g << 16) | (w << 8) | v; } if (ary[ix] != expected) { exit = 1; printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected); } } return exit; }
int f (void) { const acc_device_t dev = acc_device_X; return acc_on_device (dev); }
int main () { acc_init (acc_device_default); /* Non-positive value. */ /* GR, WS, VS. */ { #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */ int gangs_actual = GANGS; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (gangs_actual) \ reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \ num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */ { /* We're actually executing with num_gangs (1). */ gangs_actual = 1; for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i) { /* <https://gcc.gnu.org/PR80547>. */ #if 0 gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); #else int gangs = acc_gang (); gangs_min = (gangs_min < gangs) ? gangs_min : gangs; gangs_max = (gangs_max > gangs) ? gangs_max : gangs; int workers = acc_worker (); workers_min = (workers_min < workers) ? workers_min : workers; workers_max = (workers_max > workers) ? workers_max : workers; int vectors = acc_vector (); vectors_min = (vectors_min < vectors) ? vectors_min : vectors; vectors_max = (vectors_max > vectors) ? vectors_max : vectors; #endif } } if (gangs_actual != 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != gangs_actual - 1 || workers_min != 0 || workers_max != 0 || vectors_min != 0 || vectors_max != 0) __builtin_abort (); #undef GANGS } /* GP, WS, VS. */ { #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */ int gangs_actual = GANGS; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (gangs_actual) \ num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */ { /* We're actually executing with num_gangs (1). */ gangs_actual = 1; #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (gangs_actual != 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != gangs_actual - 1 || workers_min != 0 || workers_max != 0 || vectors_min != 0 || vectors_max != 0) __builtin_abort (); #undef GANGS } /* GR, WP, VS. */ { #define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */ int workers_actual = WORKERS; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (workers_actual) \ num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */ { /* We're actually executing with num_workers (1). */ workers_actual = 1; #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * workers_actual; i > -100 * workers_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (workers_actual != 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != 0 || workers_min != 0 || workers_max != workers_actual - 1 || vectors_min != 0 || vectors_max != 0) __builtin_abort (); #undef WORKERS } /* GR, WS, VP. */ { #define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */ int vectors_actual = VECTORS; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_configured } } */ \ vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */ { /* We're actually executing with vector_length (1), just the GCC nvptx back end enforces vector_length (32). */ if (acc_on_device (acc_device_nvidia)) vectors_actual = 32; else vectors_actual = 1; #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (acc_get_device_type () == acc_device_nvidia) { if (vectors_actual != 32) __builtin_abort (); } else if (vectors_actual != 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != 0 || workers_min != 0 || workers_max != 0 || vectors_min != 0 || vectors_max != vectors_actual - 1) __builtin_abort (); #undef VECTORS } /* High value. */ /* GR, WS, VS. */ { /* There is no actual limit for the number of gangs, so we try with a rather high value. */ int gangs = 12345; int gangs_actual = gangs; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (gangs_actual) \ reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \ num_gangs (gangs) { if (acc_on_device (acc_device_host)) { /* We're actually executing with num_gangs (1). */ gangs_actual = 1; } /* As we're executing GR not GP, don't multiply with a "gangs_actual" factor. */ for (int i = 100 /* * gangs_actual */; i > -100 /* * gangs_actual */; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (gangs_actual < 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != gangs_actual - 1 || workers_min != 0 || workers_max != 0 || vectors_min != 0 || vectors_max != 0) __builtin_abort (); } /* GP, WS, VS. */ { /* There is no actual limit for the number of gangs, so we try with a rather high value. */ int gangs = 12345; int gangs_actual = gangs; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (gangs_actual) \ num_gangs (gangs) { if (acc_on_device (acc_device_host)) { /* We're actually executing with num_gangs (1). */ gangs_actual = 1; } #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (gangs_actual < 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != gangs_actual - 1 || workers_min != 0 || workers_max != 0 || vectors_min != 0 || vectors_max != 0) __builtin_abort (); } /* GR, WP, VS. */ { /* We try with an outrageously large value. */ #define WORKERS 2 << 20 int workers_actual = WORKERS; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \ num_workers (WORKERS) { if (acc_on_device (acc_device_host)) { /* We're actually executing with num_workers (1). */ workers_actual = 1; } else if (acc_on_device (acc_device_nvidia)) { /* The GCC nvptx back end enforces num_workers (32). */ workers_actual = 32; } else __builtin_abort (); #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * workers_actual; i > -100 * workers_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (workers_actual < 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != 0 || workers_min != 0 || workers_max != workers_actual - 1 || vectors_min != 0 || vectors_max != 0) __builtin_abort (); #undef WORKERS } /* GR, WP, VS. */ { /* We try with an outrageously large value. */ int workers = 2 << 20; /* For nvptx offloading, this one will not result in "using num_workers (32), ignoring runtime setting", and will in fact try to launch with "num_workers (workers)", which will run into "libgomp: cuLaunchKernel error: invalid argument". So, limit ourselves here. */ if (acc_get_device_type () == acc_device_nvidia) workers = 32; int workers_actual = workers; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (workers_actual) \ num_workers (workers) { if (acc_on_device (acc_device_host)) { /* We're actually executing with num_workers (1). */ workers_actual = 1; } else if (acc_on_device (acc_device_nvidia)) { /* We're actually executing with num_workers (32). */ /* workers_actual = 32; */ } else __builtin_abort (); #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * workers_actual; i > -100 * workers_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (workers_actual < 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != 0 || workers_min != 0 || workers_max != workers_actual - 1 || vectors_min != 0 || vectors_max != 0) __builtin_abort (); } /* GR, WS, VP. */ { /* We try with an outrageously large value. */ #define VECTORS 2 << 20 int vectors_actual = VECTORS; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \ vector_length (VECTORS) { if (acc_on_device (acc_device_host)) { /* We're actually executing with vector_length (1). */ vectors_actual = 1; } else if (acc_on_device (acc_device_nvidia)) { /* The GCC nvptx back end enforces vector_length (32). */ vectors_actual = 32; } else __builtin_abort (); #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (vectors_actual < 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != 0 || workers_min != 0 || workers_max != 0 || vectors_min != 0 || vectors_max != vectors_actual - 1) __builtin_abort (); #undef VECTORS } /* GR, WS, VP. */ { /* We try with an outrageously large value. */ int vectors = 2 << 20; int vectors_actual = vectors; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_configured } } */ \ vector_length (vectors) { if (acc_on_device (acc_device_host)) { /* We're actually executing with vector_length (1). */ vectors_actual = 1; } else if (acc_on_device (acc_device_nvidia)) { /* The GCC nvptx back end enforces vector_length (32). */ vectors_actual = 32; } else __builtin_abort (); #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (vectors_actual < 1) __builtin_abort (); if (gangs_min != 0 || gangs_max != 0 || workers_min != 0 || workers_max != 0 || vectors_min != 0 || vectors_max != vectors_actual - 1) __builtin_abort (); } /* Composition of GP, WP, VP. */ { int gangs = 12345; /* With nvptx offloading, multi-level reductions apparently are very slow in the following case. So, limit ourselves here. */ if (acc_get_device_type () == acc_device_nvidia) gangs = 3; int gangs_actual = gangs; #define WORKERS 3 int workers_actual = WORKERS; #define VECTORS 11 int vectors_actual = VECTORS; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_configured } } */ \ num_gangs (gangs) \ num_workers (WORKERS) \ vector_length (VECTORS) { if (acc_on_device (acc_device_host)) { /* We're actually executing with num_gangs (1), num_workers (1), vector_length (1). */ gangs_actual = 1; workers_actual = 1; vectors_actual = 1; } else if (acc_on_device (acc_device_nvidia)) { /* The GCC nvptx back end enforces vector_length (32). */ vectors_actual = 32; } else __builtin_abort (); #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i) #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int j = 100 * workers_actual; j > -100 * workers_actual; --j) #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int k = 100 * vectors_actual; k > -100 * vectors_actual; --k) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (gangs_min != 0 || gangs_max != gangs_actual - 1 || workers_min != 0 || workers_max != workers_actual - 1 || vectors_min != 0 || vectors_max != vectors_actual - 1) __builtin_abort (); #undef VECTORS #undef WORKERS } /* We can't test parallelized OpenACC kernels constructs in this way: use of the acc_gang, acc_worker, acc_vector functions will make the construct unparallelizable. */ /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1 kernels. */ { int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc kernels { /* This is to make the OpenACC kernels construct unparallelizable. */ asm volatile ("" : : : "memory"); #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100; i > -100; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (gangs_min != 0 || gangs_max != 1 - 1 || workers_min != 0 || workers_max != 1 - 1 || vectors_min != 0 || vectors_max != 1 - 1) __builtin_abort (); } /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1 kernels even when there are explicit num_gangs, num_workers, or vector_length clauses. */ { int gangs = 5; #define WORKERS 5 #define VECTORS 13 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc kernels \ num_gangs (gangs) \ num_workers (WORKERS) \ vector_length (VECTORS) { /* This is to make the OpenACC kernels construct unparallelizable. */ asm volatile ("" : : : "memory"); #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) for (int i = 100; i > -100; --i) { gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); } } if (gangs_min != 0 || gangs_max != 1 - 1 || workers_min != 0 || workers_max != 1 - 1 || vectors_min != 0 || vectors_max != 1 - 1) __builtin_abort (); #undef VECTORS #undef WORKERS } return 0; }