static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
{
  if (acc_on_device ((int) acc_device_host))
    return 0;
  else if (acc_on_device ((int) acc_device_nvidia))
    return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
  else
    __builtin_abort ();
}
Esempio n. 2
0
int
f (void)
{
  int r = 0;

  r |= acc_on_device ();
  r |= acc_on_device (1, 2);
  r |= acc_on_device (3.14);
  r |= acc_on_device ("hello");

  return r;
}
Esempio n. 3
0
void t1 ()
{
  int ok = 1;
  int val = 2;
  int ary[32];
  int ondev = 0;

  for (int i = 0; i < 32; i++)
    ary[i] = ~0;
  
#pragma acc parallel num_gangs (32) copy (ok) firstprivate (val) copy(ary, ondev)
  {
    ondev = acc_on_device (acc_device_not_host);
#pragma acc loop gang(static:1)
    for (unsigned i = 0; i < 32; i++)
      {
	if (val != 2)
	  ok = 0;
	val += i;
	ary[i] = val;
      }
  }

  if (ondev)
    {
      if (!ok)
	__builtin_abort ();
      if (val != 2)
	__builtin_abort ();

      for (int i = 0; i < 32; i++)
	if (ary[i] != 2 + i)
	  __builtin_abort ();
    }
}
int main ()
{
  int ix;
  int ondev = 0;
  int q = 0,  h = 0;

#pragma acc parallel vector_length(32) copy(q) copy(ondev)
  {
    int t = q;
    
#pragma acc loop vector reduction (+:t)
    for (unsigned ix = 0; ix < N; ix++)
      {
	int val = ix;
	
	if (acc_on_device (acc_device_not_host))
	  {
	    int g, w, v;

	    g = __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
	    w = __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
	    v = __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
	    val = (g << 16) | (w << 8) | v;
	    ondev = 1;
	  }
	t += val;
      }
    q = t;
  }

  for (ix = 0; ix < N; ix++)
    {
      int val = ix;
      if (ondev)
	{
	  int g = 0;
	  int w = 0;
	  int v = ix % 32;

	  val = (g << 16) | (w << 8) | v;
	}
      h += val;
    }

  if (q != h)
    {
      printf ("t=%x expected %x\n", q, h);
      return 1;
    }
  
  return 0;
}
Esempio n. 5
0
int main ()
{
  int dev;
  
#pragma acc parallel copyout (dev)
  {
    dev = acc_on_device (acc_device_not_host);
  }

  int expect = 1;
  
#if  ACC_DEVICE_TYPE_host
  expect = 0;
#endif
  
  return dev != expect;
}
void __attribute__ ((noinline)) gang (int ary[N])
{
#pragma acc loop gang
    for (unsigned ix = 0; ix < N; ix++)
      {
	if (acc_on_device (acc_device_not_host))
	  {
	    int g, w, v;

	    g = __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
	    w = __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
	    v = __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
	    ary[ix] = (g << 16) | (w << 8) | v;
	  }
	else
	  ary[ix] = ix;
      }
}
int main ()
{
  int ary[N];
  int ix;
  int exit = 0;
  int ondev = 0;

  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
#pragma acc parallel num_gangs(32) copy(ary) copy(ondev)
  {
    ondev = acc_on_device (acc_device_not_host);
    gang (ary);
  }

  for (ix = 0; ix < N; ix++)
    {
      int expected = ix;
      if(ondev)
	{
	  int g = ix / ((N + 31) / 32);
	  int w = 0;
	  int v = 0;

	  expected = (g << 16) | (w << 8) | v;
	}
      
      if (ary[ix] != expected)
	{
	  exit = 1;
	  printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected);
	}
    }
  
  return exit;
}
int main ()
{
  int ary[N];
  int ix;
  int exit = 0;
  int ondev = 0;

  for (ix = 0; ix < N;ix++)
    ary[ix] = -1;
  
#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
  {
    ondev = acc_on_device (acc_device_not_host);
    worker (ary);
  }

  for (ix = 0; ix < N; ix++)
    {
      int expected = ix;
      if(ondev)
	{
	  int g = 0;
	  int w = ix % 32;
	  int v = 0;

	  expected = (g << 16) | (w << 8) | v;
	}
      
      if (ary[ix] != expected)
	{
	  exit = 1;
	  printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected);
	}
    }
  
  return exit;
}
Esempio n. 9
0
int
f (void)
{
  const acc_device_t dev = acc_device_X;
  return acc_on_device (dev);
}
int main ()
{
  acc_init (acc_device_default);

  /* Non-positive value.  */

  /* GR, WS, VS.  */
  {
#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
    int gangs_actual = GANGS;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (gangs_actual) \
  reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
  num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
    {
      /* We're actually executing with num_gangs (1).  */
      gangs_actual = 1;
      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
	{
	  /* <https://gcc.gnu.org/PR80547>.  */
#if 0
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
#else
	  int gangs = acc_gang ();
	  gangs_min = (gangs_min < gangs) ? gangs_min : gangs;
	  gangs_max = (gangs_max > gangs) ? gangs_max : gangs;
	  int workers = acc_worker ();
	  workers_min = (workers_min < workers) ? workers_min : workers;
	  workers_max = (workers_max > workers) ? workers_max : workers;
	  int vectors = acc_vector ();
	  vectors_min = (vectors_min < vectors) ? vectors_min : vectors;
	  vectors_max = (vectors_max > vectors) ? vectors_max : vectors;
#endif
	}
    }
    if (gangs_actual != 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != gangs_actual - 1
	|| workers_min != 0 || workers_max != 0
	|| vectors_min != 0 || vectors_max != 0)
      __builtin_abort ();
#undef GANGS
  }

  /* GP, WS, VS.  */
  {
#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
    int gangs_actual = GANGS;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (gangs_actual) \
  num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
    {
      /* We're actually executing with num_gangs (1).  */
      gangs_actual = 1;
#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (gangs_actual != 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != gangs_actual - 1
	|| workers_min != 0 || workers_max != 0
	|| vectors_min != 0 || vectors_max != 0)
      __builtin_abort ();
#undef GANGS
  }

  /* GR, WP, VS.  */
  {
#define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */
    int workers_actual = WORKERS;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (workers_actual) \
  num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */
    {
      /* We're actually executing with num_workers (1).  */
      workers_actual = 1;
#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (workers_actual != 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != 0
	|| workers_min != 0 || workers_max != workers_actual - 1
	|| vectors_min != 0 || vectors_max != 0)
      __builtin_abort ();
#undef WORKERS
  }

  /* GR, WS, VP.  */
  {
#define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */
    int vectors_actual = VECTORS;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_configured } } */ \
  vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */
    {
      /* We're actually executing with vector_length (1), just the GCC nvptx
	 back end enforces vector_length (32).  */
      if (acc_on_device (acc_device_nvidia))
	vectors_actual = 32;
      else
	vectors_actual = 1;
#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (acc_get_device_type () == acc_device_nvidia)
      {
	if (vectors_actual != 32)
	  __builtin_abort ();
      }
    else
      if (vectors_actual != 1)
	__builtin_abort ();
    if (gangs_min != 0 || gangs_max != 0
	|| workers_min != 0 || workers_max != 0
	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
      __builtin_abort ();
#undef VECTORS
  }


  /* High value.  */
  
  /* GR, WS, VS.  */
  {
    /* There is no actual limit for the number of gangs, so we try with a
       rather high value.  */
    int gangs = 12345;
    int gangs_actual = gangs;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (gangs_actual) \
  reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
  num_gangs (gangs)
    {
      if (acc_on_device (acc_device_host))
	{
	  /* We're actually executing with num_gangs (1).  */
	  gangs_actual = 1;
	}
      /* As we're executing GR not GP, don't multiply with a "gangs_actual"
	 factor.  */
      for (int i = 100 /* * gangs_actual */; i > -100 /* * gangs_actual */; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (gangs_actual < 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != gangs_actual - 1
	|| workers_min != 0 || workers_max != 0
	|| vectors_min != 0 || vectors_max != 0)
      __builtin_abort ();
  }

  /* GP, WS, VS.  */
  {
    /* There is no actual limit for the number of gangs, so we try with a
       rather high value.  */
    int gangs = 12345;
    int gangs_actual = gangs;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (gangs_actual) \
  num_gangs (gangs)
    {
      if (acc_on_device (acc_device_host))
	{
	  /* We're actually executing with num_gangs (1).  */
	  gangs_actual = 1;
	}
#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (gangs_actual < 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != gangs_actual - 1
	|| workers_min != 0 || workers_max != 0
	|| vectors_min != 0 || vectors_max != 0)
      __builtin_abort ();
  }

  /* GR, WP, VS.  */
  {
    /* We try with an outrageously large value. */
#define WORKERS 2 << 20
    int workers_actual = WORKERS;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
  num_workers (WORKERS)
    {
      if (acc_on_device (acc_device_host))
	{
	  /* We're actually executing with num_workers (1).  */
	  workers_actual = 1;
	}
      else if (acc_on_device (acc_device_nvidia))
	{
	  /* The GCC nvptx back end enforces num_workers (32).  */
	  workers_actual = 32;
	}
      else
	__builtin_abort ();
#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (workers_actual < 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != 0
	|| workers_min != 0 || workers_max != workers_actual - 1
	|| vectors_min != 0 || vectors_max != 0)
      __builtin_abort ();
#undef WORKERS
  }

  /* GR, WP, VS.  */
  {
    /* We try with an outrageously large value. */
    int workers = 2 << 20;
    /* For nvptx offloading, this one will not result in "using num_workers
       (32), ignoring runtime setting", and will in fact try to launch with
       "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
       error: invalid argument".  So, limit ourselves here.  */
    if (acc_get_device_type () == acc_device_nvidia)
      workers = 32;
    int workers_actual = workers;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (workers_actual) \
  num_workers (workers)
    {
      if (acc_on_device (acc_device_host))
	{
	  /* We're actually executing with num_workers (1).  */
	  workers_actual = 1;
	}
      else if (acc_on_device (acc_device_nvidia))
	{
	  /* We're actually executing with num_workers (32).  */
	  /* workers_actual = 32; */
	}
      else
	__builtin_abort ();
#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (workers_actual < 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != 0
	|| workers_min != 0 || workers_max != workers_actual - 1
	|| vectors_min != 0 || vectors_max != 0)
      __builtin_abort ();
  }

  /* GR, WS, VP.  */
  {
    /* We try with an outrageously large value. */
#define VECTORS 2 << 20
    int vectors_actual = VECTORS;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
  vector_length (VECTORS)
    {
      if (acc_on_device (acc_device_host))
	{
	  /* We're actually executing with vector_length (1).  */
	  vectors_actual = 1;
	}
      else if (acc_on_device (acc_device_nvidia))
	{
	  /* The GCC nvptx back end enforces vector_length (32).  */
	  vectors_actual = 32;
	}
      else
	__builtin_abort ();
#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (vectors_actual < 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != 0
	|| workers_min != 0 || workers_max != 0
	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
      __builtin_abort ();
#undef VECTORS
  }

  /* GR, WS, VP.  */
  {
    /* We try with an outrageously large value. */
    int vectors = 2 << 20;
    int vectors_actual = vectors;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_configured } } */ \
  vector_length (vectors)
    {
      if (acc_on_device (acc_device_host))
	{
	  /* We're actually executing with vector_length (1).  */
	  vectors_actual = 1;
	}
      else if (acc_on_device (acc_device_nvidia))
	{
	  /* The GCC nvptx back end enforces vector_length (32).  */
	  vectors_actual = 32;
	}
      else
	__builtin_abort ();
#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (vectors_actual < 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != 0
	|| workers_min != 0 || workers_max != 0
	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
      __builtin_abort ();
  }


  /* Composition of GP, WP, VP.  */
  {
    int gangs = 12345;
    /* With nvptx offloading, multi-level reductions apparently are very slow
       in the following case.  So, limit ourselves here.  */
    if (acc_get_device_type () == acc_device_nvidia)
      gangs = 3;
    int gangs_actual = gangs;
#define WORKERS 3
    int workers_actual = WORKERS;
#define VECTORS 11
    int vectors_actual = VECTORS;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_configured } } */ \
  num_gangs (gangs) \
  num_workers (WORKERS) \
  vector_length (VECTORS)
    {
      if (acc_on_device (acc_device_host))
	{
	  /* We're actually executing with num_gangs (1), num_workers (1),
	     vector_length (1).  */
	  gangs_actual = 1;
	  workers_actual = 1;
	  vectors_actual = 1;
	}
      else if (acc_on_device (acc_device_nvidia))
	{
	  /* The GCC nvptx back end enforces vector_length (32).  */
	  vectors_actual = 32;
	}
      else
	__builtin_abort ();
#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
	for (int j = 100 * workers_actual; j > -100 * workers_actual; --j)
#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
	  for (int k = 100 * vectors_actual; k > -100 * vectors_actual; --k)
	    {
	      gangs_min = gangs_max = acc_gang ();
	      workers_min = workers_max = acc_worker ();
	      vectors_min = vectors_max = acc_vector ();
	    }
    }
    if (gangs_min != 0 || gangs_max != gangs_actual - 1
	|| workers_min != 0 || workers_max != workers_actual - 1
	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
      __builtin_abort ();
#undef VECTORS
#undef WORKERS
  }


  /* We can't test parallelized OpenACC kernels constructs in this way: use of
     the acc_gang, acc_worker, acc_vector functions will make the construct
     unparallelizable.  */


  /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
     kernels.  */
  {
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc kernels
    {
      /* This is to make the OpenACC kernels construct unparallelizable.  */
      asm volatile ("" : : : "memory");

#pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100; i > -100; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (gangs_min != 0 || gangs_max != 1 - 1
	|| workers_min != 0 || workers_max != 1 - 1
	|| vectors_min != 0 || vectors_max != 1 - 1)
      __builtin_abort ();
  }


  /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
     kernels even when there are explicit num_gangs, num_workers, or
     vector_length clauses.  */
  {
    int gangs = 5;
#define WORKERS 5
#define VECTORS 13
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc kernels \
  num_gangs (gangs) \
  num_workers (WORKERS) \
  vector_length (VECTORS)
    {
      /* This is to make the OpenACC kernels construct unparallelizable.  */
      asm volatile ("" : : : "memory");

#pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100; i > -100; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (gangs_min != 0 || gangs_max != 1 - 1
	|| workers_min != 0 || workers_max != 1 - 1
	|| vectors_min != 0 || vectors_max != 1 - 1)
      __builtin_abort ();
#undef VECTORS
#undef WORKERS
  }


  return 0;
}