Ejemplo n.º 1
0
int
acc_on_device (acc_device_t dev)
{
  if (acc_get_device_type () == acc_device_host_nonshm)
    return dev == acc_device_host_nonshm || dev == acc_device_not_host;

  /* Just rely on the compiler builtin.  */
  return __builtin_acc_on_device (dev);
}
Ejemplo n.º 2
0
int
acc_on_device (acc_device_t dev)
{
  struct goacc_thread *thr = goacc_thread ();

  /* We only want to appear to be the "host_nonshm" plugin from "offloaded"
     code -- i.e. within a parallel region.  Test a flag set by the
     openacc_parallel hook of the host_nonshm plugin to determine that.  */
  if (acc_get_device_type () == acc_device_host_nonshm
      && thr && thr->target_tls
      && ((struct nonshm_thread *)thr->target_tls)->nonshm_exec)
    return dev == acc_device_host_nonshm || dev == acc_device_not_host;

  /* For OpenACC, libgomp is only built for the host, so this is sufficient.  */
  return dev == acc_device_host || dev == acc_device_none;
}
Ejemplo n.º 3
0
void t3 ()
{
  int a, b[N], c, d, i;
  int n = acc_get_device_type () == acc_device_nvidia ? N : 1;

  a = 5;
  for (i = 0; i < n; i++)
    b[i] = -1;

  #pragma acc parallel num_gangs (n) firstprivate (a)
  #pragma acc loop gang
  for (i = 0; i < n; i++)
    {
      a = a + i;
      b[i] = a;
    }

  for (i = 0; i < n; i++)
    if (a + i != b[i])
      __builtin_abort ();

  #pragma acc data copy (a)
  {
    #pragma acc parallel firstprivate (a) copyout (c)
    {
      a = 10;
      c = a;
    }

    /* This version of 'a' should still be 5.  */
    #pragma acc parallel copyout (d) present (a)
    {
      d = a;
    }
  }

  if (c != 10)
    __builtin_abort ();
  if (d != 5)
    __builtin_abort ();
}
int
main ()
{
  int s1 = 0, s2 = 0;
  int i;
  int dummy = 0;

#pragma acc data copy (dummy)
  {
#pragma acc parallel num_gangs (N) reduction (+:s1) copy(s1)
    {
      s1++;
    }
  }

  if (acc_get_device_type () == acc_device_host)
    {
      if (s1 != 1)
	abort ();
    }
  else
    {
      if (s1 != N)
	abort ();
    }

  s1 = 0;
  s2 = 0;

#pragma acc parallel num_gangs (10) reduction (+:s1, s2) copy(s1, s2)
  {
    s1++;
    s2 += N;
  }

  if (acc_get_device_type () == acc_device_host)
    {
      if (s1 != 1)
	abort ();
      if (s2 != N)
	abort ();
    }
  else
    {
      if (s1 != N)
	abort ();
      if (s2 != N*N)
	abort ();
    }

  s1 = 0;

#pragma acc parallel num_gangs (10) reduction (+:s1) copy(s1)
  {
#pragma acc loop gang reduction (+:s1)
    for (i = 0; i < 10; i++)
      s1++;
  }

  if (s1 != N)
    abort ();

  return 0;
}
int main(int argc, char** argv)
{
    int iter_max = 1000;
    
    const real tol = 1.0e-5;

    int rank = 0;
    int size = 1;

    //Initialize MPI and determine rank and size
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    
    if ( size > MAX_MPI_SIZE )
    {
        if ( 0 == rank )
        {
            fprintf(stderr,"ERROR: Only up to %d MPI ranks are supported.\n",MAX_MPI_SIZE);
        }
        return -1;
    }
    
    dim2 size2d = size_to_2Dsize(size);
    int sizex = size2d.x;
    int sizey = size2d.y;
    assert(sizex*sizey == size);
    
    int rankx = rank%sizex;
    int ranky = rank/sizex;

    memset(A, 0, NY * NX * sizeof(real));
    memset(Aref, 0, NY * NX * sizeof(real));
    
    // set rhs
    for (int iy = 1; iy < NY-1; iy++)
    {
        for( int ix = 1; ix < NX-1; ix++ )
        {
            const real x = -1.0 + (2.0*ix/(NX-1));
            const real y = -1.0 + (2.0*iy/(NY-1));
            rhs[iy][ix] = expr(-10.0*(x*x + y*y));
        }
    }
    
#if _OPENACC
    acc_device_t device_type = acc_get_device_type();
    if ( acc_device_nvidia == device_type )
    {
        int ngpus=acc_get_num_devices(acc_device_nvidia);
        
        int devicenum=rank%ngpus;
        acc_set_device_num(devicenum,acc_device_nvidia);
    }
    // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems
    acc_init(device_type);
#endif /*_OPENACC*/

    // Ensure correctness if NX%sizex != 0
    int chunk_sizex = ceil( (1.0*NX)/sizex );

    int ix_start = rankx * chunk_sizex;
    int ix_end   = ix_start + chunk_sizex;

    // Do not process boundaries
    ix_start = max( ix_start, 1 );
    ix_end = min( ix_end, NX - 1 );

    // Ensure correctness if NY%sizey != 0
    int chunk_sizey = ceil( (1.0*NY)/sizey );

    int iy_start = ranky * chunk_sizey;
    int iy_end   = iy_start + chunk_sizey;

    // Do not process boundaries
    iy_start = max( iy_start, 1 );
    iy_end = min( iy_end, NY - 1 );

    if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", NY, NX);

    if ( rank == 0) printf("Calculate reference solution and time serial execution.\n");
    StartTimer();
    poisson2d_serial( rank, iter_max, tol );
    double runtime_serial = GetTimer();

    //Wait for all processes to ensure correct timing of the parallel version
    MPI_Barrier( MPI_COMM_WORLD );
    if ( rank == 0) printf("Parallel execution.\n");
    StartTimer();
    int iter  = 0;
    real error = 1.0;
    
    #pragma acc data copy(A) copyin(rhs) create(Anew,to_left,from_left,to_right,from_right)
    while ( error > tol && iter < iter_max )
    {
        error = 0.0;

        #pragma acc kernels
        for (int iy = iy_start; iy < iy_end; iy++)
        {
            for( int ix = ix_start; ix < ix_end; ix++ )
            {
                Anew[iy][ix] = -0.25 * (rhs[iy][ix] - ( A[iy][ix+1] + A[iy][ix-1]
                                                       + A[iy-1][ix] + A[iy+1][ix] ));
                error = fmaxr( error, fabsr(Anew[iy][ix]-A[iy][ix]));
            }
        }
        
        real globalerror = 0.0;
        MPI_Allreduce( &error, &globalerror, 1, MPI_REAL_TYPE, MPI_MAX, MPI_COMM_WORLD );
        error = globalerror;
        
        #pragma acc kernels
        for (int iy = iy_start; iy < iy_end; iy++)
        {
            for( int ix = ix_start; ix < ix_end; ix++ )
            {
                A[iy][ix] = Anew[iy][ix];
            }
        }

        //Periodic boundary conditions
        int topy    = (ranky == 0) ? (sizey-1) : ranky-1;
        int bottomy = (ranky == (sizey-1)) ? 0 : ranky+1;
        int top    = topy    * sizex + rankx;
        int bottom = bottomy * sizex + rankx;
        #pragma acc host_data use_device( A )
        {
            //1. Sent row iy_start (first modified row) to top receive lower boundary (iy_end) from bottom
            MPI_Sendrecv( &A[iy_start][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top   , 0, &A[iy_end][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );

            //2. Sent row (iy_end-1) (last modified row) to bottom receive upper boundary (iy_start-1) from top
            MPI_Sendrecv( &A[(iy_end-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, bottom, 0, &A[(iy_start-1)][ix_start], (ix_end-ix_start), MPI_REAL_TYPE, top   , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
        }
        
        int leftx  = (rankx == 0) ? (sizex-1) : rankx-1;
        int rightx = (rankx == (sizex-1)) ? 0 : rankx+1;
        int left   = ranky * sizex + leftx;
        int right  = ranky * sizex + rightx;
        #pragma acc kernels
        for( int iy = iy_start; iy < iy_end; iy++ )
        {
                to_left[iy]  = A[iy][ix_start];
                to_right[iy] = A[iy][ix_end-1];
        }
        #pragma acc host_data use_device( to_left, from_left, to_right, from_right )
        {
            //1. Sent to_left starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_right from right 
            MPI_Sendrecv( to_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left   , 0, from_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );

            //2. Sent to_right starting from first modified row (iy_start) to last modified row to left and receive the same rows into from_left from left
            MPI_Sendrecv( to_right+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, right , 0, from_left+iy_start, (iy_end-iy_start), MPI_REAL_TYPE, left  , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
        }
        #pragma acc kernels
        for( int iy = iy_start; iy < iy_end; iy++ )
        {
                A[iy][ix_start-1] = from_left[iy];
                A[iy][ix_end]     = from_right[iy];
        }
        
        if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
        
        iter++;
    }

    MPI_Barrier( MPI_COMM_WORLD );
    double runtime = GetTimer();

    if (check_results( rank, ix_start, ix_end, iy_start, iy_end, tol ) && rank == 0)
    {
        printf( "Num GPUs: %d with a (%d,%d) layout.\n", size, sizey,sizex );
        printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", NY,NX, runtime_serial/ 1000.0, size, runtime/ 1000.0, runtime_serial/runtime, runtime_serial/(size*runtime)*100 );
    }

    MPI_Finalize();
    return 0;
}
Ejemplo n.º 6
0
int main ()
{
  acc_init (acc_device_default);

  /* Non-positive value.  */

  /* GR, WS, VS.  */
  {
#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
    int gangs_actual = GANGS;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (gangs_actual) \
  reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
  num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
    {
      /* We're actually executing with num_gangs (1).  */
      gangs_actual = 1;
      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
	{
	  /* <https://gcc.gnu.org/PR80547>.  */
#if 0
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
#else
	  int gangs = acc_gang ();
	  gangs_min = (gangs_min < gangs) ? gangs_min : gangs;
	  gangs_max = (gangs_max > gangs) ? gangs_max : gangs;
	  int workers = acc_worker ();
	  workers_min = (workers_min < workers) ? workers_min : workers;
	  workers_max = (workers_max > workers) ? workers_max : workers;
	  int vectors = acc_vector ();
	  vectors_min = (vectors_min < vectors) ? vectors_min : vectors;
	  vectors_max = (vectors_max > vectors) ? vectors_max : vectors;
#endif
	}
    }
    if (gangs_actual != 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != gangs_actual - 1
	|| workers_min != 0 || workers_max != 0
	|| vectors_min != 0 || vectors_max != 0)
      __builtin_abort ();
#undef GANGS
  }

  /* GP, WS, VS.  */
  {
#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
    int gangs_actual = GANGS;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (gangs_actual) \
  num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
    {
      /* We're actually executing with num_gangs (1).  */
      gangs_actual = 1;
#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (gangs_actual != 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != gangs_actual - 1
	|| workers_min != 0 || workers_max != 0
	|| vectors_min != 0 || vectors_max != 0)
      __builtin_abort ();
#undef GANGS
  }

  /* GR, WP, VS.  */
  {
#define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */
    int workers_actual = WORKERS;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (workers_actual) \
  num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */
    {
      /* We're actually executing with num_workers (1).  */
      workers_actual = 1;
#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (workers_actual != 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != 0
	|| workers_min != 0 || workers_max != workers_actual - 1
	|| vectors_min != 0 || vectors_max != 0)
      __builtin_abort ();
#undef WORKERS
  }

  /* GR, WS, VP.  */
  {
#define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */
    int vectors_actual = VECTORS;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_configured } } */ \
  vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */
    {
      /* We're actually executing with vector_length (1), just the GCC nvptx
	 back end enforces vector_length (32).  */
      if (acc_on_device (acc_device_nvidia))
	vectors_actual = 32;
      else
	vectors_actual = 1;
#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (acc_get_device_type () == acc_device_nvidia)
      {
	if (vectors_actual != 32)
	  __builtin_abort ();
      }
    else
      if (vectors_actual != 1)
	__builtin_abort ();
    if (gangs_min != 0 || gangs_max != 0
	|| workers_min != 0 || workers_max != 0
	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
      __builtin_abort ();
#undef VECTORS
  }


  /* High value.  */
  
  /* GR, WS, VS.  */
  {
    /* There is no actual limit for the number of gangs, so we try with a
       rather high value.  */
    int gangs = 12345;
    int gangs_actual = gangs;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (gangs_actual) \
  reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
  num_gangs (gangs)
    {
      if (acc_on_device (acc_device_host))
	{
	  /* We're actually executing with num_gangs (1).  */
	  gangs_actual = 1;
	}
      /* As we're executing GR not GP, don't multiply with a "gangs_actual"
	 factor.  */
      for (int i = 100 /* * gangs_actual */; i > -100 /* * gangs_actual */; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (gangs_actual < 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != gangs_actual - 1
	|| workers_min != 0 || workers_max != 0
	|| vectors_min != 0 || vectors_max != 0)
      __builtin_abort ();
  }

  /* GP, WS, VS.  */
  {
    /* There is no actual limit for the number of gangs, so we try with a
       rather high value.  */
    int gangs = 12345;
    int gangs_actual = gangs;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (gangs_actual) \
  num_gangs (gangs)
    {
      if (acc_on_device (acc_device_host))
	{
	  /* We're actually executing with num_gangs (1).  */
	  gangs_actual = 1;
	}
#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (gangs_actual < 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != gangs_actual - 1
	|| workers_min != 0 || workers_max != 0
	|| vectors_min != 0 || vectors_max != 0)
      __builtin_abort ();
  }

  /* GR, WP, VS.  */
  {
    /* We try with an outrageously large value. */
#define WORKERS 2 << 20
    int workers_actual = WORKERS;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
  num_workers (WORKERS)
    {
      if (acc_on_device (acc_device_host))
	{
	  /* We're actually executing with num_workers (1).  */
	  workers_actual = 1;
	}
      else if (acc_on_device (acc_device_nvidia))
	{
	  /* The GCC nvptx back end enforces num_workers (32).  */
	  workers_actual = 32;
	}
      else
	__builtin_abort ();
#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (workers_actual < 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != 0
	|| workers_min != 0 || workers_max != workers_actual - 1
	|| vectors_min != 0 || vectors_max != 0)
      __builtin_abort ();
#undef WORKERS
  }

  /* GR, WP, VS.  */
  {
    /* We try with an outrageously large value. */
    int workers = 2 << 20;
    /* For nvptx offloading, this one will not result in "using num_workers
       (32), ignoring runtime setting", and will in fact try to launch with
       "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
       error: invalid argument".  So, limit ourselves here.  */
    if (acc_get_device_type () == acc_device_nvidia)
      workers = 32;
    int workers_actual = workers;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (workers_actual) \
  num_workers (workers)
    {
      if (acc_on_device (acc_device_host))
	{
	  /* We're actually executing with num_workers (1).  */
	  workers_actual = 1;
	}
      else if (acc_on_device (acc_device_nvidia))
	{
	  /* We're actually executing with num_workers (32).  */
	  /* workers_actual = 32; */
	}
      else
	__builtin_abort ();
#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (workers_actual < 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != 0
	|| workers_min != 0 || workers_max != workers_actual - 1
	|| vectors_min != 0 || vectors_max != 0)
      __builtin_abort ();
  }

  /* GR, WS, VP.  */
  {
    /* We try with an outrageously large value. */
#define VECTORS 2 << 20
    int vectors_actual = VECTORS;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
  vector_length (VECTORS)
    {
      if (acc_on_device (acc_device_host))
	{
	  /* We're actually executing with vector_length (1).  */
	  vectors_actual = 1;
	}
      else if (acc_on_device (acc_device_nvidia))
	{
	  /* The GCC nvptx back end enforces vector_length (32).  */
	  vectors_actual = 32;
	}
      else
	__builtin_abort ();
#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (vectors_actual < 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != 0
	|| workers_min != 0 || workers_max != 0
	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
      __builtin_abort ();
#undef VECTORS
  }

  /* GR, WS, VP.  */
  {
    /* We try with an outrageously large value. */
    int vectors = 2 << 20;
    int vectors_actual = vectors;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_configured } } */ \
  vector_length (vectors)
    {
      if (acc_on_device (acc_device_host))
	{
	  /* We're actually executing with vector_length (1).  */
	  vectors_actual = 1;
	}
      else if (acc_on_device (acc_device_nvidia))
	{
	  /* The GCC nvptx back end enforces vector_length (32).  */
	  vectors_actual = 32;
	}
      else
	__builtin_abort ();
#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (vectors_actual < 1)
      __builtin_abort ();
    if (gangs_min != 0 || gangs_max != 0
	|| workers_min != 0 || workers_max != 0
	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
      __builtin_abort ();
  }


  /* Composition of GP, WP, VP.  */
  {
    int gangs = 12345;
    /* With nvptx offloading, multi-level reductions apparently are very slow
       in the following case.  So, limit ourselves here.  */
    if (acc_get_device_type () == acc_device_nvidia)
      gangs = 3;
    int gangs_actual = gangs;
#define WORKERS 3
    int workers_actual = WORKERS;
#define VECTORS 11
    int vectors_actual = VECTORS;
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_configured } } */ \
  num_gangs (gangs) \
  num_workers (WORKERS) \
  vector_length (VECTORS)
    {
      if (acc_on_device (acc_device_host))
	{
	  /* We're actually executing with num_gangs (1), num_workers (1),
	     vector_length (1).  */
	  gangs_actual = 1;
	  workers_actual = 1;
	  vectors_actual = 1;
	}
      else if (acc_on_device (acc_device_nvidia))
	{
	  /* The GCC nvptx back end enforces vector_length (32).  */
	  vectors_actual = 32;
	}
      else
	__builtin_abort ();
#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
	for (int j = 100 * workers_actual; j > -100 * workers_actual; --j)
#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
	  for (int k = 100 * vectors_actual; k > -100 * vectors_actual; --k)
	    {
	      gangs_min = gangs_max = acc_gang ();
	      workers_min = workers_max = acc_worker ();
	      vectors_min = vectors_max = acc_vector ();
	    }
    }
    if (gangs_min != 0 || gangs_max != gangs_actual - 1
	|| workers_min != 0 || workers_max != workers_actual - 1
	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
      __builtin_abort ();
#undef VECTORS
#undef WORKERS
  }


  /* We can't test parallelized OpenACC kernels constructs in this way: use of
     the acc_gang, acc_worker, acc_vector functions will make the construct
     unparallelizable.  */


  /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
     kernels.  */
  {
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc kernels
    {
      /* This is to make the OpenACC kernels construct unparallelizable.  */
      asm volatile ("" : : : "memory");

#pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100; i > -100; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (gangs_min != 0 || gangs_max != 1 - 1
	|| workers_min != 0 || workers_max != 1 - 1
	|| vectors_min != 0 || vectors_max != 1 - 1)
      __builtin_abort ();
  }


  /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
     kernels even when there are explicit num_gangs, num_workers, or
     vector_length clauses.  */
  {
    int gangs = 5;
#define WORKERS 5
#define VECTORS 13
    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
    gangs_min = workers_min = vectors_min = INT_MAX;
    gangs_max = workers_max = vectors_max = INT_MIN;
#pragma acc kernels \
  num_gangs (gangs) \
  num_workers (WORKERS) \
  vector_length (VECTORS)
    {
      /* This is to make the OpenACC kernels construct unparallelizable.  */
      asm volatile ("" : : : "memory");

#pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
      for (int i = 100; i > -100; --i)
	{
	  gangs_min = gangs_max = acc_gang ();
	  workers_min = workers_max = acc_worker ();
	  vectors_min = vectors_max = acc_vector ();
	}
    }
    if (gangs_min != 0 || gangs_max != 1 - 1
	|| workers_min != 0 || workers_max != 1 - 1
	|| vectors_min != 0 || vectors_max != 1 - 1)
      __builtin_abort ();
#undef VECTORS
#undef WORKERS
  }


  return 0;
}
Ejemplo n.º 7
0
/**
 * \brief Creates and initializes the working data for the plan
 * \param plan The Plan struct that holds the plan's data values.
 * \return Error flag value
 */
int initDOPENACCGEMMPlan(void *plan){   // <- Replace YOUR_NAME with the name of your module.
    if(!plan){
        return make_error(ALLOC, generic_err);           // <- This is the error code for one of the malloc fails.
    }
    Plan *p;
    DOPENACCGEMM_DATA *d;
    p = (Plan *)plan;

    #ifdef HAVE_PAPI
    int temp_event, i;
    int PAPI_Events [NUM_PAPI_EVENTS] = PAPI_COUNTERS;
    char *PAPI_units [NUM_PAPI_EVENTS] = PAPI_UNITS;
    #endif //HAVE_PAPI

    if(p){
        d = (DOPENACCGEMM_DATA *)p->vptr;
        p->exec_count = 0;           // Initialize the plan execution count to zero.
        perftimer_init(&p->timers, NUM_TIMERS);         // Initialize all performance timers to zero.

        #ifdef HAVE_PAPI
        /* Initialize plan's PAPI data */
        p->PAPI_EventSet = PAPI_NULL;
        p->PAPI_Num_Events = 0;

        TEST_PAPI(PAPI_create_eventset(&p->PAPI_EventSet), PAPI_OK, MyRank, 9999, PRINT_SOME);

        //Add the desired events to the Event Set; ensure the dsired counters
        //  are on the system then add, ignore otherwise
        for(i = 0; i < TOTAL_PAPI_EVENTS && i < NUM_PAPI_EVENTS; i++){
            temp_event = PAPI_Events[i];
            if(PAPI_query_event(temp_event) == PAPI_OK){
                p->PAPI_Num_Events++;
                TEST_PAPI(PAPI_add_event(p->PAPI_EventSet, temp_event), PAPI_OK, MyRank, 9999, PRINT_SOME);
            }
        }

        PAPIRes_init(p->PAPI_Results, p->PAPI_Times);
        PAPI_set_units(p->name, PAPI_units, NUM_PAPI_EVENTS);

        TEST_PAPI(PAPI_start(p->PAPI_EventSet), PAPI_OK, MyRank, 9999, PRINT_SOME);
        #endif     //HAVE_PAPI
    }
    if(d){
      int error;

      acc_device_t my_device = acc_get_device_type();
      acc_set_device_num(d->device_id, my_device);

      //When OpenACC can report back on accelerator size, these two lines should be enabled
      //d->device_memory = system_burn_accelerator_memory(d->device_id);
      //d->device_memory -= SUB_FACTOR;

      d->M = ((int)sqrt(d->device_memory/sizeof(double))) / 3;

      size_t page_size = sysconf(_SC_PAGESIZE);
      error = posix_memalign((void **)&(d->A_buffer),page_size,d->M*d->M*sizeof(double));
      assert(error==0);

      error = posix_memalign((void **)&(d->B_buffer),page_size,d->M*d->M*sizeof(double));
      assert(error==0);

      error = posix_memalign((void **)&(d->C_buffer),page_size,d->M*d->M*sizeof(double));
      assert(error==0);

      for(size_t idx=0; idx < d->M*d->M; idx++) {
        d->A_buffer[idx] = (double)4.5;
        d->B_buffer[idx] = (double)2.0;
        d->C_buffer[idx] = (double)0.0;
      }
    }
    return ERR_CLEAN;     // <- This indicates a clean run with no errors. Does not need to be changed.
} /* initDOPENACCGEMMPlan */