예제 #1
0
파일: gaussblur.c 프로젝트: akiml/ampehre
static void exec_gaussblur_app(DATA *image_out, DATA *image_in, ARGUMENTS *settings) {
    size_t size_in_bytes = settings->size*settings->size*(PIXEL_CHANNELS*sizeof(DATA));

    DATA *image_in_gpu	= (DATA *)acc_malloc(size_in_bytes);
    DATA *image_out_gpu	= (DATA *)acc_malloc(size_in_bytes);

    uint32_t e;
    uint32_t i;

    for (e=0; e<settings->energy_loops; ++e) {
        acc_memcpy_to_device(image_in_gpu , image_in , size_in_bytes);
        acc_memcpy_to_device(image_out_gpu, image_out, size_in_bytes);

        for (i=0; i<settings->checkpoints; ++i) {
            gaussblur_calc_gpu_compute((uint16_t *)image_in_gpu , (uint16_t *)image_out_gpu, settings->size);
            gaussblur_calc_gpu_compute((uint16_t *)image_out_gpu, (uint16_t *)image_in_gpu , settings->size);
        }

        acc_memcpy_from_device(image_in , image_in_gpu , size_in_bytes);
        acc_memcpy_from_device(image_out, image_out_gpu, size_in_bytes);
    }

    acc_free(image_in_gpu);
    acc_free(image_out_gpu);

    image_in_gpu	= NULL;
    image_out_gpu	= NULL;
}
예제 #2
0
파일: GPUCopy.cpp 프로젝트: mcprice30/MCGPU
void GPUCopy::copyIn(SimBox *sb) {
    h_moleculeData = sb->moleculeData;
    h_atomData = sb->atomData;
    h_atomCoordinates = sb->atomCoordinates;
    h_rollBackCoordinates = sb->rollBackCoordinates;
    h_size = sb-> size;
    h_primaryIndexes = sb->primaryIndexes;
    if (!parallel) {
        return;
    }

#ifdef _OPENACC
    d_moleculeData = (int**)acc_malloc(MOL_DATA_SIZE * sizeof(int *));
    assert(d_moleculeData != NULL);
    for (int row = 0; row < MOL_DATA_SIZE; row++) {
        int *h_moleculeData_row = sb->moleculeData[row];
        int *d_moleculeData_row = (int *)acc_copyin(h_moleculeData_row,
                                  sb->numMolecules * sizeof(int));
        assert(d_moleculeData_row != NULL);
#pragma acc parallel deviceptr(d_moleculeData)
        d_moleculeData[row] = d_moleculeData_row;
    }

    d_atomData = (Real**)acc_malloc(ATOM_DATA_SIZE * sizeof(Real *));
    assert(d_atomData != NULL);
    for (int row = 0; row < ATOM_DATA_SIZE; row++) {
        Real *h_atomData_row = sb->atomData[row];
        Real *d_atomData_row = (Real *)acc_copyin(h_atomData_row, sb->numAtoms * sizeof(Real));
        assert(d_atomData_row != NULL);
#pragma acc parallel deviceptr(d_atomData)
        d_atomData[row] = d_atomData_row;
    }

    d_atomCoordinates = (Real**)acc_malloc(NUM_DIMENSIONS * sizeof(Real *));
    assert(d_atomCoordinates != NULL);
    for (int row = 0; row < NUM_DIMENSIONS; row++) {
        Real *h_atomCoordinates_row = sb->atomCoordinates[row];
        Real *d_atomCoordinates_row = (Real *)acc_copyin(h_atomCoordinates_row, sb->numAtoms * sizeof(Real));
        assert(d_atomCoordinates_row != NULL);
#pragma acc parallel deviceptr(d_atomCoordinates)
        d_atomCoordinates[row] = d_atomCoordinates_row;
    }

    d_rollBackCoordinates = (Real**)acc_malloc(NUM_DIMENSIONS * sizeof(Real *));
    assert(d_rollBackCoordinates != NULL);
    for (int row = 0; row < NUM_DIMENSIONS; row++) {
        Real *h_rollBackCoordinates_row = sb->rollBackCoordinates[row];
        Real *d_rollBackCoordinates_row = (Real *)acc_copyin(h_rollBackCoordinates_row, sb->largestMol * sizeof(Real));
        assert(d_rollBackCoordinates_row != NULL);
#pragma acc parallel deviceptr(d_rollBackCoordinates)
        d_rollBackCoordinates[row] = d_rollBackCoordinates_row;
    }

    d_primaryIndexes = (int *)acc_copyin(sb->primaryIndexes, sb->numPIdxes * sizeof(int));

    d_size = (Real *)acc_copyin(sb->size, NUM_DIMENSIONS * sizeof(Real));
#endif
}
예제 #3
0
int
main (int argc, char **argv)
{
    void *d;
    acc_device_t devtype = acc_device_host;

#if ACC_DEVICE_TYPE_nvidia
    devtype = acc_device_nvidia;

    if (acc_get_num_devices (acc_device_nvidia) == 0)
        return 0;
#endif

    acc_init (devtype);

    d = acc_malloc (0);
    if (d != NULL)
        abort ();

    acc_free (0);

    acc_shutdown (devtype);

    acc_set_device_type (devtype);

    d = acc_malloc (0);
    if (d != NULL)
        abort ();

    acc_shutdown (devtype);

    acc_init (devtype);

    d = acc_malloc (1024);
    if (d == NULL)
        abort ();

    acc_free (d);

    acc_shutdown (devtype);

    acc_set_device_type (devtype);

    d = acc_malloc (1024);
    if (d == NULL)
        abort ();

    acc_free (d);

    acc_shutdown (devtype);

    return 0;
}
예제 #4
0
파일: osu_bw.c 프로젝트: LLNL/mpi-tools
int
allocate_device_buffer (char ** buffer)
{
#ifdef _ENABLE_CUDA_
    cudaError_t cuerr = cudaSuccess;
#endif

    switch (options.accel) {
#ifdef _ENABLE_CUDA_
        case cuda:
            cuerr = cudaMalloc((void **)buffer, MYBUFSIZE);

            if (cudaSuccess != cuerr) {
                fprintf(stderr, "Could not allocate device memory\n");
                return 1;
            }
            break;
#endif
#ifdef _ENABLE_OPENACC_
        case openacc:
            *buffer = acc_malloc(MYBUFSIZE);
            if (NULL == *buffer) {
                fprintf(stderr, "Could not allocate device memory\n");
                return 1;
            }
            break;
#endif
        default:
            fprintf(stderr, "Could not allocate device memory\n");
            return 1;
    }

    return 0;
}
예제 #5
0
int
allocate_buffer (void ** buffer, size_t size, enum accel_type type)
{
    if (options.target == cpu || options.target == both) {
        allocate_host_arrays();
    }

    size_t alignment = sysconf(_SC_PAGESIZE);
#ifdef _ENABLE_CUDA_
    cudaError_t cuerr = cudaSuccess;
#endif

    switch (type) {
        case none:
            return posix_memalign(buffer, alignment, size);
#ifdef _ENABLE_CUDA_
        case cuda:
            cuerr = cudaMalloc(buffer, size);
            if (cudaSuccess != cuerr) {
                return 1;
            }

            else {
                return 0;
            }
        case managed:
            cuerr = cudaMallocManaged(buffer, size, cudaMemAttachGlobal);
            if (cudaSuccess != cuerr) {
                return 1;
            }

            else {
                return 0;
            }
#endif
#ifdef _ENABLE_OPENACC_
        case openacc:
            *buffer = acc_malloc(size);
            if (NULL == *buffer) {
                return 1;
            }

            else {
                return 0;
            }
#endif
        default:
            return 1;
    }
}
예제 #6
0
파일: lib-62.c 프로젝트: ivop/gcc-6502
int
main (int argc, char **argv)
{
    const int N = 256;
    int i;
    unsigned char *h;
    void *d;

    acc_init (acc_device_nvidia);

    h = (unsigned char *) malloc (N);

    for (i = 0; i < N; i++)
    {
        h[i] = i;
    }

    d = acc_malloc (N);

    acc_memcpy_to_device (d, h, N);

    memset (&h[0], 0, N);

    acc_memcpy_to_device (d, h, N << 1);

    acc_memcpy_from_device (h, d, N);

    for (i = 0; i < N; i++)
    {
        if (h[i] != i)
            abort ();
    }

    acc_free (d);

    free (h);

    acc_shutdown (acc_device_nvidia);

    return 0;
}
예제 #7
0
파일: lib-54.c 프로젝트: SuperLu/gcc
int
main (int argc, char **argv)
{
  const int N = 256;
  unsigned char *h;
  void *d;

  h = (unsigned char *) malloc (N);

  d = acc_malloc (N);

  fprintf (stderr, "CheCKpOInT\n");
  acc_map_data (h, d, 0);

  acc_unmap_data (h);

  acc_free (d);

  free (h);

  return 0;
}
예제 #8
0
파일: lib-63.c 프로젝트: 0day-ci/gcc
int
main (int argc, char **argv)
{
  const int N = 256;
  int i;
  unsigned char *h;
  void *d;

  h = (unsigned char *) malloc (N);

  for (i = 0; i < N; i++)
    {
      h[i] = i;
    }

  d = acc_malloc (N);

  fprintf (stderr, "CheCKpOInT\n");
  acc_memcpy_to_device (0, h, N);

  memset (&h[0], 0, N);

  acc_memcpy_from_device (h, d, N);

  for (i = 0; i < N; i++)
    {
      if (h[i] != i)
	abort ();
    }

  acc_free (d);

  free (h);

  return 0;
}
예제 #9
0
파일: lib-50.c 프로젝트: 0day-ci/gcc
int
main (int argc, char **argv)
{
  const int N = 256;
  unsigned char *h;
  void *d;

  h = (unsigned char *) malloc (N);

  d = acc_malloc (N);

  acc_map_data (h, d, N);

  if (acc_is_present (h, N) != 1)
    abort ();

  acc_unmap_data (h);

  acc_free (d);

  free (h);

  return 0;
}
예제 #10
0
파일: lib-77.c 프로젝트: abumaryam/gcc
int
main (int argc, char **argv)
{
  CUdevice dev;
  CUfunction delay;
  CUmodule module;
  CUresult r;
  CUstream stream;
  unsigned long *a, *d_a, dticks;
  int nbytes;
  float atime, dtime;
  void *kargs[2];
  int clkrate;
  int devnum, nprocs;

  acc_init (acc_device_nvidia);

  devnum = acc_get_device_num (acc_device_nvidia);

  r = cuDeviceGet (&dev, devnum);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
      abort ();
    }

  r =
    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
			  dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuModuleLoad (&module, "subr.ptx");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
      abort ();
    }

  r = cuModuleGetFunction (&delay, module, "delay");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
      abort ();
    }

  nbytes = nprocs * sizeof (unsigned long);

  dtime = 200.0;

  dticks = (unsigned long) (dtime * clkrate);

  a = (unsigned long *) malloc (nbytes);
  d_a = (unsigned long *) acc_malloc (nbytes);

  acc_map_data (a, d_a, nbytes);

  kargs[0] = (void *) &d_a;
  kargs[1] = (void *) &dticks;

  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
  if (r != CUDA_SUCCESS)
	{
	  fprintf (stderr, "cuStreamCreate failed: %d\n", r);
	  abort ();
	}

  acc_set_cuda_stream (0, stream);

  init_timers (1);

  start_timer (0);

  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
      abort ();
    }

  acc_wait (1);

  atime = stop_timer (0);

  if (atime < dtime)
    {
      fprintf (stderr, "actual time < delay time\n");
      abort ();
    }

  start_timer (0);

  acc_wait (1);

  atime = stop_timer (0);

  if (0.010 < atime)
    {
      fprintf (stderr, "actual time < delay time\n");
      abort ();
    }

  acc_unmap_data (a);

  fini_timers ();

  free (a);
  acc_free (d_a);

  acc_shutdown (acc_device_nvidia);

  return 0;
}
예제 #11
0
파일: lib-72.c 프로젝트: 0day-ci/gcc
int
main (int argc, char **argv)
{
  CUdevice dev;
  CUfunction delay;
  CUmodule module;
  CUresult r;
  CUstream stream;
  unsigned long *a, *d_a, dticks;
  int nbytes;
  float dtime;
  void *kargs[2];
  int clkrate;
  int devnum, nprocs;

  acc_init (acc_device_nvidia);

  devnum = acc_get_device_num (acc_device_nvidia);

  r = cuDeviceGet (&dev, devnum);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
      abort ();
    }

  r =
    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
			  dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuModuleLoad (&module, "subr.ptx");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
      abort ();
    }

  r = cuModuleGetFunction (&delay, module, "delay");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
      abort ();
    }

  nbytes = nprocs * sizeof (unsigned long);

  dtime = 200.0;

  dticks = (unsigned long) (dtime * clkrate);

  a = (unsigned long *) malloc (nbytes);
  d_a = (unsigned long *) acc_malloc (nbytes);

  acc_map_data (a, d_a, nbytes);

  kargs[0] = (void *) &d_a;
  kargs[1] = (void *) &dticks;

  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
      abort ();
    }

  if (!acc_set_cuda_stream (0, stream))
    abort ();
    
  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
      abort ();
    }

  if (acc_async_test_all () != 0)
    {
      fprintf (stderr, "asynchronous operation not running\n");
      abort ();
    }

  sleep ((int) (dtime / 1000.f) + 1);

  if (acc_async_test_all () != 1)
    {
      fprintf (stderr, "found asynchronous operation still running\n");
      abort ();
    }

  acc_unmap_data (a);

  free (a);
  acc_free (d_a);

  acc_shutdown (acc_device_nvidia);

  exit (0);
}
예제 #12
0
파일: lib-82.c 프로젝트: 0day-ci/gcc
int
main (int argc, char **argv)
{
  CUdevice dev;
  CUfunction delay2;
  CUmodule module;
  CUresult r;
  int N;
  int i;
  CUstream *streams;
  unsigned long **a, **d_a, *tid, ticks;
  int nbytes;
  void *kargs[3];
  int clkrate;
  int devnum, nprocs;

  acc_init (acc_device_nvidia);

  devnum = acc_get_device_num (acc_device_nvidia);

  r = cuDeviceGet (&dev, devnum);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
      abort ();
    }

  r =
    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
			  dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuModuleLoad (&module, "subr.ptx");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
      abort ();
    }

  r = cuModuleGetFunction (&delay2, module, "delay2");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
      abort ();
    }

  nbytes = sizeof (int);

  ticks = (unsigned long) (200.0 * clkrate);

  N = nprocs;

  streams = (CUstream *) malloc (N * sizeof (void *));

  a = (unsigned long **) malloc (N * sizeof (unsigned long *));
  d_a = (unsigned long **) malloc (N * sizeof (unsigned long *));
  tid = (unsigned long *) malloc (N * sizeof (unsigned long));

  for (i = 0; i < N; i++)
    {
      a[i] = (unsigned long *) malloc (sizeof (unsigned long));
      *a[i] = N;
      d_a[i] = (unsigned long *) acc_malloc (nbytes);
      tid[i] = i;

      acc_map_data (a[i], d_a[i], nbytes);

      streams[i] = (CUstream) acc_get_cuda_stream (i);
      if (streams[i] != NULL)
        abort ();

      r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT);
      if (r != CUDA_SUCCESS)
        {
          fprintf (stderr, "cuStreamCreate failed: %d\n", r);
          abort ();
        }

       if (!acc_set_cuda_stream (i, streams[i]))
        abort ();
    }

  for (i = 0; i < N; i++)
    {
      kargs[0] = (void *) &d_a[i];
      kargs[1] = (void *) &ticks;
      kargs[2] = (void *) &tid[i];

      r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
      if (r != CUDA_SUCCESS)
	{
	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
	  abort ();
	}

      ticks = (unsigned long) (50.0 * clkrate);
    }

  acc_wait_all_async (0);

  for (i = 0; i < N; i++)
    {
      acc_copyout (a[i], nbytes);
      if (*a[i] != i)
	abort ();
    }

  free (streams);

  for (i = 0; i < N; i++)
    {
      free (a[i]);
    }

  free (a);
  free (d_a);
  free (tid);

  acc_shutdown (acc_device_nvidia);

  exit (0);
}