예제 #1
0
파일: cudamain.cpp 프로젝트: sergiy8/fsha
int main(int argc, char ** argv){
	int i;
        if( (argc>=2) && (atoi(argv[1])!=RANK)) error("rank %d mandatory",RANK);
        printf("CUDA RANK=%d\n",RANK);

	kernel.print();
// build busylist
        busylist = (uint32_t*)malloc_file(CNK*sizeof(uint32_t),FMODE_RO,BLIST_FORMAT,RANK);

// put busylist
        SafeCall(cuMemHostRegister(busylist,CNK*sizeof*busylist,CU_MEMHOSTREGISTER_DEVICEMAP));
        SafeCall(cuMemHostGetDevicePointer(&host_busylist,busylist,0));

        SafeCall(cuModuleGetGlobal(&dev_busylist,&bytes,kernel.module[0].module,"busylist"));
        if(bytes!=sizeof(host_busylist)) error("busylist!");
	SafeCall(cuMemcpyHtoD(dev_busylist,&host_busylist,bytes));
// put array
#ifdef IN_mk_data
        mkdir(DATADIR,0755); errno=0;
        array = (unsigned char *)malloc_file(abytes(RANK,CNK),1,DATADIR"%d",RANK);
#else
        array = (unsigned char *)malloc_file(abytes(RANK,CNK),0,DATADIR"%d",RANK);
#endif
        SafeCall(cuMemHostRegister(array,abytes(RANK,CNK),CU_MEMHOSTREGISTER_DEVICEMAP));
        SafeCall(cuMemHostGetDevicePointer(&host_array,array,0));

        SafeCall(cuModuleGetGlobal(&dev_array,&bytes,kernel.module[0].module,"array"));
        if(bytes!=sizeof(host_array)) error("array!");
	SafeCall(cuMemcpyHtoD(dev_array,&host_array,bytes));

#define THREADS 512
#define MAXG    65535
uint64_t nado = (cnk[RANK] +(THREADS-1))/THREADS;
uint32_t gridx = nado>MAXG?MAXG:nado;
uint32_t gridy = (nado+(MAXG-1))/MAXG;
printf("gridy=%d gridx=%d THREAD=%d\n",gridy, gridx, THREADS);

	kernel.launch(params,THREADS,gridx,gridy);
	kernel.wait();

	SafeCall(cuMemHostUnregister(busylist));
	SafeCall(cuMemHostUnregister(array));

        SafeCall(cuModuleGetGlobal(&dev_changed,&bytes,kernel.module[0].module,"changed"));
        if(bytes!=sizeof(changed)) error("changed!");
	SafeCall(cuMemcpyDtoH(changed,dev_changed,bytes));

	for(i=0;i<CACHESIZE;i++)
		total += changed[i];
	printf("changed=%ju\n",total);

	return 0;
}
예제 #2
0
static void
map_init (struct ptx_stream *s)
{
  CUresult r;

  int size = getpagesize ();

  assert (s);
  assert (!s->d);
  assert (!s->h);

  r = cuMemAllocHost (&s->h, size);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));

  r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));

  assert (s->h);

  s->h_begin = s->h;
  s->h_end = s->h_begin + size;
  s->h_next = s->h_prev = s->h_tail = s->h_begin;

  assert (s->h_next);
  assert (s->h_end);
}
예제 #3
0
void *swanDevicePtrForHostPtr( void *ptr ) {
	CUdeviceptr ptrd;	
	CUresult err = cuMemHostGetDevicePointer( &ptrd, (CUdeviceptr) ptr, 0 );
	if ( err != CUDA_SUCCESS ) {
		error("swanMallocHost failed\n" );
	}
	return (void*) ptrd;
	
}
예제 #4
0
SEXP
R_auto_cuMemHostGetDevicePointer(SEXP r_p, SEXP r_Flags)
{
    SEXP r_ans = R_NilValue;
    CUdeviceptr pdptr;
    void * p = GET_REF(r_p, void );
    unsigned int Flags = REAL(r_Flags)[0];
    CUresult ans;
    ans = cuMemHostGetDevicePointer(& pdptr,  p,  Flags);
    if(ans)
       return(R_cudaErrorInfo(ans));
    r_ans = R_createRef((void*) pdptr, "CUdeviceptr") ;
    return(r_ans);
}
GPUPtr GPUInterface::GetDevicePointer(void* hPtr) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tEntering GPUInterface::GetDevicePointer\n");
#endif

    GPUPtr dPtr;

    SAFE_CUPP(cuMemHostGetDevicePointer(&dPtr, hPtr, 0));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::GetDevicePointer\n");
#endif

    return dPtr;
}
예제 #6
0
파일: pocl-cuda.c 프로젝트: jrprice/pocl
cl_int
pocl_cuda_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void *host_ptr)
{
  cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);

  CUresult result;
  void *b = NULL;

  /* if memory for this global memory is not yet allocated -> do it */
  if (mem_obj->device_ptrs[device->global_mem_id].mem_ptr == NULL)
    {
      cl_mem_flags flags = mem_obj->flags;

      if (flags & CL_MEM_USE_HOST_PTR)
        {
#if defined __arm__
          // cuMemHostRegister is not supported on ARN
          // Allocate device memory and perform explicit copies
          // before and after running a kernel
          result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size);
          CUDA_CHECK (result, "cuMemAlloc");
#else
          result = cuMemHostRegister (host_ptr, mem_obj->size,
                                      CU_MEMHOSTREGISTER_DEVICEMAP);
          if (result != CUDA_SUCCESS
              && result != CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)
            CUDA_CHECK (result, "cuMemHostRegister");
          result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, host_ptr, 0);
          CUDA_CHECK (result, "cuMemHostGetDevicePointer");
#endif
        }
      else if (flags & CL_MEM_ALLOC_HOST_PTR)
        {
          result = cuMemHostAlloc (&mem_obj->mem_host_ptr, mem_obj->size,
                                   CU_MEMHOSTREGISTER_DEVICEMAP);
          CUDA_CHECK (result, "cuMemHostAlloc");
          result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b,
                                              mem_obj->mem_host_ptr, 0);
          CUDA_CHECK (result, "cuMemHostGetDevicePointer");
        }
      else
        {
          result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size);
          if (result != CUDA_SUCCESS)
            {
              const char *err;
              cuGetErrorName (result, &err);
              POCL_MSG_PRINT2 (__FUNCTION__, __LINE__,
                               "-> Failed to allocate memory: %s\n", err);
              return CL_MEM_OBJECT_ALLOCATION_FAILURE;
            }
        }

      if (flags & CL_MEM_COPY_HOST_PTR)
        {
          result = cuMemcpyHtoD ((CUdeviceptr)b, host_ptr, mem_obj->size);
          CUDA_CHECK (result, "cuMemcpyHtoD");
        }

      mem_obj->device_ptrs[device->global_mem_id].mem_ptr = b;
      mem_obj->device_ptrs[device->global_mem_id].global_mem_id
          = device->global_mem_id;
    }

  /* copy already allocated global mem info to devices own slot */
  mem_obj->device_ptrs[device->dev_id]
      = mem_obj->device_ptrs[device->global_mem_id];

  return CL_SUCCESS;
}
int gib_recover ( void *buffers, int buf_size, int *buf_ids, int recover_last,
		  gib_context c ) {
  ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx));
#if !GIB_USE_MMAP
  if (buf_size > gib_buf_size) {
    int rc = gib_cpu_recover(buffers, buf_size, buf_ids, recover_last, c);
    ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
    return rc;
  }
#endif

  int i, j;
  int n = c->n;
  int m = c->m;
  unsigned char A[128*128], inv[128*128], modA[128*128];
  for (i = n; i < n+recover_last; i++)
    if (buf_ids[i] >= n) {
      fprintf(stderr, "Attempting to recover a parity buffer, not allowed\n");
      return GIB_ERR;
    }

  gib_galois_gen_A(A, m+n, n);

  /* Modify the matrix to have the failed drives reflected */
  for (i = 0; i < n; i++) 
    for (j = 0; j < n; j++) 
      modA[i*n+j] = A[buf_ids[i]*n+j];

  gib_galois_gaussian_elim(modA, inv, n, n);

  /* Copy row buf_ids[i] into row i */
  for (i = n; i < n+recover_last; i++)
    for (j = 0; j < n; j++)
      modA[i*n+j] = inv[buf_ids[i]*n+j];

  int nthreads_per_block = 128;
  int fetch_size = sizeof(int)*nthreads_per_block;
  int nblocks = (buf_size + fetch_size - 1)/fetch_size;
  gpu_context gpu_c = (gpu_context) c->acc_context;

  CUdeviceptr F_d;
  ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d"));
  ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, modA+n*n, (c->m)*(c->n)));

#if !GIB_USE_MMAP
  ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, (c->n)*buf_size));
#endif
  ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->recover, nthreads_per_block, 
				       1, 1));
  int offset = 0;
  void *ptr;
#if GIB_USE_MMAP
  CUdeviceptr cpu_buffers;
  ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0));
  ptr = (void *)cpu_buffers;
#else
  ptr = (void *)gpu_c->buffers;
#endif
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &ptr, sizeof(ptr)));
  offset += sizeof(ptr);
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &buf_size, 
			       sizeof(buf_size)));
  offset += sizeof(buf_size);
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &recover_last, 
			       sizeof(recover_last)));
  offset += sizeof(recover_last);
  ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->recover, offset));
  ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->recover, nblocks, 1));
#if !GIB_USE_MMAP
  CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size;
  void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size);
  ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, recover_last*buf_size));
#else
  cuCtxSynchronize();
#endif
  ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
  return GIB_SUC;
}
int gib_generate ( void *buffers, int buf_size, gib_context c ) {
  ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx));
  /* Do it all at once if the buffers are small enough */
#if !GIB_USE_MMAP
  /* This is too large to do at once in the GPU memory we have allocated.
   * Split it into several noncontiguous jobs. 
   */
  if (buf_size > gib_buf_size) {
    int rc = gib_generate_nc(buffers, buf_size, buf_size, c);
    ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
    return rc;
  }
#endif

  int nthreads_per_block = 128;
  int fetch_size = sizeof(int)*nthreads_per_block;
  int nblocks = (buf_size + fetch_size - 1)/fetch_size;
  gpu_context gpu_c = (gpu_context) c->acc_context;
  
  unsigned char F[256*256];
  gib_galois_gen_F(F, c->m, c->n);
  CUdeviceptr F_d;
  ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d"));
  ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, F, (c->m)*(c->n)));
  
#if !GIB_USE_MMAP
  /* Copy the buffers to memory */
  ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, 
				(c->n)*buf_size));
#endif
  /* Configure and launch */
  ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->checksum, nthreads_per_block,
				       1, 1));
  int offset = 0;
  void *ptr;
#if GIB_USE_MMAP
  CUdeviceptr cpu_buffers;
  ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0));
  ptr = (void *)cpu_buffers;
#else
  ptr = (void *)(gpu_c->buffers);
#endif
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &ptr, sizeof(ptr)));
  offset += sizeof(ptr);
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &buf_size,
			       sizeof(buf_size)));
  offset += sizeof(buf_size);
  ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->checksum, offset));
  ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->checksum, nblocks, 1));

  /* Get the results back */
#if !GIB_USE_MMAP
  CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size;
  void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size);
  ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, (c->m)*buf_size));
#else
  ERROR_CHECK_FAIL(cuCtxSynchronize());
#endif
  ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
  return GIB_SUC; 
}