Пример #1
0
int main(int argc, char ** argv){
	int i;
        if( (argc>=2) && (atoi(argv[1])!=RANK)) error("rank %d mandatory",RANK);
        printf("CUDA RANK=%d\n",RANK);

	kernel.print();
// build busylist
        busylist = (uint32_t*)malloc_file(CNK*sizeof(uint32_t),FMODE_RO,BLIST_FORMAT,RANK);

// put busylist
        SafeCall(cuMemHostRegister(busylist,CNK*sizeof*busylist,CU_MEMHOSTREGISTER_DEVICEMAP));
        SafeCall(cuMemHostGetDevicePointer(&host_busylist,busylist,0));

        SafeCall(cuModuleGetGlobal(&dev_busylist,&bytes,kernel.module[0].module,"busylist"));
        if(bytes!=sizeof(host_busylist)) error("busylist!");
	SafeCall(cuMemcpyHtoD(dev_busylist,&host_busylist,bytes));
// put array
#ifdef IN_mk_data
        mkdir(DATADIR,0755); errno=0;
        array = (unsigned char *)malloc_file(abytes(RANK,CNK),1,DATADIR"%d",RANK);
#else
        array = (unsigned char *)malloc_file(abytes(RANK,CNK),0,DATADIR"%d",RANK);
#endif
        SafeCall(cuMemHostRegister(array,abytes(RANK,CNK),CU_MEMHOSTREGISTER_DEVICEMAP));
        SafeCall(cuMemHostGetDevicePointer(&host_array,array,0));

        SafeCall(cuModuleGetGlobal(&dev_array,&bytes,kernel.module[0].module,"array"));
        if(bytes!=sizeof(host_array)) error("array!");
	SafeCall(cuMemcpyHtoD(dev_array,&host_array,bytes));

#define THREADS 512
#define MAXG    65535
uint64_t nado = (cnk[RANK] +(THREADS-1))/THREADS;
uint32_t gridx = nado>MAXG?MAXG:nado;
uint32_t gridy = (nado+(MAXG-1))/MAXG;
printf("gridy=%d gridx=%d THREAD=%d\n",gridy, gridx, THREADS);

	kernel.launch(params,THREADS,gridx,gridy);
	kernel.wait();

	SafeCall(cuMemHostUnregister(busylist));
	SafeCall(cuMemHostUnregister(array));

        SafeCall(cuModuleGetGlobal(&dev_changed,&bytes,kernel.module[0].module,"changed"));
        if(bytes!=sizeof(changed)) error("changed!");
	SafeCall(cuMemcpyDtoH(changed,dev_changed,bytes));

	for(i=0;i<CACHESIZE;i++)
		total += changed[i];
	printf("changed=%ju\n",total);

	return 0;
}
Пример #2
0
SEXP R_auto_cuMemHostRegister(SEXP r_p, SEXP r_bytesize, SEXP r_Flags)
{
    SEXP r_ans = R_NilValue;
    void * p = GET_REF(r_p, void );
    size_t bytesize = REAL(r_bytesize)[0];
    unsigned int Flags = REAL(r_Flags)[0];
    
    CUresult ans;
    ans = cuMemHostRegister(p, bytesize, Flags);
    
    r_ans = Renum_convert_CUresult(ans) ;
    
    return(r_ans);
}
Пример #3
0
int * transpose(int *matrix, int height, int width) {
    if (cuda.get() == NULL) {
        cuda.reset(new Cuda());
    }
    int *result;
    result = new int[height*width];
    if (width >= BLOCK_DIM_X && height >= BLOCK_DIM_X) {
        // gpu_transpose_naive(result, matrix, height, width);
        cuMemHostRegister(result, sizeof(int)*height*width, 0);  // TODO check result value
        gpu_transpose_with_shared_mem(result, matrix, height, width);
        cuMemHostUnregister(result);  // TODO check result value
    } else {
        cpu_transpose(result, matrix, height, width);
    }
    return result;
}
Пример #4
0
/**
 * Call the CUDA register function so we pin the memory in the CUDA
 * space.
 */
void mca_common_cuda_register(void *ptr, size_t amount, char *msg) {
    int res;

    if (!initialized) {
        mca_common_cuda_init();
    }

    if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
        res = cuMemHostRegister(ptr, amount, 0);
        if (res != CUDA_SUCCESS) {
            /* If registering the memory fails, print a message and continue.
             * This is not a fatal error. */
            orte_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister failed",
                           true, ptr, amount, res, msg);
        } else {
            opal_output_verbose(20, mca_common_cuda_output,
                                "CUDA: cuMemHostRegister OK on mpool %s: "
                                "address=%p, bufsize=%d",
                                msg, ptr, (int)amount);
        }
    }
}
Пример #5
0
cl_int
pocl_cuda_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void *host_ptr)
{
  cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);

  CUresult result;
  void *b = NULL;

  /* if memory for this global memory is not yet allocated -> do it */
  if (mem_obj->device_ptrs[device->global_mem_id].mem_ptr == NULL)
    {
      cl_mem_flags flags = mem_obj->flags;

      if (flags & CL_MEM_USE_HOST_PTR)
        {
#if defined __arm__
          // cuMemHostRegister is not supported on ARN
          // Allocate device memory and perform explicit copies
          // before and after running a kernel
          result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size);
          CUDA_CHECK (result, "cuMemAlloc");
#else
          result = cuMemHostRegister (host_ptr, mem_obj->size,
                                      CU_MEMHOSTREGISTER_DEVICEMAP);
          if (result != CUDA_SUCCESS
              && result != CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)
            CUDA_CHECK (result, "cuMemHostRegister");
          result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, host_ptr, 0);
          CUDA_CHECK (result, "cuMemHostGetDevicePointer");
#endif
        }
      else if (flags & CL_MEM_ALLOC_HOST_PTR)
        {
          result = cuMemHostAlloc (&mem_obj->mem_host_ptr, mem_obj->size,
                                   CU_MEMHOSTREGISTER_DEVICEMAP);
          CUDA_CHECK (result, "cuMemHostAlloc");
          result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b,
                                              mem_obj->mem_host_ptr, 0);
          CUDA_CHECK (result, "cuMemHostGetDevicePointer");
        }
      else
        {
          result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size);
          if (result != CUDA_SUCCESS)
            {
              const char *err;
              cuGetErrorName (result, &err);
              POCL_MSG_PRINT2 (__FUNCTION__, __LINE__,
                               "-> Failed to allocate memory: %s\n", err);
              return CL_MEM_OBJECT_ALLOCATION_FAILURE;
            }
        }

      if (flags & CL_MEM_COPY_HOST_PTR)
        {
          result = cuMemcpyHtoD ((CUdeviceptr)b, host_ptr, mem_obj->size);
          CUDA_CHECK (result, "cuMemcpyHtoD");
        }

      mem_obj->device_ptrs[device->global_mem_id].mem_ptr = b;
      mem_obj->device_ptrs[device->global_mem_id].global_mem_id
          = device->global_mem_id;
    }

  /* copy already allocated global mem info to devices own slot */
  mem_obj->device_ptrs[device->dev_id]
      = mem_obj->device_ptrs[device->global_mem_id];

  return CL_SUCCESS;
}