Ejemplos de cuMemcpyDtoH en C++ (Cpp)

Ejemplo n.º 1

0

Mostrar archivo

Archivo: ov.c Proyecto: CPFL/gtraffic

/*
 * download data from device memory to host memory
 */
void
download(double x[], double v[], int *error, double *s_time){

  res = cuMemcpyDtoH(x, x_dev, N * sizeof(double));
  if(res != CUDA_SUCCESS){
    printf("cuMemcpyDtoH(x) failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuMemcpyDtoH(v, v_dev, N * sizeof(double));
  if(res != CUDA_SUCCESS){
    printf("cuMemcpyDtoH(v) failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuMemcpyDtoH(error, error_dev, sizeof(int));
  if(res != CUDA_SUCCESS){
    printf("cuMemcpyDtoH(error) failed: res = %s\n", conv(res));
    exit(1);
  }

  res = cuMemcpyDtoH(s_time, s_time_dev, sizeof(double));
  if(res != CUDA_SUCCESS){
    printf("cuMemcpyDtoH(s_time) failed: res = %s\n", conv(res));
    exit(1);
  }

}

Ejemplo n.º 2

0

Mostrar archivo

Archivo: CudaRuntime2.c Proyecto: ealliaume/rootbeer1

/*
 * Class:     edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2
 * Method:    runBlocks
 * Signature: (I)V
 */
JNIEXPORT jint JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_runBlocks
  (JNIEnv *env, jobject this_obj, jint num_blocks, jint block_shape, jint grid_shape){

  CUresult status;
  jlong * infoSpace = (jlong *) malloc(gc_space_size);
  infoSpace[1] = heapEndPtr;
  cuMemcpyHtoD(gcInfoSpace, infoSpace, gc_space_size);
  cuMemcpyHtoD(gpuToSpace, toSpace, heapEndPtr);
  //cuMemcpyHtoD(gpuTexture, textureMemory, textureMemSize);
  cuMemcpyHtoD(gpuHandlesMemory, handlesMemory, num_blocks * sizeof(jlong));
  cuMemcpyHtoD(gpuHeapEndPtr, &heapEndPtr, sizeof(jlong));
  cuMemcpyHtoD(gpuBufferSize, &bufferSize, sizeof(jlong));
  
/*
  status = cuModuleGetTexRef(&cache, cuModule, "m_Cache");  
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuModuleGetTexRef %d\n", status);
  }

  status = cuTexRefSetAddress(0, cache, gpuTexture, textureMemSize);
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuTextRefSetAddress %d\n", status);
  }
*/

  status = cuFuncSetBlockShape(cuFunction, block_shape, 1, 1);
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuFuncSetBlockShape %d\n", status);
    return (jint) status;
  }

  status = cuLaunchGrid(cuFunction, grid_shape, 1);
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuLaunchGrid %d\n", status);
    fflush(stdout);
    return (jint) status;
  }

  status = cuCtxSynchronize();  
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuCtxSynchronize %d\n", status);
    return (jint) status;
  }

  cuMemcpyDtoH(infoSpace, gcInfoSpace, gc_space_size);
  heapEndPtr = infoSpace[1];
  cuMemcpyDtoH(toSpace, gpuToSpace, heapEndPtr);
  cuMemcpyDtoH(exceptionsMemory, gpuExceptionsMemory, num_blocks * sizeof(jlong));
  free(infoSpace);
  
  return 0;
}

Ejemplo n.º 3

0

Mostrar archivo

Archivo: PTXBackendTestSuite.cpp Proyecto: jrk/llvmptxbackend

CUresult loadAndRunDualTestFunction(CUmodule *phModule, std::string name, CUdeviceptr &d_data0, 
				CUdeviceptr &d_data1, 
				DataStruct *h_data0, 
				DataStruct *h_data1, 
				unsigned int memSize, 
                                int thread_x=1,int thread_y=1,int thread_z=1,
                                int block_x=1, int block_y=1, int block_z=1)
{
  //  std::cout << "  Start Loading" << std::endl;

  // load data the to device
  cuMemcpyHtoD(d_data0, h_data0, memSize);         
  cuMemcpyHtoD(d_data1, h_data1, memSize);         

  // Locate the kernel entry point
  CUfunction phKernel = 0;
  CUresult status = cuModuleGetFunction(&phKernel, *phModule, name.data());
   if (status != CUDA_SUCCESS)
     {printf("ERROR: could not load function\n");}
    
  // Set the kernel parameters
  status = cuFuncSetBlockShape(phKernel, thread_x, thread_y, thread_z);
   if (status != CUDA_SUCCESS)
     {printf("ERROR: during setBlockShape\n");}

  int paramOffset = 0, size=0;

  size = sizeof(CUdeviceptr);
  status = cuParamSetv(phKernel, paramOffset, &d_data0, size);
  paramOffset += size;
  status = cuParamSetv(phKernel, paramOffset, &d_data1, size);
  paramOffset += size;



  status = cuParamSetSize(phKernel, paramOffset);
   if (status != CUDA_SUCCESS)
     {printf("ERROR: during cuParamSetv\n");}
    
  // Launch the kernel
  status = cuLaunchGrid(phKernel, block_x, block_y);
  if (status != CUDA_SUCCESS)
    {printf("ERROR: during grid launch\n");}

  //  std::cout << "  launched CUDA kernel!!" << std::endl;
  
  // Copy the result back to the host
  status = cuMemcpyDtoH(h_data0, d_data0, memSize);
  status = cuMemcpyDtoH(h_data1, d_data1, memSize);
  if (status != CUDA_SUCCESS)
    {printf("ERROR: during MemcpyDtoH\n");}
}

Ejemplo n.º 4

0

Mostrar archivo

Archivo: basic_branch.cpp Proyecto: sebsgit/gemu

int main(){
	init_test();
	const std::string source = 
	".version 4.2\n"
	".target sm_20\n"
	".address_size 64\n"
	".visible .entry kernel(.param .u64 kernel_param_0) {\n"
	".reg .s32 	%r<2>;\n"
	".reg .s64 	%rd<3>;\n"
	"bra 	BB1_2;\n"
	"ld.param.u64 	%rd1, [kernel_param_0];\n"
	"cvta.to.global.u64 	%rd2, %rd1;\n"
	"mov.u32 	%r1, 5;\n"
	"st.global.u32 	[%rd2], %r1;\n"
	"BB1_2: ret;\n"
	"}\n";
	CUmodule modId = 0;
	CUfunction funcHandle = 0;
	cu_assert(cuModuleLoadData(&modId, source.c_str()));
	cu_assert(cuModuleGetFunction(&funcHandle, modId, "kernel"));
	CUdeviceptr devValue;
	int hostValue = 10;
	cu_assert(cuMemAlloc(&devValue, sizeof(int)));
	cu_assert(cuMemcpyHtoD(devValue, &hostValue, sizeof(hostValue)));
	void * params[] = {&devValue};
	cu_assert(cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr));
	cu_assert(cuMemcpyDtoH(&hostValue, devValue, sizeof(hostValue)));
	assert(hostValue == 10);
	std::cout << hostValue << "\n";
	cu_assert(cuMemFree(devValue));
	cu_assert(cuModuleUnload(modId));
	return 0;
}

Ejemplo n.º 5

0

Mostrar archivo

Archivo: Buffer.hpp Proyecto: vitaliylag/waifu2x-converter-cpp

    void *get_read_ptr_host(ComputeEnv *env, size_t read_byte_size) {
        if (host_valid) {
            return host_ptr;
        }

        if (host_ptr == nullptr) {
            host_ptr = _mm_malloc(byte_size, 64);
        }

        if (last_write.type == Processor::OpenCL) {
            OpenCLDev *dev = &env->cl_dev_list[last_write.devid];
            clEnqueueReadBuffer(dev->queue, cl_ptr_list[last_write.devid],
                                CL_TRUE, 0, read_byte_size, host_ptr, 0, nullptr, nullptr);
        } else if (last_write.type == Processor::CUDA) {
            CUDADev *dev = &env->cuda_dev_list[last_write.devid];
            cuCtxPushCurrent(dev->context);
            //double t0 = getsec();
            cuMemcpyDtoH(host_ptr, cuda_ptr_list[last_write.devid], read_byte_size);
            //double t1 = getsec();
            //env->transfer_wait = t1-t0;

            CUcontext old;
            cuCtxPopCurrent(&old);
        } else {
            abort();
        }

        host_valid = true;
        return host_ptr;
    }

Ejemplo n.º 6

0

Mostrar archivo

Archivo: saracufft.cpp Proyecto: sara-nl/Omnimatch

void sararfftnd_one_complex_to_real(
  sararfftnd_plan plan, sarafft_complex *h_data
) {
  CUdeviceptr d_data;
  size_t planSize = getPlanSize( plan );
  if ( CUDA_SUCCESS != cuMemAlloc( &d_data, planSize ) ) {
    printf( "cuMemAlloc failed for plansize %li!\n", planSize );
    fflush ( stdout );
    exit( 90 );
  }
  if ( CUDA_SUCCESS != cuMemcpyHtoD( d_data, h_data, planSize ) ) {
    printf( "cuMemcpyHtoD failed!\n" );
    fflush ( stdout );
    exit( 91 );
  }
  if ( CUFFT_SUCCESS != cufftExecC2R( plan, ( cufftComplex* )d_data, ( cufftReal* )d_data ) ) {
    printf( "cufftExecR2C failed!\n" );
    fflush ( stdout );
    exit( 92 );
  }
  if ( CUDA_SUCCESS != cuMemcpyDtoH( h_data, d_data, planSize ) ) {
    printf( "cuMemcpyDtoH failed!\n" );
    fflush ( stdout );
    exit( 93 );
  }
  if ( CUDA_SUCCESS != cuMemFree( d_data ) ) {
    printf( "cuMemFree failed!\n" );
    fflush ( stdout );
    exit( 94 );
  }
}

Ejemplo n.º 7

0

Mostrar archivo

Archivo: transpose.cpp Proyecto: Larhard/Objective-Cuda

void gpu_transpose_with_shared_mem(int *dest, const int *src, int height, int width) {
    assert((width & (width - 1)) == 0);  // TODO
    assert((height & (height - 1)) == 0);

    cuda->set_default_module(CUDA_PTX_PREFIX"transpose.cu.ptx");
    CUfunction transpose_kernel = cuda->get_kernel("transpose_with_shared_mem");

    int grid_dim_x = width / TILE_DIM;
    int grid_dim_y = height / TILE_DIM;

    CUdeviceptr device_src;
    CUdeviceptr device_dest;
    cuMemAlloc(&device_src, width*height*sizeof(int));
    cuMemAlloc(&device_dest, width*height*sizeof(int));
    cuMemcpyHtoD(device_src, src, width*height*sizeof(int));

    void *args[] = {&device_dest, &device_src};
    cuda->launch_kernel_2d_sync(transpose_kernel,
            grid_dim_x, grid_dim_y,
            TILE_DIM, 2,
            args);

    cuMemcpyDtoH(dest, device_dest, width*height*sizeof(int));
    cuMemFree(device_src);
    cuMemFree(device_dest);
    cuda->ctx_synchronize();
}

Ejemplo n.º 8

0

Mostrar archivo

Archivo: transpose.cpp Proyecto: Larhard/Objective-Cuda

void gpu_transpose_naive(int *dest, const int *src, int height, int width) {
    assert((width & (width - 1)) == 0);  // TODO
    assert((height & (height - 1)) == 0);

    cuda->set_default_module("transpose.ptx");
    CUfunction transpose_kernel = cuda->get_kernel("transpose_naive");

    int grid_dim_x = width / BLOCK_DIM_X;
    int grid_dim_y = height / BLOCK_DIM_Y;

    CUdeviceptr device_src;
    CUdeviceptr device_dest;
    cuMemAlloc(&device_src, width*height*sizeof(int));
    cuMemAlloc(&device_dest, width*height*sizeof(int));
    cuMemcpyHtoD(device_src, src, width*height*sizeof(int));

    void *args[] = {&device_dest, &device_src, &height, &width};
    cuda->launch_kernel_2d_sync(transpose_kernel,
            grid_dim_x, grid_dim_y,
            BLOCK_DIM_X, BLOCK_DIM_Y,
            args);

    cuMemcpyDtoH(dest, device_dest, width*height*sizeof(int));
    cuMemFree(device_src);
    cuMemFree(device_dest);
    cuda->ctx_synchronize();
}

Ejemplo n.º 9

0

Mostrar archivo

Archivo: cuda.c Proyecto: mwh/grace-cuda

Object cuda_over_map(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    CUresult error;
    cuInit(0);
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    CUcontext cuContext;
    CUmodule cuModule;
    CUfunction cuFunc;
    error = cuDeviceGet(&cuDevice, 0);
    error = cuCtxCreate(&cuContext, 0, cuDevice);
    CUdeviceptr d_A;
    CUdeviceptr d_B;
    CUdeviceptr d_res;
    errcheck(cuModuleLoad(&cuModule, grcstring(argv[argcv[0]])));
    CUdeviceptr dps[argcv[0]];
    void *args[argcv[0]+2];
    int size = INT_MAX;
    for (int i=0; i<argcv[0]; i++) {
        struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i];
        if (a->size < size)
            size = a->size;
        errcheck(cuMemAlloc(&dps[i], size * sizeof(float)));
        errcheck(cuMemcpyHtoD(dps[i], &a->data, size * sizeof(float)));
        args[i+1] = &dps[i];
    }
    struct CudaFloatArray *r =
        (struct CudaFloatArray *)(alloc_CudaFloatArray(size));
    int fsize = sizeof(float) * size;
    errcheck(cuMemAlloc(&d_res, fsize));
    errcheck(cuMemcpyHtoD(d_res, &r->data, fsize));
    args[0] = &d_res;
    args[argcv[0]+1] = &size;

    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
    char name[256];
    strcpy(name, "block");
    strcat(name, grcstring(argv[argcv[0]]) + strlen("_cuda/"));
    for (int i=0; name[i] != 0; i++)
        if (name[i] == '.') {
            name[i] = 0;
            break;
        }
    errcheck(cuModuleGetFunction(&cuFunc, cuModule, name));
    errcheck(cuLaunchKernel(cuFunc, blocksPerGrid, 1, 1,
        threadsPerBlock, 1, 1,
        0,
        NULL, args, NULL));
    errcheck(cuMemcpyDtoH(&r->data, d_res, fsize));
    cuMemFree(d_res);
    for (int i=0; i<argcv[0]; i++)
        cuMemFree(dps[i]);
    return (Object)r;
}

Ejemplo n.º 10

0

Mostrar archivo

Archivo: pocl-cuda.c Proyecto: jrprice/pocl

void
pocl_cuda_read (void *data, void *host_ptr, const void *device_ptr,
                size_t offset, size_t cb)
{
  CUresult result
      = cuMemcpyDtoH (host_ptr, (CUdeviceptr) (device_ptr + offset), cb);
  CUDA_CHECK (result, "cuMemcpyDtoH");
}

Ejemplo n.º 11

0

Mostrar archivo

Archivo: bitcoinminercuda.cpp Proyecto: maqifrnswa/rpcminer-mod

const unsigned long CUDARunner::RunStep()
{
	//unsigned int best=0;
	//unsigned int bestg=~0;
	int offset=0;

	if(m_in==0 || m_out==0 || m_devin==0 || m_devout==0)
	{
		AllocateResources(m_numb,m_numt);
	}
	m_out[0].m_bestnonce=0;
	cuMemcpyHtoD(m_devout,m_out,/*m_numb*m_numt*/sizeof(cuda_out));

	cuMemcpyHtoD(m_devin,m_in,sizeof(cuda_in));

	int loops=GetStepIterations();
	int bits=GetStepBitShift()-1;

	void *ptr=(void *)(size_t)m_devin;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(m_function,offset,&ptr,sizeof(ptr));
	offset+=sizeof(ptr);

	ptr=(void *)(size_t)m_devout;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(m_function,offset,&ptr,sizeof(ptr));
	offset+=sizeof(ptr);

	ALIGN_UP(offset, __alignof(loops));
	cuParamSeti(m_function,offset,loops);
	offset+=sizeof(loops);

	ALIGN_UP(offset, __alignof(bits));
	cuParamSeti(m_function,offset,bits);
	offset+=sizeof(bits);

	cuParamSetSize(m_function,offset);

	cuFuncSetBlockShape(m_function,m_numt,1,1);
	cuLaunchGrid(m_function,m_numb,1);

	cuMemcpyDtoH(m_out,m_devout,/*m_numb*m_numt*/sizeof(cuda_out));

	// very unlikely that we will find more than 1 hash with H=0
	// so we'll just return the first one and not even worry about G
	for(int i=0; i<1/*m_numb*m_numt*/; i++)
	{
		if(m_out[i].m_bestnonce!=0)// && m_out[i].m_bestg<bestg)
		{
			return CryptoPP::ByteReverse(m_out[i].m_bestnonce);
			//best=m_out[i].m_bestnonce;
			//bestg=m_out[i].m_bestg;
		}
	}

	return 0;

}

Ejemplo n.º 12

0

Mostrar archivo

Archivo: cuda.cpp Proyecto: bnascimento/Halide

WEAK int halide_copy_to_host(void *user_context, buffer_t* buf) {
    if (!buf->dev_dirty) {
        return 0;
    }

    DEBUG_PRINTF( user_context, "CUDA: halide_copy_to_host (user_context: %p, buf: %p)\n", user_context, buf );

    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    // Need to check dev_dirty again, in case another thread did the
    // copy_to_host before the serialization point above.
    if (buf->dev_dirty) {
        #ifdef DEBUG
        uint64_t t_before = halide_current_time_ns(user_context);
        #endif

        halide_assert(user_context, buf->dev && buf->dev);
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));

        _dev_copy c = _make_dev_to_host_copy(buf);

        for (int w = 0; w < c.extent[3]; w++) {
            for (int z = 0; z < c.extent[2]; z++) {
                for (int y = 0; y < c.extent[1]; y++) {
                    for (int x = 0; x < c.extent[0]; x++) {
                        uint64_t off = (x * c.stride_bytes[0] +
                                        y * c.stride_bytes[1] +
                                        z * c.stride_bytes[2] +
                                        w * c.stride_bytes[3]);
                        CUdeviceptr src = (CUdeviceptr)(c.src + off);
                        void *dst = (void *)(c.dst + off);
                        uint64_t size = c.chunk_size;
                        DEBUG_PRINTF( user_context, "    cuMemcpyDtoH (%d, %d, %d, %d), %p -> %p, %lld bytes\n",
                                      x, y, z, w,
                                      (void *)src, dst, (long long)size );
                        CUresult err = cuMemcpyDtoH(dst, src, size);
                        if (err != CUDA_SUCCESS) {
                            halide_error_varargs(user_context, "CUDA: cuMemcpyDtoH failed (%s)",
                                                 _get_error_name(err));
                            return err;
                        }
                    }
                }
            }
        }

        #ifdef DEBUG
        uint64_t t_after = halide_current_time_ns(user_context);
        halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
        #endif
    }
    buf->dev_dirty = false;
    return 0;
}

Ejemplo n.º 13

0

Mostrar archivo

Archivo: device_cuda.cpp Proyecto: vanangamudi/blender-main

	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
	{
		size_t offset = elem*y*w;
		size_t size = elem*w*h;

		cuda_push_context();
		cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
			(CUdeviceptr)((uchar*)mem.device_pointer + offset), size))
		cuda_pop_context();
	}

Ejemplo n.º 14

0

Mostrar archivo

Archivo: device_vector.hpp Proyecto: azuredsky/vexcl

        /// Copies data from device to host memory.
        void read(const command_queue &q, size_t offset, size_t size, T *host,
                bool blocking = false) const
        {
            (void)blocking;

            if (size) {
                q.context().set_current();
                cuda_check( cuMemcpyDtoH(host, raw() + offset * sizeof(T), size * sizeof(T)) );
            }
        }

Ejemplo n.º 15

0

Mostrar archivo

Archivo: swanlib_nv.c Proyecto: liyuming1978/swan

void swanMemcpyDtoH( void *p_d, void *p_h, size_t len ) {
	CUresult	err=CUDA_SUCCESS; //cuCtxSynchronize();
	if ( err != CUDA_SUCCESS ) {
		error("swanMemcpyDtoH sync failed\n" );
	}

	err = cuMemcpyDtoH(  p_h, PTR_TO_CUDEVPTR(p_d), len );
	if ( err != CUDA_SUCCESS ) {
		error("swanMemcpyDtoH failed\n" );
	}
}

Ejemplo n.º 16

0

Mostrar archivo

Archivo: cudamain.cpp Proyecto: sergiy8/fsha

int main(int argc, char ** argv){
	int i;
        if( (argc>=2) && (atoi(argv[1])!=RANK)) error("rank %d mandatory",RANK);
        printf("CUDA RANK=%d\n",RANK);

	kernel.print();
// build busylist
        busylist = (uint32_t*)malloc_file(CNK*sizeof(uint32_t),FMODE_RO,BLIST_FORMAT,RANK);

// put busylist
        SafeCall(cuMemHostRegister(busylist,CNK*sizeof*busylist,CU_MEMHOSTREGISTER_DEVICEMAP));
        SafeCall(cuMemHostGetDevicePointer(&host_busylist,busylist,0));

        SafeCall(cuModuleGetGlobal(&dev_busylist,&bytes,kernel.module[0].module,"busylist"));
        if(bytes!=sizeof(host_busylist)) error("busylist!");
	SafeCall(cuMemcpyHtoD(dev_busylist,&host_busylist,bytes));
// put array
#ifdef IN_mk_data
        mkdir(DATADIR,0755); errno=0;
        array = (unsigned char *)malloc_file(abytes(RANK,CNK),1,DATADIR"%d",RANK);
#else
        array = (unsigned char *)malloc_file(abytes(RANK,CNK),0,DATADIR"%d",RANK);
#endif
        SafeCall(cuMemHostRegister(array,abytes(RANK,CNK),CU_MEMHOSTREGISTER_DEVICEMAP));
        SafeCall(cuMemHostGetDevicePointer(&host_array,array,0));

        SafeCall(cuModuleGetGlobal(&dev_array,&bytes,kernel.module[0].module,"array"));
        if(bytes!=sizeof(host_array)) error("array!");
	SafeCall(cuMemcpyHtoD(dev_array,&host_array,bytes));

#define THREADS 512
#define MAXG    65535
uint64_t nado = (cnk[RANK] +(THREADS-1))/THREADS;
uint32_t gridx = nado>MAXG?MAXG:nado;
uint32_t gridy = (nado+(MAXG-1))/MAXG;
printf("gridy=%d gridx=%d THREAD=%d\n",gridy, gridx, THREADS);

	kernel.launch(params,THREADS,gridx,gridy);
	kernel.wait();

	SafeCall(cuMemHostUnregister(busylist));
	SafeCall(cuMemHostUnregister(array));

        SafeCall(cuModuleGetGlobal(&dev_changed,&bytes,kernel.module[0].module,"changed"));
        if(bytes!=sizeof(changed)) error("changed!");
	SafeCall(cuMemcpyDtoH(changed,dev_changed,bytes));

	for(i=0;i<CACHESIZE;i++)
		total += changed[i];
	printf("changed=%ju\n",total);

	return 0;
}

Ejemplo n.º 17

0

Mostrar archivo

Archivo: GPUInterfaceCUDA.cpp Proyecto: answer19831020/beagle-lib

void GPUInterface::MemcpyDeviceToHost(void* dest,
                                      const GPUPtr src,
                                      size_t memSize) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tEntering GPUInterface::MemcpyDeviceToHost\n");
#endif

    SAFE_CUPP(cuMemcpyDtoH(dest, src, memSize));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tLeaving  GPUInterface::MemcpyDeviceToHost\n");
#endif

}

Ejemplo n.º 18

0

Mostrar archivo

Archivo: runtime.ptx_host.cpp Proyecto: 202198/Halide

WEAK void halide_copy_to_host(buffer_t* buf) {
    if (buf->dev_dirty) {
        halide_assert(buf->dev);
        halide_assert(buf->host);
        size_t size = buf_size(buf);
        #ifdef DEBUG
        char msg[256];
        snprintf(msg, 256, "copy_to_host (%zu bytes) %p -> %p", size, (void*)buf->dev, buf->host );
        halide_assert(halide_validate_dev_pointer(buf));
        #endif
        TIME_CALL( cuMemcpyDtoH(buf->host, buf->dev, size), msg );
    }
    buf->dev_dirty = false;
}

Ejemplo n.º 19

0

Mostrar archivo

Archivo: mgpusort.cpp Proyecto: CindyYang85/mgpu

sortStatus_t SORTAPI sortHost(sortEngine_t engine, uint* keys, uint* values,
	int numElements, int numBits) {

	MgpuSortData data;
	sortStatus_t status = data.Alloc(engine, numElements, values ? 1 : 0);
	if(SORT_STATUS_SUCCESS != status) return status;

	data.endBit = numBits;

	// Copy keys and values into device memory.
	CUresult result = cuMemcpyHtoD(data.keys[0], keys, 
		sizeof(double) * numElements);
	if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR;

	if(values) {
		result = cuMemcpyHtoD(data.values1[0], values, 
			sizeof(double) * numElements);
		if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR;
	}

	// Sort
	status = sortArray(engine, &data);
	if(SORT_STATUS_SUCCESS != status) return status;

	// Copy sorted keys and values into host memory.
	result = cuMemcpyDtoH(keys, data.keys[0], sizeof(double) * numElements);
	if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR;

	if(values) {
		result = cuMemcpyDtoH(values, data.values1[0], 
			sizeof(double) * numElements);
		if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR;
	}

	// MgpuSortData wrapper will automatically clean up the device memory.
	return SORT_STATUS_SUCCESS;
}

Ejemplo n.º 20

0

Mostrar archivo

Archivo: float_array.cpp Proyecto: sebsgit/gemu

int main(){
	init_test();
	const std::string test_source =
	".version 4.2\n"
	".target sm_20\n"
	".address_size 64\n"
	".visible .entry _Z6kernelPfi(\n"
	".param .u64 _Z6kernelPfi_param_0,\n"
	".param .u32 _Z6kernelPfi_param_1){\n"
	".reg .pred 	%p<2>;\n"
	".reg .f32 	%f<3>;\n"
	".reg .s32 	%r<3>;\n"
	".reg .s64 	%rd<5>;\n"
	"ld.param.u64 	%rd1, [_Z6kernelPfi_param_0];\n"
	"ld.param.u32 	%r2, [_Z6kernelPfi_param_1];\n"
	"mov.u32 	%r1, %tid.x;\n"
	"setp.ge.u32	%p1, %r1, %r2;\n"
	"@%p1 bra 	BB0_2;\n"
	"cvta.to.global.u64 	%rd2, %rd1;\n"
	"cvt.rn.f32.u32	%f1, %r1;\n"
	"mul.f32 	%f2, %f1, 0f3FC00000;\n"
	"mul.wide.u32 	%rd3, %r1, 4;\n"
	"add.s64 	%rd4, %rd2, %rd3;\n"
	"st.global.f32 	[%rd4], %f2;\n"
	"BB0_2:\n"
	"ret;\n"
	"}";
	CUmodule modId = 0;
	CUfunction funcHandle = 0;
	cu_assert(cuModuleLoadData(&modId, test_source.c_str()));
	cu_assert(cuModuleGetFunction(&funcHandle, modId, "_Z6kernelPfi"));
	CUdeviceptr devArray;
	int size = 10;
	float hostArray[size];
	memset(hostArray, 0, size * sizeof(hostArray[0]));
	cu_assert(cuMemAlloc(&devArray, sizeof(float) * size));
	void * params[] = {&devArray, &size};
	auto result = cuLaunchKernel(funcHandle, 1,1,1, size*2,1,1, 0,0, params, nullptr);
	cu_assert(result);
	cu_assert(cuMemcpyDtoH(&hostArray, devArray, sizeof(hostArray[0])*size));
	cu_assert(cuMemFree(devArray));
	cu_assert(cuModuleUnload(modId));
	for (int i=0 ; i<size ; ++i)
		std::cout << hostArray[i] << '\n';
	return 0;
}

Ejemplo n.º 21

0

Mostrar archivo

Archivo: saracufft.cpp Proyecto: sara-nl/Omnimatch

void sararfftnd_one_real_to_complex(
  sararfftnd_plan plan, sarafft_real *h_data
) {
  CUdeviceptr d_data;
  size_t planSize = getPlanSize( plan );
//   printf( "planSize = %li!\n", planSize );
//   fflush ( stdout );
  cufftResult fftResult;
  CUresult cudaResult;
  if ( CUDA_SUCCESS != cuMemAlloc( &d_data, planSize ) ) {
    printf( "cuMemAlloc failed for plansize %li!\n", planSize );
    fflush ( stdout );
    exit( 85 );
  }
  if ( CUDA_SUCCESS != cuMemcpyHtoD( d_data, h_data, planSize ) ) {
    printf( "cuMemcpyHtoD failed!\n" );
    fflush ( stdout );
    exit( 86 );
  }
//   cudaError_t cudaError = cudaGetLastError();
//   if( cudaError != cudaSuccess ) {
//     printf( "CUDA Runtime API Error reported : %s\n", cudaGetErrorString(cudaError));
//     fflush ( stdout );
//     exit( 87 );
//   } else {
//     printf( "CUDA is in good shape.\n");
//     fflush ( stdout );
//   }
  fftResult = cufftExecR2C( plan, ( cufftReal* )d_data, ( cufftComplex* )d_data );
  if ( CUFFT_SUCCESS != fftResult ) {
    printf( "cufftExecR2C failed with code %d\n", fftResult );
    fflush ( stdout );
    exit( 87 );
  }
  if ( CUDA_SUCCESS != cuMemcpyDtoH( h_data, d_data, planSize ) ) {
    printf( "cuMemcpyDtoH failed!\n" );
    fflush ( stdout );
    exit( 88 );
  }
  if ( CUDA_SUCCESS != cuMemFree( d_data ) ) {
    printf( "cuMemFree failed!\n" );
    fflush ( stdout );
    exit( 89 );
  }
}

Ejemplo n.º 22

0

Mostrar archivo

Archivo: mmult.c Proyecto: Constellation/gdev-bench

int shmem_device_copy(int key, int size, unsigned int *matrix, int toDevice)
{
        int shmid;
        CUresult res;
        CUdeviceptr addr;

        if ((res = cuShmGet(&shmid, key, size, 0)) != CUDA_SUCCESS) {
                printf("cuShmGet failed: res = %u\n", res);

                return -1;
        }

        /* attach a local pointer to shared memory */
        if ((res = cuShmAt(&addr, shmid, 0)) != CUDA_SUCCESS) {
                printf("cuShmAt failed: res = %u\n", res);

                return -1;
        }

        /* copy current matrix from host memory to shared memory */
        if (toDevice) {
                if ((res = cuMemcpyHtoD(addr, matrix, size)) != CUDA_SUCCESS) {
                        printf("cuMemcpyHtoD failed: res = %u\n", res);

                        return -1;
                }
        }
        else {
                if ((res = cuMemcpyDtoH(matrix, addr, size)) != CUDA_SUCCESS) {
                        printf("cuMemcpyDtoH failed: res = %u\n", res);

                        return -1;
                }
        }

        /* detach local pointer */
        if ((res = cuShmDt(addr)) != CUDA_SUCCESS) {
                printf("cuShmDt failed: res = %u\n", res);

                return -1;
        }

        return 0;
}

Ejemplo n.º 23

0

Mostrar archivo

Archivo: hello.cuda_driver.cpp Proyecto: 7633/msu-cuda-course

int main(int argc, char ** argv)
{
	int dev_count = 0;

	CUdevice   device;
	CUcontext  context;
	CUmodule   module;
	CUfunction function;

	cuInit(0);

	cuDeviceGetCount(&dev_count);

	if (dev_count < 1) return -1;

	cuDeviceGet( &device, 0 );
	cuCtxCreate( &context, 0, device );
	
	cuModuleLoad( &module, "hello.cuda_runtime.ptx" );
	cuModuleGetFunction( &function, module, "_Z6kernelPf" );

	int N = 512;
	CUdeviceptr pData;
	cuMemAlloc( &pData, N * sizeof(float) );
	cuFuncSetBlockShape( function, N, 1, 1 );
	cuParamSeti( function, 0, pData );
	cuParamSetSize( function, 4 );

	cuLaunchGrid( function, 1, 1 );

	float * pHostData = new float[N];

	cuMemcpyDtoH( pHostData, pData, N * sizeof( float) );

	cuMemFree( pData );

	delete [] pHostData;

	return 0;
}

Ejemplo n.º 24

0

Mostrar archivo

Archivo: scan_main.cpp Proyecto: yabuta/eating_one

uint transport(CUdeviceptr d_Input , uint loc, uint *res){

  CUdeviceptr d_Output;
  
  checkCudaErrors(cudaMalloc((void **)&d_Output, sizeof(int)));
  checkCudaErrors(cudaDeviceSynchronize());

  transport_gpu((uint *)d_Output, (uint *)d_Input, loc);
  //szWorkgroup = scanExclusiveLarge((uint *)d_Output, (uint *)d_Input, pnum, N);
  checkCudaErrors(cudaDeviceSynchronize());

  // pass or fail (cumulative... all tests in the loop)

  if(cuMemcpyDtoH(res,d_Output,sizeof(uint)) != CUDA_SUCCESS){
    printf("cuMemcpyDtoH(d_Output) error.\n");
    exit(1);
  }

  return SUCCESS;


}

Ejemplo n.º 25

0

Mostrar archivo

Archivo: basic_add.cpp Proyecto: sebsgit/gemu

int main(){
	init_test();
	const std::string source = 
	".version 4.2\n"
	".target sm_20\n"
	".address_size 64\n"
	".visible .entry kernel_4(\n"
	".param .u32 kernel_4_param_0,\n"
	".param .u64 kernel_4_param_1\n"
	")\n"
	"{\n"
	".reg .s32 	%r<3>;\n"
	".reg .s64 	%rd<3>;\n"
	"ld.param.u32 	%r1, [kernel_4_param_0];\n"
	"ld.param.u64 	%rd1, [kernel_4_param_1];\n"
	"cvta.to.global.u64 	%rd2, %rd1;\n"
	"add.s32 	%r2, %r1, 7;\n"
	"st.global.u32 	[%rd2], %r2;\n"
	"ret;\n"
	"}";
	CUmodule modId = 0;
	CUfunction funcHandle = 0;
	cu_assert(cuModuleLoadData(&modId, source.c_str()));
	cu_assert(cuModuleGetFunction(&funcHandle, modId, "kernel_4"));
	CUdeviceptr devValue;
	int hostValue = 10;
	cu_assert(cuMemAlloc(&devValue, sizeof(int)));
	void * params[] = {&hostValue, &devValue};
	cu_assert(cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr));
	int result = 0;
	cu_assert(cuMemcpyDtoH(&result, devValue, sizeof(result)));
	assert(result == hostValue + 7);
	std::cout << result << "\n";
	cu_assert(cuMemFree(devValue));
	cu_assert(cuModuleUnload(modId));
	return 0;
}

Ejemplo n.º 26

0

Mostrar archivo

Archivo: occaCUDA.cpp Proyecto: maxhutch/OCCA2

  void memory_t<CUDA>::copyTo(void *dest,
                              const uintptr_t bytes,
                              const uintptr_t offset){
    const uintptr_t bytes_ = (bytes == 0) ? size : bytes;

    OCCA_CHECK((bytes_ + offset) <= size);

    if(!isTexture)
      OCCA_CUDA_CHECK("Memory: Copy To",
                      cuMemcpyDtoH(dest, *((CUdeviceptr*) handle) + offset, bytes_) );
    else{
      if(textureInfo.dim == 1)
        OCCA_CUDA_CHECK("Texture Memory: Copy To",
                        cuMemcpyAtoH(dest, ((CUDATextureData_t*) handle)->array, offset, bytes_) );
      else{
        CUDA_MEMCPY2D info;

        info.srcXInBytes   = offset;
        info.srcY          = 0;
        info.srcMemoryType = CU_MEMORYTYPE_ARRAY;
        info.srcArray      = ((CUDATextureData_t*) handle)->array;

        info.dstXInBytes   = 0;
        info.dstY          = 0;
        info.dstMemoryType = CU_MEMORYTYPE_HOST;
        info.dstHost       = dest;
        info.dstPitch      = 0;

        info.WidthInBytes = textureInfo.w * textureInfo.bytesInEntry;
        info.Height       = (bytes_ / info.WidthInBytes);

        cuMemcpy2D(&info);

        dev->finish();
      }
    }
  }

Ejemplo n.º 27

0

Mostrar archivo

Archivo: SurfaceInteropCUDA.cpp Proyecto: ntibor22/QtAV

void* InteropResource::mapToHost(const VideoFormat &format, void *handle, int picIndex, const CUVIDPROCPARAMS &param, int width, int height, int coded_height)
{
    AutoCtxLock locker((cuda_api*)this, lock);
    Q_UNUSED(locker);
    CUdeviceptr devptr;
    unsigned int pitch;

    CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(&param)), NULL);
    CUVIDAutoUnmapper unmapper(this, dec, devptr);
    Q_UNUSED(unmapper);
    uchar* host_data = NULL;
    const size_t host_size = pitch*coded_height*3/2;
    CUDA_ENSURE(cuMemAllocHost((void**)&host_data, host_size), NULL);
    // copy to the memory not allocated by cuda is possible but much slower
    CUDA_ENSURE(cuMemcpyDtoH(host_data, devptr, host_size), NULL);

    VideoFrame frame(width, height, VideoFormat::Format_NV12);
    uchar *planes[] = {
        host_data,
        host_data + pitch * coded_height
    };
    frame.setBits(planes);
    int pitches[] = { (int)pitch, (int)pitch };
    frame.setBytesPerLine(pitches);

    VideoFrame *f = reinterpret_cast<VideoFrame*>(handle);
    frame.setTimestamp(f->timestamp());
    frame.setDisplayAspectRatio(f->displayAspectRatio());
    if (format == frame.format())
        *f = frame.clone();
    else
        *f = frame.to(format);

    cuMemFreeHost(host_data);
    return f;
}

Ejemplo n.º 28

0

Mostrar archivo

Archivo: fixedgrid.c Proyecto: jlinford/fixedgrid

/**
 * PPU program entry point.
 */
int main(int argc, char** argv)
{
    /* Get global memory pointer */
    fixedgrid_t* const G = &G_GLOBAL;
    
    /* Iterators */
    uint32_t k, iter;
    
    /* Start wall clock timer */
    timer_start(&G->metrics.wallclock);

    /* Check dimensions */
    if(NX % BLOCK_X != 0)
    {
        fprintf(stderr, "NX must be a multiple of %d\n", BLOCK_X);
        exit(1);
    }    
    if(NY % BLOCK_Y != 0)
    {
        fprintf(stderr, "NY must be a multiple of %d\n", BLOCK_Y);
        exit(1);
    }
    if(NZ % BLOCK_Z != 0)
    {
        fprintf(stderr, "NZ must be a multiple of %d\n", BLOCK_Z);
        exit(1);
    }
    
    /* Initialize the model parameters */
    init_model(G);
    
    /* Add emissions */
    process_emissions(G);
    
    /* Print startup banner */
    print_start_banner(G);
    
    /* Store initial concentration */
    printf("Writing initial concentration data... ");
    write_conc(G, 0, 0);
    printf("done.\n");    
        
    printf("\n!!!!FIXME: Report # FPEs\n");
        
    /* BEGIN CALCULATIONS */
    for(iter=1, G->time = G->tstart; G->time < G->tend; G->time += G->dt, ++iter)
    {
        start_saprc99(G);
        
        for(k=0; k<NLOOKAT; k++)
        {
            // Copy concentration data to device
            CU_SAFE_CALL(cuMemcpyHtoD(G->dev_conc, &G->conc(0, 0, 0, MONITOR[k]), NX*NY*NZ*sizeof(real_t)));
            
            discretize_all_x(G, G->dt*0.5);
            
            discretize_all_y(G, G->dt*0.5);
            
            discretize_all_z(G, G->dt);
            
            discretize_all_y(G, G->dt*0.5);
            
            discretize_all_x(G, G->dt*0.5);
            
            // Copy updated concentrations back to host
            CU_SAFE_CALL(cuMemcpyDtoH((void*)&G->conc(0, 0, 0, MONITOR[k]), G->dev_conc_out, NX*NY*NZ*sizeof(real_t)));            
        }

        update_model(G);
        
        #if WRITE_EACH_ITER == 1
        write_conc(G, iter, 0);
        #endif

        printf("  After iteration %02d: Model time = %07.2f sec.\n", iter, iter*G->dt);
    }
    /* END CALCULATIONS */
    
    /* Store concentration */
    #if WRITE_EACH_ITER != 1
    write_conc(G, iter-1, 0);
    #endif
    
    /* Show final time */
    printf("\nFinal time: %f seconds.\n", (iter-1)*G->dt);
    
    timer_stop(&G->metrics.wallclock);
    
    /* Write metrics to CSV file */
    write_metrics_as_csv(G, "NVidia CUDA");
    
    /* Cleanup and exit */

    CU_SAFE_CALL(cuMemFree(G->dev_conc));
    CU_SAFE_CALL(cuMemFree(G->dev_wind));
    CU_SAFE_CALL(cuMemFree(G->dev_diff));
    CU_SAFE_CALL(cuMemFree(G->dev_buff));
    CU_SAFE_CALL(cuMemFree(G->dev_conc_out));
    CU_SAFE_CALL_NO_SYNC(cuCtxDetach(cu_context_global));
    
    return 0;
}

Ejemplo n.º 29

0

Mostrar archivo

Archivo: dt_GPU.cpp Proyecto: 794523332/Autoware

static CUT_THREADPROC dt_thread_func(void *p)
{
	dt_partition *pt = (dt_partition *)p;
	struct timeval tv;
	CUresult res;
	int thread_num_x=0, thread_num_y=0;
	int block_num_x=0, block_num_y=0;

	res = cuCtxSetCurrent(ctx[pt->pid]);
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSetCurrent(ctx[%d]) failed: res = %s\n", pt->pid, cuda_response_to_string(res));
		exit(1);
	}

	/* allocate GPU memory */

	//printf("part_error_array_num = %d\n",part_error_array_num);


	if(pt->pid == 0){
		gettimeofday(&tv_memcpy_start, NULL);
	}

	res = cuMemcpyHtoD(part_C_dev[pt->pid], dst_C, SUM_SIZE_C);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(part_C_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(part_error_array_dev[pt->pid], part_error_array, part_error_array_num*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(part_error_array_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(pm_size_array_dev[pt->pid], &pt->size_array[0][0], pt->NoP*2*pt->L_MAX*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(pm_size_array_dev) falied: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(def_array_dev[pt->pid], pt->def, sum_size_def_array);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(def_array_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(numpart_dev[pt->pid], pt->numpart, pt->NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(cuMemcpyHtoD(numpart_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(PIDX_array_dev[pt->pid], pt->dst_PIDX, pt->tmp_array_size);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(PIDX_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(DID_4_array_dev[pt->pid], pt->dst_DID_4, pt->tmp_array_size);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(DID_4__array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}


	if(pt->pid == 0){
		gettimeofday(&tv_memcpy_end, NULL);
		tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
		time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}

	int sharedMemBytes = 0;

	/* get max thread num per block */
	int max_threads_num = 0;
	res = cuDeviceGetAttribute(&max_threads_num, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[pt->pid]);
	if(res != CUDA_SUCCESS){
		printf("\ncuDeviceGetAttribute() failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	/* prepare for launch inverse_Q */
	void* kernel_args_inverse[] = {
		&part_C_dev[pt->pid],
		&pm_size_array_dev[pt->pid],
		&part_error_array_dev[pt->pid],
		&part_error_array_num,
		(void*)&(pt->NoP),
		&PIDX_array_dev[pt->pid],
		&numpart_dev[pt->pid],
		(void*)&(pt->NoC),
		(void*)&(pt->max_numpart),
		(void*)&(pt->interval),
		(void*)&(pt->L_MAX),
		(void*)&(pt->pid),
		(void*)&(device_num)
	};

	/* define CUDA block shape */
	int upper_limit_th_num_x = max_threads_num/(pt->max_numpart*pt->NoC);
	int upper_limit_th_num_y = max_threads_num/upper_limit_th_num_x;
	if(upper_limit_th_num_x < 1) upper_limit_th_num_x++;
	if(upper_limit_th_num_y < 1) upper_limit_th_num_y++;

	thread_num_x = (pt->max_dim0*pt->max_dim1 < upper_limit_th_num_x) ? (pt->max_dim0*pt->max_dim1) : upper_limit_th_num_x;
	thread_num_y = (pt->max_numpart < upper_limit_th_num_y) ? pt->max_numpart : upper_limit_th_num_y;

	block_num_x = (pt->max_dim0*pt->max_dim1) / thread_num_x;
	block_num_y = (pt->max_numpart) / thread_num_y;
	if((pt->max_dim0*pt->max_dim1) % thread_num_x != 0) block_num_x++;
	if(pt->max_numpart % thread_num_y != 0) block_num_y++;


	int blockDimY = thread_num_y / device_num;
	if(thread_num_y%device_num != 0){
		blockDimY++;
	}

	/* launch iverse_Q */
	if(pt->pid == 0){
		gettimeofday(&tv_kernel_start, NULL);
	}
	res = cuLaunchKernel(
		func_inverse_Q[pt->pid],      // call function
		block_num_x,         // gridDimX
		block_num_y,         // gridDimY
		pt->L_MAX-pt->interval,      // gridDimZ
		thread_num_x,        // blockDimX
		blockDimY,        // blockDimY
		pt->NoC,                 // blockDimZ
		sharedMemBytes,      // sharedMemBytes
		NULL,                // hStream
		kernel_args_inverse, // kernelParams
		NULL                 // extra
		);
	if(res != CUDA_SUCCESS) {
		printf("block_num_x %d, block_num_y %d, thread_num_x %d, thread_num_y %d\n", block_num_x, block_num_y, thread_num_x, thread_num_y);
		printf("cuLaunchKernel(inverse_Q) failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(inverse_Q) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	if(pt->pid == 0){
		gettimeofday(&tv_kernel_end, NULL);
		tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
		time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}


	/* prepare for launch dt1d_x */
	void* kernel_args_x[] = {
		&part_C_dev[pt->pid],                  // FLOAT *src_start
		&tmpM_dev[pt->pid],                    // FLOTA *dst
		&tmpIy_dev[pt->pid],                   // int *ptr
		&DID_4_array_dev[pt->pid],             // int *DID_4_array,
		&def_array_dev[pt->pid],               // FLOAT *def_array,
		&pm_size_array_dev[pt->pid],           // int *size_array
		(void*)&(pt->NoP),                  // int NoP
		&PIDX_array_dev[pt->pid],              // int *PIDX_array
		&part_error_array_dev[pt->pid],        // int *error_array
		(void*)&(part_error_array_num), // int error_array_num
		&numpart_dev[pt->pid],                 // int *numpart
		(void*)&(pt->NoC),                  // int NoC
		(void*)&(pt->max_numpart),          // int max_numpart
		(void*)&(pt->interval),             // int interval
		(void*)&(pt->L_MAX),                 // int L_MAX
		(void*)&(pt->pid),                   // int pid
		(void*)&(device_num)                 // int device_num
	};


	max_threads_num = 64/pt->NoC;
	if(max_threads_num < 1) max_threads_num++;

	thread_num_x = (pt->max_dim1 < max_threads_num) ? pt->max_dim1 : max_threads_num;
	thread_num_y = (pt->max_numpart < max_threads_num) ? pt->max_numpart : max_threads_num;

	block_num_x = pt->max_dim1 / thread_num_x;
	block_num_y = pt->max_numpart / thread_num_y;
	if(pt->max_dim1 % thread_num_x != 0) block_num_x++;
	if(pt->max_numpart % thread_num_y != 0) block_num_y++;

	blockDimY = thread_num_y / device_num;
	if(thread_num_y%device_num != 0){
		blockDimY++;
	}

	/* launch dt1d_x */
	if(pt->pid == 0){
		gettimeofday(&tv_kernel_start, NULL);
	}

	res = cuLaunchKernel(
		func_dt1d_x[pt->pid],    // call function
		block_num_x,    // gridDimX
		block_num_y,    // gridDimY
		pt->L_MAX-pt->interval, // gridDimZ
		thread_num_x,   // blockDimX
		blockDimY,   // blockDimY
		pt->NoC,            // blockDimZ
		sharedMemBytes, // sharedMemBytes
		NULL,           // hStream
		kernel_args_x,  // kernelParams
		NULL            // extra
		);
	if(res != CUDA_SUCCESS) {

		printf("block_num_x %d, block_num_y %d, thread_num_x %d, thread_num_y %d\n", block_num_x, block_num_y, thread_num_x, thread_num_y);

		printf("cuLaunchKernel(dt1d_x) failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(dt1d_x) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	if(pt->pid == 0){
		gettimeofday(&tv_kernel_end, NULL);
		tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
		time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}


	/* prepare for launch dt1d_y */
	void* kernel_args_y[] = {
		&tmpM_dev[pt->pid],                    // FLOAT *src_start
		&M_dev[pt->pid],                       // FLOAT *dst_start
		&tmpIx_dev[pt->pid],                   // int *ptr_start
		&DID_4_array_dev[pt->pid],             // int *DID_4_array,
		&def_array_dev[pt->pid],               // FLOAT *def_array,
		(void*)&(pt->NoP),                  // int NoP
		&pm_size_array_dev[pt->pid],           // int *size_array
		&numpart_dev[pt->pid],                 // int *numpart,
		&PIDX_array_dev[pt->pid],              // int *PIDX_array,
		(void*)&(pt->NoC),                  // int NoC
		(void*)&(pt->max_numpart),          // int max_numpart
		(void*)&(pt->interval),             // int interval
		(void*)&(pt->L_MAX),                // int L_MAX
		&part_error_array_dev[pt->pid],        // int *error_array
		(void*)&(part_error_array_num), // int error_array_num
		(void*)&(pt->pid),                   // int pid
		(void*)&(device_num)                 // int device_num
	};


	thread_num_x = (pt->max_dim0 < max_threads_num) ? pt->max_dim0 : max_threads_num;
	thread_num_y = (pt->max_numpart < max_threads_num) ? pt->max_numpart : max_threads_num;

	block_num_x = pt->max_dim0 / thread_num_x;
	block_num_y = pt->max_numpart / thread_num_y;
	if(pt->max_dim0 % thread_num_x != 0) block_num_x++;
	if(pt->max_numpart % thread_num_y != 0) block_num_y++;

	blockDimY = thread_num_y / device_num;
	if(thread_num_y%device_num != 0){
		blockDimY++;
	}

	/* prepare for launch dt1d_y */
	if(pt->pid == 0){
		gettimeofday(&tv_kernel_start, NULL);
	}

	res = cuLaunchKernel(
		func_dt1d_y[pt->pid],    // call functions
		block_num_x,    // gridDimX
		block_num_y,    // gridDimY
		pt->L_MAX-pt->interval, // gridDimZ
		thread_num_x,   // blockDimX
		blockDimY,   // blockDimY
		pt->NoC,            // blockDimZ
		sharedMemBytes, // sharedMemBytes
		NULL,           // hStream
		kernel_args_y,  // kernelParams
		NULL            // extra
		);
	if(res != CUDA_SUCCESS) {
		printf("cuLaunchKernel(dt1d_y failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}


	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(dt1d_y) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	if(pt->pid == 0){
		gettimeofday(&tv_kernel_end, NULL);
		tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
		time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}



	/* downloads datas from GPU */

	/* downloads M from GPU */

	int sum_part_size = 0;
	int sum_pointer_size = 0;
	int sum_move_size = 0;
	int part_size = 0;
	int pointer_size = 0;
	int part_y = 0;
	int move_size = 0;
	int start_kk = 0;
	int end_kk = 0;
	int part_end_kk = 0;
	unsigned long long int pointer_dst_M = (unsigned long long int)pt->dst_M;
	unsigned long long int pointer_M_dev = (unsigned long long int)M_dev[pt->pid];

	for(int L=0; L<(pt->L_MAX-pt->interval); L++) {

		/**************************************************************************/
		/* loop condition */
		if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) )
		{
			continue;
		}
		/* loop conditon */
		/**************************************************************************/


		for(int jj=0; jj<pt->NoC; jj++) {

			part_y = pt->numpart[jj] / device_num;
			if(pt->numpart[jj]%device_num != 0){
				part_y++;
			}


			start_kk = part_y * pt->pid;
			end_kk = part_y * (pt->pid + 1);

			if(end_kk > pt->numpart[jj]){
				end_kk = pt->numpart[jj];
			}

			if(pt->pid > 0){
				part_end_kk = part_y * pt->pid;
			}

			for(int kk=0; kk<pt->numpart[jj]; kk++) {

				int PIDX = pt->PIDX_array[L][jj][kk];
				int dims0 = pt->size_array[L][PIDX*2];
				int dims1 = pt->size_array[L][PIDX*2+1];
				if(start_kk <= kk && kk < end_kk){
					part_size += dims0 * dims1;
				}
				//if(pt->pid > 0 && part_start_kk <= kk && kk < part_end_kk){
				if(pt->pid > 0 && 0 <= kk && kk < part_end_kk){
					pointer_size += dims0 * dims1;
				}
				move_size += dims0 * dims1;
			}

			sum_part_size += part_size;
			sum_pointer_size += pointer_size;
			sum_move_size += move_size;

			// error pt->pid == 2 && L == 24 && jj == 1

			if(pt->pid*part_y < pt->numpart[jj]){

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_start, NULL);
				}


				res = cuMemcpyDtoH((void *)(pointer_dst_M+(unsigned long long int)(pointer_size*sizeof(FLOAT))), (CUdeviceptr)(pointer_M_dev+(unsigned long long int)(pointer_size*sizeof(FLOAT))), part_size*sizeof(FLOAT));
				if(res != CUDA_SUCCESS) {
					printf("error pid = %d\n",pt->pid);
					printf("cuMemcpyDtoH(dst_M) failed: res = %s\n", cuda_response_to_string(res));
					exit(1);
				}

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_end, NULL);
					tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
					time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
				}

			}

			pointer_dst_M += (unsigned long long int)(move_size * sizeof(FLOAT));
			pointer_M_dev += (unsigned long long int)(move_size * sizeof(FLOAT));

			part_size = 0;
			pointer_size = 0;
			move_size = 0;

		}

	}


	/* downloads tmpIx from GPU */

	sum_part_size = 0;
	sum_pointer_size = 0;
	part_size = 0;
	pointer_size = 0;
	part_y = 0;
	move_size = 0;
	start_kk = 0;
	end_kk = 0;
	part_end_kk = 0;
	unsigned long long int pointer_dst_tmpIx = (unsigned long long int)pt->dst_tmpIx;
	unsigned long long int pointer_tmpIx_dev = (unsigned long long int)tmpIx_dev[pt->pid];


	for(int L=0; L<(pt->L_MAX-pt->interval); L++) {

		/**************************************************************************/
		/* loop condition */
		if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) )
		{
			continue;
		}
		/* loop conditon */
		/**************************************************************************/


		for(int jj=0; jj<pt->NoC; jj++) {

			part_y = pt->numpart[jj] / device_num;
			if(pt->numpart[jj]%device_num != 0){
				part_y++;
			}

			start_kk = part_y * pt->pid;
			end_kk = part_y * (pt->pid + 1);

			if(end_kk > pt->numpart[jj]){
				end_kk = pt->numpart[jj];
			}

			if(pt->pid > 0){
				part_end_kk = part_y * pt->pid;
			}

			for(int kk=0; kk<pt->numpart[jj]; kk++) {

				int PIDX = pt->PIDX_array[L][jj][kk];
				int dims0 = pt->size_array[L][PIDX*2];
				int dims1 = pt->size_array[L][PIDX*2+1];
				if(start_kk <= kk && kk < end_kk){
					part_size += dims0 * dims1;
				}
				if(pt->pid > 0){
					if(0 <= kk && kk < part_end_kk){
						pointer_size += dims0 * dims1;
					}
				}
				move_size += dims0 * dims1;
			}

			sum_part_size += part_size;
			sum_pointer_size += pointer_size;


			if(pt->pid*part_y < pt->numpart[jj]){

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_start, NULL);
				}


				res = cuMemcpyDtoH((void *)(pointer_dst_tmpIx+(unsigned long long int)(pointer_size*sizeof(int))), (CUdeviceptr)(pointer_tmpIx_dev+(unsigned long long int)(pointer_size*sizeof(int))), part_size*sizeof(int));
				if(res != CUDA_SUCCESS) {
					printf("error pid = %d\n",pt->pid);
					printf("cuMemcpyDtoH(tmpIx) failed: res = %s\n", cuda_response_to_string(res));
					exit(1);
				}

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_end, NULL);
					tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
					time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
				}

			}

			pointer_dst_tmpIx += (unsigned long long int)(move_size * sizeof(int));
			pointer_tmpIx_dev += (unsigned long long int)(move_size * sizeof(int));

			part_size = 0;
			pointer_size = 0;
			move_size = 0;

		}

	}


	/* downloads tmpIy from GPU */

	sum_part_size = 0;
	sum_pointer_size = 0;
	part_size = 0;
	pointer_size = 0;
	part_y = 0;
	move_size = 0;
	start_kk = 0;
	end_kk = 0;
	part_end_kk = 0;
	unsigned long long int pointer_dst_tmpIy = (unsigned long long int)pt->dst_tmpIy;
	unsigned long long int pointer_tmpIy_dev = (unsigned long long int)tmpIy_dev[pt->pid];


	for(int L=0; L<(pt->L_MAX-pt->interval); L++) {

		/**************************************************************************/
		/* loop condition */
		if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) )
		{
			continue;
		}
		/* loop conditon */
		/**************************************************************************/


		for(int jj=0; jj<pt->NoC; jj++) {

			part_y = pt->numpart[jj] / device_num;
			if(pt->numpart[jj]%device_num != 0){
				part_y++;
			}

			start_kk = part_y * pt->pid;
			end_kk = part_y * (pt->pid + 1);

			if(end_kk > pt->numpart[jj]){
				end_kk = pt->numpart[jj];
			}

			if(pt->pid > 0){
				part_end_kk = part_y * pt->pid;
			}

			for(int kk=0; kk<pt->numpart[jj]; kk++) {

				int PIDX = pt->PIDX_array[L][jj][kk];
				int dims0 = pt->size_array[L][PIDX*2];
				int dims1 = pt->size_array[L][PIDX*2+1];
				if(start_kk <= kk && kk < end_kk){
					part_size += dims0 * dims1;
				}
				if(pt->pid > 0){
					if(0 <= kk && kk < part_end_kk){
						pointer_size += dims0 * dims1;
					}
				}
				move_size += dims0 * dims1;
			}

			sum_part_size += part_size;
			sum_pointer_size += pointer_size;

			if(pt->pid*part_y < pt->numpart[jj]){

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_start, NULL);
				}

				res = cuMemcpyDtoH((void *)(pointer_dst_tmpIy+(unsigned long long int)(pointer_size*sizeof(int))), (CUdeviceptr)(pointer_tmpIy_dev+(unsigned long long int)(pointer_size*sizeof(int))), part_size*sizeof(int));
				if(res != CUDA_SUCCESS) {
					printf("error pid = %d\n",pt->pid);
					printf("cuMemcpyDtoH(tmpIy) failed: res = %s\n", cuda_response_to_string(res));
					exit(1);
				}

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_end, NULL);
					tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
					time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
				}

			}

			pointer_dst_tmpIy += (unsigned long long int)(move_size * sizeof(int));
			pointer_tmpIy_dev += (unsigned long long int)(move_size * sizeof(int));

			part_size = 0;
			pointer_size = 0;
			move_size = 0;

		}

	}


	/* end of thread */
	CUT_THREADEND;
}

Ejemplo n.º 30

0

Mostrar archivo

Archivo: get_boxes.cpp Proyecto: obrianjunior/Autoware

static void calc_a_score_GPU(FLOAT *ac_score,  FLOAT **score,
			     int *ssize_start,  Model_info *MI,
			     FLOAT scale, int *size_score_array,
			     int NoC)
{
	CUresult res;

	const int IHEI = MI->IM_HEIGHT;
	const int IWID = MI->IM_WIDTH;
	int pady_n = MI->pady;
	int padx_n = MI->padx;
	int block_pad = (int)(scale/2.0);

	struct timeval tv;

	int *RY_array, *RX_array;
	res = cuMemHostAlloc((void**)&RY_array, NoC*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP);
	if(res != CUDA_SUCCESS) {
		printf("cuMemHostAlloc(RY_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemHostAlloc((void**)&RX_array, NoC*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP);
	if(res != CUDA_SUCCESS) {
		printf("cuMemHostAlloc(RX_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	for(int i = 0; i < NoC; i++) {
		int rsize[2] = {MI->rsize[i*2], MI->rsize[i*2+1]};

		RY_array[i] = (int)((FLOAT)rsize[0]*scale/2.0-1.0+block_pad);
		RX_array[i] = (int)((FLOAT)rsize[1]*scale/2.0-1.0+block_pad);
	}

	CUdeviceptr ac_score_dev, score_dev;
	CUdeviceptr ssize_dev, size_score_dev;
	CUdeviceptr RY_dev, RX_dev;

	int size_score=0;
	for(int i = 0; i < NoC; i++) {
		size_score += size_score_array[i];
	}

	/* allocate GPU memory */
	res = cuMemAlloc(&ac_score_dev, gpu_size_A_SCORE);
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(ac_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&score_dev, size_score);
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&ssize_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(ssize) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&size_score_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(size_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&RY_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(RY) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&RX_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(RX) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	gettimeofday(&tv_memcpy_start, nullptr);
	/* upload date to GPU */
	res = cuMemcpyHtoD(ac_score_dev, &ac_score[0], gpu_size_A_SCORE);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(ac_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(score_dev, &score[0][0], size_score);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(ssize_dev, &ssize_start[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(ssize) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(size_score_dev, &size_score_array[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(size_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(RY_dev, &RY_array[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(RY) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(RX_dev, &RX_array[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(RX) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	gettimeofday(&tv_memcpy_end, nullptr);
	tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
	time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	void* kernel_args[] = {
		(void*)&IWID,
		(void*)&IHEI,
		(void*)&scale,
		(void*)&padx_n,
		(void*)&pady_n,
		&RX_dev,
		&RY_dev,
		&ac_score_dev,
		&score_dev,
		&ssize_dev,
		(void*)&NoC,
		&size_score_dev
	};

	int sharedMemBytes = 0;

	/* define CUDA block shape */
	int max_threads_num = 0;
	int thread_num_x, thread_num_y;
	int block_num_x, block_num_y;

	res = cuDeviceGetAttribute(&max_threads_num, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[0]);
	if(res != CUDA_SUCCESS){
		printf("\ncuDeviceGetAttribute() failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	NR_MAXTHREADS_X[0] = (int)sqrt((double)max_threads_num/NoC);
	NR_MAXTHREADS_Y[0] = (int)sqrt((double)max_threads_num/NoC);

	thread_num_x = (IWID < NR_MAXTHREADS_X[0]) ? IWID : NR_MAXTHREADS_X[0];
	thread_num_y = (IHEI < NR_MAXTHREADS_Y[0]) ? IHEI : NR_MAXTHREADS_Y[0];

	block_num_x = IWID / thread_num_x;
	block_num_y = IHEI / thread_num_y;
	if(IWID % thread_num_x != 0) block_num_x++;
	if(IHEI % thread_num_y != 0) block_num_y++;

	gettimeofday(&tv_kernel_start, nullptr);
	/* launch GPU kernel */
	res = cuLaunchKernel(
		func_calc_a_score[0], // call function
		block_num_x,       // gridDimX
		block_num_y,       // gridDimY
		1,                 // gridDimZ
		thread_num_x,      // blockDimX
		thread_num_y,      // blockDimY
		NoC,               // blockDimZ
		sharedMemBytes,    // sharedMemBytes
		nullptr,              // hStream
		kernel_args,       // kernelParams
		nullptr               // extra
		);
	if(res != CUDA_SUCCESS) {
		printf("cuLaunchKernel(calc_a_score) failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(calc_a_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}
	gettimeofday(&tv_kernel_end, nullptr);
	tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
	time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	gettimeofday(&tv_memcpy_start, nullptr);
	/* download data from GPU */
	res = cuMemcpyDtoH(ac_score, ac_score_dev, gpu_size_A_SCORE);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH(ac_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	gettimeofday(&tv_memcpy_end, nullptr);
	tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
	time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	/* free GPU memory */
	res = cuMemFree(ac_score_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(ac_score_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(score_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(score_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(ssize_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(ssize_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(size_score_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(size_score_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(RY_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(RY_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(RX_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(RX_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	/* free CPU memory */
	res = cuMemFreeHost(RY_array);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(RY_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFreeHost(RX_array);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(RX_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}
}