Пример #1
0
//pteracuda_nifs:gemv(Ctx, _m, _n, _alpha, A, X, _betha, Y),
ERL_NIF_TERM pteracuda_nifs_gemv(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    PCudaContextRef *ctxRef;
   PCudaBufferRef *ref_A, *ref_X, *ref_Y;
    
    unsigned long transpose;
    unsigned long  m, n;
    double alpha, beta;

    if (argc != 9 || 
        !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) ||
        !enif_get_ulong(env, argv[1], &transpose)||
        !enif_get_ulong(env, argv[2], &m)||
        !enif_get_ulong(env, argv[3], &n)||
        !enif_get_double(env, argv[4], &alpha)||
        !enif_get_resource(env, argv[5], pteracuda_buffer_resource, (void **) &ref_A) ||
        !enif_get_resource(env, argv[6], pteracuda_buffer_resource, (void **) &ref_X)||
        !enif_get_double(env, argv[7], &beta)||
        !enif_get_resource(env, argv[8], pteracuda_buffer_resource, (void **) &ref_Y)) {

        return enif_make_badarg(env);
    }

    if(((PCudaMatrixFloatBuffer*)ref_A->buffer)->rows() != m || ((PCudaMatrixFloatBuffer*)ref_A->buffer)->cols() != n){
        return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Matrix A dimensions do not match m,n parameters")); 
    }

    cuCtxSetCurrent(ctxRef->ctx);
    pcuda_gemv(transpose, m, n, alpha, ((PCudaMatrixFloatBuffer *)ref_A->buffer)->get_data(), ((PCudaFloatBuffer *)ref_X->buffer)->get_data(), beta, ((PCudaFloatBuffer *)ref_Y->buffer)->get_data());

    return ATOM_OK;
}
Пример #2
0
ERL_NIF_TERM pteracuda_nifs_transpose(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    PCudaContextRef *ctxRef;

    PCudaBufferRef *ref_A, *ref_B;
    unsigned long m, n;

    if (argc != 3 || 
        !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) ||
        !enif_get_resource(env, argv[1], pteracuda_buffer_resource, (void **) &ref_A)||
        !enif_get_resource(env, argv[2], pteracuda_buffer_resource, (void **) &ref_B)) {

        return enif_make_badarg(env);
    }

    if(((PCudaMatrixFloatBuffer *)ref_A->buffer)->rows() != ((PCudaMatrixFloatBuffer *)ref_B->buffer)->cols() ||
        ((PCudaMatrixFloatBuffer *)ref_A->buffer)->cols() != ((PCudaMatrixFloatBuffer *)ref_B->buffer)->rows() ){
        return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Size A does not match the transpose size B.")); 
    }

    m = ((PCudaMatrixFloatBuffer *)ref_A->buffer)->rows();
    n = ((PCudaMatrixFloatBuffer *)ref_A->buffer)->cols();
    
    cuCtxSetCurrent(ctxRef->ctx);

    //as the internal representation of a matrix buffer is "column major", the actual Rows x Columns is  N x M
    pcuda_transpose(n, m, ((PCudaMatrixFloatBuffer *)ref_A->buffer)->get_data(), ((PCudaMatrixFloatBuffer *)ref_B->buffer)->get_data());

    return ATOM_OK;
}
Пример #3
0
ERL_NIF_TERM pteracuda_ml_gd_learn(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    PCudaContextRef *ctxRef;
    PCudaBufferRef *ref_Theta, *ref_X, *ref_Y;
    unsigned long num_features; 
    unsigned long num_samples;
    unsigned long iterations;
    double learning_rate;
    
    if (argc != 8 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) ||
        !enif_get_resource(env, argv[1], pteracuda_buffer_resource, (void **) &ref_Theta) ||
        !enif_get_resource(env, argv[2], pteracuda_buffer_resource, (void **) &ref_X) ||
        !enif_get_resource(env, argv[3], pteracuda_buffer_resource, (void **) &ref_Y) ||
        !enif_get_ulong(env, argv[4], &num_features) ||
        !enif_get_ulong(env, argv[5], &num_samples) ||
        !enif_get_double(env, argv[6], &learning_rate) ||
        !enif_get_ulong(env, argv[7], &iterations)

        ) {
        return enif_make_badarg(env);
    }

    cuCtxSetCurrent(ctxRef->ctx);

    pcuda_gd_learn(((PCudaFloatBuffer*)ref_Theta->buffer)->get_data(), ((PCudaFloatBuffer*)ref_X->buffer)->get_data(), ((PCudaFloatBuffer*)ref_Y->buffer)->get_data(), num_features, num_samples, (float)learning_rate, iterations);
    
    return ATOM_OK;
}
Пример #4
0
ERL_NIF_TERM pteracuda_nifs_saxpy(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    PCudaContextRef *ctxRef;

    PCudaBufferRef *ref_X, *ref_Y;
    
    double a;

    if (argc != 4 || 
        !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) ||
        !enif_get_double(env, argv[1], &a)||
        !enif_get_resource(env, argv[2], pteracuda_buffer_resource, (void **) &ref_X)||
        !enif_get_resource(env, argv[3], pteracuda_buffer_resource, (void **) &ref_Y)) {

        return enif_make_badarg(env);
    }

    if(((PCudaFloatBuffer *)ref_X->buffer)->get_data()->size() != ((PCudaFloatBuffer *)ref_Y->buffer)->get_data()->size()){
        return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Size X does not match size Y.")); 
    }

    cuCtxSetCurrent(ctxRef->ctx);
    pcuda_saxpy(a, ((PCudaFloatBuffer *)ref_X->buffer)->get_data(), ((PCudaFloatBuffer *)ref_Y->buffer)->get_data());

    return ATOM_OK;
}
Пример #5
0
void cuda_enter(cuda_context *ctx) {
  ASSERT_CTX(ctx);
  cuCtxGetCurrent(&ctx->old);
  if (ctx->old != ctx->ctx)
    ctx->err = cuCtxSetCurrent(ctx->ctx);
  /* If no context was there in the first place, then we take over
     to avoid the set dance on the thread */
  if (ctx->old == NULL) ctx->old = ctx->ctx;
}
Пример #6
0
ERL_NIF_TERM pteracuda_nifs_buffer_intersection(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    PCudaContextRef *ctxRef;
    PCudaBufferRef *first, *second;
    if (argc !=3 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) ||
        !enif_get_resource(env, argv[1], pteracuda_buffer_resource, (void **) &first) ||
        !enif_get_resource(env, argv[2], pteracuda_buffer_resource, (void **) &second)) {
        return enif_make_badarg(env);
    }
    cuCtxSetCurrent(ctxRef->ctx);
    return enif_make_tuple2(env, ATOM_OK, first->buffer->intersect(env, second->buffer));
}
Пример #7
0
SEXP R_cuCtxSetCurrent(SEXP r_ctx)
{
    SEXP r_ans = R_NilValue;
    CUcontext ctx = (CUcontext) getRReference(r_ctx);
    
    CUresult ans;
    ans = cuCtxSetCurrent(ctx);
    
    r_ans = Renum_convert_CUresult(ans) ;
    
    return(r_ans);
}
Пример #8
0
///////////////////Matrix operations
// C(m,n) = A(m,k) * B(k,n)
//gemm(_Ctx, _transpose_op_A, _transpose_op_B, _m, _n, _k, _alpha, _A, _B, _beta, _C ) 
ERL_NIF_TERM pteracuda_nifs_gemm(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    PCudaContextRef *ctxRef;
    PCudaBufferRef *ref_A, *ref_B, *ref_C;
    unsigned long transpose_a, transpose_b;
    unsigned long  m, n, k;
    double alpha, beta;
    
    if (argc != 11 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) ||
        !enif_get_ulong(env, argv[1], &transpose_a)||
        !enif_get_ulong(env, argv[2], &transpose_b)||
        !enif_get_ulong(env, argv[3], &m)||
        !enif_get_ulong(env, argv[4], &n)||
        !enif_get_ulong(env, argv[5], &k)||
        !enif_get_double(env, argv[6], &alpha)||
        !enif_get_resource(env, argv[7], pteracuda_buffer_resource, (void **) &ref_A) ||
        !enif_get_resource(env, argv[8], pteracuda_buffer_resource, (void **) &ref_B)||
        !enif_get_double(env, argv[9], &beta)||
        !enif_get_resource(env, argv[10], pteracuda_buffer_resource, (void **) &ref_C)
        ) {
        return enif_make_badarg(env);
    }

    if(transpose_a == CUBLAS_OP_N){
        if(((PCudaMatrixFloatBuffer*)ref_A->buffer)->rows() != m || ((PCudaMatrixFloatBuffer*)ref_A->buffer)->cols() != k){
            return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Matrix A dimensions do not match m,k parameters")); 
        }
    }else{
         if(((PCudaMatrixFloatBuffer*)ref_A->buffer)->rows() != k || ((PCudaMatrixFloatBuffer*)ref_A->buffer)->cols() != n){
            return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Matrix A dimensions do not match m,k parameters")); 
        }
    }

    if(transpose_b == CUBLAS_OP_N){
        if(((PCudaMatrixFloatBuffer*)ref_B->buffer)->rows() != k || ((PCudaMatrixFloatBuffer*)ref_B->buffer)->cols() != n){
            return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Matrix B dimensions do not match k,n parameters")); 
        }
    }else{
        if(((PCudaMatrixFloatBuffer*)ref_B->buffer)->rows() != n || ((PCudaMatrixFloatBuffer*)ref_B->buffer)->cols() != k){
            return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Matrix B dimensions do not match k,n parameters")); 
        }
    }    

    if(((PCudaMatrixFloatBuffer*)ref_C->buffer)->rows() != m || ((PCudaMatrixFloatBuffer*)ref_C->buffer)->cols() != n){
        return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Matrix C dimensions do not match m,n parameters")); 
    }

    cuCtxSetCurrent(ctxRef->ctx);
    //pcuda_mmul(((PCudaMatrixFloatBuffer*)ref_A->buffer)->get_data(), ((PCudaMatrixFloatBuffer*)ref_B->buffer)->get_data(), ((PCudaMatrixFloatBuffer*)ref_C->buffer)->get_data(), m, k, n);
    pcuda_gemm(transpose_a, transpose_b, m, n, k, alpha, ((PCudaMatrixFloatBuffer*)ref_A->buffer)->get_data(), ((PCudaMatrixFloatBuffer*)ref_B->buffer)->get_data(), beta, ((PCudaMatrixFloatBuffer*)ref_C->buffer)->get_data());
    
    return ATOM_OK;
}
Пример #9
0
ERL_NIF_TERM pteracuda_nifs_buffer_minmax(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    PCudaContextRef *ctxRef;
    PCudaBufferRef *bufRef;
    if (argc !=2 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) ||
        !enif_get_resource(env, argv[1], pteracuda_buffer_resource, (void **) &bufRef)) {
        return enif_make_badarg(env);
    }
    if (bufRef->buffer->size() == 0) {
        return enif_make_tuple2(env, ATOM_OK, enif_make_tuple2(env, enif_make_int(env, 0),
                                                               enif_make_int(env, 0)));
    }
    cuCtxSetCurrent(ctxRef->ctx);
    return enif_make_tuple2(env, ATOM_OK, bufRef->buffer->minmax(env));
}
Пример #10
0
ERL_NIF_TERM pteracuda_nifs_sort_buffer(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    PCudaContextRef *ctxRef;
    PCudaBufferRef *ref;
    if (argc != 2 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) ||
        !enif_get_resource(env, argv[1], pteracuda_buffer_resource, (void **) &ref)) {
        return enif_make_badarg(env);
    }
    cuCtxSetCurrent(ctxRef->ctx);
    if (ref->buffer->sort()) {
        return ATOM_OK;
    }
    else {
        return ATOM_ERROR;
    }
}
Пример #11
0
void
pocl_cuda_free (cl_device_id device, cl_mem mem_obj)
{
  cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);

  if (mem_obj->flags & CL_MEM_ALLOC_HOST_PTR)
    {
      cuMemFreeHost (mem_obj->mem_host_ptr);
      mem_obj->mem_host_ptr = NULL;
    }
  else
    {
      void *ptr = mem_obj->device_ptrs[device->dev_id].mem_ptr;
      cuMemFree ((CUdeviceptr)ptr);
    }
}
Пример #12
0
ERL_NIF_TERM pteracuda_nifs_buffer_contains(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    PCudaContextRef *ctxRef;
    PCudaBufferRef *ref;
    if (argc !=3 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) ||
        !enif_get_resource(env, argv[1], pteracuda_buffer_resource, (void **) &ref)) {
        return enif_make_badarg(env);
    }
    if (ref->buffer->size() > 0) {
        cuCtxSetCurrent(ctxRef->ctx);
        if (ref->buffer->contains(env, argv[2])) {
            return ATOM_TRUE;
        }
        else {
            return ATOM_FALSE;
        }
    }
    else {
        return ATOM_FALSE;
    }
}
Пример #13
0
	void Initialize()
	{
		cuCtxSetCurrent(m_Context);
		// Encode a dummy frame since it seems that some initialization is done on the first encoding
		m_PictureParameters.encodePicFlags = NV_ENC_PIC_FLAG_FORCEIDR;
#ifdef ASYNCHRONOUS
		// Sanity check
		_ASSERT(WaitForSingleObject(m_PictureParameters.completionEvent, 0) == WAIT_TIMEOUT);
#endif
		CHECK_NVENC_STATUS(m_FunctionList.nvEncEncodePicture(m_pEncoder, &m_PictureParameters));
#ifdef ASYNCHRONOUS
		DWORD nWaitResult = WaitForSingleObject(m_PictureParameters.completionEvent, INFINITE);
		// Sanity check
		_ASSERT(nWaitResult == WAIT_OBJECT_0);
#endif
		NV_ENC_LOCK_BITSTREAM LockBitstream = { NV_ENC_LOCK_BITSTREAM_VER, 0 };
		LockBitstream.sliceOffsets = NULL;
		LockBitstream.outputBitstream = m_PictureParameters.outputBitstream;
		CHECK_NVENC_STATUS(m_FunctionList.nvEncLockBitstream(m_pEncoder, &LockBitstream));
		CHECK_NVENC_STATUS(m_FunctionList.nvEncUnlockBitstream(m_pEncoder, LockBitstream.outputBitstream));
	}
Пример #14
0
ERL_NIF_TERM pteracuda_nifs_log(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    PCudaContextRef *ctxRef;
    PCudaBufferRef *ref_A, *ref_B;
    
    if (argc != 3 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) ||
        !enif_get_resource(env, argv[1], pteracuda_buffer_resource, (void **) &ref_A) ||
        !enif_get_resource(env, argv[2], pteracuda_buffer_resource, (void **) &ref_B)
        ) {
        return enif_make_badarg(env);
    }

    if(((PCudaFloatBuffer*)ref_A->buffer)->size() != ((PCudaFloatBuffer*)ref_B->buffer)->size()){
        return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Buffer A size does not match buffer B size")); 
    }

    cuCtxSetCurrent(ctxRef->ctx);

    pcuda_log(((PCudaFloatBuffer*)ref_A->buffer)->get_data(), ((PCudaFloatBuffer*)ref_B->buffer)->get_data());
    
    return ATOM_OK;
}
Пример #15
0
cl_int
pocl_cuda_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void *host_ptr)
{
  cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);

  CUresult result;
  void *b = NULL;

  /* if memory for this global memory is not yet allocated -> do it */
  if (mem_obj->device_ptrs[device->global_mem_id].mem_ptr == NULL)
    {
      cl_mem_flags flags = mem_obj->flags;

      if (flags & CL_MEM_USE_HOST_PTR)
        {
#if defined __arm__
          // cuMemHostRegister is not supported on ARN
          // Allocate device memory and perform explicit copies
          // before and after running a kernel
          result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size);
          CUDA_CHECK (result, "cuMemAlloc");
#else
          result = cuMemHostRegister (host_ptr, mem_obj->size,
                                      CU_MEMHOSTREGISTER_DEVICEMAP);
          if (result != CUDA_SUCCESS
              && result != CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)
            CUDA_CHECK (result, "cuMemHostRegister");
          result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, host_ptr, 0);
          CUDA_CHECK (result, "cuMemHostGetDevicePointer");
#endif
        }
      else if (flags & CL_MEM_ALLOC_HOST_PTR)
        {
          result = cuMemHostAlloc (&mem_obj->mem_host_ptr, mem_obj->size,
                                   CU_MEMHOSTREGISTER_DEVICEMAP);
          CUDA_CHECK (result, "cuMemHostAlloc");
          result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b,
                                              mem_obj->mem_host_ptr, 0);
          CUDA_CHECK (result, "cuMemHostGetDevicePointer");
        }
      else
        {
          result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size);
          if (result != CUDA_SUCCESS)
            {
              const char *err;
              cuGetErrorName (result, &err);
              POCL_MSG_PRINT2 (__FUNCTION__, __LINE__,
                               "-> Failed to allocate memory: %s\n", err);
              return CL_MEM_OBJECT_ALLOCATION_FAILURE;
            }
        }

      if (flags & CL_MEM_COPY_HOST_PTR)
        {
          result = cuMemcpyHtoD ((CUdeviceptr)b, host_ptr, mem_obj->size);
          CUDA_CHECK (result, "cuMemcpyHtoD");
        }

      mem_obj->device_ptrs[device->global_mem_id].mem_ptr = b;
      mem_obj->device_ptrs[device->global_mem_id].global_mem_id
          = device->global_mem_id;
    }

  /* copy already allocated global mem info to devices own slot */
  mem_obj->device_ptrs[device->dev_id]
      = mem_obj->device_ptrs[device->global_mem_id];

  return CL_SUCCESS;
}
Пример #16
0
	void cuda_pop_context()
	{
		cuda_assert(cuCtxSetCurrent(NULL));
	}
Пример #17
0
		return true;
	}

#define cuda_error(stmt) cuda_error_(stmt, #stmt)

	void cuda_error_message(const string& message)
	{
		if(error_msg == "")
			error_msg = message;
		fprintf(stderr, "%s\n", message.c_str());
		cuda_error_documentation();
	}

	void cuda_push_context()
	{
		cuda_assert(cuCtxSetCurrent(cuContext))
	}

	void cuda_pop_context()
	{
		cuda_assert(cuCtxSetCurrent(NULL));
	}

	CUDADevice(DeviceInfo& info, Stats &stats, bool background_) : Device(stats)
	{
		first_error = true;
		background = background_;

		cuDevId = info.num;
		cuDevice = 0;
		cuContext = 0;
Пример #18
0
 static void dispose(CUcontext context) {
     cuda_check( cuCtxSetCurrent(context) );
     cuda_check( cuCtxSynchronize()       );
     cuda_check( cuCtxDestroy(context)    );
 }
Пример #19
0
static CUT_THREADPROC dt_thread_func(void *p)
{
	dt_partition *pt = (dt_partition *)p;
	struct timeval tv;
	CUresult res;
	int thread_num_x=0, thread_num_y=0;
	int block_num_x=0, block_num_y=0;

	res = cuCtxSetCurrent(ctx[pt->pid]);
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSetCurrent(ctx[%d]) failed: res = %s\n", pt->pid, cuda_response_to_string(res));
		exit(1);
	}

	/* allocate GPU memory */

	//printf("part_error_array_num = %d\n",part_error_array_num);


	if(pt->pid == 0){
		gettimeofday(&tv_memcpy_start, NULL);
	}

	res = cuMemcpyHtoD(part_C_dev[pt->pid], dst_C, SUM_SIZE_C);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(part_C_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(part_error_array_dev[pt->pid], part_error_array, part_error_array_num*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(part_error_array_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(pm_size_array_dev[pt->pid], &pt->size_array[0][0], pt->NoP*2*pt->L_MAX*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(pm_size_array_dev) falied: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(def_array_dev[pt->pid], pt->def, sum_size_def_array);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(def_array_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(numpart_dev[pt->pid], pt->numpart, pt->NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(cuMemcpyHtoD(numpart_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(PIDX_array_dev[pt->pid], pt->dst_PIDX, pt->tmp_array_size);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(PIDX_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(DID_4_array_dev[pt->pid], pt->dst_DID_4, pt->tmp_array_size);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(DID_4__array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}


	if(pt->pid == 0){
		gettimeofday(&tv_memcpy_end, NULL);
		tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
		time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}

	int sharedMemBytes = 0;

	/* get max thread num per block */
	int max_threads_num = 0;
	res = cuDeviceGetAttribute(&max_threads_num, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[pt->pid]);
	if(res != CUDA_SUCCESS){
		printf("\ncuDeviceGetAttribute() failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	/* prepare for launch inverse_Q */
	void* kernel_args_inverse[] = {
		&part_C_dev[pt->pid],
		&pm_size_array_dev[pt->pid],
		&part_error_array_dev[pt->pid],
		&part_error_array_num,
		(void*)&(pt->NoP),
		&PIDX_array_dev[pt->pid],
		&numpart_dev[pt->pid],
		(void*)&(pt->NoC),
		(void*)&(pt->max_numpart),
		(void*)&(pt->interval),
		(void*)&(pt->L_MAX),
		(void*)&(pt->pid),
		(void*)&(device_num)
	};

	/* define CUDA block shape */
	int upper_limit_th_num_x = max_threads_num/(pt->max_numpart*pt->NoC);
	int upper_limit_th_num_y = max_threads_num/upper_limit_th_num_x;
	if(upper_limit_th_num_x < 1) upper_limit_th_num_x++;
	if(upper_limit_th_num_y < 1) upper_limit_th_num_y++;

	thread_num_x = (pt->max_dim0*pt->max_dim1 < upper_limit_th_num_x) ? (pt->max_dim0*pt->max_dim1) : upper_limit_th_num_x;
	thread_num_y = (pt->max_numpart < upper_limit_th_num_y) ? pt->max_numpart : upper_limit_th_num_y;

	block_num_x = (pt->max_dim0*pt->max_dim1) / thread_num_x;
	block_num_y = (pt->max_numpart) / thread_num_y;
	if((pt->max_dim0*pt->max_dim1) % thread_num_x != 0) block_num_x++;
	if(pt->max_numpart % thread_num_y != 0) block_num_y++;


	int blockDimY = thread_num_y / device_num;
	if(thread_num_y%device_num != 0){
		blockDimY++;
	}

	/* launch iverse_Q */
	if(pt->pid == 0){
		gettimeofday(&tv_kernel_start, NULL);
	}
	res = cuLaunchKernel(
		func_inverse_Q[pt->pid],      // call function
		block_num_x,         // gridDimX
		block_num_y,         // gridDimY
		pt->L_MAX-pt->interval,      // gridDimZ
		thread_num_x,        // blockDimX
		blockDimY,        // blockDimY
		pt->NoC,                 // blockDimZ
		sharedMemBytes,      // sharedMemBytes
		NULL,                // hStream
		kernel_args_inverse, // kernelParams
		NULL                 // extra
		);
	if(res != CUDA_SUCCESS) {
		printf("block_num_x %d, block_num_y %d, thread_num_x %d, thread_num_y %d\n", block_num_x, block_num_y, thread_num_x, thread_num_y);
		printf("cuLaunchKernel(inverse_Q) failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(inverse_Q) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	if(pt->pid == 0){
		gettimeofday(&tv_kernel_end, NULL);
		tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
		time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}


	/* prepare for launch dt1d_x */
	void* kernel_args_x[] = {
		&part_C_dev[pt->pid],                  // FLOAT *src_start
		&tmpM_dev[pt->pid],                    // FLOTA *dst
		&tmpIy_dev[pt->pid],                   // int *ptr
		&DID_4_array_dev[pt->pid],             // int *DID_4_array,
		&def_array_dev[pt->pid],               // FLOAT *def_array,
		&pm_size_array_dev[pt->pid],           // int *size_array
		(void*)&(pt->NoP),                  // int NoP
		&PIDX_array_dev[pt->pid],              // int *PIDX_array
		&part_error_array_dev[pt->pid],        // int *error_array
		(void*)&(part_error_array_num), // int error_array_num
		&numpart_dev[pt->pid],                 // int *numpart
		(void*)&(pt->NoC),                  // int NoC
		(void*)&(pt->max_numpart),          // int max_numpart
		(void*)&(pt->interval),             // int interval
		(void*)&(pt->L_MAX),                 // int L_MAX
		(void*)&(pt->pid),                   // int pid
		(void*)&(device_num)                 // int device_num
	};


	max_threads_num = 64/pt->NoC;
	if(max_threads_num < 1) max_threads_num++;

	thread_num_x = (pt->max_dim1 < max_threads_num) ? pt->max_dim1 : max_threads_num;
	thread_num_y = (pt->max_numpart < max_threads_num) ? pt->max_numpart : max_threads_num;

	block_num_x = pt->max_dim1 / thread_num_x;
	block_num_y = pt->max_numpart / thread_num_y;
	if(pt->max_dim1 % thread_num_x != 0) block_num_x++;
	if(pt->max_numpart % thread_num_y != 0) block_num_y++;

	blockDimY = thread_num_y / device_num;
	if(thread_num_y%device_num != 0){
		blockDimY++;
	}

	/* launch dt1d_x */
	if(pt->pid == 0){
		gettimeofday(&tv_kernel_start, NULL);
	}

	res = cuLaunchKernel(
		func_dt1d_x[pt->pid],    // call function
		block_num_x,    // gridDimX
		block_num_y,    // gridDimY
		pt->L_MAX-pt->interval, // gridDimZ
		thread_num_x,   // blockDimX
		blockDimY,   // blockDimY
		pt->NoC,            // blockDimZ
		sharedMemBytes, // sharedMemBytes
		NULL,           // hStream
		kernel_args_x,  // kernelParams
		NULL            // extra
		);
	if(res != CUDA_SUCCESS) {

		printf("block_num_x %d, block_num_y %d, thread_num_x %d, thread_num_y %d\n", block_num_x, block_num_y, thread_num_x, thread_num_y);

		printf("cuLaunchKernel(dt1d_x) failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(dt1d_x) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	if(pt->pid == 0){
		gettimeofday(&tv_kernel_end, NULL);
		tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
		time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}


	/* prepare for launch dt1d_y */
	void* kernel_args_y[] = {
		&tmpM_dev[pt->pid],                    // FLOAT *src_start
		&M_dev[pt->pid],                       // FLOAT *dst_start
		&tmpIx_dev[pt->pid],                   // int *ptr_start
		&DID_4_array_dev[pt->pid],             // int *DID_4_array,
		&def_array_dev[pt->pid],               // FLOAT *def_array,
		(void*)&(pt->NoP),                  // int NoP
		&pm_size_array_dev[pt->pid],           // int *size_array
		&numpart_dev[pt->pid],                 // int *numpart,
		&PIDX_array_dev[pt->pid],              // int *PIDX_array,
		(void*)&(pt->NoC),                  // int NoC
		(void*)&(pt->max_numpart),          // int max_numpart
		(void*)&(pt->interval),             // int interval
		(void*)&(pt->L_MAX),                // int L_MAX
		&part_error_array_dev[pt->pid],        // int *error_array
		(void*)&(part_error_array_num), // int error_array_num
		(void*)&(pt->pid),                   // int pid
		(void*)&(device_num)                 // int device_num
	};


	thread_num_x = (pt->max_dim0 < max_threads_num) ? pt->max_dim0 : max_threads_num;
	thread_num_y = (pt->max_numpart < max_threads_num) ? pt->max_numpart : max_threads_num;

	block_num_x = pt->max_dim0 / thread_num_x;
	block_num_y = pt->max_numpart / thread_num_y;
	if(pt->max_dim0 % thread_num_x != 0) block_num_x++;
	if(pt->max_numpart % thread_num_y != 0) block_num_y++;

	blockDimY = thread_num_y / device_num;
	if(thread_num_y%device_num != 0){
		blockDimY++;
	}

	/* prepare for launch dt1d_y */
	if(pt->pid == 0){
		gettimeofday(&tv_kernel_start, NULL);
	}

	res = cuLaunchKernel(
		func_dt1d_y[pt->pid],    // call functions
		block_num_x,    // gridDimX
		block_num_y,    // gridDimY
		pt->L_MAX-pt->interval, // gridDimZ
		thread_num_x,   // blockDimX
		blockDimY,   // blockDimY
		pt->NoC,            // blockDimZ
		sharedMemBytes, // sharedMemBytes
		NULL,           // hStream
		kernel_args_y,  // kernelParams
		NULL            // extra
		);
	if(res != CUDA_SUCCESS) {
		printf("cuLaunchKernel(dt1d_y failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}


	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(dt1d_y) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	if(pt->pid == 0){
		gettimeofday(&tv_kernel_end, NULL);
		tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
		time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}



	/* downloads datas from GPU */

	/* downloads M from GPU */

	int sum_part_size = 0;
	int sum_pointer_size = 0;
	int sum_move_size = 0;
	int part_size = 0;
	int pointer_size = 0;
	int part_y = 0;
	int move_size = 0;
	int start_kk = 0;
	int end_kk = 0;
	int part_end_kk = 0;
	unsigned long long int pointer_dst_M = (unsigned long long int)pt->dst_M;
	unsigned long long int pointer_M_dev = (unsigned long long int)M_dev[pt->pid];

	for(int L=0; L<(pt->L_MAX-pt->interval); L++) {

		/**************************************************************************/
		/* loop condition */
		if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) )
		{
			continue;
		}
		/* loop conditon */
		/**************************************************************************/


		for(int jj=0; jj<pt->NoC; jj++) {

			part_y = pt->numpart[jj] / device_num;
			if(pt->numpart[jj]%device_num != 0){
				part_y++;
			}


			start_kk = part_y * pt->pid;
			end_kk = part_y * (pt->pid + 1);

			if(end_kk > pt->numpart[jj]){
				end_kk = pt->numpart[jj];
			}

			if(pt->pid > 0){
				part_end_kk = part_y * pt->pid;
			}

			for(int kk=0; kk<pt->numpart[jj]; kk++) {

				int PIDX = pt->PIDX_array[L][jj][kk];
				int dims0 = pt->size_array[L][PIDX*2];
				int dims1 = pt->size_array[L][PIDX*2+1];
				if(start_kk <= kk && kk < end_kk){
					part_size += dims0 * dims1;
				}
				//if(pt->pid > 0 && part_start_kk <= kk && kk < part_end_kk){
				if(pt->pid > 0 && 0 <= kk && kk < part_end_kk){
					pointer_size += dims0 * dims1;
				}
				move_size += dims0 * dims1;
			}

			sum_part_size += part_size;
			sum_pointer_size += pointer_size;
			sum_move_size += move_size;

			// error pt->pid == 2 && L == 24 && jj == 1

			if(pt->pid*part_y < pt->numpart[jj]){

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_start, NULL);
				}


				res = cuMemcpyDtoH((void *)(pointer_dst_M+(unsigned long long int)(pointer_size*sizeof(FLOAT))), (CUdeviceptr)(pointer_M_dev+(unsigned long long int)(pointer_size*sizeof(FLOAT))), part_size*sizeof(FLOAT));
				if(res != CUDA_SUCCESS) {
					printf("error pid = %d\n",pt->pid);
					printf("cuMemcpyDtoH(dst_M) failed: res = %s\n", cuda_response_to_string(res));
					exit(1);
				}

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_end, NULL);
					tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
					time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
				}

			}

			pointer_dst_M += (unsigned long long int)(move_size * sizeof(FLOAT));
			pointer_M_dev += (unsigned long long int)(move_size * sizeof(FLOAT));

			part_size = 0;
			pointer_size = 0;
			move_size = 0;

		}

	}


	/* downloads tmpIx from GPU */

	sum_part_size = 0;
	sum_pointer_size = 0;
	part_size = 0;
	pointer_size = 0;
	part_y = 0;
	move_size = 0;
	start_kk = 0;
	end_kk = 0;
	part_end_kk = 0;
	unsigned long long int pointer_dst_tmpIx = (unsigned long long int)pt->dst_tmpIx;
	unsigned long long int pointer_tmpIx_dev = (unsigned long long int)tmpIx_dev[pt->pid];


	for(int L=0; L<(pt->L_MAX-pt->interval); L++) {

		/**************************************************************************/
		/* loop condition */
		if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) )
		{
			continue;
		}
		/* loop conditon */
		/**************************************************************************/


		for(int jj=0; jj<pt->NoC; jj++) {

			part_y = pt->numpart[jj] / device_num;
			if(pt->numpart[jj]%device_num != 0){
				part_y++;
			}

			start_kk = part_y * pt->pid;
			end_kk = part_y * (pt->pid + 1);

			if(end_kk > pt->numpart[jj]){
				end_kk = pt->numpart[jj];
			}

			if(pt->pid > 0){
				part_end_kk = part_y * pt->pid;
			}

			for(int kk=0; kk<pt->numpart[jj]; kk++) {

				int PIDX = pt->PIDX_array[L][jj][kk];
				int dims0 = pt->size_array[L][PIDX*2];
				int dims1 = pt->size_array[L][PIDX*2+1];
				if(start_kk <= kk && kk < end_kk){
					part_size += dims0 * dims1;
				}
				if(pt->pid > 0){
					if(0 <= kk && kk < part_end_kk){
						pointer_size += dims0 * dims1;
					}
				}
				move_size += dims0 * dims1;
			}

			sum_part_size += part_size;
			sum_pointer_size += pointer_size;


			if(pt->pid*part_y < pt->numpart[jj]){

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_start, NULL);
				}


				res = cuMemcpyDtoH((void *)(pointer_dst_tmpIx+(unsigned long long int)(pointer_size*sizeof(int))), (CUdeviceptr)(pointer_tmpIx_dev+(unsigned long long int)(pointer_size*sizeof(int))), part_size*sizeof(int));
				if(res != CUDA_SUCCESS) {
					printf("error pid = %d\n",pt->pid);
					printf("cuMemcpyDtoH(tmpIx) failed: res = %s\n", cuda_response_to_string(res));
					exit(1);
				}

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_end, NULL);
					tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
					time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
				}

			}

			pointer_dst_tmpIx += (unsigned long long int)(move_size * sizeof(int));
			pointer_tmpIx_dev += (unsigned long long int)(move_size * sizeof(int));

			part_size = 0;
			pointer_size = 0;
			move_size = 0;

		}

	}


	/* downloads tmpIy from GPU */

	sum_part_size = 0;
	sum_pointer_size = 0;
	part_size = 0;
	pointer_size = 0;
	part_y = 0;
	move_size = 0;
	start_kk = 0;
	end_kk = 0;
	part_end_kk = 0;
	unsigned long long int pointer_dst_tmpIy = (unsigned long long int)pt->dst_tmpIy;
	unsigned long long int pointer_tmpIy_dev = (unsigned long long int)tmpIy_dev[pt->pid];


	for(int L=0; L<(pt->L_MAX-pt->interval); L++) {

		/**************************************************************************/
		/* loop condition */
		if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) )
		{
			continue;
		}
		/* loop conditon */
		/**************************************************************************/


		for(int jj=0; jj<pt->NoC; jj++) {

			part_y = pt->numpart[jj] / device_num;
			if(pt->numpart[jj]%device_num != 0){
				part_y++;
			}

			start_kk = part_y * pt->pid;
			end_kk = part_y * (pt->pid + 1);

			if(end_kk > pt->numpart[jj]){
				end_kk = pt->numpart[jj];
			}

			if(pt->pid > 0){
				part_end_kk = part_y * pt->pid;
			}

			for(int kk=0; kk<pt->numpart[jj]; kk++) {

				int PIDX = pt->PIDX_array[L][jj][kk];
				int dims0 = pt->size_array[L][PIDX*2];
				int dims1 = pt->size_array[L][PIDX*2+1];
				if(start_kk <= kk && kk < end_kk){
					part_size += dims0 * dims1;
				}
				if(pt->pid > 0){
					if(0 <= kk && kk < part_end_kk){
						pointer_size += dims0 * dims1;
					}
				}
				move_size += dims0 * dims1;
			}

			sum_part_size += part_size;
			sum_pointer_size += pointer_size;

			if(pt->pid*part_y < pt->numpart[jj]){

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_start, NULL);
				}

				res = cuMemcpyDtoH((void *)(pointer_dst_tmpIy+(unsigned long long int)(pointer_size*sizeof(int))), (CUdeviceptr)(pointer_tmpIy_dev+(unsigned long long int)(pointer_size*sizeof(int))), part_size*sizeof(int));
				if(res != CUDA_SUCCESS) {
					printf("error pid = %d\n",pt->pid);
					printf("cuMemcpyDtoH(tmpIy) failed: res = %s\n", cuda_response_to_string(res));
					exit(1);
				}

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_end, NULL);
					tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
					time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
				}

			}

			pointer_dst_tmpIy += (unsigned long long int)(move_size * sizeof(int));
			pointer_tmpIy_dev += (unsigned long long int)(move_size * sizeof(int));

			part_size = 0;
			pointer_size = 0;
			move_size = 0;

		}

	}


	/* end of thread */
	CUT_THREADEND;
}
Пример #20
0
    void operator()(Handle handle) const {
	if (ctx) cuda_check( cuCtxSetCurrent(ctx) );
        deleter_impl<Handle>::dispose(handle);
    }
Пример #21
0
 /// Binds the context to the calling CPU thread.
 void set_current() const {
     cuda_check( cuCtxSetCurrent( c.get() ) );
 }
Пример #22
0
void cuda_exit(cuda_context *ctx) {
  if (ctx->old != ctx->ctx)
    cuCtxSetCurrent(ctx->old);
}
Пример #23
0
void init_cuda(void)
{
    CUresult res;
    std::string cubin_path(STR(CUBIN_PATH));

    // initnialize GPU
    res = cuInit(0);
    CUDA_CHECK(res, "cuInit()");

    // count the number of usable GPU
    res = cuDeviceGetCount(&device_num);
    CUDA_CHECK(res, "cuDeviceGetCount()");

    // unsupported multi GPU
    device_num = 1;

    // get device
    dev = (CUdevice*) malloc(device_num * sizeof(CUdevice));

    for (int i = 0; i < device_num; i++)
    {
        res = cuDeviceGet(&dev[i], i);
        CUDA_CHECK(res, "cuDeviceGet()");
    }

    ctx = (CUcontext*) malloc(device_num * sizeof(CUcontext));

    module = (CUmodule*) malloc(device_num * sizeof(CUmodule));

    ConvolutionKernel_func = (CUfunction*) malloc(
            device_num * sizeof(CUfunction));
    DistanceTransformTwoDimensionalProblem_func = (CUfunction*) malloc(
            device_num * sizeof(CUfunction));
    BilinearKernelTex32F_func = (CUfunction*) malloc(
            device_num * sizeof(CUfunction));
    calculateHistogram_func = (CUfunction*) malloc(
            device_num * sizeof(CUfunction));
    getFeatureMaps_func = (CUfunction*) malloc(device_num * sizeof(CUfunction));
    calculateNorm_func = (CUfunction*) malloc(device_num * sizeof(CUfunction));
    normalizeAndTruncate_func = (CUfunction*) malloc(
            device_num * sizeof(CUfunction));
    PCAFeatureMapsAddNullableBorder_func = (CUfunction*) malloc(
            device_num * sizeof(CUfunction));

    for (int i = 0; i < device_num; i++)
    {
        res = cuCtxCreate(&ctx[i], 0, dev[i]);
        CUDA_CHECK(res, "cuCtxCreate()");
    }

    for (int i = 0; i < device_num; i++)
    {

        res = cuCtxSetCurrent(ctx[i]);
        CUDA_CHECK(res, "cuCtxSetCurrent()");

        // load .cubin file
        res = cuModuleLoad(&module[i], cubin_path.c_str());
        CUDA_CHECK(res, "cuModuleLoad()");

        res = cuModuleGetFunction(&ConvolutionKernel_func[i], module[i],
                "ConvolutionKernel");
        CUDA_CHECK(res, "cuModuleGetFunction(ConvolutionKernel)");

        res = cuModuleGetFunction(
                &DistanceTransformTwoDimensionalProblem_func[i], module[i],
                "DistanceTransformTwoDimensionalProblemKernel");
        CUDA_CHECK(res, "cuModuleGetFunction(DistanceTransformTwoDimensionalProblemKernel)");

        res = cuModuleGetFunction(&BilinearKernelTex32F_func[i], module[i],
                "BilinearKernelTex32F");
        CUDA_CHECK(res, "cuModuleGetFunction(BilinearKernelTex32F)");

        res = cuModuleGetFunction(&calculateHistogram_func[i], module[i],
                "calculateHistogram");
        CUDA_CHECK(res, "cuModuleGetFunction(calculateHistogram)");

        res = cuModuleGetFunction(&getFeatureMaps_func[i], module[i],
                "getFeatureMaps");
        CUDA_CHECK(res, "cuModuleGetFunction(getFeatureMaps)");

        res = cuModuleGetFunction(&calculateNorm_func[i], module[i],
                "calculateNorm");
        CUDA_CHECK(res, "cuModuleGetFunction(calculateNorm)");

        res = cuModuleGetFunction(&normalizeAndTruncate_func[i], module[i],
                "normalizeAndTruncate");
        CUDA_CHECK(res, "cuModuleGetFunction(normalizeAndTruncate)");

        res = cuModuleGetFunction(&PCAFeatureMapsAddNullableBorder_func[i],
                module[i], "PCAFeatureMapsAddNullableBorder");
        CUDA_CHECK(res, "cuModuleGetFunction(PCAFeatureMapsAddNullableBorder)");
    }

    NR_MAXTHREADS_X = (int*) malloc(device_num * sizeof(int));
    NR_MAXTHREADS_Y = (int*) malloc(device_num * sizeof(int));

    for (int i = 0; i < device_num; i++)
    {
        // get max thread num per block
        max_threads_num = 0;
        res = cuDeviceGetAttribute(&max_threads_num,
                CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[i]);
        CUDA_CHECK(res, "cuDeviceGetAttribute()");

        NR_MAXTHREADS_X[i] = (int) sqrt((double) max_threads_num);
        NR_MAXTHREADS_Y[i] = (int) sqrt((double) max_threads_num);
    }

}
Пример #24
0
//detect boundary box
FLOAT *dpm_ttic_gpu_get_boxes(FLOAT **features,FLOAT *scales,int *feature_size, GPUModel *MO,
			      int *detected_count, FLOAT *acc_score, FLOAT thresh)
{
	//constant parameters
	const int max_scale = MO->MI->max_scale;
	const int interval = MO->MI->interval;
	const int sbin = MO->MI->sbin;
	const int padx = MO->MI->padx;
	const int pady = MO->MI->pady;
	const int NoR = MO->RF->NoR;
	const int NoP = MO->PF->NoP;
	const int NoC = MO->MI->numcomponent;
	const int *numpart = MO->MI->numpart;
	const int LofFeat=(max_scale+interval)*NoC;
	const int L_MAX = max_scale+interval;

	/* for measurement */
	struct timeval tv;
	struct timeval tv_make_c_start, tv_make_c_end;
	struct timeval tv_nucom_start, tv_nucom_end;
	struct timeval tv_box_start, tv_box_end;
	float time_box=0;
	struct timeval tv_root_score_start, tv_root_score_end;
	float time_root_score = 0;
	struct timeval tv_part_score_start, tv_part_score_end;
	float time_part_score = 0;
	struct timeval tv_dt_start, tv_dt_end;
	float time_dt = 0;
	struct timeval tv_calc_a_score_start, tv_calc_a_score_end;
	float time_calc_a_score = 0;

	gettimeofday(&tv_make_c_start, nullptr);

	int **RF_size = MO->RF->root_size;
	int *rootsym = MO->RF->rootsym;
	int *part_sym = MO->PF->part_sym;
	int **part_size = MO->PF->part_size;
	FLOAT **rootfilter = MO->RF->rootfilter;
	FLOAT **partfilter=MO->PF->partfilter;
	int **psize = MO->MI->psize;

	int **rm_size_array = (int **)malloc(sizeof(int *)*L_MAX);
	int **pm_size_array = (int **)malloc(sizeof(int *)*L_MAX);
	pm_size_array = (int **)malloc(sizeof(int *)*L_MAX);

	FLOAT **Tboxes=(FLOAT**)calloc(LofFeat,sizeof(FLOAT*)); //box coordinate information(Temp)
	int  *b_nums =(int*)calloc(LofFeat,sizeof(int)); //length of Tboxes
	int count = 0;
	int detected_boxes=0;
	CUresult res;

	/* matched score (root and part) */
	FLOAT ***rootmatch,***partmatch = nullptr;

	int *new_PADsize;  // need new_PADsize[L_MAX*3]
	size_t SUM_SIZE_feat = 0;

	FLOAT **featp2 = (FLOAT **)malloc(L_MAX*sizeof(FLOAT *));


	if(featp2 == nullptr) {  // error semantics
		printf("allocate featp2 failed\n");
		exit(1);
	}


	/* allocate required memory for new_PADsize */
	new_PADsize = (int *)malloc(L_MAX*3*sizeof(int));
	if(new_PADsize == nullptr) {     // error semantics
		printf("allocate new_PADsize failed\n");
		exit(1);
	}

	/* do padarray once and reuse it at calculating root and part time */

	/* calculate sum of size of padded feature */
	for(int tmpL=0; tmpL<L_MAX; tmpL++) {
		int PADsize[3] = { feature_size[tmpL*2], feature_size[tmpL*2+1], 31 };
		int NEW_Y = PADsize[0] + pady*2;
		int NEW_X = PADsize[1] + padx*2;
		SUM_SIZE_feat += (NEW_X*NEW_Y*PADsize[2])*sizeof(FLOAT);
	}

	/* allocate region for padded feat in a lump */
	FLOAT *dst_feat;
	res = cuMemHostAlloc((void **)&dst_feat, SUM_SIZE_feat, CU_MEMHOSTALLOC_DEVICEMAP);
	if(res != CUDA_SUCCESS) {
		printf("cuMemHostAlloc(dst_feat) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	memset(dst_feat, 0, SUM_SIZE_feat);  // zero clear

	/* distribute allocated region */
	uintptr_t pointer_feat = (uintptr_t)dst_feat;
	for(int tmpL=0; tmpL<L_MAX; tmpL++) {

		featp2[tmpL] = (FLOAT *)pointer_feat;
		int PADsize[3] = { feature_size[tmpL*2], feature_size[tmpL*2+1], 31 };
		int NEW_Y = PADsize[0] + pady*2;
		int NEW_X = PADsize[1] + padx*2;
		pointer_feat += (uintptr_t)(NEW_X*NEW_Y*PADsize[2]*sizeof(FLOAT));

	}

	/* copy feat to feat2 */
	for(int tmpL=0; tmpL<L_MAX; tmpL++) {

		int PADsize[3] = { feature_size[tmpL*2], feature_size[tmpL*2+1], 31 };
		int NEW_Y = PADsize[0] + pady*2;
		int NEW_X = PADsize[1] + padx*2;
		int L = NEW_Y*padx;
		int SPL = PADsize[0] + pady;
		int M_S = sizeof(FLOAT)*PADsize[0];
		FLOAT *P = featp2[tmpL];
		FLOAT *S = features[tmpL];

		for(int i=0; i<PADsize[2]; i++)
		{
			P += L;
			for(int j=0; j<PADsize[1]; j++)
			{
				P += pady;
				memcpy(P, S, M_S);
				S += PADsize[0];
				P += SPL;
			}
			P += L;
		}

		new_PADsize[tmpL*3] = NEW_Y;
		new_PADsize[tmpL*3 + 1] = NEW_X;
		new_PADsize[tmpL*3 + 2] = PADsize[2];

	}

	/* do padarray once and reuse it at calculating root and part time */

	/* allocation in a lump */
	int *dst_rm_size = (int *)malloc(sizeof(int)*NoC*2*L_MAX);
	if(dst_rm_size == nullptr) {
		printf("allocate dst_rm_size failed\n");
		exit(1);
	}

	/* distribution to rm_size_array[L_MAX] */
	uintptr_t ptr = (uintptr_t)dst_rm_size;
	for(int i=0; i<L_MAX; i++) {
		rm_size_array[i] = (int *)ptr;
		ptr += (uintptr_t)(NoC*2*sizeof(int));
	}

	/* allocation in a lump */
	int *dst_pm_size = (int *)malloc(sizeof(int)*NoP*2*L_MAX);
	if(dst_pm_size == nullptr) {
		printf("allocate dst_pm_size failed\n");
		exit(1);
	}

	/* distribution to pm_size_array[L_MAX] */
	ptr = (uintptr_t)dst_pm_size;
	for(int i=0; i<L_MAX; i++) {
		pm_size_array[i] = (int *)ptr;
		ptr += (uintptr_t)(NoP*2*sizeof(int));
	}


	///////level
	for (int level=interval; level<L_MAX; level++)  // feature's loop(A's loop) 1level 1picture
	{
		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
		{
			Tboxes[count]=nullptr;
			count++;
			continue;
		}
	}  //for (level)  // feature's loop(A's loop) 1level 1picture

	///////root calculation/////////
	/* calculate model score (only root) */

	gettimeofday(&tv_root_score_start, nullptr);
	rootmatch = fconvsMT_GPU(
		featp2,
		SUM_SIZE_feat,
		rootfilter,
		rootsym,
		1,
		NoR,
		new_PADsize,
		RF_size, rm_size_array,
		L_MAX,
		interval,
		feature_size,
		padx,
		pady,
		MO->MI->max_X,
		MO->MI->max_Y,
		ROOT
		);
	gettimeofday(&tv_root_score_end, nullptr);
	tvsub(&tv_root_score_end, &tv_root_score_start, &tv);
	time_root_score += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	///////part calculation/////////
	if(NoP>0)
	{
		/* calculate model score (only part) */
		gettimeofday(&tv_part_score_start, nullptr);
		partmatch = fconvsMT_GPU(
			featp2,
			SUM_SIZE_feat,
			partfilter,
			part_sym,
			1,
			NoP,
			new_PADsize,
			part_size,
			pm_size_array,
			L_MAX,
			interval,
			feature_size,
			padx,
			pady,
			MO->MI->max_X,
			MO->MI->max_Y,
			PART
			);
		gettimeofday(&tv_part_score_end, nullptr);
		tvsub(&tv_part_score_end, &tv_part_score_start, &tv);
		time_part_score += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	}

	res = cuCtxSetCurrent(ctx[0]);
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSetCurrent(ctx[0]) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	gettimeofday(&tv_make_c_end, nullptr);
	gettimeofday(&tv_nucom_start, nullptr);

	count = 0;
	detected_boxes = 0;

	int **RL_array = (int **)malloc((L_MAX-interval)*sizeof(int*));
	int *dst_RL = (int *) malloc(NoC*(L_MAX-interval)*sizeof(int));

	int **RI_array = (int **)malloc((L_MAX-interval)*sizeof(int*));
	int *dst_RI = (int *)malloc(NoC*(L_MAX-interval)*sizeof(int));

	int **OI_array = (int **)malloc((L_MAX-interval)*sizeof(int*));
	int *dst_OI = (int *)malloc((NoC)*(L_MAX-interval)*sizeof(int));

	int **RL_S_array = (int **)malloc((L_MAX-interval)*sizeof(int*));
	int *dst_RL_S = (int *)malloc(NoC*(L_MAX-interval)*sizeof(int));


	FLOAT **OFF_array = (FLOAT **)malloc((L_MAX-interval)*sizeof(FLOAT*));
	FLOAT *dst_OFF = (FLOAT *)malloc(NoC*(L_MAX-interval)*sizeof(FLOAT));

	FLOAT ***SCORE_array = (FLOAT ***)malloc((L_MAX-interval)*sizeof(FLOAT **));
	FLOAT **sub_dst_SCORE = (FLOAT **)malloc(NoC*(L_MAX-interval)*sizeof(FLOAT*));

	uintptr_t pointer_RL = (uintptr_t)dst_RL;
	uintptr_t pointer_RI = (uintptr_t)dst_RI;
	uintptr_t pointer_OI = (uintptr_t)dst_OI;
	uintptr_t pointer_RL_S = (uintptr_t)dst_RL_S;
	uintptr_t pointer_OFF = (uintptr_t)dst_OFF;
	uintptr_t pointer_SCORE = (uintptr_t)sub_dst_SCORE;
	for (int level=interval; level<L_MAX; level++) {

		int L=level-interval;

		RL_array[L] = (int *)pointer_RL;
		pointer_RL += (uintptr_t)NoC*sizeof(int);

		RI_array[L] = (int *)pointer_RI;
		pointer_RI += (uintptr_t)NoC*sizeof(int);

		OI_array[L] = (int *)pointer_OI;
		pointer_OI += (uintptr_t)NoC*sizeof(int);

		RL_S_array[L] = (int *)pointer_RL_S;
		pointer_RL_S += (uintptr_t)NoC*sizeof(int);

		OFF_array[L] = (FLOAT *)pointer_OFF;
		pointer_OFF += (uintptr_t)NoC*sizeof(FLOAT);

		SCORE_array[L] = (FLOAT **)pointer_SCORE;
		pointer_SCORE += (uintptr_t)NoC*sizeof(FLOAT*);
	}

	int sum_RL_S = 0;
	int sum_SNJ = 0;
	/* prepare for parallel execution */
	for(int level=interval; level<L_MAX; level++) {
		int L = level - interval;

		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
		{
			continue;
		}

		for(int j=0; j<NoC; j++) {

			/* root score + offset */
			RL_array[L][j] = rm_size_array[level][j*2]*rm_size_array[level][j*2+1];  //length of root-matching
			RI_array[L][j] = MO->MI->ridx[j];  //root-index
			OI_array[L][j] =  MO->MI->oidx[j];  //offset-index
			RL_S_array[L][j] =sizeof(FLOAT)*RL_array[L][j];


			OFF_array[L][j] = MO->MI->offw[RI_array[L][j]];  //offset information


			/* search max values */
			max_RL_S = (max_RL_S < RL_S_array[L][j]) ? RL_S_array[L][j] : max_RL_S;
			max_numpart = (max_numpart < numpart[j]) ? numpart[j] : max_numpart;
		}
	}

	sum_RL_S = max_RL_S*NoC*(L_MAX-interval);

	/* root matching size */
	sum_SNJ = sizeof(int*)*max_numpart*NoC*(L_MAX-interval);

	/* consolidated allocation for SCORE_array and distribute region */
	FLOAT *dst_SCORE = (FLOAT *)malloc(sum_RL_S);
	pointer_SCORE = (uintptr_t)dst_SCORE;
	for(int level=interval; level<L_MAX; level++) {
		int L = level - interval;

		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
		{
			continue;
		}

		for(int j=0; j<NoC; j++) {
			SCORE_array[L][j] = (FLOAT *)pointer_SCORE;
			pointer_SCORE += (uintptr_t)max_RL_S;
		}
	}

	/* add offset */
	for(int level=interval; level<L_MAX; level++) {
		int L = level - interval;

		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
		{
			continue;
		}

		for(int j=0; j<NoC; j++) {
			memcpy(SCORE_array[L][j], rootmatch[level][j], RL_S_array[L][j]);
			FLOAT *SC_S = SCORE_array[L][j];
			FLOAT *SC_E = SCORE_array[L][j]+RL_array[L][j];
			while(SC_S<SC_E) *(SC_S++)+=OFF_array[L][j];
		}
	}

	/* anchor matrix */  // consolidated allocation
	int ***ax_array = (int ***)malloc((L_MAX-interval)*sizeof(int **));
	int **sub_dst_ax = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int *));
	int *dst_ax = (int *)malloc(sum_SNJ);

	int ***ay_array = (int ***)malloc((L_MAX-interval)*sizeof(int **));
	int **sub_dst_ay = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int *));
	int *dst_ay = (int *)malloc(sum_SNJ);

	/* boudary index */  // consolidated allocation
	int ****Ix_array =(int ****)malloc((L_MAX-interval)*sizeof(int ***));
	int ***sub_dst_Ix = (int ***)malloc(NoC*(L_MAX-interval)*sizeof(int **));
	int **dst_Ix = (int **)malloc(sum_SNJ);

	int ****Iy_array = (int ****)malloc((L_MAX-interval)*sizeof(int ***));
	int ***sub_dst_Iy = (int ***)malloc(NoC*(L_MAX-interval)*sizeof(int **));
	int **dst_Iy = (int **)malloc(sum_SNJ);

	/* distribute region */
	uintptr_t pointer_ax = (uintptr_t)sub_dst_ax;
	uintptr_t pointer_ay = (uintptr_t)sub_dst_ay;
	uintptr_t pointer_Ix = (uintptr_t)sub_dst_Ix;
	uintptr_t pointer_Iy = (uintptr_t)sub_dst_Iy;
	for(int level=interval; level<L_MAX; level++) {
		int L = level - interval;

		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
		{
			continue;
		}

		ax_array[L] = (int **)pointer_ax;
		pointer_ax += (uintptr_t)(NoC*sizeof(int*));

		ay_array[L] = (int **)pointer_ay;
		pointer_ay += (uintptr_t)(NoC*sizeof(int*));

		Ix_array[L] = (int ***)pointer_Ix;
		pointer_Ix += (uintptr_t)(NoC*sizeof(int**));

		Iy_array[L] = (int ***)pointer_Iy;
		pointer_Iy += (uintptr_t)(NoC*sizeof(int**));
	}

	pointer_ax = (uintptr_t)dst_ax;
	pointer_ay = (uintptr_t)dst_ay;
	pointer_Ix = (uintptr_t)dst_Ix;
	pointer_Iy = (uintptr_t)dst_Iy;
	for(int level=interval; level<L_MAX; level++) {
		int L = level - interval;

		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
		{
			continue;
		}

		for(int j=0; j<NoC; j++) {
			uintptr_t pointer_offset = sizeof(int*)*max_numpart;

			ax_array[L][j] = (int *)pointer_ax;
			pointer_ax += pointer_offset;

			ay_array[L][j] = (int *)pointer_ay;
			pointer_ay += pointer_offset;

			Ix_array[L][j] = (int **)pointer_Ix;
			pointer_Ix += pointer_offset;

			Iy_array[L][j] = (int **)pointer_Iy;
			pointer_Iy += pointer_offset;
		}
	}

	/* add parts */
	if(NoP>0)
        {
		/* arrays to store temporary loop variables */
		int tmp_array_size = 0;
		for(int level=interval; level<L_MAX; level++) {
			if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
			{
				continue;
			}

			for(int j=0; j<NoC; j++) {
				tmp_array_size += max_numpart*sizeof(int);
			}
		}

		int ***DIDX_array = (int ***)malloc((L_MAX-interval)*sizeof(int**));
		int **sub_dst_DIDX = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int*));
		int *dst_DIDX = (int *)malloc(tmp_array_size);


		int ***DID_4_array = (int ***)malloc((L_MAX-interval)*sizeof(int **));
		int **sub_dst_DID_4 = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int*));
		int *dst_DID_4;
		res = cuMemHostAlloc((void **)&dst_DID_4, tmp_array_size, CU_MEMHOSTALLOC_DEVICEMAP);
		if(res != CUDA_SUCCESS) {
			printf("cuMemHostAlloc(dst_DID_4) failed: res = %s\n", cuda_response_to_string(res));
			exit(1);
		}


		int ***PIDX_array = (int ***)malloc((L_MAX-interval)*sizeof(int **));
		int **sub_dst_PIDX = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int*));
		int *dst_PIDX;
		res = cuMemHostAlloc((void **)&dst_PIDX, tmp_array_size, CU_MEMHOSTALLOC_DEVICEMAP);
		if(res != CUDA_SUCCESS) {
			printf("cuMemHostAlloc(dst_PIDX) failed: res = %s\n", cuda_response_to_string(res));
			exit(1);
		}

		/* distribute consolidated region */
		uintptr_t pointer_DIDX = (uintptr_t)sub_dst_DIDX;
		uintptr_t pointer_DID_4 = (uintptr_t)sub_dst_DID_4;
		uintptr_t pointer_PIDX = (uintptr_t)sub_dst_PIDX;
		for(int level=interval; level<L_MAX; level++) {
			int L = level - interval;

			if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
			{
				continue;
			}

			DIDX_array[L] = (int **)pointer_DIDX;
			pointer_DIDX += (uintptr_t)(NoC*sizeof(int*));

			DID_4_array[L] = (int **)pointer_DID_4;
			pointer_DID_4 += (uintptr_t)(NoC*sizeof(int*));

			PIDX_array[L] = (int **)pointer_PIDX;
			pointer_PIDX += (uintptr_t)(NoC*sizeof(int*));
		}

		pointer_DIDX = (uintptr_t)dst_DIDX;
		pointer_DID_4 = (uintptr_t)dst_DID_4;
		pointer_PIDX = (uintptr_t)dst_PIDX;
		for(int level=interval; level<L_MAX; level++) {
			int L = level - interval;

			if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) {
				continue;
			}

			for(int j=0; j<NoC; j++) {
				uintptr_t pointer_offset = (uintptr_t)(max_numpart*sizeof(int));

				DIDX_array[L][j] = (int *)pointer_DIDX;
				pointer_DIDX += pointer_offset;

				DID_4_array[L][j] = (int *)pointer_DID_4;
				pointer_DID_4 += pointer_offset;

				PIDX_array[L][j] = (int *)pointer_PIDX;
				pointer_PIDX += pointer_offset;
			}
		}

		/* prepare for parallel execution */
		int sum_size_index_matrix = 0;
		for(int level=interval; level<L_MAX; level++) {
			int L = level - interval;

			if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) {
				continue;
			}

			for(int j=0; j<NoC; j++) {
				for (int k=0;k<numpart[j];k++) {
					/* assign values to each element */
					DIDX_array[L][j][k] = MO->MI->didx[j][k];
					DID_4_array[L][j][k] = DIDX_array[L][j][k]*4;
					PIDX_array[L][j][k] = MO->MI->pidx[j][k];

					/* anchor */
					ax_array[L][j][k] = MO->MI->anchor[DIDX_array[L][j][k]*2]+1;
					ay_array[L][j][k] = MO->MI->anchor[DIDX_array[L][j][k]*2+1]+1;

					int PSSIZE[2] ={pm_size_array[L][PIDX_array[L][j][k]*2], pm_size_array[L][PIDX_array[L][j][k]*2+1]}; // size of C

					/* index matrix */
					sum_size_index_matrix += sizeof(int)*PSSIZE[0]*PSSIZE[1];
				}
			}
		}

		int *dst_Ix_kk = (int *)malloc(sum_size_index_matrix);
		int *dst_Iy_kk = (int *)malloc(sum_size_index_matrix);
		uintptr_t pointer_Ix_kk = (uintptr_t)dst_Ix_kk;
		uintptr_t pointer_Iy_kk = (uintptr_t)dst_Iy_kk;
		for(int level=interval; level<L_MAX; level++) {
			int L = level - interval;

			if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
			{
				continue;
			}

			for(int j=0; j<NoC; j++) {
				for (int k=0;k<numpart[j];k++) {
					int PSSIZE[2] ={pm_size_array[L][PIDX_array[L][j][k]*2], pm_size_array[L][PIDX_array[L][j][k]*2+1]}; // size of C

					Ix_array[L][j][k] = (int *)pointer_Ix_kk;
					Iy_array[L][j][k] = (int *)pointer_Iy_kk;

					pointer_Ix_kk += (uintptr_t)(sizeof(int)*PSSIZE[0]*PSSIZE[1]);
					pointer_Iy_kk += (uintptr_t)(sizeof(int)*PSSIZE[0]*PSSIZE[1]);
				}
			}
		}

		gettimeofday(&tv_dt_start, nullptr);
		FLOAT ****M_array = dt_GPU(
			Ix_array,      // int ****Ix_array
			Iy_array,      // int ****Iy_array
			PIDX_array,    // int ***PIDX_array
			pm_size_array, // int **size_array
			NoP,           // int NoP
			numpart,       // int *numpart
			NoC,           // int NoC
			interval,      // int interval
			L_MAX,         // int L_MAX
			feature_size,         // int *feature_size,
			padx,          // int padx,
			pady,          // int pady,
			MO->MI->max_X, // int max_X
			MO->MI->max_Y, // int max_Y
			MO->MI->def, // FLOAT *def
			tmp_array_size, // int tmp_array_size
			dst_PIDX, // int *dst_PIDX
			dst_DID_4 // int *DID_4
			);
		gettimeofday(&tv_dt_end, nullptr);
		tvsub(&tv_dt_end, &tv_dt_start, &tv);
		time_dt += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

		/* add part score */
		for(int level=interval; level<L_MAX; level++){
			int L = level - interval;

			if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
			{
				continue;
			}

			for(int j=0; j<NoC; j++) {
				for(int k=0; k<numpart[j]; k++) {
					int PSSIZE[2] ={pm_size_array[L][PIDX_array[L][j][k]*2],
							pm_size_array[L][PIDX_array[L][j][k]*2+1]}; // Size of C
					int R_S[2]={rm_size_array[level][j*2], rm_size_array[level][j*2+1]};

					dpm_ttic_add_part_calculation(SCORE_array[L][j], M_array[L][j][k], R_S,
								      PSSIZE, ax_array[L][j][k], ay_array[L][j][k]);
				}
			}
		}

		s_free(M_array[0][0][0]);
		s_free(M_array[0][0]);
		s_free(M_array[0]);
		s_free(M_array);

		/* free temporary arrays */
		free(dst_DIDX);
		free(sub_dst_DIDX);
		free(DIDX_array);

		res = cuMemFreeHost(dst_DID_4);
		if(res != CUDA_SUCCESS) {
			printf("cuMemFreeHost(dst_DID_4) failed: res = %s\n", cuda_response_to_string(res));
			exit(1);
		}
		free(sub_dst_DID_4);
		free(DID_4_array);

		res = cuMemFreeHost(dst_PIDX);
		if(res != CUDA_SUCCESS) {
			printf("cuMemFreeHost(dst_PIDX) failed: res = %s\n", cuda_response_to_string(res));
			exit(1);
		}

		free(sub_dst_PIDX);
		free(PIDX_array);

		res = cuCtxSetCurrent(ctx[0]);
		if(res != CUDA_SUCCESS) {
			printf("cuCtxSetCurrent(ctx[0]) failed: res = %s\n", cuda_response_to_string(res));
			exit(1);
		}
        } // start from if(NoP>0)

	/* combine root and part score and detect boundary box for each-component */

	FLOAT *scale_array = (FLOAT *)malloc((L_MAX-interval)*sizeof(FLOAT));
	for(int level=interval; level<L_MAX; level++) {
		int L = level - interval;

		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
		{
			Tboxes[count]=nullptr;
			count++;
			continue;
		}

		scale_array[L] = (FLOAT)sbin/scales[level];
	}

	for (int level=interval; level<L_MAX; level++)  // feature's loop(A's loop) 1level 1picture
        {
		/* parameters (related for level) */
		int L=level-interval;
		/* matched score size matrix */
		FLOAT scale=(FLOAT)sbin/scales[level];

		/* loop conditon */
		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) {
			Tboxes[count]=nullptr;
			count++;
			continue;
		}

		/* calculate accumulated score */
		gettimeofday(&tv_calc_a_score_start, nullptr);

		calc_a_score_GPU(
			acc_score,              // FLOAT *ac_score
			SCORE_array[L],       // FLOAT **score
			rm_size_array[level], // int *ssize_start
			MO->MI,               // Model_info *MI
			scale,                // FLOAT scale
			RL_S_array[L],        // int *size_score_array
			NoC                   // int NoC
			);

		gettimeofday(&tv_calc_a_score_end, nullptr);
		tvsub(&tv_calc_a_score_end, &tv_calc_a_score_start, &tv);
		time_calc_a_score += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

		for(int j = 0; j <NoC; j++) {
			int R_S[2]={rm_size_array[level][j*2], rm_size_array[level][j*2+1]};

			/* get all good matches */
			int GMN;
			int *GMPC = get_gmpc(SCORE_array[L][j],thresh,R_S,&GMN);
			int RSIZE[2]={MO->MI->rsize[j*2], MO->MI->rsize[j*2+1]};

			int GL = (numpart[j]+1)*4+3;  //31

			/* detected box coordinate(current level) */
			FLOAT *t_boxes = (FLOAT*)calloc(GMN*GL,sizeof(FLOAT));

			gettimeofday(&tv_box_start, nullptr);

			// NO NEED TO USE GPU 
			for(int k = 0;k < GMN;k++) {
				FLOAT *P_temp = t_boxes+GL*k;
				int y = GMPC[2*k];
				int x = GMPC[2*k+1];

				/* calculate root box coordinate */
				FLOAT *RB =rootbox(x,y,scale,padx,pady,RSIZE);
				memcpy(P_temp, RB,sizeof(FLOAT)*4);
				s_free(RB);
				P_temp+=4;

				for(int pp=0;pp<numpart[j];pp++) {
					int PBSIZE[2]={psize[j][pp*2], psize[j][pp*2+1]};
					int Isize[2]={pm_size_array[L][MO->MI->pidx[j][pp]*2], pm_size_array[L][MO->MI->pidx[j][pp]*2+1]};

					/* calculate part box coordinate */
					FLOAT *PB = partbox(x,y,ax_array[L][j][pp],ay_array[L][j][pp],scale,padx,pady,PBSIZE,Ix_array[L][j][pp],Iy_array[L][j][pp],Isize);
					memcpy(P_temp, PB,sizeof(FLOAT)*4);
					P_temp+=4;
					s_free(PB);
				}
				/* component number and score */
				*(P_temp++)=(FLOAT)j; //component number
				*(P_temp++)=SCORE_array[L][j][x*R_S[0]+y]; //score of good match
				*P_temp = scale;
			}

			//  NO NEED TO USE GPU
			gettimeofday(&tv_box_end, nullptr);
			tvsub(&tv_box_end, &tv_box_start, &tv);
			time_box += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

			/* save box information */
			if (GMN > 0)
				Tboxes[count] = t_boxes;
			else
				Tboxes[count] = nullptr;

			b_nums[count]=GMN;
			count++;
			detected_boxes+=GMN;			//number of detected box

			/* release */
			s_free(GMPC);
		}
		////numcom
        }
	////level

	/* free temporary arrays */
	free(dst_RL);
	free(RL_array);

	free(dst_RI);
	free(RI_array);

	free(dst_OI);
	free(OI_array);

	free(dst_RL_S);
	free(RL_S_array);

	free(dst_OFF);
	free(OFF_array);

	free(dst_SCORE);
	free(sub_dst_SCORE);
	free(SCORE_array);

	free(dst_ax);
	free(sub_dst_ax);
	free(ax_array);

	free(dst_ay);
	free(sub_dst_ay);
	free(ay_array);

	free(Ix_array[0][0][0]);
	free(dst_Ix);
	free(sub_dst_Ix);
	free(Ix_array);

	free(Iy_array[0][0][0]);
	free(dst_Iy);
	free(sub_dst_Iy);
	free(Iy_array);

	free(scale_array);

	gettimeofday(&tv_nucom_end, nullptr);

#ifdef PRINT_INFO
	printf("root SCORE : %f\n", time_root_score);
	printf("part SCORE : %f\n", time_part_score);
	printf("dt  : %f\n", time_dt);
	printf("calc_a_score : %f\n", time_calc_a_score);
#endif
	res = cuCtxSetCurrent(ctx[0]);
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSetCurrent(ctx[0]) failed: res = %s\n",cuda_response_to_string(res));
		exit(1);
	}

	/* free memory regions */
	res = cuMemFreeHost((void *)featp2[0]);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(featp2[0]) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	s_free(featp2);

	res = cuMemFreeHost((void *)rootmatch[interval][0]);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(rootmatch[0][0]) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}
	s_free(rootmatch[0]);
	s_free(rootmatch);

	if (partmatch != nullptr) {
		res = cuMemFreeHost((void *)partmatch[0][0]);
		if(res != CUDA_SUCCESS) {
			printf("cuMemFreeHost(partmatch[0][0]) failed: res = %s\n", cuda_response_to_string(res));
			exit(1);
		}

		s_free(partmatch[0]);
		s_free(partmatch);

		s_free(new_PADsize);
	}

	/* release */
	s_free(rm_size_array[0]);
	s_free(rm_size_array);
	s_free(pm_size_array[0]);
	s_free(pm_size_array);

	/* Output boundary-box coorinate information */
	int GL=(numpart[0]+1)*4+3;
	FLOAT *boxes=(FLOAT*)calloc(detected_boxes*GL,sizeof(FLOAT));		//box coordinate information(Temp)

	FLOAT *T1 = boxes;
	for(int i = 0; i < LofFeat; i++) {
		int num_t = b_nums[i]*GL;
		if(num_t > 0) {
			FLOAT *T2 = Tboxes[i];
			//memcpy_s(T1,sizeof(FLOAT)*num_t,T2,sizeof(FLOAT)*num_t);
			memcpy(T1, T2,sizeof(FLOAT)*num_t);
			T1 += num_t;
		}
	}

	FLOAT abs_threshold = abs(thresh);

	/* accumulated score calculation */
	FLOAT max_score = 0.0;

	/* add offset to accumulated score */
	for(int i = 0; i < MO->MI->IM_HEIGHT*MO->MI->IM_WIDTH; i++) {
		if (acc_score[i] < thresh) {
			acc_score[i] = 0.0;
		} else {
			acc_score[i] += abs_threshold;

			if (acc_score[i] > max_score)
				max_score = acc_score[i];
		}
	}

	/* normalization */
	if (max_score > 0.0) {
		FLOAT ac_ratio = 1.0 / max_score;

		for (int i = 0; i < MO->MI->IM_HEIGHT*MO->MI->IM_WIDTH; i++) {
			acc_score[i] *= ac_ratio;
		}
	}

	/* release */
	free_boxes(Tboxes,LofFeat);
	s_free(b_nums);

	/* output result */
	*detected_count = detected_boxes;
	return boxes;
}
Пример #25
0
value spoc_getCudaDevice(value i)
{
	CAMLparam1(i);
	CAMLlocal4(general_info, cuda_info, specific_info, gc_info);
	CAMLlocal3(device,  maxT, maxG);
	int nb_devices;
	CUdevprop dev_infos;
	CUdevice dev;
	CUcontext ctx;
	CUstream queue[2];
	spoc_cu_context *spoc_ctx;
	//CUcontext gl_ctx;
	char infoStr[1024];
	int infoInt;
	size_t infoUInt;
	int major, minor;
	enum cudaError_enum cuda_error; 


	cuDeviceGetCount (&nb_devices);

	if ((Int_val(i)) > nb_devices)
		raise_constant(*caml_named_value("no_cuda_device")) ;


	CUDA_CHECK_CALL(cuDeviceGet(&dev, Int_val(i)));
	CUDA_CHECK_CALL(cuDeviceGetProperties(&dev_infos, dev));

	general_info = caml_alloc (9, 0);
	CUDA_CHECK_CALL(cuDeviceGetName(infoStr, sizeof(infoStr), dev));

	Store_field(general_info,0, copy_string(infoStr));//
	CUDA_CHECK_CALL(cuDeviceTotalMem(&infoUInt, dev));

	Store_field(general_info,1, Val_int(infoUInt));//
	Store_field(general_info,2, Val_int(dev_infos.sharedMemPerBlock));//
	Store_field(general_info,3, Val_int(dev_infos.clockRate));//
	Store_field(general_info,4, Val_int(dev_infos.totalConstantMemory));//
	CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev));
	Store_field(general_info,5, Val_int(infoInt));//
	CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
	Store_field(general_info,6, Val_bool(infoInt));//
	Store_field(general_info,7, i);
	CUDA_CHECK_CALL(cuCtxCreate	(&ctx,
			CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST,
			dev));
	spoc_ctx = malloc(sizeof(spoc_cl_context));
	spoc_ctx->ctx = ctx;
	CUDA_CHECK_CALL(cuStreamCreate(&queue[0], 0));
	CUDA_CHECK_CALL(cuStreamCreate(&queue[1], 0));
	spoc_ctx->queue[0] = queue[0];
	spoc_ctx->queue[1] = queue[1];
	Store_field(general_info,8, (value)spoc_ctx);
	CUDA_CHECK_CALL(cuCtxSetCurrent(ctx));


	cuda_info = caml_alloc(1, 0); //0 -> Cuda
	specific_info = caml_alloc(18, 0);

	cuDeviceComputeCapability(&major, &minor, dev);
	Store_field(specific_info,0, Val_int(major));//
	Store_field(specific_info,1, Val_int(minor));//
	Store_field(specific_info,2, Val_int(dev_infos.regsPerBlock));//
	Store_field(specific_info,3, Val_int(dev_infos.SIMDWidth));//
	Store_field(specific_info,4, Val_int(dev_infos.memPitch));//
	Store_field(specific_info,5, Val_int(dev_infos.maxThreadsPerBlock));//

	maxT = caml_alloc(3, 0);
	Store_field(maxT,0, Val_int(dev_infos.maxThreadsDim[0]));//
	Store_field(maxT,1, Val_int(dev_infos.maxThreadsDim[1]));//
	Store_field(maxT,2, Val_int(dev_infos.maxThreadsDim[2]));//
	Store_field(specific_info,6, maxT);

	maxG = caml_alloc(3, 0);
	Store_field(maxG,0, Val_int(dev_infos.maxGridSize[0]));//
	Store_field(maxG,1, Val_int(dev_infos.maxGridSize[1]));//
	Store_field(maxG,2, Val_int(dev_infos.maxGridSize[2]));//
	Store_field(specific_info,7, maxG);

	Store_field(specific_info,8, Val_int(dev_infos.textureAlign));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
	Store_field(specific_info,9, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
	Store_field(specific_info,10, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
	Store_field(specific_info,11, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
	Store_field(specific_info,12, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
	Store_field(specific_info,13, Val_int(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
	Store_field(specific_info,14, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
	Store_field(specific_info,15, Val_int(infoInt));
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
	Store_field(specific_info,16, Val_int(infoInt));
	cuDriverGetVersion(&infoInt);
	Store_field(specific_info, 17, Val_int(infoInt));

	Store_field(cuda_info, 0, specific_info);
	device = caml_alloc(4, 0);
	Store_field(device, 0, general_info);
	Store_field(device, 1, cuda_info);

	{spoc_cuda_gc_info* gcInfo = (spoc_cuda_gc_info*)malloc(sizeof(spoc_cuda_gc_info));
	CUDA_CHECK_CALL(cuMemGetInfo(&infoUInt, NULL));
	infoUInt -= (32*1024*1024);

	Store_field(device, 2, (value)gcInfo);


	{cuda_event_list* events = NULL;
	Store_field(device, 3, (value)events);



	CAMLreturn(device);}}
}
Пример #26
0
/*
 * Create a VampirTrace CUPTI context. If the CUDA context is not given, the 
 * current context will be requested and used.
 * 
 * @param cuCtx CUDA context
 * @param cuDev CUDA device
 * @param ctxID ID of the CUDA context
 * @param devID ID of the CUDA device
 * 
 * @return pointer to created VampirTrace CUPTI context
 */
vt_cupti_ctx_t* vt_cupti_createCtx(CUcontext cuCtx, CUdevice cuDev,
                                   uint32_t cuCtxID, uint32_t cuDevID)
{
  vt_cupti_ctx_t* vtCtx = NULL;
  
  /* create new context */
  vtCtx = (vt_cupti_ctx_t *)malloc(sizeof(vt_cupti_ctx_t));
  if(vtCtx == NULL) 
    vt_error_msg("[CUPTI] Could not allocate memory for VT CUPTI context!");
  vtCtx->ctxID = cuCtxID;
#if (defined(VT_CUPTI_ACTIVITY) || defined(VT_CUPTI_CALLBACKS))
  vtCtx->gpuMemAllocated = 0;
  vtCtx->gpuMemList = NULL;
  vtCtx->strmList = NULL;
#endif
  vtCtx->next = NULL;
  
  VT_CHECK_THREAD;
  vtCtx->ptid = VT_MY_THREAD;
  
  /* try to get CUDA device (ID), if they are not given */
  if(cuDevID == VT_CUPTI_NO_DEVICE_ID){
    if(cuDev == VT_CUPTI_NO_CUDA_DEVICE){
      CUcontext cuCurrCtx;
      
      if(cuCtx != NULL){
        cuCtxGetCurrent(&cuCurrCtx);
      
        /* if given context does not match the current one, get the device for 
           the given one */
        if(cuCtx != cuCurrCtx)
          VT_CUDRV_CALL(cuCtxSetCurrent(cuCtx), NULL);
      }
      
      if(CUDA_SUCCESS == cuCtxGetDevice(&cuDev))
        cuDevID = (uint32_t)cuDev;
      
      /* reset the active context */
      if(cuCtx != NULL && cuCtx != cuCurrCtx)
        VT_CUDRV_CALL(cuCtxSetCurrent(cuCurrCtx), NULL);
      
    }else{
      /* no device ID, but CUDA device is given */
      cuDevID = (uint32_t)cuDev;
    }
  }
  
  vtCtx->devID = cuDevID;
  vtCtx->cuDev = cuDev;
  
  /* get the current CUDA context, if it is not given */
  if(cuCtx == NULL) 
    VT_CUDRV_CALL(cuCtxGetCurrent(&cuCtx), NULL);
  
  /* set the CUDA context */
  vtCtx->cuCtx = cuCtx;
  
#if defined(VT_CUPTI_ACTIVITY)
  vtCtx->activity = NULL;
#endif

#if defined(VT_CUPTI_CALLBACKS)
  vtCtx->callbacks = NULL;
#endif
  
#if defined(VT_CUPTI_EVENTS)
  vtCtx->events = NULL;
#endif

  vt_cntl_msg(2, "[CUPTI] Created context for CUcontext %d, CUdevice %d", 
              cuCtx, cuDev);
  
  return vtCtx;
}
Пример #27
0
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq)
{
	VirtQueueElement elem;
	
	while(virtqueue_pop(vq, &elem)) {
		struct param *p = elem.out_sg[0].iov_base;
	
		//for all library routines: get required arguments from buffer, execute, and push results back in virtqueue
		switch (p->syscall_type) {
		case CUINIT: {
			p->result = cuInit(p->flags);
			break;
		}
		case CUDRIVERGETVERSION: {
			p->result = cuDriverGetVersion(&p->val1);
			break;
		}
		case CUDEVICEGETCOUNT: {
			p->result = cuDeviceGetCount(&p->val1);
			break;
		}
		case CUDEVICEGET: {
			p->result = cuDeviceGet(&p->device, p->val1);
			break;
		}
		case CUDEVICECOMPUTECAPABILITY: {
			p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device);
			break;
		}
		case CUDEVICEGETNAME: {
			p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device);
			break;
		}
		case CUDEVICEGETATTRIBUTE: {
			p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device);
			break;
		}
		case CUCTXCREATE: {
                        p->result = cuCtxCreate(&p->ctx, p->flags, p->device);				
			break;
		}
		case CUCTXDESTROY: {
			p->result = cuCtxDestroy(p->ctx);
			break;
		}
		case CUCTXGETCURRENT: {
			p->result = cuCtxGetCurrent(&p->ctx);
			break;
		}
		case CUCTXGETDEVICE: {
			p->result = cuCtxGetDevice(&p->device);
			break;
		}
		case CUCTXPOPCURRENT: {
			p->result = cuCtxPopCurrent(&p->ctx);
			break;
		}
		case CUCTXSETCURRENT: {
			p->result = cuCtxSetCurrent(p->ctx);
	                break;
		}
	        case CUCTXSYNCHRONIZE: {
		        p->result = cuCtxSynchronize();
	                break;
	        }
		case CUMODULELOAD: {
			//hardcoded path - needs improvement
			//all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS
			char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char));
			if (!binname) {
				p->result = 0;
		                virtqueue_push(vq, &elem, 0);
				break;
			}
		        strcpy(binname, getenv("QEMU_NFS_PATH"));
		        strcat(binname, (char *)elem.out_sg[1].iov_base);
			//change current CUDA context
			//each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes
                        if (cuCtxSetCurrent(p->ctx) != 0) {
				p->result = 999;
                                break;
			}
			p->result = cuModuleLoad(&p->module, binname);
			free(binname);
			break;
		}
                case CUMODULEGETGLOBAL: {
                        char *name = malloc(100*sizeof(char));
                        if (!name) {
                                p->result = 999;
                                break;
                        }
                        strcpy(name, (char *)elem.out_sg[1].iov_base);
                        p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name);
                        break;
                }
		case CUMODULEUNLOAD: {
			p->result = cuModuleUnload(p->module);
			break;			
		}
		case CUMEMALLOC: {
			if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuMemAlloc(&p->dptr, p->bytesize);
			break;
		}
                case CUMEMALLOCPITCH: {
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize);
			break;
		}
		//large buffers are alocated in smaller chuncks in guest kernel space
		//gets each chunck seperately and copies it to device memory
	        case CUMEMCPYHTOD: {
			int i;
			size_t offset;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			offset = 0;
			for (i=0; i<nr_pages; i++) {
				s = *(long *)elem.out_sg[1+2*i+1].iov_base;
				p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s);
				if (p->result != 0) break;
				offset += s;
			}
	                break;
		}
		case CUMEMCPYHTODASYNC: {
			int i;
                        size_t offset;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
                        offset = 0;
			for (i=0; i<nr_pages; i++) {
                                s = *(long *)elem.out_sg[1+2*i+1].iov_base;
                                p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream);
                                if (p->result != 0) break;
                                offset += s;
                        }
                        break;
		}
		case CUMEMCPYDTODASYNC: {
			p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream);
                        break;		
		}
	        case CUMEMCPYDTOH: {
			int i;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			size_t offset = 0;
			for (i=0; i<nr_pages; i++) {
				s = *(long *)elem.in_sg[0+2*i+1].iov_base;
				p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s);
				if (p->result != 0) break;
				offset += s;
			}
			break;
		}
		case CUMEMCPYDTOHASYNC: {
			int i;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
                        size_t offset = 0;
			for (i=0; i<nr_pages; i++) {
                                s = *(long *)elem.in_sg[0+2*i+1].iov_base;
                                p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream);
                                if (p->result != 0) break;
                                offset += s;
                        }
			break;
		}
		case CUMEMSETD32: {
			p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize);
			break;
		}
	        case CUMEMFREE: {
	                p->result = cuMemFree(p->dptr);
	                break;
	        }
		case CUMODULEGETFUNCTION: {
			char *name = (char *)elem.out_sg[1].iov_base;
			name[p->length] = '\0';
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuModuleGetFunction(&p->function, p->module, name);
			break;	
		}
		case CULAUNCHKERNEL: {
			void **args = malloc(p->val1*sizeof(void *));
	                if (!args) {
				p->result = 9999;
	                        break;
        	        }
			int i;
			for (i=0; i<p->val1; i++) {
				args[i] = elem.out_sg[1+i].iov_base;
			}
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuLaunchKernel(p->function,
					p->gridDimX, p->gridDimY, p->gridDimZ,
			                p->blockDimX, p->blockDimY, p->blockDimZ,
					p->bytecount, 0, args, 0);
			free(args);
			break;
		}
		case CUEVENTCREATE: {
			p->result = cuEventCreate(&p->event1, p->flags);
			break;
		}
		case CUEVENTDESTROY: {
			p->result = cuEventDestroy(p->event1);
			break;
		}
		case CUEVENTRECORD: {
			p->result = cuEventRecord(p->event1, p->stream);
			break;
		}
		case CUEVENTSYNCHRONIZE: {
			p->result = cuEventSynchronize(p->event1);
			break;
		}
		case CUEVENTELAPSEDTIME: {
			p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2);
			break;
		}
		case CUSTREAMCREATE: {
			p->result =  cuStreamCreate(&p->stream, 0);
			break;
		}		
                case CUSTREAMSYNCHRONIZE: {
                        p->result = cuStreamSynchronize(p->stream);
                        break;
                }
                case CUSTREAMQUERY: {
                        p->result = cuStreamQuery(p->stream);
                        break;
                }
		case CUSTREAMDESTROY: {
                        p->result = cuStreamDestroy(p->stream);
                        break;
                }

		default: 
			printf("Unknown syscall_type\n");
		}
		virtqueue_push(vq, &elem, 0);
	}
	//notify frontend - trigger virtual interrupt
	virtio_notify(vdev, vq);
	return;
}