//pteracuda_nifs:gemv(Ctx, _m, _n, _alpha, A, X, _betha, Y), ERL_NIF_TERM pteracuda_nifs_gemv(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { PCudaContextRef *ctxRef; PCudaBufferRef *ref_A, *ref_X, *ref_Y; unsigned long transpose; unsigned long m, n; double alpha, beta; if (argc != 9 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) || !enif_get_ulong(env, argv[1], &transpose)|| !enif_get_ulong(env, argv[2], &m)|| !enif_get_ulong(env, argv[3], &n)|| !enif_get_double(env, argv[4], &alpha)|| !enif_get_resource(env, argv[5], pteracuda_buffer_resource, (void **) &ref_A) || !enif_get_resource(env, argv[6], pteracuda_buffer_resource, (void **) &ref_X)|| !enif_get_double(env, argv[7], &beta)|| !enif_get_resource(env, argv[8], pteracuda_buffer_resource, (void **) &ref_Y)) { return enif_make_badarg(env); } if(((PCudaMatrixFloatBuffer*)ref_A->buffer)->rows() != m || ((PCudaMatrixFloatBuffer*)ref_A->buffer)->cols() != n){ return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Matrix A dimensions do not match m,n parameters")); } cuCtxSetCurrent(ctxRef->ctx); pcuda_gemv(transpose, m, n, alpha, ((PCudaMatrixFloatBuffer *)ref_A->buffer)->get_data(), ((PCudaFloatBuffer *)ref_X->buffer)->get_data(), beta, ((PCudaFloatBuffer *)ref_Y->buffer)->get_data()); return ATOM_OK; }
ERL_NIF_TERM pteracuda_nifs_transpose(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { PCudaContextRef *ctxRef; PCudaBufferRef *ref_A, *ref_B; unsigned long m, n; if (argc != 3 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) || !enif_get_resource(env, argv[1], pteracuda_buffer_resource, (void **) &ref_A)|| !enif_get_resource(env, argv[2], pteracuda_buffer_resource, (void **) &ref_B)) { return enif_make_badarg(env); } if(((PCudaMatrixFloatBuffer *)ref_A->buffer)->rows() != ((PCudaMatrixFloatBuffer *)ref_B->buffer)->cols() || ((PCudaMatrixFloatBuffer *)ref_A->buffer)->cols() != ((PCudaMatrixFloatBuffer *)ref_B->buffer)->rows() ){ return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Size A does not match the transpose size B.")); } m = ((PCudaMatrixFloatBuffer *)ref_A->buffer)->rows(); n = ((PCudaMatrixFloatBuffer *)ref_A->buffer)->cols(); cuCtxSetCurrent(ctxRef->ctx); //as the internal representation of a matrix buffer is "column major", the actual Rows x Columns is N x M pcuda_transpose(n, m, ((PCudaMatrixFloatBuffer *)ref_A->buffer)->get_data(), ((PCudaMatrixFloatBuffer *)ref_B->buffer)->get_data()); return ATOM_OK; }
ERL_NIF_TERM pteracuda_ml_gd_learn(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { PCudaContextRef *ctxRef; PCudaBufferRef *ref_Theta, *ref_X, *ref_Y; unsigned long num_features; unsigned long num_samples; unsigned long iterations; double learning_rate; if (argc != 8 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) || !enif_get_resource(env, argv[1], pteracuda_buffer_resource, (void **) &ref_Theta) || !enif_get_resource(env, argv[2], pteracuda_buffer_resource, (void **) &ref_X) || !enif_get_resource(env, argv[3], pteracuda_buffer_resource, (void **) &ref_Y) || !enif_get_ulong(env, argv[4], &num_features) || !enif_get_ulong(env, argv[5], &num_samples) || !enif_get_double(env, argv[6], &learning_rate) || !enif_get_ulong(env, argv[7], &iterations) ) { return enif_make_badarg(env); } cuCtxSetCurrent(ctxRef->ctx); pcuda_gd_learn(((PCudaFloatBuffer*)ref_Theta->buffer)->get_data(), ((PCudaFloatBuffer*)ref_X->buffer)->get_data(), ((PCudaFloatBuffer*)ref_Y->buffer)->get_data(), num_features, num_samples, (float)learning_rate, iterations); return ATOM_OK; }
ERL_NIF_TERM pteracuda_nifs_saxpy(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { PCudaContextRef *ctxRef; PCudaBufferRef *ref_X, *ref_Y; double a; if (argc != 4 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) || !enif_get_double(env, argv[1], &a)|| !enif_get_resource(env, argv[2], pteracuda_buffer_resource, (void **) &ref_X)|| !enif_get_resource(env, argv[3], pteracuda_buffer_resource, (void **) &ref_Y)) { return enif_make_badarg(env); } if(((PCudaFloatBuffer *)ref_X->buffer)->get_data()->size() != ((PCudaFloatBuffer *)ref_Y->buffer)->get_data()->size()){ return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Size X does not match size Y.")); } cuCtxSetCurrent(ctxRef->ctx); pcuda_saxpy(a, ((PCudaFloatBuffer *)ref_X->buffer)->get_data(), ((PCudaFloatBuffer *)ref_Y->buffer)->get_data()); return ATOM_OK; }
void cuda_enter(cuda_context *ctx) { ASSERT_CTX(ctx); cuCtxGetCurrent(&ctx->old); if (ctx->old != ctx->ctx) ctx->err = cuCtxSetCurrent(ctx->ctx); /* If no context was there in the first place, then we take over to avoid the set dance on the thread */ if (ctx->old == NULL) ctx->old = ctx->ctx; }
ERL_NIF_TERM pteracuda_nifs_buffer_intersection(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { PCudaContextRef *ctxRef; PCudaBufferRef *first, *second; if (argc !=3 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) || !enif_get_resource(env, argv[1], pteracuda_buffer_resource, (void **) &first) || !enif_get_resource(env, argv[2], pteracuda_buffer_resource, (void **) &second)) { return enif_make_badarg(env); } cuCtxSetCurrent(ctxRef->ctx); return enif_make_tuple2(env, ATOM_OK, first->buffer->intersect(env, second->buffer)); }
SEXP R_cuCtxSetCurrent(SEXP r_ctx) { SEXP r_ans = R_NilValue; CUcontext ctx = (CUcontext) getRReference(r_ctx); CUresult ans; ans = cuCtxSetCurrent(ctx); r_ans = Renum_convert_CUresult(ans) ; return(r_ans); }
///////////////////Matrix operations // C(m,n) = A(m,k) * B(k,n) //gemm(_Ctx, _transpose_op_A, _transpose_op_B, _m, _n, _k, _alpha, _A, _B, _beta, _C ) ERL_NIF_TERM pteracuda_nifs_gemm(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { PCudaContextRef *ctxRef; PCudaBufferRef *ref_A, *ref_B, *ref_C; unsigned long transpose_a, transpose_b; unsigned long m, n, k; double alpha, beta; if (argc != 11 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) || !enif_get_ulong(env, argv[1], &transpose_a)|| !enif_get_ulong(env, argv[2], &transpose_b)|| !enif_get_ulong(env, argv[3], &m)|| !enif_get_ulong(env, argv[4], &n)|| !enif_get_ulong(env, argv[5], &k)|| !enif_get_double(env, argv[6], &alpha)|| !enif_get_resource(env, argv[7], pteracuda_buffer_resource, (void **) &ref_A) || !enif_get_resource(env, argv[8], pteracuda_buffer_resource, (void **) &ref_B)|| !enif_get_double(env, argv[9], &beta)|| !enif_get_resource(env, argv[10], pteracuda_buffer_resource, (void **) &ref_C) ) { return enif_make_badarg(env); } if(transpose_a == CUBLAS_OP_N){ if(((PCudaMatrixFloatBuffer*)ref_A->buffer)->rows() != m || ((PCudaMatrixFloatBuffer*)ref_A->buffer)->cols() != k){ return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Matrix A dimensions do not match m,k parameters")); } }else{ if(((PCudaMatrixFloatBuffer*)ref_A->buffer)->rows() != k || ((PCudaMatrixFloatBuffer*)ref_A->buffer)->cols() != n){ return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Matrix A dimensions do not match m,k parameters")); } } if(transpose_b == CUBLAS_OP_N){ if(((PCudaMatrixFloatBuffer*)ref_B->buffer)->rows() != k || ((PCudaMatrixFloatBuffer*)ref_B->buffer)->cols() != n){ return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Matrix B dimensions do not match k,n parameters")); } }else{ if(((PCudaMatrixFloatBuffer*)ref_B->buffer)->rows() != n || ((PCudaMatrixFloatBuffer*)ref_B->buffer)->cols() != k){ return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Matrix B dimensions do not match k,n parameters")); } } if(((PCudaMatrixFloatBuffer*)ref_C->buffer)->rows() != m || ((PCudaMatrixFloatBuffer*)ref_C->buffer)->cols() != n){ return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Matrix C dimensions do not match m,n parameters")); } cuCtxSetCurrent(ctxRef->ctx); //pcuda_mmul(((PCudaMatrixFloatBuffer*)ref_A->buffer)->get_data(), ((PCudaMatrixFloatBuffer*)ref_B->buffer)->get_data(), ((PCudaMatrixFloatBuffer*)ref_C->buffer)->get_data(), m, k, n); pcuda_gemm(transpose_a, transpose_b, m, n, k, alpha, ((PCudaMatrixFloatBuffer*)ref_A->buffer)->get_data(), ((PCudaMatrixFloatBuffer*)ref_B->buffer)->get_data(), beta, ((PCudaMatrixFloatBuffer*)ref_C->buffer)->get_data()); return ATOM_OK; }
ERL_NIF_TERM pteracuda_nifs_buffer_minmax(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { PCudaContextRef *ctxRef; PCudaBufferRef *bufRef; if (argc !=2 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) || !enif_get_resource(env, argv[1], pteracuda_buffer_resource, (void **) &bufRef)) { return enif_make_badarg(env); } if (bufRef->buffer->size() == 0) { return enif_make_tuple2(env, ATOM_OK, enif_make_tuple2(env, enif_make_int(env, 0), enif_make_int(env, 0))); } cuCtxSetCurrent(ctxRef->ctx); return enif_make_tuple2(env, ATOM_OK, bufRef->buffer->minmax(env)); }
ERL_NIF_TERM pteracuda_nifs_sort_buffer(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { PCudaContextRef *ctxRef; PCudaBufferRef *ref; if (argc != 2 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) || !enif_get_resource(env, argv[1], pteracuda_buffer_resource, (void **) &ref)) { return enif_make_badarg(env); } cuCtxSetCurrent(ctxRef->ctx); if (ref->buffer->sort()) { return ATOM_OK; } else { return ATOM_ERROR; } }
void pocl_cuda_free (cl_device_id device, cl_mem mem_obj) { cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context); if (mem_obj->flags & CL_MEM_ALLOC_HOST_PTR) { cuMemFreeHost (mem_obj->mem_host_ptr); mem_obj->mem_host_ptr = NULL; } else { void *ptr = mem_obj->device_ptrs[device->dev_id].mem_ptr; cuMemFree ((CUdeviceptr)ptr); } }
ERL_NIF_TERM pteracuda_nifs_buffer_contains(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { PCudaContextRef *ctxRef; PCudaBufferRef *ref; if (argc !=3 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) || !enif_get_resource(env, argv[1], pteracuda_buffer_resource, (void **) &ref)) { return enif_make_badarg(env); } if (ref->buffer->size() > 0) { cuCtxSetCurrent(ctxRef->ctx); if (ref->buffer->contains(env, argv[2])) { return ATOM_TRUE; } else { return ATOM_FALSE; } } else { return ATOM_FALSE; } }
void Initialize() { cuCtxSetCurrent(m_Context); // Encode a dummy frame since it seems that some initialization is done on the first encoding m_PictureParameters.encodePicFlags = NV_ENC_PIC_FLAG_FORCEIDR; #ifdef ASYNCHRONOUS // Sanity check _ASSERT(WaitForSingleObject(m_PictureParameters.completionEvent, 0) == WAIT_TIMEOUT); #endif CHECK_NVENC_STATUS(m_FunctionList.nvEncEncodePicture(m_pEncoder, &m_PictureParameters)); #ifdef ASYNCHRONOUS DWORD nWaitResult = WaitForSingleObject(m_PictureParameters.completionEvent, INFINITE); // Sanity check _ASSERT(nWaitResult == WAIT_OBJECT_0); #endif NV_ENC_LOCK_BITSTREAM LockBitstream = { NV_ENC_LOCK_BITSTREAM_VER, 0 }; LockBitstream.sliceOffsets = NULL; LockBitstream.outputBitstream = m_PictureParameters.outputBitstream; CHECK_NVENC_STATUS(m_FunctionList.nvEncLockBitstream(m_pEncoder, &LockBitstream)); CHECK_NVENC_STATUS(m_FunctionList.nvEncUnlockBitstream(m_pEncoder, LockBitstream.outputBitstream)); }
ERL_NIF_TERM pteracuda_nifs_log(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { PCudaContextRef *ctxRef; PCudaBufferRef *ref_A, *ref_B; if (argc != 3 || !enif_get_resource(env, argv[0], pteracuda_context_resource, (void **) &ctxRef) || !enif_get_resource(env, argv[1], pteracuda_buffer_resource, (void **) &ref_A) || !enif_get_resource(env, argv[2], pteracuda_buffer_resource, (void **) &ref_B) ) { return enif_make_badarg(env); } if(((PCudaFloatBuffer*)ref_A->buffer)->size() != ((PCudaFloatBuffer*)ref_B->buffer)->size()){ return enif_make_tuple2(env, ATOM_ERROR, enif_make_atom(env, "Buffer A size does not match buffer B size")); } cuCtxSetCurrent(ctxRef->ctx); pcuda_log(((PCudaFloatBuffer*)ref_A->buffer)->get_data(), ((PCudaFloatBuffer*)ref_B->buffer)->get_data()); return ATOM_OK; }
cl_int pocl_cuda_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void *host_ptr) { cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context); CUresult result; void *b = NULL; /* if memory for this global memory is not yet allocated -> do it */ if (mem_obj->device_ptrs[device->global_mem_id].mem_ptr == NULL) { cl_mem_flags flags = mem_obj->flags; if (flags & CL_MEM_USE_HOST_PTR) { #if defined __arm__ // cuMemHostRegister is not supported on ARN // Allocate device memory and perform explicit copies // before and after running a kernel result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size); CUDA_CHECK (result, "cuMemAlloc"); #else result = cuMemHostRegister (host_ptr, mem_obj->size, CU_MEMHOSTREGISTER_DEVICEMAP); if (result != CUDA_SUCCESS && result != CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED) CUDA_CHECK (result, "cuMemHostRegister"); result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, host_ptr, 0); CUDA_CHECK (result, "cuMemHostGetDevicePointer"); #endif } else if (flags & CL_MEM_ALLOC_HOST_PTR) { result = cuMemHostAlloc (&mem_obj->mem_host_ptr, mem_obj->size, CU_MEMHOSTREGISTER_DEVICEMAP); CUDA_CHECK (result, "cuMemHostAlloc"); result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, mem_obj->mem_host_ptr, 0); CUDA_CHECK (result, "cuMemHostGetDevicePointer"); } else { result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size); if (result != CUDA_SUCCESS) { const char *err; cuGetErrorName (result, &err); POCL_MSG_PRINT2 (__FUNCTION__, __LINE__, "-> Failed to allocate memory: %s\n", err); return CL_MEM_OBJECT_ALLOCATION_FAILURE; } } if (flags & CL_MEM_COPY_HOST_PTR) { result = cuMemcpyHtoD ((CUdeviceptr)b, host_ptr, mem_obj->size); CUDA_CHECK (result, "cuMemcpyHtoD"); } mem_obj->device_ptrs[device->global_mem_id].mem_ptr = b; mem_obj->device_ptrs[device->global_mem_id].global_mem_id = device->global_mem_id; } /* copy already allocated global mem info to devices own slot */ mem_obj->device_ptrs[device->dev_id] = mem_obj->device_ptrs[device->global_mem_id]; return CL_SUCCESS; }
void cuda_pop_context() { cuda_assert(cuCtxSetCurrent(NULL)); }
return true; } #define cuda_error(stmt) cuda_error_(stmt, #stmt) void cuda_error_message(const string& message) { if(error_msg == "") error_msg = message; fprintf(stderr, "%s\n", message.c_str()); cuda_error_documentation(); } void cuda_push_context() { cuda_assert(cuCtxSetCurrent(cuContext)) } void cuda_pop_context() { cuda_assert(cuCtxSetCurrent(NULL)); } CUDADevice(DeviceInfo& info, Stats &stats, bool background_) : Device(stats) { first_error = true; background = background_; cuDevId = info.num; cuDevice = 0; cuContext = 0;
static void dispose(CUcontext context) { cuda_check( cuCtxSetCurrent(context) ); cuda_check( cuCtxSynchronize() ); cuda_check( cuCtxDestroy(context) ); }
static CUT_THREADPROC dt_thread_func(void *p) { dt_partition *pt = (dt_partition *)p; struct timeval tv; CUresult res; int thread_num_x=0, thread_num_y=0; int block_num_x=0, block_num_y=0; res = cuCtxSetCurrent(ctx[pt->pid]); if(res != CUDA_SUCCESS) { printf("cuCtxSetCurrent(ctx[%d]) failed: res = %s\n", pt->pid, cuda_response_to_string(res)); exit(1); } /* allocate GPU memory */ //printf("part_error_array_num = %d\n",part_error_array_num); if(pt->pid == 0){ gettimeofday(&tv_memcpy_start, NULL); } res = cuMemcpyHtoD(part_C_dev[pt->pid], dst_C, SUM_SIZE_C); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(part_C_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(part_error_array_dev[pt->pid], part_error_array, part_error_array_num*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(part_error_array_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(pm_size_array_dev[pt->pid], &pt->size_array[0][0], pt->NoP*2*pt->L_MAX*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(pm_size_array_dev) falied: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(def_array_dev[pt->pid], pt->def, sum_size_def_array); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(def_array_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(numpart_dev[pt->pid], pt->numpart, pt->NoC*sizeof(int)); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(cuMemcpyHtoD(numpart_dev) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(PIDX_array_dev[pt->pid], pt->dst_PIDX, pt->tmp_array_size); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(PIDX_array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuMemcpyHtoD(DID_4_array_dev[pt->pid], pt->dst_DID_4, pt->tmp_array_size); if(res != CUDA_SUCCESS) { printf("cuMemcpyHtoD(DID_4__array) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_memcpy_end, NULL); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } int sharedMemBytes = 0; /* get max thread num per block */ int max_threads_num = 0; res = cuDeviceGetAttribute(&max_threads_num, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[pt->pid]); if(res != CUDA_SUCCESS){ printf("\ncuDeviceGetAttribute() failed: res = %s\n", cuda_response_to_string(res)); exit(1); } /* prepare for launch inverse_Q */ void* kernel_args_inverse[] = { &part_C_dev[pt->pid], &pm_size_array_dev[pt->pid], &part_error_array_dev[pt->pid], &part_error_array_num, (void*)&(pt->NoP), &PIDX_array_dev[pt->pid], &numpart_dev[pt->pid], (void*)&(pt->NoC), (void*)&(pt->max_numpart), (void*)&(pt->interval), (void*)&(pt->L_MAX), (void*)&(pt->pid), (void*)&(device_num) }; /* define CUDA block shape */ int upper_limit_th_num_x = max_threads_num/(pt->max_numpart*pt->NoC); int upper_limit_th_num_y = max_threads_num/upper_limit_th_num_x; if(upper_limit_th_num_x < 1) upper_limit_th_num_x++; if(upper_limit_th_num_y < 1) upper_limit_th_num_y++; thread_num_x = (pt->max_dim0*pt->max_dim1 < upper_limit_th_num_x) ? (pt->max_dim0*pt->max_dim1) : upper_limit_th_num_x; thread_num_y = (pt->max_numpart < upper_limit_th_num_y) ? pt->max_numpart : upper_limit_th_num_y; block_num_x = (pt->max_dim0*pt->max_dim1) / thread_num_x; block_num_y = (pt->max_numpart) / thread_num_y; if((pt->max_dim0*pt->max_dim1) % thread_num_x != 0) block_num_x++; if(pt->max_numpart % thread_num_y != 0) block_num_y++; int blockDimY = thread_num_y / device_num; if(thread_num_y%device_num != 0){ blockDimY++; } /* launch iverse_Q */ if(pt->pid == 0){ gettimeofday(&tv_kernel_start, NULL); } res = cuLaunchKernel( func_inverse_Q[pt->pid], // call function block_num_x, // gridDimX block_num_y, // gridDimY pt->L_MAX-pt->interval, // gridDimZ thread_num_x, // blockDimX blockDimY, // blockDimY pt->NoC, // blockDimZ sharedMemBytes, // sharedMemBytes NULL, // hStream kernel_args_inverse, // kernelParams NULL // extra ); if(res != CUDA_SUCCESS) { printf("block_num_x %d, block_num_y %d, thread_num_x %d, thread_num_y %d\n", block_num_x, block_num_y, thread_num_x, thread_num_y); printf("cuLaunchKernel(inverse_Q) failed : res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuCtxSynchronize(); if(res != CUDA_SUCCESS) { printf("cuCtxSynchronize(inverse_Q) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_kernel_end, NULL); tvsub(&tv_kernel_end, &tv_kernel_start, &tv); time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } /* prepare for launch dt1d_x */ void* kernel_args_x[] = { &part_C_dev[pt->pid], // FLOAT *src_start &tmpM_dev[pt->pid], // FLOTA *dst &tmpIy_dev[pt->pid], // int *ptr &DID_4_array_dev[pt->pid], // int *DID_4_array, &def_array_dev[pt->pid], // FLOAT *def_array, &pm_size_array_dev[pt->pid], // int *size_array (void*)&(pt->NoP), // int NoP &PIDX_array_dev[pt->pid], // int *PIDX_array &part_error_array_dev[pt->pid], // int *error_array (void*)&(part_error_array_num), // int error_array_num &numpart_dev[pt->pid], // int *numpart (void*)&(pt->NoC), // int NoC (void*)&(pt->max_numpart), // int max_numpart (void*)&(pt->interval), // int interval (void*)&(pt->L_MAX), // int L_MAX (void*)&(pt->pid), // int pid (void*)&(device_num) // int device_num }; max_threads_num = 64/pt->NoC; if(max_threads_num < 1) max_threads_num++; thread_num_x = (pt->max_dim1 < max_threads_num) ? pt->max_dim1 : max_threads_num; thread_num_y = (pt->max_numpart < max_threads_num) ? pt->max_numpart : max_threads_num; block_num_x = pt->max_dim1 / thread_num_x; block_num_y = pt->max_numpart / thread_num_y; if(pt->max_dim1 % thread_num_x != 0) block_num_x++; if(pt->max_numpart % thread_num_y != 0) block_num_y++; blockDimY = thread_num_y / device_num; if(thread_num_y%device_num != 0){ blockDimY++; } /* launch dt1d_x */ if(pt->pid == 0){ gettimeofday(&tv_kernel_start, NULL); } res = cuLaunchKernel( func_dt1d_x[pt->pid], // call function block_num_x, // gridDimX block_num_y, // gridDimY pt->L_MAX-pt->interval, // gridDimZ thread_num_x, // blockDimX blockDimY, // blockDimY pt->NoC, // blockDimZ sharedMemBytes, // sharedMemBytes NULL, // hStream kernel_args_x, // kernelParams NULL // extra ); if(res != CUDA_SUCCESS) { printf("block_num_x %d, block_num_y %d, thread_num_x %d, thread_num_y %d\n", block_num_x, block_num_y, thread_num_x, thread_num_y); printf("cuLaunchKernel(dt1d_x) failed : res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuCtxSynchronize(); if(res != CUDA_SUCCESS) { printf("cuCtxSynchronize(dt1d_x) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_kernel_end, NULL); tvsub(&tv_kernel_end, &tv_kernel_start, &tv); time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } /* prepare for launch dt1d_y */ void* kernel_args_y[] = { &tmpM_dev[pt->pid], // FLOAT *src_start &M_dev[pt->pid], // FLOAT *dst_start &tmpIx_dev[pt->pid], // int *ptr_start &DID_4_array_dev[pt->pid], // int *DID_4_array, &def_array_dev[pt->pid], // FLOAT *def_array, (void*)&(pt->NoP), // int NoP &pm_size_array_dev[pt->pid], // int *size_array &numpart_dev[pt->pid], // int *numpart, &PIDX_array_dev[pt->pid], // int *PIDX_array, (void*)&(pt->NoC), // int NoC (void*)&(pt->max_numpart), // int max_numpart (void*)&(pt->interval), // int interval (void*)&(pt->L_MAX), // int L_MAX &part_error_array_dev[pt->pid], // int *error_array (void*)&(part_error_array_num), // int error_array_num (void*)&(pt->pid), // int pid (void*)&(device_num) // int device_num }; thread_num_x = (pt->max_dim0 < max_threads_num) ? pt->max_dim0 : max_threads_num; thread_num_y = (pt->max_numpart < max_threads_num) ? pt->max_numpart : max_threads_num; block_num_x = pt->max_dim0 / thread_num_x; block_num_y = pt->max_numpart / thread_num_y; if(pt->max_dim0 % thread_num_x != 0) block_num_x++; if(pt->max_numpart % thread_num_y != 0) block_num_y++; blockDimY = thread_num_y / device_num; if(thread_num_y%device_num != 0){ blockDimY++; } /* prepare for launch dt1d_y */ if(pt->pid == 0){ gettimeofday(&tv_kernel_start, NULL); } res = cuLaunchKernel( func_dt1d_y[pt->pid], // call functions block_num_x, // gridDimX block_num_y, // gridDimY pt->L_MAX-pt->interval, // gridDimZ thread_num_x, // blockDimX blockDimY, // blockDimY pt->NoC, // blockDimZ sharedMemBytes, // sharedMemBytes NULL, // hStream kernel_args_y, // kernelParams NULL // extra ); if(res != CUDA_SUCCESS) { printf("cuLaunchKernel(dt1d_y failed : res = %s\n", cuda_response_to_string(res)); exit(1); } res = cuCtxSynchronize(); if(res != CUDA_SUCCESS) { printf("cuCtxSynchronize(dt1d_y) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_kernel_end, NULL); tvsub(&tv_kernel_end, &tv_kernel_start, &tv); time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } /* downloads datas from GPU */ /* downloads M from GPU */ int sum_part_size = 0; int sum_pointer_size = 0; int sum_move_size = 0; int part_size = 0; int pointer_size = 0; int part_y = 0; int move_size = 0; int start_kk = 0; int end_kk = 0; int part_end_kk = 0; unsigned long long int pointer_dst_M = (unsigned long long int)pt->dst_M; unsigned long long int pointer_M_dev = (unsigned long long int)M_dev[pt->pid]; for(int L=0; L<(pt->L_MAX-pt->interval); L++) { /**************************************************************************/ /* loop condition */ if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) ) { continue; } /* loop conditon */ /**************************************************************************/ for(int jj=0; jj<pt->NoC; jj++) { part_y = pt->numpart[jj] / device_num; if(pt->numpart[jj]%device_num != 0){ part_y++; } start_kk = part_y * pt->pid; end_kk = part_y * (pt->pid + 1); if(end_kk > pt->numpart[jj]){ end_kk = pt->numpart[jj]; } if(pt->pid > 0){ part_end_kk = part_y * pt->pid; } for(int kk=0; kk<pt->numpart[jj]; kk++) { int PIDX = pt->PIDX_array[L][jj][kk]; int dims0 = pt->size_array[L][PIDX*2]; int dims1 = pt->size_array[L][PIDX*2+1]; if(start_kk <= kk && kk < end_kk){ part_size += dims0 * dims1; } //if(pt->pid > 0 && part_start_kk <= kk && kk < part_end_kk){ if(pt->pid > 0 && 0 <= kk && kk < part_end_kk){ pointer_size += dims0 * dims1; } move_size += dims0 * dims1; } sum_part_size += part_size; sum_pointer_size += pointer_size; sum_move_size += move_size; // error pt->pid == 2 && L == 24 && jj == 1 if(pt->pid*part_y < pt->numpart[jj]){ if(pt->pid == 0){ gettimeofday(&tv_memcpy_start, NULL); } res = cuMemcpyDtoH((void *)(pointer_dst_M+(unsigned long long int)(pointer_size*sizeof(FLOAT))), (CUdeviceptr)(pointer_M_dev+(unsigned long long int)(pointer_size*sizeof(FLOAT))), part_size*sizeof(FLOAT)); if(res != CUDA_SUCCESS) { printf("error pid = %d\n",pt->pid); printf("cuMemcpyDtoH(dst_M) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_memcpy_end, NULL); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } } pointer_dst_M += (unsigned long long int)(move_size * sizeof(FLOAT)); pointer_M_dev += (unsigned long long int)(move_size * sizeof(FLOAT)); part_size = 0; pointer_size = 0; move_size = 0; } } /* downloads tmpIx from GPU */ sum_part_size = 0; sum_pointer_size = 0; part_size = 0; pointer_size = 0; part_y = 0; move_size = 0; start_kk = 0; end_kk = 0; part_end_kk = 0; unsigned long long int pointer_dst_tmpIx = (unsigned long long int)pt->dst_tmpIx; unsigned long long int pointer_tmpIx_dev = (unsigned long long int)tmpIx_dev[pt->pid]; for(int L=0; L<(pt->L_MAX-pt->interval); L++) { /**************************************************************************/ /* loop condition */ if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) ) { continue; } /* loop conditon */ /**************************************************************************/ for(int jj=0; jj<pt->NoC; jj++) { part_y = pt->numpart[jj] / device_num; if(pt->numpart[jj]%device_num != 0){ part_y++; } start_kk = part_y * pt->pid; end_kk = part_y * (pt->pid + 1); if(end_kk > pt->numpart[jj]){ end_kk = pt->numpart[jj]; } if(pt->pid > 0){ part_end_kk = part_y * pt->pid; } for(int kk=0; kk<pt->numpart[jj]; kk++) { int PIDX = pt->PIDX_array[L][jj][kk]; int dims0 = pt->size_array[L][PIDX*2]; int dims1 = pt->size_array[L][PIDX*2+1]; if(start_kk <= kk && kk < end_kk){ part_size += dims0 * dims1; } if(pt->pid > 0){ if(0 <= kk && kk < part_end_kk){ pointer_size += dims0 * dims1; } } move_size += dims0 * dims1; } sum_part_size += part_size; sum_pointer_size += pointer_size; if(pt->pid*part_y < pt->numpart[jj]){ if(pt->pid == 0){ gettimeofday(&tv_memcpy_start, NULL); } res = cuMemcpyDtoH((void *)(pointer_dst_tmpIx+(unsigned long long int)(pointer_size*sizeof(int))), (CUdeviceptr)(pointer_tmpIx_dev+(unsigned long long int)(pointer_size*sizeof(int))), part_size*sizeof(int)); if(res != CUDA_SUCCESS) { printf("error pid = %d\n",pt->pid); printf("cuMemcpyDtoH(tmpIx) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_memcpy_end, NULL); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } } pointer_dst_tmpIx += (unsigned long long int)(move_size * sizeof(int)); pointer_tmpIx_dev += (unsigned long long int)(move_size * sizeof(int)); part_size = 0; pointer_size = 0; move_size = 0; } } /* downloads tmpIy from GPU */ sum_part_size = 0; sum_pointer_size = 0; part_size = 0; pointer_size = 0; part_y = 0; move_size = 0; start_kk = 0; end_kk = 0; part_end_kk = 0; unsigned long long int pointer_dst_tmpIy = (unsigned long long int)pt->dst_tmpIy; unsigned long long int pointer_tmpIy_dev = (unsigned long long int)tmpIy_dev[pt->pid]; for(int L=0; L<(pt->L_MAX-pt->interval); L++) { /**************************************************************************/ /* loop condition */ if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) ) { continue; } /* loop conditon */ /**************************************************************************/ for(int jj=0; jj<pt->NoC; jj++) { part_y = pt->numpart[jj] / device_num; if(pt->numpart[jj]%device_num != 0){ part_y++; } start_kk = part_y * pt->pid; end_kk = part_y * (pt->pid + 1); if(end_kk > pt->numpart[jj]){ end_kk = pt->numpart[jj]; } if(pt->pid > 0){ part_end_kk = part_y * pt->pid; } for(int kk=0; kk<pt->numpart[jj]; kk++) { int PIDX = pt->PIDX_array[L][jj][kk]; int dims0 = pt->size_array[L][PIDX*2]; int dims1 = pt->size_array[L][PIDX*2+1]; if(start_kk <= kk && kk < end_kk){ part_size += dims0 * dims1; } if(pt->pid > 0){ if(0 <= kk && kk < part_end_kk){ pointer_size += dims0 * dims1; } } move_size += dims0 * dims1; } sum_part_size += part_size; sum_pointer_size += pointer_size; if(pt->pid*part_y < pt->numpart[jj]){ if(pt->pid == 0){ gettimeofday(&tv_memcpy_start, NULL); } res = cuMemcpyDtoH((void *)(pointer_dst_tmpIy+(unsigned long long int)(pointer_size*sizeof(int))), (CUdeviceptr)(pointer_tmpIy_dev+(unsigned long long int)(pointer_size*sizeof(int))), part_size*sizeof(int)); if(res != CUDA_SUCCESS) { printf("error pid = %d\n",pt->pid); printf("cuMemcpyDtoH(tmpIy) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } if(pt->pid == 0){ gettimeofday(&tv_memcpy_end, NULL); tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv); time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } } pointer_dst_tmpIy += (unsigned long long int)(move_size * sizeof(int)); pointer_tmpIy_dev += (unsigned long long int)(move_size * sizeof(int)); part_size = 0; pointer_size = 0; move_size = 0; } } /* end of thread */ CUT_THREADEND; }
void operator()(Handle handle) const { if (ctx) cuda_check( cuCtxSetCurrent(ctx) ); deleter_impl<Handle>::dispose(handle); }
/// Binds the context to the calling CPU thread. void set_current() const { cuda_check( cuCtxSetCurrent( c.get() ) ); }
void cuda_exit(cuda_context *ctx) { if (ctx->old != ctx->ctx) cuCtxSetCurrent(ctx->old); }
void init_cuda(void) { CUresult res; std::string cubin_path(STR(CUBIN_PATH)); // initnialize GPU res = cuInit(0); CUDA_CHECK(res, "cuInit()"); // count the number of usable GPU res = cuDeviceGetCount(&device_num); CUDA_CHECK(res, "cuDeviceGetCount()"); // unsupported multi GPU device_num = 1; // get device dev = (CUdevice*) malloc(device_num * sizeof(CUdevice)); for (int i = 0; i < device_num; i++) { res = cuDeviceGet(&dev[i], i); CUDA_CHECK(res, "cuDeviceGet()"); } ctx = (CUcontext*) malloc(device_num * sizeof(CUcontext)); module = (CUmodule*) malloc(device_num * sizeof(CUmodule)); ConvolutionKernel_func = (CUfunction*) malloc( device_num * sizeof(CUfunction)); DistanceTransformTwoDimensionalProblem_func = (CUfunction*) malloc( device_num * sizeof(CUfunction)); BilinearKernelTex32F_func = (CUfunction*) malloc( device_num * sizeof(CUfunction)); calculateHistogram_func = (CUfunction*) malloc( device_num * sizeof(CUfunction)); getFeatureMaps_func = (CUfunction*) malloc(device_num * sizeof(CUfunction)); calculateNorm_func = (CUfunction*) malloc(device_num * sizeof(CUfunction)); normalizeAndTruncate_func = (CUfunction*) malloc( device_num * sizeof(CUfunction)); PCAFeatureMapsAddNullableBorder_func = (CUfunction*) malloc( device_num * sizeof(CUfunction)); for (int i = 0; i < device_num; i++) { res = cuCtxCreate(&ctx[i], 0, dev[i]); CUDA_CHECK(res, "cuCtxCreate()"); } for (int i = 0; i < device_num; i++) { res = cuCtxSetCurrent(ctx[i]); CUDA_CHECK(res, "cuCtxSetCurrent()"); // load .cubin file res = cuModuleLoad(&module[i], cubin_path.c_str()); CUDA_CHECK(res, "cuModuleLoad()"); res = cuModuleGetFunction(&ConvolutionKernel_func[i], module[i], "ConvolutionKernel"); CUDA_CHECK(res, "cuModuleGetFunction(ConvolutionKernel)"); res = cuModuleGetFunction( &DistanceTransformTwoDimensionalProblem_func[i], module[i], "DistanceTransformTwoDimensionalProblemKernel"); CUDA_CHECK(res, "cuModuleGetFunction(DistanceTransformTwoDimensionalProblemKernel)"); res = cuModuleGetFunction(&BilinearKernelTex32F_func[i], module[i], "BilinearKernelTex32F"); CUDA_CHECK(res, "cuModuleGetFunction(BilinearKernelTex32F)"); res = cuModuleGetFunction(&calculateHistogram_func[i], module[i], "calculateHistogram"); CUDA_CHECK(res, "cuModuleGetFunction(calculateHistogram)"); res = cuModuleGetFunction(&getFeatureMaps_func[i], module[i], "getFeatureMaps"); CUDA_CHECK(res, "cuModuleGetFunction(getFeatureMaps)"); res = cuModuleGetFunction(&calculateNorm_func[i], module[i], "calculateNorm"); CUDA_CHECK(res, "cuModuleGetFunction(calculateNorm)"); res = cuModuleGetFunction(&normalizeAndTruncate_func[i], module[i], "normalizeAndTruncate"); CUDA_CHECK(res, "cuModuleGetFunction(normalizeAndTruncate)"); res = cuModuleGetFunction(&PCAFeatureMapsAddNullableBorder_func[i], module[i], "PCAFeatureMapsAddNullableBorder"); CUDA_CHECK(res, "cuModuleGetFunction(PCAFeatureMapsAddNullableBorder)"); } NR_MAXTHREADS_X = (int*) malloc(device_num * sizeof(int)); NR_MAXTHREADS_Y = (int*) malloc(device_num * sizeof(int)); for (int i = 0; i < device_num; i++) { // get max thread num per block max_threads_num = 0; res = cuDeviceGetAttribute(&max_threads_num, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[i]); CUDA_CHECK(res, "cuDeviceGetAttribute()"); NR_MAXTHREADS_X[i] = (int) sqrt((double) max_threads_num); NR_MAXTHREADS_Y[i] = (int) sqrt((double) max_threads_num); } }
//detect boundary box FLOAT *dpm_ttic_gpu_get_boxes(FLOAT **features,FLOAT *scales,int *feature_size, GPUModel *MO, int *detected_count, FLOAT *acc_score, FLOAT thresh) { //constant parameters const int max_scale = MO->MI->max_scale; const int interval = MO->MI->interval; const int sbin = MO->MI->sbin; const int padx = MO->MI->padx; const int pady = MO->MI->pady; const int NoR = MO->RF->NoR; const int NoP = MO->PF->NoP; const int NoC = MO->MI->numcomponent; const int *numpart = MO->MI->numpart; const int LofFeat=(max_scale+interval)*NoC; const int L_MAX = max_scale+interval; /* for measurement */ struct timeval tv; struct timeval tv_make_c_start, tv_make_c_end; struct timeval tv_nucom_start, tv_nucom_end; struct timeval tv_box_start, tv_box_end; float time_box=0; struct timeval tv_root_score_start, tv_root_score_end; float time_root_score = 0; struct timeval tv_part_score_start, tv_part_score_end; float time_part_score = 0; struct timeval tv_dt_start, tv_dt_end; float time_dt = 0; struct timeval tv_calc_a_score_start, tv_calc_a_score_end; float time_calc_a_score = 0; gettimeofday(&tv_make_c_start, nullptr); int **RF_size = MO->RF->root_size; int *rootsym = MO->RF->rootsym; int *part_sym = MO->PF->part_sym; int **part_size = MO->PF->part_size; FLOAT **rootfilter = MO->RF->rootfilter; FLOAT **partfilter=MO->PF->partfilter; int **psize = MO->MI->psize; int **rm_size_array = (int **)malloc(sizeof(int *)*L_MAX); int **pm_size_array = (int **)malloc(sizeof(int *)*L_MAX); pm_size_array = (int **)malloc(sizeof(int *)*L_MAX); FLOAT **Tboxes=(FLOAT**)calloc(LofFeat,sizeof(FLOAT*)); //box coordinate information(Temp) int *b_nums =(int*)calloc(LofFeat,sizeof(int)); //length of Tboxes int count = 0; int detected_boxes=0; CUresult res; /* matched score (root and part) */ FLOAT ***rootmatch,***partmatch = nullptr; int *new_PADsize; // need new_PADsize[L_MAX*3] size_t SUM_SIZE_feat = 0; FLOAT **featp2 = (FLOAT **)malloc(L_MAX*sizeof(FLOAT *)); if(featp2 == nullptr) { // error semantics printf("allocate featp2 failed\n"); exit(1); } /* allocate required memory for new_PADsize */ new_PADsize = (int *)malloc(L_MAX*3*sizeof(int)); if(new_PADsize == nullptr) { // error semantics printf("allocate new_PADsize failed\n"); exit(1); } /* do padarray once and reuse it at calculating root and part time */ /* calculate sum of size of padded feature */ for(int tmpL=0; tmpL<L_MAX; tmpL++) { int PADsize[3] = { feature_size[tmpL*2], feature_size[tmpL*2+1], 31 }; int NEW_Y = PADsize[0] + pady*2; int NEW_X = PADsize[1] + padx*2; SUM_SIZE_feat += (NEW_X*NEW_Y*PADsize[2])*sizeof(FLOAT); } /* allocate region for padded feat in a lump */ FLOAT *dst_feat; res = cuMemHostAlloc((void **)&dst_feat, SUM_SIZE_feat, CU_MEMHOSTALLOC_DEVICEMAP); if(res != CUDA_SUCCESS) { printf("cuMemHostAlloc(dst_feat) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } memset(dst_feat, 0, SUM_SIZE_feat); // zero clear /* distribute allocated region */ uintptr_t pointer_feat = (uintptr_t)dst_feat; for(int tmpL=0; tmpL<L_MAX; tmpL++) { featp2[tmpL] = (FLOAT *)pointer_feat; int PADsize[3] = { feature_size[tmpL*2], feature_size[tmpL*2+1], 31 }; int NEW_Y = PADsize[0] + pady*2; int NEW_X = PADsize[1] + padx*2; pointer_feat += (uintptr_t)(NEW_X*NEW_Y*PADsize[2]*sizeof(FLOAT)); } /* copy feat to feat2 */ for(int tmpL=0; tmpL<L_MAX; tmpL++) { int PADsize[3] = { feature_size[tmpL*2], feature_size[tmpL*2+1], 31 }; int NEW_Y = PADsize[0] + pady*2; int NEW_X = PADsize[1] + padx*2; int L = NEW_Y*padx; int SPL = PADsize[0] + pady; int M_S = sizeof(FLOAT)*PADsize[0]; FLOAT *P = featp2[tmpL]; FLOAT *S = features[tmpL]; for(int i=0; i<PADsize[2]; i++) { P += L; for(int j=0; j<PADsize[1]; j++) { P += pady; memcpy(P, S, M_S); S += PADsize[0]; P += SPL; } P += L; } new_PADsize[tmpL*3] = NEW_Y; new_PADsize[tmpL*3 + 1] = NEW_X; new_PADsize[tmpL*3 + 2] = PADsize[2]; } /* do padarray once and reuse it at calculating root and part time */ /* allocation in a lump */ int *dst_rm_size = (int *)malloc(sizeof(int)*NoC*2*L_MAX); if(dst_rm_size == nullptr) { printf("allocate dst_rm_size failed\n"); exit(1); } /* distribution to rm_size_array[L_MAX] */ uintptr_t ptr = (uintptr_t)dst_rm_size; for(int i=0; i<L_MAX; i++) { rm_size_array[i] = (int *)ptr; ptr += (uintptr_t)(NoC*2*sizeof(int)); } /* allocation in a lump */ int *dst_pm_size = (int *)malloc(sizeof(int)*NoP*2*L_MAX); if(dst_pm_size == nullptr) { printf("allocate dst_pm_size failed\n"); exit(1); } /* distribution to pm_size_array[L_MAX] */ ptr = (uintptr_t)dst_pm_size; for(int i=0; i<L_MAX; i++) { pm_size_array[i] = (int *)ptr; ptr += (uintptr_t)(NoP*2*sizeof(int)); } ///////level for (int level=interval; level<L_MAX; level++) // feature's loop(A's loop) 1level 1picture { if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { Tboxes[count]=nullptr; count++; continue; } } //for (level) // feature's loop(A's loop) 1level 1picture ///////root calculation///////// /* calculate model score (only root) */ gettimeofday(&tv_root_score_start, nullptr); rootmatch = fconvsMT_GPU( featp2, SUM_SIZE_feat, rootfilter, rootsym, 1, NoR, new_PADsize, RF_size, rm_size_array, L_MAX, interval, feature_size, padx, pady, MO->MI->max_X, MO->MI->max_Y, ROOT ); gettimeofday(&tv_root_score_end, nullptr); tvsub(&tv_root_score_end, &tv_root_score_start, &tv); time_root_score += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; ///////part calculation///////// if(NoP>0) { /* calculate model score (only part) */ gettimeofday(&tv_part_score_start, nullptr); partmatch = fconvsMT_GPU( featp2, SUM_SIZE_feat, partfilter, part_sym, 1, NoP, new_PADsize, part_size, pm_size_array, L_MAX, interval, feature_size, padx, pady, MO->MI->max_X, MO->MI->max_Y, PART ); gettimeofday(&tv_part_score_end, nullptr); tvsub(&tv_part_score_end, &tv_part_score_start, &tv); time_part_score += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; } res = cuCtxSetCurrent(ctx[0]); if(res != CUDA_SUCCESS) { printf("cuCtxSetCurrent(ctx[0]) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } gettimeofday(&tv_make_c_end, nullptr); gettimeofday(&tv_nucom_start, nullptr); count = 0; detected_boxes = 0; int **RL_array = (int **)malloc((L_MAX-interval)*sizeof(int*)); int *dst_RL = (int *) malloc(NoC*(L_MAX-interval)*sizeof(int)); int **RI_array = (int **)malloc((L_MAX-interval)*sizeof(int*)); int *dst_RI = (int *)malloc(NoC*(L_MAX-interval)*sizeof(int)); int **OI_array = (int **)malloc((L_MAX-interval)*sizeof(int*)); int *dst_OI = (int *)malloc((NoC)*(L_MAX-interval)*sizeof(int)); int **RL_S_array = (int **)malloc((L_MAX-interval)*sizeof(int*)); int *dst_RL_S = (int *)malloc(NoC*(L_MAX-interval)*sizeof(int)); FLOAT **OFF_array = (FLOAT **)malloc((L_MAX-interval)*sizeof(FLOAT*)); FLOAT *dst_OFF = (FLOAT *)malloc(NoC*(L_MAX-interval)*sizeof(FLOAT)); FLOAT ***SCORE_array = (FLOAT ***)malloc((L_MAX-interval)*sizeof(FLOAT **)); FLOAT **sub_dst_SCORE = (FLOAT **)malloc(NoC*(L_MAX-interval)*sizeof(FLOAT*)); uintptr_t pointer_RL = (uintptr_t)dst_RL; uintptr_t pointer_RI = (uintptr_t)dst_RI; uintptr_t pointer_OI = (uintptr_t)dst_OI; uintptr_t pointer_RL_S = (uintptr_t)dst_RL_S; uintptr_t pointer_OFF = (uintptr_t)dst_OFF; uintptr_t pointer_SCORE = (uintptr_t)sub_dst_SCORE; for (int level=interval; level<L_MAX; level++) { int L=level-interval; RL_array[L] = (int *)pointer_RL; pointer_RL += (uintptr_t)NoC*sizeof(int); RI_array[L] = (int *)pointer_RI; pointer_RI += (uintptr_t)NoC*sizeof(int); OI_array[L] = (int *)pointer_OI; pointer_OI += (uintptr_t)NoC*sizeof(int); RL_S_array[L] = (int *)pointer_RL_S; pointer_RL_S += (uintptr_t)NoC*sizeof(int); OFF_array[L] = (FLOAT *)pointer_OFF; pointer_OFF += (uintptr_t)NoC*sizeof(FLOAT); SCORE_array[L] = (FLOAT **)pointer_SCORE; pointer_SCORE += (uintptr_t)NoC*sizeof(FLOAT*); } int sum_RL_S = 0; int sum_SNJ = 0; /* prepare for parallel execution */ for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { /* root score + offset */ RL_array[L][j] = rm_size_array[level][j*2]*rm_size_array[level][j*2+1]; //length of root-matching RI_array[L][j] = MO->MI->ridx[j]; //root-index OI_array[L][j] = MO->MI->oidx[j]; //offset-index RL_S_array[L][j] =sizeof(FLOAT)*RL_array[L][j]; OFF_array[L][j] = MO->MI->offw[RI_array[L][j]]; //offset information /* search max values */ max_RL_S = (max_RL_S < RL_S_array[L][j]) ? RL_S_array[L][j] : max_RL_S; max_numpart = (max_numpart < numpart[j]) ? numpart[j] : max_numpart; } } sum_RL_S = max_RL_S*NoC*(L_MAX-interval); /* root matching size */ sum_SNJ = sizeof(int*)*max_numpart*NoC*(L_MAX-interval); /* consolidated allocation for SCORE_array and distribute region */ FLOAT *dst_SCORE = (FLOAT *)malloc(sum_RL_S); pointer_SCORE = (uintptr_t)dst_SCORE; for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { SCORE_array[L][j] = (FLOAT *)pointer_SCORE; pointer_SCORE += (uintptr_t)max_RL_S; } } /* add offset */ for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { memcpy(SCORE_array[L][j], rootmatch[level][j], RL_S_array[L][j]); FLOAT *SC_S = SCORE_array[L][j]; FLOAT *SC_E = SCORE_array[L][j]+RL_array[L][j]; while(SC_S<SC_E) *(SC_S++)+=OFF_array[L][j]; } } /* anchor matrix */ // consolidated allocation int ***ax_array = (int ***)malloc((L_MAX-interval)*sizeof(int **)); int **sub_dst_ax = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int *)); int *dst_ax = (int *)malloc(sum_SNJ); int ***ay_array = (int ***)malloc((L_MAX-interval)*sizeof(int **)); int **sub_dst_ay = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int *)); int *dst_ay = (int *)malloc(sum_SNJ); /* boudary index */ // consolidated allocation int ****Ix_array =(int ****)malloc((L_MAX-interval)*sizeof(int ***)); int ***sub_dst_Ix = (int ***)malloc(NoC*(L_MAX-interval)*sizeof(int **)); int **dst_Ix = (int **)malloc(sum_SNJ); int ****Iy_array = (int ****)malloc((L_MAX-interval)*sizeof(int ***)); int ***sub_dst_Iy = (int ***)malloc(NoC*(L_MAX-interval)*sizeof(int **)); int **dst_Iy = (int **)malloc(sum_SNJ); /* distribute region */ uintptr_t pointer_ax = (uintptr_t)sub_dst_ax; uintptr_t pointer_ay = (uintptr_t)sub_dst_ay; uintptr_t pointer_Ix = (uintptr_t)sub_dst_Ix; uintptr_t pointer_Iy = (uintptr_t)sub_dst_Iy; for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } ax_array[L] = (int **)pointer_ax; pointer_ax += (uintptr_t)(NoC*sizeof(int*)); ay_array[L] = (int **)pointer_ay; pointer_ay += (uintptr_t)(NoC*sizeof(int*)); Ix_array[L] = (int ***)pointer_Ix; pointer_Ix += (uintptr_t)(NoC*sizeof(int**)); Iy_array[L] = (int ***)pointer_Iy; pointer_Iy += (uintptr_t)(NoC*sizeof(int**)); } pointer_ax = (uintptr_t)dst_ax; pointer_ay = (uintptr_t)dst_ay; pointer_Ix = (uintptr_t)dst_Ix; pointer_Iy = (uintptr_t)dst_Iy; for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { uintptr_t pointer_offset = sizeof(int*)*max_numpart; ax_array[L][j] = (int *)pointer_ax; pointer_ax += pointer_offset; ay_array[L][j] = (int *)pointer_ay; pointer_ay += pointer_offset; Ix_array[L][j] = (int **)pointer_Ix; pointer_Ix += pointer_offset; Iy_array[L][j] = (int **)pointer_Iy; pointer_Iy += pointer_offset; } } /* add parts */ if(NoP>0) { /* arrays to store temporary loop variables */ int tmp_array_size = 0; for(int level=interval; level<L_MAX; level++) { if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { tmp_array_size += max_numpart*sizeof(int); } } int ***DIDX_array = (int ***)malloc((L_MAX-interval)*sizeof(int**)); int **sub_dst_DIDX = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int*)); int *dst_DIDX = (int *)malloc(tmp_array_size); int ***DID_4_array = (int ***)malloc((L_MAX-interval)*sizeof(int **)); int **sub_dst_DID_4 = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int*)); int *dst_DID_4; res = cuMemHostAlloc((void **)&dst_DID_4, tmp_array_size, CU_MEMHOSTALLOC_DEVICEMAP); if(res != CUDA_SUCCESS) { printf("cuMemHostAlloc(dst_DID_4) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } int ***PIDX_array = (int ***)malloc((L_MAX-interval)*sizeof(int **)); int **sub_dst_PIDX = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int*)); int *dst_PIDX; res = cuMemHostAlloc((void **)&dst_PIDX, tmp_array_size, CU_MEMHOSTALLOC_DEVICEMAP); if(res != CUDA_SUCCESS) { printf("cuMemHostAlloc(dst_PIDX) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } /* distribute consolidated region */ uintptr_t pointer_DIDX = (uintptr_t)sub_dst_DIDX; uintptr_t pointer_DID_4 = (uintptr_t)sub_dst_DID_4; uintptr_t pointer_PIDX = (uintptr_t)sub_dst_PIDX; for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } DIDX_array[L] = (int **)pointer_DIDX; pointer_DIDX += (uintptr_t)(NoC*sizeof(int*)); DID_4_array[L] = (int **)pointer_DID_4; pointer_DID_4 += (uintptr_t)(NoC*sizeof(int*)); PIDX_array[L] = (int **)pointer_PIDX; pointer_PIDX += (uintptr_t)(NoC*sizeof(int*)); } pointer_DIDX = (uintptr_t)dst_DIDX; pointer_DID_4 = (uintptr_t)dst_DID_4; pointer_PIDX = (uintptr_t)dst_PIDX; for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { uintptr_t pointer_offset = (uintptr_t)(max_numpart*sizeof(int)); DIDX_array[L][j] = (int *)pointer_DIDX; pointer_DIDX += pointer_offset; DID_4_array[L][j] = (int *)pointer_DID_4; pointer_DID_4 += pointer_offset; PIDX_array[L][j] = (int *)pointer_PIDX; pointer_PIDX += pointer_offset; } } /* prepare for parallel execution */ int sum_size_index_matrix = 0; for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { for (int k=0;k<numpart[j];k++) { /* assign values to each element */ DIDX_array[L][j][k] = MO->MI->didx[j][k]; DID_4_array[L][j][k] = DIDX_array[L][j][k]*4; PIDX_array[L][j][k] = MO->MI->pidx[j][k]; /* anchor */ ax_array[L][j][k] = MO->MI->anchor[DIDX_array[L][j][k]*2]+1; ay_array[L][j][k] = MO->MI->anchor[DIDX_array[L][j][k]*2+1]+1; int PSSIZE[2] ={pm_size_array[L][PIDX_array[L][j][k]*2], pm_size_array[L][PIDX_array[L][j][k]*2+1]}; // size of C /* index matrix */ sum_size_index_matrix += sizeof(int)*PSSIZE[0]*PSSIZE[1]; } } } int *dst_Ix_kk = (int *)malloc(sum_size_index_matrix); int *dst_Iy_kk = (int *)malloc(sum_size_index_matrix); uintptr_t pointer_Ix_kk = (uintptr_t)dst_Ix_kk; uintptr_t pointer_Iy_kk = (uintptr_t)dst_Iy_kk; for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { for (int k=0;k<numpart[j];k++) { int PSSIZE[2] ={pm_size_array[L][PIDX_array[L][j][k]*2], pm_size_array[L][PIDX_array[L][j][k]*2+1]}; // size of C Ix_array[L][j][k] = (int *)pointer_Ix_kk; Iy_array[L][j][k] = (int *)pointer_Iy_kk; pointer_Ix_kk += (uintptr_t)(sizeof(int)*PSSIZE[0]*PSSIZE[1]); pointer_Iy_kk += (uintptr_t)(sizeof(int)*PSSIZE[0]*PSSIZE[1]); } } } gettimeofday(&tv_dt_start, nullptr); FLOAT ****M_array = dt_GPU( Ix_array, // int ****Ix_array Iy_array, // int ****Iy_array PIDX_array, // int ***PIDX_array pm_size_array, // int **size_array NoP, // int NoP numpart, // int *numpart NoC, // int NoC interval, // int interval L_MAX, // int L_MAX feature_size, // int *feature_size, padx, // int padx, pady, // int pady, MO->MI->max_X, // int max_X MO->MI->max_Y, // int max_Y MO->MI->def, // FLOAT *def tmp_array_size, // int tmp_array_size dst_PIDX, // int *dst_PIDX dst_DID_4 // int *DID_4 ); gettimeofday(&tv_dt_end, nullptr); tvsub(&tv_dt_end, &tv_dt_start, &tv); time_dt += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; /* add part score */ for(int level=interval; level<L_MAX; level++){ int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { continue; } for(int j=0; j<NoC; j++) { for(int k=0; k<numpart[j]; k++) { int PSSIZE[2] ={pm_size_array[L][PIDX_array[L][j][k]*2], pm_size_array[L][PIDX_array[L][j][k]*2+1]}; // Size of C int R_S[2]={rm_size_array[level][j*2], rm_size_array[level][j*2+1]}; dpm_ttic_add_part_calculation(SCORE_array[L][j], M_array[L][j][k], R_S, PSSIZE, ax_array[L][j][k], ay_array[L][j][k]); } } } s_free(M_array[0][0][0]); s_free(M_array[0][0]); s_free(M_array[0]); s_free(M_array); /* free temporary arrays */ free(dst_DIDX); free(sub_dst_DIDX); free(DIDX_array); res = cuMemFreeHost(dst_DID_4); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(dst_DID_4) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } free(sub_dst_DID_4); free(DID_4_array); res = cuMemFreeHost(dst_PIDX); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(dst_PIDX) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } free(sub_dst_PIDX); free(PIDX_array); res = cuCtxSetCurrent(ctx[0]); if(res != CUDA_SUCCESS) { printf("cuCtxSetCurrent(ctx[0]) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } } // start from if(NoP>0) /* combine root and part score and detect boundary box for each-component */ FLOAT *scale_array = (FLOAT *)malloc((L_MAX-interval)*sizeof(FLOAT)); for(int level=interval; level<L_MAX; level++) { int L = level - interval; if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { Tboxes[count]=nullptr; count++; continue; } scale_array[L] = (FLOAT)sbin/scales[level]; } for (int level=interval; level<L_MAX; level++) // feature's loop(A's loop) 1level 1picture { /* parameters (related for level) */ int L=level-interval; /* matched score size matrix */ FLOAT scale=(FLOAT)sbin/scales[level]; /* loop conditon */ if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) { Tboxes[count]=nullptr; count++; continue; } /* calculate accumulated score */ gettimeofday(&tv_calc_a_score_start, nullptr); calc_a_score_GPU( acc_score, // FLOAT *ac_score SCORE_array[L], // FLOAT **score rm_size_array[level], // int *ssize_start MO->MI, // Model_info *MI scale, // FLOAT scale RL_S_array[L], // int *size_score_array NoC // int NoC ); gettimeofday(&tv_calc_a_score_end, nullptr); tvsub(&tv_calc_a_score_end, &tv_calc_a_score_start, &tv); time_calc_a_score += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; for(int j = 0; j <NoC; j++) { int R_S[2]={rm_size_array[level][j*2], rm_size_array[level][j*2+1]}; /* get all good matches */ int GMN; int *GMPC = get_gmpc(SCORE_array[L][j],thresh,R_S,&GMN); int RSIZE[2]={MO->MI->rsize[j*2], MO->MI->rsize[j*2+1]}; int GL = (numpart[j]+1)*4+3; //31 /* detected box coordinate(current level) */ FLOAT *t_boxes = (FLOAT*)calloc(GMN*GL,sizeof(FLOAT)); gettimeofday(&tv_box_start, nullptr); // NO NEED TO USE GPU for(int k = 0;k < GMN;k++) { FLOAT *P_temp = t_boxes+GL*k; int y = GMPC[2*k]; int x = GMPC[2*k+1]; /* calculate root box coordinate */ FLOAT *RB =rootbox(x,y,scale,padx,pady,RSIZE); memcpy(P_temp, RB,sizeof(FLOAT)*4); s_free(RB); P_temp+=4; for(int pp=0;pp<numpart[j];pp++) { int PBSIZE[2]={psize[j][pp*2], psize[j][pp*2+1]}; int Isize[2]={pm_size_array[L][MO->MI->pidx[j][pp]*2], pm_size_array[L][MO->MI->pidx[j][pp]*2+1]}; /* calculate part box coordinate */ FLOAT *PB = partbox(x,y,ax_array[L][j][pp],ay_array[L][j][pp],scale,padx,pady,PBSIZE,Ix_array[L][j][pp],Iy_array[L][j][pp],Isize); memcpy(P_temp, PB,sizeof(FLOAT)*4); P_temp+=4; s_free(PB); } /* component number and score */ *(P_temp++)=(FLOAT)j; //component number *(P_temp++)=SCORE_array[L][j][x*R_S[0]+y]; //score of good match *P_temp = scale; } // NO NEED TO USE GPU gettimeofday(&tv_box_end, nullptr); tvsub(&tv_box_end, &tv_box_start, &tv); time_box += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; /* save box information */ if (GMN > 0) Tboxes[count] = t_boxes; else Tboxes[count] = nullptr; b_nums[count]=GMN; count++; detected_boxes+=GMN; //number of detected box /* release */ s_free(GMPC); } ////numcom } ////level /* free temporary arrays */ free(dst_RL); free(RL_array); free(dst_RI); free(RI_array); free(dst_OI); free(OI_array); free(dst_RL_S); free(RL_S_array); free(dst_OFF); free(OFF_array); free(dst_SCORE); free(sub_dst_SCORE); free(SCORE_array); free(dst_ax); free(sub_dst_ax); free(ax_array); free(dst_ay); free(sub_dst_ay); free(ay_array); free(Ix_array[0][0][0]); free(dst_Ix); free(sub_dst_Ix); free(Ix_array); free(Iy_array[0][0][0]); free(dst_Iy); free(sub_dst_Iy); free(Iy_array); free(scale_array); gettimeofday(&tv_nucom_end, nullptr); #ifdef PRINT_INFO printf("root SCORE : %f\n", time_root_score); printf("part SCORE : %f\n", time_part_score); printf("dt : %f\n", time_dt); printf("calc_a_score : %f\n", time_calc_a_score); #endif res = cuCtxSetCurrent(ctx[0]); if(res != CUDA_SUCCESS) { printf("cuCtxSetCurrent(ctx[0]) failed: res = %s\n",cuda_response_to_string(res)); exit(1); } /* free memory regions */ res = cuMemFreeHost((void *)featp2[0]); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(featp2[0]) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } s_free(featp2); res = cuMemFreeHost((void *)rootmatch[interval][0]); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(rootmatch[0][0]) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } s_free(rootmatch[0]); s_free(rootmatch); if (partmatch != nullptr) { res = cuMemFreeHost((void *)partmatch[0][0]); if(res != CUDA_SUCCESS) { printf("cuMemFreeHost(partmatch[0][0]) failed: res = %s\n", cuda_response_to_string(res)); exit(1); } s_free(partmatch[0]); s_free(partmatch); s_free(new_PADsize); } /* release */ s_free(rm_size_array[0]); s_free(rm_size_array); s_free(pm_size_array[0]); s_free(pm_size_array); /* Output boundary-box coorinate information */ int GL=(numpart[0]+1)*4+3; FLOAT *boxes=(FLOAT*)calloc(detected_boxes*GL,sizeof(FLOAT)); //box coordinate information(Temp) FLOAT *T1 = boxes; for(int i = 0; i < LofFeat; i++) { int num_t = b_nums[i]*GL; if(num_t > 0) { FLOAT *T2 = Tboxes[i]; //memcpy_s(T1,sizeof(FLOAT)*num_t,T2,sizeof(FLOAT)*num_t); memcpy(T1, T2,sizeof(FLOAT)*num_t); T1 += num_t; } } FLOAT abs_threshold = abs(thresh); /* accumulated score calculation */ FLOAT max_score = 0.0; /* add offset to accumulated score */ for(int i = 0; i < MO->MI->IM_HEIGHT*MO->MI->IM_WIDTH; i++) { if (acc_score[i] < thresh) { acc_score[i] = 0.0; } else { acc_score[i] += abs_threshold; if (acc_score[i] > max_score) max_score = acc_score[i]; } } /* normalization */ if (max_score > 0.0) { FLOAT ac_ratio = 1.0 / max_score; for (int i = 0; i < MO->MI->IM_HEIGHT*MO->MI->IM_WIDTH; i++) { acc_score[i] *= ac_ratio; } } /* release */ free_boxes(Tboxes,LofFeat); s_free(b_nums); /* output result */ *detected_count = detected_boxes; return boxes; }
value spoc_getCudaDevice(value i) { CAMLparam1(i); CAMLlocal4(general_info, cuda_info, specific_info, gc_info); CAMLlocal3(device, maxT, maxG); int nb_devices; CUdevprop dev_infos; CUdevice dev; CUcontext ctx; CUstream queue[2]; spoc_cu_context *spoc_ctx; //CUcontext gl_ctx; char infoStr[1024]; int infoInt; size_t infoUInt; int major, minor; enum cudaError_enum cuda_error; cuDeviceGetCount (&nb_devices); if ((Int_val(i)) > nb_devices) raise_constant(*caml_named_value("no_cuda_device")) ; CUDA_CHECK_CALL(cuDeviceGet(&dev, Int_val(i))); CUDA_CHECK_CALL(cuDeviceGetProperties(&dev_infos, dev)); general_info = caml_alloc (9, 0); CUDA_CHECK_CALL(cuDeviceGetName(infoStr, sizeof(infoStr), dev)); Store_field(general_info,0, copy_string(infoStr));// CUDA_CHECK_CALL(cuDeviceTotalMem(&infoUInt, dev)); Store_field(general_info,1, Val_int(infoUInt));// Store_field(general_info,2, Val_int(dev_infos.sharedMemPerBlock));// Store_field(general_info,3, Val_int(dev_infos.clockRate));// Store_field(general_info,4, Val_int(dev_infos.totalConstantMemory));// CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev)); Store_field(general_info,5, Val_int(infoInt));// CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev)); Store_field(general_info,6, Val_bool(infoInt));// Store_field(general_info,7, i); CUDA_CHECK_CALL(cuCtxCreate (&ctx, CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST, dev)); spoc_ctx = malloc(sizeof(spoc_cl_context)); spoc_ctx->ctx = ctx; CUDA_CHECK_CALL(cuStreamCreate(&queue[0], 0)); CUDA_CHECK_CALL(cuStreamCreate(&queue[1], 0)); spoc_ctx->queue[0] = queue[0]; spoc_ctx->queue[1] = queue[1]; Store_field(general_info,8, (value)spoc_ctx); CUDA_CHECK_CALL(cuCtxSetCurrent(ctx)); cuda_info = caml_alloc(1, 0); //0 -> Cuda specific_info = caml_alloc(18, 0); cuDeviceComputeCapability(&major, &minor, dev); Store_field(specific_info,0, Val_int(major));// Store_field(specific_info,1, Val_int(minor));// Store_field(specific_info,2, Val_int(dev_infos.regsPerBlock));// Store_field(specific_info,3, Val_int(dev_infos.SIMDWidth));// Store_field(specific_info,4, Val_int(dev_infos.memPitch));// Store_field(specific_info,5, Val_int(dev_infos.maxThreadsPerBlock));// maxT = caml_alloc(3, 0); Store_field(maxT,0, Val_int(dev_infos.maxThreadsDim[0]));// Store_field(maxT,1, Val_int(dev_infos.maxThreadsDim[1]));// Store_field(maxT,2, Val_int(dev_infos.maxThreadsDim[2]));// Store_field(specific_info,6, maxT); maxG = caml_alloc(3, 0); Store_field(maxG,0, Val_int(dev_infos.maxGridSize[0]));// Store_field(maxG,1, Val_int(dev_infos.maxGridSize[1]));// Store_field(maxG,2, Val_int(dev_infos.maxGridSize[2]));// Store_field(specific_info,7, maxG); Store_field(specific_info,8, Val_int(dev_infos.textureAlign));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); Store_field(specific_info,9, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev); Store_field(specific_info,10, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); Store_field(specific_info,11, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); Store_field(specific_info,12, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); Store_field(specific_info,13, Val_int(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); Store_field(specific_info,14, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev); Store_field(specific_info,15, Val_int(infoInt)); cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev); Store_field(specific_info,16, Val_int(infoInt)); cuDriverGetVersion(&infoInt); Store_field(specific_info, 17, Val_int(infoInt)); Store_field(cuda_info, 0, specific_info); device = caml_alloc(4, 0); Store_field(device, 0, general_info); Store_field(device, 1, cuda_info); {spoc_cuda_gc_info* gcInfo = (spoc_cuda_gc_info*)malloc(sizeof(spoc_cuda_gc_info)); CUDA_CHECK_CALL(cuMemGetInfo(&infoUInt, NULL)); infoUInt -= (32*1024*1024); Store_field(device, 2, (value)gcInfo); {cuda_event_list* events = NULL; Store_field(device, 3, (value)events); CAMLreturn(device);}} }
/* * Create a VampirTrace CUPTI context. If the CUDA context is not given, the * current context will be requested and used. * * @param cuCtx CUDA context * @param cuDev CUDA device * @param ctxID ID of the CUDA context * @param devID ID of the CUDA device * * @return pointer to created VampirTrace CUPTI context */ vt_cupti_ctx_t* vt_cupti_createCtx(CUcontext cuCtx, CUdevice cuDev, uint32_t cuCtxID, uint32_t cuDevID) { vt_cupti_ctx_t* vtCtx = NULL; /* create new context */ vtCtx = (vt_cupti_ctx_t *)malloc(sizeof(vt_cupti_ctx_t)); if(vtCtx == NULL) vt_error_msg("[CUPTI] Could not allocate memory for VT CUPTI context!"); vtCtx->ctxID = cuCtxID; #if (defined(VT_CUPTI_ACTIVITY) || defined(VT_CUPTI_CALLBACKS)) vtCtx->gpuMemAllocated = 0; vtCtx->gpuMemList = NULL; vtCtx->strmList = NULL; #endif vtCtx->next = NULL; VT_CHECK_THREAD; vtCtx->ptid = VT_MY_THREAD; /* try to get CUDA device (ID), if they are not given */ if(cuDevID == VT_CUPTI_NO_DEVICE_ID){ if(cuDev == VT_CUPTI_NO_CUDA_DEVICE){ CUcontext cuCurrCtx; if(cuCtx != NULL){ cuCtxGetCurrent(&cuCurrCtx); /* if given context does not match the current one, get the device for the given one */ if(cuCtx != cuCurrCtx) VT_CUDRV_CALL(cuCtxSetCurrent(cuCtx), NULL); } if(CUDA_SUCCESS == cuCtxGetDevice(&cuDev)) cuDevID = (uint32_t)cuDev; /* reset the active context */ if(cuCtx != NULL && cuCtx != cuCurrCtx) VT_CUDRV_CALL(cuCtxSetCurrent(cuCurrCtx), NULL); }else{ /* no device ID, but CUDA device is given */ cuDevID = (uint32_t)cuDev; } } vtCtx->devID = cuDevID; vtCtx->cuDev = cuDev; /* get the current CUDA context, if it is not given */ if(cuCtx == NULL) VT_CUDRV_CALL(cuCtxGetCurrent(&cuCtx), NULL); /* set the CUDA context */ vtCtx->cuCtx = cuCtx; #if defined(VT_CUPTI_ACTIVITY) vtCtx->activity = NULL; #endif #if defined(VT_CUPTI_CALLBACKS) vtCtx->callbacks = NULL; #endif #if defined(VT_CUPTI_EVENTS) vtCtx->events = NULL; #endif vt_cntl_msg(2, "[CUPTI] Created context for CUcontext %d, CUdevice %d", cuCtx, cuDev); return vtCtx; }
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtQueueElement elem; while(virtqueue_pop(vq, &elem)) { struct param *p = elem.out_sg[0].iov_base; //for all library routines: get required arguments from buffer, execute, and push results back in virtqueue switch (p->syscall_type) { case CUINIT: { p->result = cuInit(p->flags); break; } case CUDRIVERGETVERSION: { p->result = cuDriverGetVersion(&p->val1); break; } case CUDEVICEGETCOUNT: { p->result = cuDeviceGetCount(&p->val1); break; } case CUDEVICEGET: { p->result = cuDeviceGet(&p->device, p->val1); break; } case CUDEVICECOMPUTECAPABILITY: { p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device); break; } case CUDEVICEGETNAME: { p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device); break; } case CUDEVICEGETATTRIBUTE: { p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device); break; } case CUCTXCREATE: { p->result = cuCtxCreate(&p->ctx, p->flags, p->device); break; } case CUCTXDESTROY: { p->result = cuCtxDestroy(p->ctx); break; } case CUCTXGETCURRENT: { p->result = cuCtxGetCurrent(&p->ctx); break; } case CUCTXGETDEVICE: { p->result = cuCtxGetDevice(&p->device); break; } case CUCTXPOPCURRENT: { p->result = cuCtxPopCurrent(&p->ctx); break; } case CUCTXSETCURRENT: { p->result = cuCtxSetCurrent(p->ctx); break; } case CUCTXSYNCHRONIZE: { p->result = cuCtxSynchronize(); break; } case CUMODULELOAD: { //hardcoded path - needs improvement //all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char)); if (!binname) { p->result = 0; virtqueue_push(vq, &elem, 0); break; } strcpy(binname, getenv("QEMU_NFS_PATH")); strcat(binname, (char *)elem.out_sg[1].iov_base); //change current CUDA context //each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleLoad(&p->module, binname); free(binname); break; } case CUMODULEGETGLOBAL: { char *name = malloc(100*sizeof(char)); if (!name) { p->result = 999; break; } strcpy(name, (char *)elem.out_sg[1].iov_base); p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name); break; } case CUMODULEUNLOAD: { p->result = cuModuleUnload(p->module); break; } case CUMEMALLOC: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAlloc(&p->dptr, p->bytesize); break; } case CUMEMALLOCPITCH: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize); break; } //large buffers are alocated in smaller chuncks in guest kernel space //gets each chunck seperately and copies it to device memory case CUMEMCPYHTOD: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYHTODASYNC: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTODASYNC: { p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream); break; } case CUMEMCPYDTOH: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTOHASYNC: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMSETD32: { p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize); break; } case CUMEMFREE: { p->result = cuMemFree(p->dptr); break; } case CUMODULEGETFUNCTION: { char *name = (char *)elem.out_sg[1].iov_base; name[p->length] = '\0'; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleGetFunction(&p->function, p->module, name); break; } case CULAUNCHKERNEL: { void **args = malloc(p->val1*sizeof(void *)); if (!args) { p->result = 9999; break; } int i; for (i=0; i<p->val1; i++) { args[i] = elem.out_sg[1+i].iov_base; } if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuLaunchKernel(p->function, p->gridDimX, p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, p->blockDimZ, p->bytecount, 0, args, 0); free(args); break; } case CUEVENTCREATE: { p->result = cuEventCreate(&p->event1, p->flags); break; } case CUEVENTDESTROY: { p->result = cuEventDestroy(p->event1); break; } case CUEVENTRECORD: { p->result = cuEventRecord(p->event1, p->stream); break; } case CUEVENTSYNCHRONIZE: { p->result = cuEventSynchronize(p->event1); break; } case CUEVENTELAPSEDTIME: { p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2); break; } case CUSTREAMCREATE: { p->result = cuStreamCreate(&p->stream, 0); break; } case CUSTREAMSYNCHRONIZE: { p->result = cuStreamSynchronize(p->stream); break; } case CUSTREAMQUERY: { p->result = cuStreamQuery(p->stream); break; } case CUSTREAMDESTROY: { p->result = cuStreamDestroy(p->stream); break; } default: printf("Unknown syscall_type\n"); } virtqueue_push(vq, &elem, 0); } //notify frontend - trigger virtual interrupt virtio_notify(vdev, vq); return; }