Exemplo n.º 1
0
CUresult
TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha )
{
    CUresult status;
    CUdeviceptr dptrOut = 0;
    CUdeviceptr dptrIn = 0;
    float *hostOut = 0;
    float *hostIn = 0;

    CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) );

    CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) );
    CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) );
    CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) );
    for ( size_t i = 0; i < N; i++ ) {
        hostIn[i] = (float) rand() / (float) RAND_MAX;
    }
    CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) );

    {
        CUmodule moduleSAXPY;
        CUfunction kernelSAXPY;
        void *params[] = { &dptrOut, &dptrIn, &N, &alpha };
        
        moduleSAXPY = chDevice->module( "saxpy.ptx" );
        if ( ! moduleSAXPY ) {
            status = CUDA_ERROR_NOT_FOUND;
            goto Error;
        }
        CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) );

        CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) );

    }

    CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) );
    CUDA_CHECK( cuCtxSynchronize() );
    for ( size_t i = 0; i < N; i++ ) {
        if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) {
            status = CUDA_ERROR_UNKNOWN;
            goto Error;
        }
    }
    status = CUDA_SUCCESS;
    printf( "Well it worked!\n" );

Error:
    cuCtxPopCurrent( NULL );
    cuMemFreeHost( hostOut );
    cuMemFreeHost( hostIn );
    cuMemFree( dptrOut );
    cuMemFree( dptrIn );
    return status;
}
Exemplo n.º 2
0
static void cuda_free_ctx(cuda_context *ctx) {
  gpuarray_blas_ops *blas_ops;
  gpudata *next, *curr;

  ASSERT_CTX(ctx);
  ctx->refcnt--;
  if (ctx->refcnt == 0) {
    assert(ctx->enter == 0 && "Context was active when freed!");
    if (ctx->blas_handle != NULL) {
      ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS,
                               &blas_ops);
      blas_ops->teardown(ctx);
    }
    cuMemFreeHost((void *)ctx->errbuf->ptr);
    deallocate(ctx->errbuf);

    cuStreamDestroy(ctx->s);

    /* Clear out the freelist */
    for (curr = ctx->freeblocks; curr != NULL; curr = next) {
      next = curr->next;
      cuMemFree(curr->ptr);
      deallocate(curr);
    }

    if (!(ctx->flags & DONTFREE))
      cuCtxDestroy(ctx->ctx);
    cache_destroy(ctx->extcopy_cache);
    CLEAR(ctx);
    free(ctx);
  }
}
Exemplo n.º 3
0
/*
 * Class:     edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2
 * Method:    reinit
 * Signature: ()V
 */
JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_reinit
  (JNIEnv * env, jobject this_ref, jint max_blocks_per_proc, jint max_threads_per_block, jlong free_space)
{
  cuMemFreeHost(toSpace);
  cuMemFree(gpuToSpace);
  cuMemFree(gpuClassMemory);
  cuMemFreeHost(handlesMemory);
  cuMemFree(gpuHandlesMemory);
  cuMemFreeHost(exceptionsMemory);
  cuMemFree(gpuExceptionsMemory);
  cuMemFree(gcInfoSpace);
  cuMemFree(gpuHeapEndPtr);
  cuMemFree(gpuBufferSize);
  cuCtxDestroy(cuContext);
  initDevice(env, this_ref, max_blocks_per_proc, max_threads_per_block, free_space);
}
Exemplo n.º 4
0
bool VideoDecoderCUDAPrivate::releaseCuda()
{
    available = false;
    if (!can_load)
        return true;
    if (dec) {
        cuvidDestroyDecoder(dec);
        dec = 0;
    }
    if (parser) {
        cuvidDestroyVideoParser(parser);
        parser = 0;
    }
    if (stream) {
        cuStreamDestroy(stream);
        stream = 0;
    }
    if (host_data) {
        cuMemFreeHost(host_data);
        host_data = 0;
        host_data_size = 0;
    }
    if (vid_ctx_lock) {
        cuvidCtxLockDestroy(vid_ctx_lock);
        vid_ctx_lock = 0;
    }
    if (cuctx) {
        checkCudaErrors(cuCtxDestroy(cuctx));
    }
    // TODO: dllapi unload
    return true;
}
Exemplo n.º 5
0
void swanFreeHost( void *ptr ) {
	//printf("FreeHost %p\n", ptr );
	CUresult err = cuMemFreeHost( ptr );
	if ( err != CUDA_SUCCESS ) {
		error("swanFreeHost failed\n" );
	}
}
Exemplo n.º 6
0
void *cuda_make_ctx(CUcontext ctx, int flags) {
  cuda_context *res;
  void *p;

  res = malloc(sizeof(*res));
  if (res == NULL)
    return NULL;
  res->ctx = ctx;
  res->err = CUDA_SUCCESS;
  res->blas_handle = NULL;
  res->refcnt = 1;
  res->flags = flags;
  res->enter = 0;
  res->freeblocks = NULL;
  if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) {
    goto fail_cache;
  }
  res->extcopy_cache = cache_lru(64, 32, (cache_eq_fn)extcopy_eq,
                                 (cache_hash_fn)extcopy_hash,
                                 (cache_freek_fn)extcopy_free,
                                 (cache_freev_fn)cuda_freekernel);
  if (res->extcopy_cache == NULL) {
    goto fail_cache;
  }
  err = cuStreamCreate(&res->s, 0);
  if (err != CUDA_SUCCESS) {
    goto fail_stream;
  }
  err = cuStreamCreate(&res->mem_s, CU_STREAM_NON_BLOCKING);
  if (err != CUDA_SUCCESS) {
    goto fail_mem_stream;
  }
  err = cuMemAllocHost(&p, 16);
  if (err != CUDA_SUCCESS) {
    goto fail_errbuf;
  }
  memset(p, 0, 16);
  /* Need to tag for new_gpudata */
  TAG_CTX(res);
  res->errbuf = new_gpudata(res, (CUdeviceptr)p, 16);
  if (res->errbuf == NULL) {
    err = res->err;
    goto fail_end;
  }
  res->errbuf->flags |= CUDA_MAPPED_PTR;
  return res;
 fail_end:
  cuMemFreeHost(p);
 fail_errbuf:
  cuStreamDestroy(res->mem_s);
 fail_mem_stream:
  cuStreamDestroy(res->s);
 fail_stream:
  cache_destroy(res->extcopy_cache);
 fail_cache:
  free(res);
  return NULL;
}
Exemplo n.º 7
0
static void
map_fini (struct ptx_stream *s)
{
  CUresult r;

  r = cuMemFreeHost (s->h);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
}
void GPUInterface::FreePinnedHostMemory(void* hPtr) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tEntering GPUInterface::FreePinnedHostMemory\n");
#endif

    SAFE_CUPP(cuMemFreeHost(hPtr));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::FreePinnedHostMemory\n");
#endif
}
Exemplo n.º 9
0
SEXP R_auto_cuMemFreeHost(SEXP r_p)
{
    SEXP r_ans = R_NilValue;
    void * p = GET_REF(r_p, void );
    
    CUresult ans;
    ans = cuMemFreeHost(p);
    
    r_ans = Renum_convert_CUresult(ans) ;
    
    return(r_ans);
}
Exemplo n.º 10
0
void
pocl_cuda_free (cl_device_id device, cl_mem mem_obj)
{
  cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);

  if (mem_obj->flags & CL_MEM_ALLOC_HOST_PTR)
    {
      cuMemFreeHost (mem_obj->mem_host_ptr);
      mem_obj->mem_host_ptr = NULL;
    }
  else
    {
      void *ptr = mem_obj->device_ptrs[device->dev_id].mem_ptr;
      cuMemFree ((CUdeviceptr)ptr);
    }
}
Exemplo n.º 11
0
void* InteropResource::mapToHost(const VideoFormat &format, void *handle, int picIndex, const CUVIDPROCPARAMS &param, int width, int height, int coded_height)
{
    AutoCtxLock locker((cuda_api*)this, lock);
    Q_UNUSED(locker);
    CUdeviceptr devptr;
    unsigned int pitch;

    CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(&param)), NULL);
    CUVIDAutoUnmapper unmapper(this, dec, devptr);
    Q_UNUSED(unmapper);
    uchar* host_data = NULL;
    const size_t host_size = pitch*coded_height*3/2;
    CUDA_ENSURE(cuMemAllocHost((void**)&host_data, host_size), NULL);
    // copy to the memory not allocated by cuda is possible but much slower
    CUDA_ENSURE(cuMemcpyDtoH(host_data, devptr, host_size), NULL);

    VideoFrame frame(width, height, VideoFormat::Format_NV12);
    uchar *planes[] = {
        host_data,
        host_data + pitch * coded_height
    };
    frame.setBits(planes);
    int pitches[] = { (int)pitch, (int)pitch };
    frame.setBytesPerLine(pitches);

    VideoFrame *f = reinterpret_cast<VideoFrame*>(handle);
    frame.setTimestamp(f->timestamp());
    frame.setDisplayAspectRatio(f->displayAspectRatio());
    if (format == frame.format())
        *f = frame.clone();
    else
        *f = frame.to(format);

    cuMemFreeHost(host_data);
    return f;
}
Exemplo n.º 12
0
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//release model
void free_model(MODEL *MO)
{
  
  CUresult res;
  
  //free model information
  for(int ii=0;ii<MO->MI->numcomponent;ii++)
    {
      s_free(MO->MI->didx[ii]);
      s_free(MO->MI->pidx[ii]);
      s_free(MO->MI->psize[ii]);
      s_free(MO->MI->x1[ii]);
      s_free(MO->MI->x2[ii]);
      s_free(MO->MI->y1[ii]);
      s_free(MO->MI->y2[ii]);
    }
  s_free(MO->MI->anchor);

  //  s_free(MO->MI->def);
  res = cuMemFreeHost((void *)MO->MI->def);
  if(res != CUDA_SUCCESS) {
    printf("cuMemFreeHost(MO->MI->def) failed: res = %s\n", conv(res));
    exit(1);
  }

  s_free(MO->MI->numpart);
  s_free(MO->MI->offw);
  s_free(MO->MI->oidx);
  s_free(MO->MI->ridx);
  s_free(MO->MI->rsize);
  s_free(MO->MI->x1);
  s_free(MO->MI->x2);
  s_free(MO->MI->y1);
  s_free(MO->MI->y2);
  s_free(MO->MI);
  
  //free root-filter information
  for(int ii=0;ii<MO->RF->NoR;ii++)
    {
      s_free(MO->RF->root_size[ii]);
#ifdef ORIGINAL
      s_free(MO->RF->rootfilter[ii]);
#else
#ifdef SEPARETE_MEM 
      res = cuMemFreeHost((void *)MO->RF->rootfilter[ii]);
      if(res != CUDA_SUCCESS){
        printf("cuMemFreeHost(MO->RF->rootfilter) failed: res = %s\n", conv(res));
        exit(1);
      }
#endif  
#endif
    }
  
  
#ifndef ORIGINAL
#ifndef SEPARETE_MEM
  /* free heap region in a lump */
  res = cuMemFreeHost((void *)MO->RF->rootfilter[0]);
  if(res != CUDA_SUCCESS){
    printf("cuMemFreeHost(MO->RF->rootfilter[0]) failed: res = %s\n", conv(res));
    exit(1);
  }
#endif
#endif
  
  
  s_free(MO->RF->rootsym);
  s_free(MO->RF);
  
  //free root-filter information
  for(int ii=0;ii<MO->PF->NoP;ii++)
    {
      s_free(MO->PF->part_size[ii]);
#ifdef ORIGINAL
      s_free(MO->PF->partfilter[ii]);
#else
#ifdef SEPARETE_MEM
      res = cuMemFreeHost((void *)MO->PF->partfilter[ii]);
      if(res != CUDA_SUCCESS){
        printf("cuMemFreeHost(MO->PF->partfilter) failed: res = %s\n", conv(res));
        exit(1);
      }
#endif
#endif
    }
  
#ifndef ORIGINAL
#ifndef SEPARETE_MEM
  /* free heap region in a lump */
  res = cuMemFreeHost((void *)MO->PF->partfilter[0]);
  if(res != CUDA_SUCCESS){
    printf("cuMemFreeHost(MO->PF->partfilter[0] failed: res = %s\n", conv(res));
    exit(1);
  }
#endif
#endif
  
  
  s_free(MO->PF->part_partner);
  s_free(MO->PF->part_sym);
  s_free(MO->PF);
  
  s_free(MO);
  
  
}
Exemplo n.º 13
0
bool VideoDecoderCUDAPrivate::processDecodedData(CUVIDPARSERDISPINFO *cuviddisp, VideoFrame* outFrame) {
    int num_fields = cuviddisp->progressive_frame ? 1 : 2+cuviddisp->repeat_first_field;

    for (int active_field = 0; active_field < num_fields; ++active_field) {
        CUVIDPROCPARAMS proc_params;
        memset(&proc_params, 0, sizeof(CUVIDPROCPARAMS));
        proc_params.progressive_frame = cuviddisp->progressive_frame; //check user config
        proc_params.second_field = active_field == 1; //check user config
        proc_params.top_field_first = cuviddisp->top_field_first;
        proc_params.unpaired_field = cuviddisp->progressive_frame == 1;

        CUdeviceptr devptr;
        unsigned int pitch;
        cuvidCtxLock(vid_ctx_lock, 0);
        CUresult cuStatus = cuvidMapVideoFrame(dec, cuviddisp->picture_index, &devptr, &pitch, &proc_params);
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuvidMapVideoFrame failed on index %d (%#x, %s)", cuviddisp->picture_index, cuStatus, _cudaGetErrorEnum(cuStatus));
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
#define PAD_ALIGN(x,mask) ( (x + mask) & ~mask )
        //uint w = dec_create_info.ulWidth;//PAD_ALIGN(dec_create_info.ulWidth, 0x3F);
        uint h = dec_create_info.ulHeight;//PAD_ALIGN(dec_create_info.ulHeight, 0x0F); //?
#undef PAD_ALIGN
        int size = pitch*h*3/2;
        if (size > host_data_size && host_data) {
            cuMemFreeHost(host_data);
            host_data = 0;
            host_data_size = 0;
        }
        if (!host_data) {
            cuStatus = cuMemAllocHost((void**)&host_data, size);
            if (cuStatus != CUDA_SUCCESS) {
                qWarning("cuMemAllocHost failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
                cuvidUnmapVideoFrame(dec, devptr);
                cuvidCtxUnlock(vid_ctx_lock, 0);
                return false;
            }
            host_data_size = size;
        }
        if (!host_data) {
            qWarning("No valid staging memory!");
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
        cuStatus = cuMemcpyDtoHAsync(host_data, devptr, size, stream);
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuMemcpyDtoHAsync failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
        cuStatus = cuCtxSynchronize();
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuCtxSynchronize failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
        }
        cuvidUnmapVideoFrame(dec, devptr);
        cuvidCtxUnlock(vid_ctx_lock, 0);
        //qDebug("mark not in use pic_index: %d", cuviddisp->picture_index);
        surface_in_use[cuviddisp->picture_index] = false;

        uchar *planes[] = {
            host_data,
            host_data + pitch * h
        };
        int pitches[] = { (int)pitch, (int)pitch };
        VideoFrame frame(codec_ctx->width, codec_ctx->height, VideoFormat::Format_NV12);
        frame.setBits(planes);
        frame.setBytesPerLine(pitches);
        //TODO: is clone required? may crash on clone, I should review clone()
        //frame = frame.clone();
        if (outFrame) {
            *outFrame = frame.clone();
        }
#if COPY_ON_DECODE
        frame_queue.put(frame.clone());
#endif
        //qDebug("frame queue size: %d", frame_queue.size());
    }
    return true;
}
Exemplo n.º 14
0
// Run the Cuda part of the computation
bool copyDecodedFrameToTexture(unsigned int &nRepeats, int bUseInterop, int *pbIsProgressive)
{
    CUVIDPARSERDISPINFO oDisplayInfo;

    if (g_pFrameQueue->dequeue(&oDisplayInfo))
    {
        CCtxAutoLock lck(g_CtxLock);
        // Push the current CUDA context (only if we are using CUDA decoding path)
        CUresult result = cuCtxPushCurrent(g_oContext);

        CUdeviceptr  pDecodedFrame[2] = { 0, 0 };
        CUdeviceptr  pInteropFrame[2] = { 0, 0 };

        int num_fields = (oDisplayInfo.progressive_frame ? (1) : (2+oDisplayInfo.repeat_first_field));
        *pbIsProgressive = oDisplayInfo.progressive_frame;
        g_bIsProgressive = oDisplayInfo.progressive_frame ? true : false;

        for (int active_field=0; active_field<num_fields; active_field++)
        {
            nRepeats = oDisplayInfo.repeat_first_field;
            CUVIDPROCPARAMS oVideoProcessingParameters;
            memset(&oVideoProcessingParameters, 0, sizeof(CUVIDPROCPARAMS));

            oVideoProcessingParameters.progressive_frame = oDisplayInfo.progressive_frame;
            oVideoProcessingParameters.second_field      = active_field;
            oVideoProcessingParameters.top_field_first   = oDisplayInfo.top_field_first;
            oVideoProcessingParameters.unpaired_field    = (num_fields == 1);

            unsigned int nDecodedPitch = 0;
            unsigned int nWidth = 0;
            unsigned int nHeight = 0;

            // map decoded video frame to CUDA surfae
            g_pVideoDecoder->mapFrame(oDisplayInfo.picture_index, &pDecodedFrame[active_field], &nDecodedPitch, &oVideoProcessingParameters);
            nWidth  = g_pVideoDecoder->targetWidth();
            nHeight = g_pVideoDecoder->targetHeight();
            // map DirectX texture to CUDA surface
            size_t nTexturePitch = 0;

            // If we are Encoding and this is the 1st Frame, we make sure we allocate system memory for readbacks
            if (g_bReadback && g_bFirstFrame && g_ReadbackSID)
            {
                CUresult result;
                checkCudaErrors(result = cuMemAllocHost((void **)&g_bFrameData[0], (nDecodedPitch * nHeight * 3 / 2)));
                checkCudaErrors(result = cuMemAllocHost((void **)&g_bFrameData[1], (nDecodedPitch * nHeight * 3 / 2)));
                g_bFirstFrame = false;

                if (result != CUDA_SUCCESS)
                {
                    printf("cuMemAllocHost returned %d\n", (int)result);
                }
            }

            // If streams are enabled, we can perform the readback to the host while the kernel is executing
            if (g_bReadback && g_ReadbackSID)
            {
                CUresult result = cuMemcpyDtoHAsync(g_bFrameData[active_field], pDecodedFrame[active_field], (nDecodedPitch * nHeight * 3 / 2), g_ReadbackSID);

                if (result != CUDA_SUCCESS)
                {
                    printf("cuMemAllocHost returned %d\n", (int)result);
                }
            }

#if ENABLE_DEBUG_OUT
            printf("%s = %02d, PicIndex = %02d, OutputPTS = %08d\n",
                   (oDisplayInfo.progressive_frame ? "Frame" : "Field"),
                   g_DecodeFrameCount, oDisplayInfo.picture_index, oDisplayInfo.timestamp);
#endif

            if (g_pImageDX)
            {
                // map the texture surface
                g_pImageDX->map(&pInteropFrame[active_field], &nTexturePitch, active_field);
            }
            else
            {
                pInteropFrame[active_field] = g_pInteropFrame[active_field];
                nTexturePitch = g_pVideoDecoder->targetWidth() * 2;
            }

            // perform post processing on the CUDA surface (performs colors space conversion and post processing)
            // comment this out if we inclue the line of code seen above
            cudaPostProcessFrame(&pDecodedFrame[active_field], nDecodedPitch, &pInteropFrame[active_field], nTexturePitch, g_pCudaModule->getModule(), gfpNV12toARGB, g_KernelSID);

            if (g_pImageDX)
            {
                // unmap the texture surface
                g_pImageDX->unmap(active_field);
            }

            // unmap video frame
            // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
            g_pVideoDecoder->unmapFrame(pDecodedFrame[active_field]);
            // release the frame, so it can be re-used in decoder
            g_pFrameQueue->releaseFrame(&oDisplayInfo);
            g_DecodeFrameCount++;
        }

        // Detach from the Current thread
        checkCudaErrors(cuCtxPopCurrent(NULL));
    }
    else
    {
        return false;
    }

    // check if decoding has come to an end.
    // if yes, signal the app to shut down.
    if (!g_pVideoSource->isStarted() || g_pFrameQueue->isEndOfDecode())
    {
        // Let's free the Frame Data
        if (g_ReadbackSID && g_bFrameData)
        {
            cuMemFreeHost((void *)g_bFrameData[0]);
            cuMemFreeHost((void *)g_bFrameData[1]);
            g_bFrameData[0] = NULL;
            g_bFrameData[1] = NULL;
        }

        // Let's just stop, and allow the user to quit, so they can at least see the results
        g_pVideoSource->stop();

        // If we want to loop reload the video file and restart
        if (g_bLoop && !g_bAutoQuit)
        {
            reinitCudaResources();
            g_FrameCount = 0;
            g_DecodeFrameCount = 0;
            g_pVideoSource->start();
        }

        if (g_bAutoQuit)
        {
            g_bDone = true;
        }
    }

    return true;
}
Exemplo n.º 15
0
/*
 * Class:     edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2
 * Method:    findReserveMem
 * Signature: ()I
 */
JNIEXPORT jlong JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_findReserveMem
  (JNIEnv * env, jobject this_ref, jint max_blocks_per_proc, jint max_threads_per_block)
{
  size_t to_space_size;
  size_t temp_size;
  int status;
  int deviceCount = 0;
  jlong prev_i;
  jlong i;
  size_t f_mem;
  size_t t_mem;
  jint num_blocks;

  status = cuInit(0);
  CHECK_STATUS(env,"error in cuInit",status)

  printf("automatically determining CUDA reserve space...\n");
  
  to_space_size = initContext(env, max_blocks_per_proc, max_threads_per_block);

  //space for 100 types in the scene
  classMemSize = sizeof(jint)*100;

  num_blocks = numMultiProcessors * max_threads_per_block * max_blocks_per_proc;
  
  gc_space_size = 1024;
  to_space_size -= (num_blocks * sizeof(jlong));
  to_space_size -= (num_blocks * sizeof(jlong));
  to_space_size -= gc_space_size;
  to_space_size -= classMemSize;
  
  for(i = 1024L*1024L; i < to_space_size; i += 100L*1024L*1024L){
    temp_size = to_space_size - i;
  
    printf("attempting allocation with temp_size: %lu to_space_size: %lu i: %ld\n", temp_size, to_space_size, i);
 
    status = cuMemHostAlloc(&toSpace, temp_size, 0);  
    if(status != CUDA_SUCCESS){
	    cuCtxDestroy(cuContext);
	    initContext(env, max_blocks_per_proc, max_threads_per_block);
      continue;
    }
    
    status = cuMemAlloc(&gpuToSpace, temp_size);
    if(status != CUDA_SUCCESS){
	    cuCtxDestroy(cuContext);
	    initContext(env, max_blocks_per_proc, max_threads_per_block);
      continue;
    } 

    status = cuMemAlloc(&gpuClassMemory, classMemSize);
    if(status != CUDA_SUCCESS){
	    cuCtxDestroy(cuContext);
	    initContext(env, max_blocks_per_proc, max_threads_per_block);
      continue;
    } 

    status = cuMemHostAlloc(&handlesMemory, num_blocks * sizeof(jlong), CU_MEMHOSTALLOC_WRITECOMBINED); 
    if(status != CUDA_SUCCESS){
	    cuCtxDestroy(cuContext);
	    initContext(env, max_blocks_per_proc, max_threads_per_block);
      continue;
    } 

    status = cuMemAlloc(&gpuHandlesMemory, num_blocks * sizeof(jlong)); 
    if(status != CUDA_SUCCESS){
	    cuCtxDestroy(cuContext);
	    initContext(env, max_blocks_per_proc, max_threads_per_block);
      continue;
    } 

    status = cuMemHostAlloc(&exceptionsMemory, num_blocks * sizeof(jlong), 0); 
    if(status != CUDA_SUCCESS){
	    cuCtxDestroy(cuContext);
	    initContext(env, max_blocks_per_proc, max_threads_per_block);
      continue;
    } 

    status = cuMemAlloc(&gpuExceptionsMemory, num_blocks * sizeof(jlong)); 
    if(status != CUDA_SUCCESS){
	    cuCtxDestroy(cuContext);
	    initContext(env, max_blocks_per_proc, max_threads_per_block);
      continue;
    } 

    status = cuMemAlloc(&gcInfoSpace, gc_space_size);  
    if(status != CUDA_SUCCESS){
	    cuCtxDestroy(cuContext);
	    initContext(env, max_blocks_per_proc, max_threads_per_block);
      continue;
    } 

    status = cuMemAlloc(&gpuHeapEndPtr, 8);
    if(status != CUDA_SUCCESS){
	    cuCtxDestroy(cuContext);
	    initContext(env, max_blocks_per_proc, max_threads_per_block);
      continue;
    } 

    status = cuMemAlloc(&gpuBufferSize, 8);
    if(status != CUDA_SUCCESS){
	    cuCtxDestroy(cuContext);
	    initContext(env, max_blocks_per_proc, max_threads_per_block);
      continue;
    } 

    //done, free everything
    cuMemFree(gpuToSpace);
    cuMemFree(gpuClassMemory);
    cuMemFree(gpuHandlesMemory);
    cuMemFree(gpuExceptionsMemory);
    cuMemFree(gcInfoSpace);
    cuMemFree(gpuHeapEndPtr);
    cuMemFree(gpuBufferSize);

	  cuMemFreeHost(toSpace);
	  cuMemFreeHost(handlesMemory);
	  cuMemFreeHost(exceptionsMemory);

    return i;
  }
  throw_cuda_errror_exception(env, "unable to find enough space using CUDA", 0); 
  return 0;
}
Exemplo n.º 16
0
static void calc_a_score_GPU(FLOAT *ac_score,  FLOAT **score,
			     int *ssize_start,  Model_info *MI,
			     FLOAT scale, int *size_score_array,
			     int NoC)
{
	CUresult res;

	const int IHEI = MI->IM_HEIGHT;
	const int IWID = MI->IM_WIDTH;
	int pady_n = MI->pady;
	int padx_n = MI->padx;
	int block_pad = (int)(scale/2.0);

	struct timeval tv;

	int *RY_array, *RX_array;
	res = cuMemHostAlloc((void**)&RY_array, NoC*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP);
	if(res != CUDA_SUCCESS) {
		printf("cuMemHostAlloc(RY_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemHostAlloc((void**)&RX_array, NoC*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP);
	if(res != CUDA_SUCCESS) {
		printf("cuMemHostAlloc(RX_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	for(int i = 0; i < NoC; i++) {
		int rsize[2] = {MI->rsize[i*2], MI->rsize[i*2+1]};

		RY_array[i] = (int)((FLOAT)rsize[0]*scale/2.0-1.0+block_pad);
		RX_array[i] = (int)((FLOAT)rsize[1]*scale/2.0-1.0+block_pad);
	}

	CUdeviceptr ac_score_dev, score_dev;
	CUdeviceptr ssize_dev, size_score_dev;
	CUdeviceptr RY_dev, RX_dev;

	int size_score=0;
	for(int i = 0; i < NoC; i++) {
		size_score += size_score_array[i];
	}

	/* allocate GPU memory */
	res = cuMemAlloc(&ac_score_dev, gpu_size_A_SCORE);
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(ac_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&score_dev, size_score);
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&ssize_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(ssize) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&size_score_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(size_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&RY_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(RY) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&RX_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(RX) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	gettimeofday(&tv_memcpy_start, nullptr);
	/* upload date to GPU */
	res = cuMemcpyHtoD(ac_score_dev, &ac_score[0], gpu_size_A_SCORE);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(ac_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(score_dev, &score[0][0], size_score);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(ssize_dev, &ssize_start[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(ssize) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(size_score_dev, &size_score_array[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(size_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(RY_dev, &RY_array[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(RY) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(RX_dev, &RX_array[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(RX) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	gettimeofday(&tv_memcpy_end, nullptr);
	tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
	time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	void* kernel_args[] = {
		(void*)&IWID,
		(void*)&IHEI,
		(void*)&scale,
		(void*)&padx_n,
		(void*)&pady_n,
		&RX_dev,
		&RY_dev,
		&ac_score_dev,
		&score_dev,
		&ssize_dev,
		(void*)&NoC,
		&size_score_dev
	};

	int sharedMemBytes = 0;

	/* define CUDA block shape */
	int max_threads_num = 0;
	int thread_num_x, thread_num_y;
	int block_num_x, block_num_y;

	res = cuDeviceGetAttribute(&max_threads_num, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[0]);
	if(res != CUDA_SUCCESS){
		printf("\ncuDeviceGetAttribute() failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	NR_MAXTHREADS_X[0] = (int)sqrt((double)max_threads_num/NoC);
	NR_MAXTHREADS_Y[0] = (int)sqrt((double)max_threads_num/NoC);

	thread_num_x = (IWID < NR_MAXTHREADS_X[0]) ? IWID : NR_MAXTHREADS_X[0];
	thread_num_y = (IHEI < NR_MAXTHREADS_Y[0]) ? IHEI : NR_MAXTHREADS_Y[0];

	block_num_x = IWID / thread_num_x;
	block_num_y = IHEI / thread_num_y;
	if(IWID % thread_num_x != 0) block_num_x++;
	if(IHEI % thread_num_y != 0) block_num_y++;

	gettimeofday(&tv_kernel_start, nullptr);
	/* launch GPU kernel */
	res = cuLaunchKernel(
		func_calc_a_score[0], // call function
		block_num_x,       // gridDimX
		block_num_y,       // gridDimY
		1,                 // gridDimZ
		thread_num_x,      // blockDimX
		thread_num_y,      // blockDimY
		NoC,               // blockDimZ
		sharedMemBytes,    // sharedMemBytes
		nullptr,              // hStream
		kernel_args,       // kernelParams
		nullptr               // extra
		);
	if(res != CUDA_SUCCESS) {
		printf("cuLaunchKernel(calc_a_score) failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(calc_a_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}
	gettimeofday(&tv_kernel_end, nullptr);
	tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
	time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	gettimeofday(&tv_memcpy_start, nullptr);
	/* download data from GPU */
	res = cuMemcpyDtoH(ac_score, ac_score_dev, gpu_size_A_SCORE);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH(ac_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	gettimeofday(&tv_memcpy_end, nullptr);
	tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
	time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	/* free GPU memory */
	res = cuMemFree(ac_score_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(ac_score_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(score_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(score_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(ssize_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(ssize_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(size_score_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(size_score_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(RY_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(RY_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(RX_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(RX_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	/* free CPU memory */
	res = cuMemFreeHost(RY_array);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(RY_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFreeHost(RX_array);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(RX_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}
}
Exemplo n.º 17
0
//detect boundary box
FLOAT *dpm_ttic_gpu_get_boxes(FLOAT **features,FLOAT *scales,int *feature_size, GPUModel *MO,
			      int *detected_count, FLOAT *acc_score, FLOAT thresh)
{
	//constant parameters
	const int max_scale = MO->MI->max_scale;
	const int interval = MO->MI->interval;
	const int sbin = MO->MI->sbin;
	const int padx = MO->MI->padx;
	const int pady = MO->MI->pady;
	const int NoR = MO->RF->NoR;
	const int NoP = MO->PF->NoP;
	const int NoC = MO->MI->numcomponent;
	const int *numpart = MO->MI->numpart;
	const int LofFeat=(max_scale+interval)*NoC;
	const int L_MAX = max_scale+interval;

	/* for measurement */
	struct timeval tv;
	struct timeval tv_make_c_start, tv_make_c_end;
	struct timeval tv_nucom_start, tv_nucom_end;
	struct timeval tv_box_start, tv_box_end;
	float time_box=0;
	struct timeval tv_root_score_start, tv_root_score_end;
	float time_root_score = 0;
	struct timeval tv_part_score_start, tv_part_score_end;
	float time_part_score = 0;
	struct timeval tv_dt_start, tv_dt_end;
	float time_dt = 0;
	struct timeval tv_calc_a_score_start, tv_calc_a_score_end;
	float time_calc_a_score = 0;

	gettimeofday(&tv_make_c_start, nullptr);

	int **RF_size = MO->RF->root_size;
	int *rootsym = MO->RF->rootsym;
	int *part_sym = MO->PF->part_sym;
	int **part_size = MO->PF->part_size;
	FLOAT **rootfilter = MO->RF->rootfilter;
	FLOAT **partfilter=MO->PF->partfilter;
	int **psize = MO->MI->psize;

	int **rm_size_array = (int **)malloc(sizeof(int *)*L_MAX);
	int **pm_size_array = (int **)malloc(sizeof(int *)*L_MAX);
	pm_size_array = (int **)malloc(sizeof(int *)*L_MAX);

	FLOAT **Tboxes=(FLOAT**)calloc(LofFeat,sizeof(FLOAT*)); //box coordinate information(Temp)
	int  *b_nums =(int*)calloc(LofFeat,sizeof(int)); //length of Tboxes
	int count = 0;
	int detected_boxes=0;
	CUresult res;

	/* matched score (root and part) */
	FLOAT ***rootmatch,***partmatch = nullptr;

	int *new_PADsize;  // need new_PADsize[L_MAX*3]
	size_t SUM_SIZE_feat = 0;

	FLOAT **featp2 = (FLOAT **)malloc(L_MAX*sizeof(FLOAT *));


	if(featp2 == nullptr) {  // error semantics
		printf("allocate featp2 failed\n");
		exit(1);
	}


	/* allocate required memory for new_PADsize */
	new_PADsize = (int *)malloc(L_MAX*3*sizeof(int));
	if(new_PADsize == nullptr) {     // error semantics
		printf("allocate new_PADsize failed\n");
		exit(1);
	}

	/* do padarray once and reuse it at calculating root and part time */

	/* calculate sum of size of padded feature */
	for(int tmpL=0; tmpL<L_MAX; tmpL++) {
		int PADsize[3] = { feature_size[tmpL*2], feature_size[tmpL*2+1], 31 };
		int NEW_Y = PADsize[0] + pady*2;
		int NEW_X = PADsize[1] + padx*2;
		SUM_SIZE_feat += (NEW_X*NEW_Y*PADsize[2])*sizeof(FLOAT);
	}

	/* allocate region for padded feat in a lump */
	FLOAT *dst_feat;
	res = cuMemHostAlloc((void **)&dst_feat, SUM_SIZE_feat, CU_MEMHOSTALLOC_DEVICEMAP);
	if(res != CUDA_SUCCESS) {
		printf("cuMemHostAlloc(dst_feat) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	memset(dst_feat, 0, SUM_SIZE_feat);  // zero clear

	/* distribute allocated region */
	uintptr_t pointer_feat = (uintptr_t)dst_feat;
	for(int tmpL=0; tmpL<L_MAX; tmpL++) {

		featp2[tmpL] = (FLOAT *)pointer_feat;
		int PADsize[3] = { feature_size[tmpL*2], feature_size[tmpL*2+1], 31 };
		int NEW_Y = PADsize[0] + pady*2;
		int NEW_X = PADsize[1] + padx*2;
		pointer_feat += (uintptr_t)(NEW_X*NEW_Y*PADsize[2]*sizeof(FLOAT));

	}

	/* copy feat to feat2 */
	for(int tmpL=0; tmpL<L_MAX; tmpL++) {

		int PADsize[3] = { feature_size[tmpL*2], feature_size[tmpL*2+1], 31 };
		int NEW_Y = PADsize[0] + pady*2;
		int NEW_X = PADsize[1] + padx*2;
		int L = NEW_Y*padx;
		int SPL = PADsize[0] + pady;
		int M_S = sizeof(FLOAT)*PADsize[0];
		FLOAT *P = featp2[tmpL];
		FLOAT *S = features[tmpL];

		for(int i=0; i<PADsize[2]; i++)
		{
			P += L;
			for(int j=0; j<PADsize[1]; j++)
			{
				P += pady;
				memcpy(P, S, M_S);
				S += PADsize[0];
				P += SPL;
			}
			P += L;
		}

		new_PADsize[tmpL*3] = NEW_Y;
		new_PADsize[tmpL*3 + 1] = NEW_X;
		new_PADsize[tmpL*3 + 2] = PADsize[2];

	}

	/* do padarray once and reuse it at calculating root and part time */

	/* allocation in a lump */
	int *dst_rm_size = (int *)malloc(sizeof(int)*NoC*2*L_MAX);
	if(dst_rm_size == nullptr) {
		printf("allocate dst_rm_size failed\n");
		exit(1);
	}

	/* distribution to rm_size_array[L_MAX] */
	uintptr_t ptr = (uintptr_t)dst_rm_size;
	for(int i=0; i<L_MAX; i++) {
		rm_size_array[i] = (int *)ptr;
		ptr += (uintptr_t)(NoC*2*sizeof(int));
	}

	/* allocation in a lump */
	int *dst_pm_size = (int *)malloc(sizeof(int)*NoP*2*L_MAX);
	if(dst_pm_size == nullptr) {
		printf("allocate dst_pm_size failed\n");
		exit(1);
	}

	/* distribution to pm_size_array[L_MAX] */
	ptr = (uintptr_t)dst_pm_size;
	for(int i=0; i<L_MAX; i++) {
		pm_size_array[i] = (int *)ptr;
		ptr += (uintptr_t)(NoP*2*sizeof(int));
	}


	///////level
	for (int level=interval; level<L_MAX; level++)  // feature's loop(A's loop) 1level 1picture
	{
		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
		{
			Tboxes[count]=nullptr;
			count++;
			continue;
		}
	}  //for (level)  // feature's loop(A's loop) 1level 1picture

	///////root calculation/////////
	/* calculate model score (only root) */

	gettimeofday(&tv_root_score_start, nullptr);
	rootmatch = fconvsMT_GPU(
		featp2,
		SUM_SIZE_feat,
		rootfilter,
		rootsym,
		1,
		NoR,
		new_PADsize,
		RF_size, rm_size_array,
		L_MAX,
		interval,
		feature_size,
		padx,
		pady,
		MO->MI->max_X,
		MO->MI->max_Y,
		ROOT
		);
	gettimeofday(&tv_root_score_end, nullptr);
	tvsub(&tv_root_score_end, &tv_root_score_start, &tv);
	time_root_score += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	///////part calculation/////////
	if(NoP>0)
	{
		/* calculate model score (only part) */
		gettimeofday(&tv_part_score_start, nullptr);
		partmatch = fconvsMT_GPU(
			featp2,
			SUM_SIZE_feat,
			partfilter,
			part_sym,
			1,
			NoP,
			new_PADsize,
			part_size,
			pm_size_array,
			L_MAX,
			interval,
			feature_size,
			padx,
			pady,
			MO->MI->max_X,
			MO->MI->max_Y,
			PART
			);
		gettimeofday(&tv_part_score_end, nullptr);
		tvsub(&tv_part_score_end, &tv_part_score_start, &tv);
		time_part_score += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	}

	res = cuCtxSetCurrent(ctx[0]);
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSetCurrent(ctx[0]) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	gettimeofday(&tv_make_c_end, nullptr);
	gettimeofday(&tv_nucom_start, nullptr);

	count = 0;
	detected_boxes = 0;

	int **RL_array = (int **)malloc((L_MAX-interval)*sizeof(int*));
	int *dst_RL = (int *) malloc(NoC*(L_MAX-interval)*sizeof(int));

	int **RI_array = (int **)malloc((L_MAX-interval)*sizeof(int*));
	int *dst_RI = (int *)malloc(NoC*(L_MAX-interval)*sizeof(int));

	int **OI_array = (int **)malloc((L_MAX-interval)*sizeof(int*));
	int *dst_OI = (int *)malloc((NoC)*(L_MAX-interval)*sizeof(int));

	int **RL_S_array = (int **)malloc((L_MAX-interval)*sizeof(int*));
	int *dst_RL_S = (int *)malloc(NoC*(L_MAX-interval)*sizeof(int));


	FLOAT **OFF_array = (FLOAT **)malloc((L_MAX-interval)*sizeof(FLOAT*));
	FLOAT *dst_OFF = (FLOAT *)malloc(NoC*(L_MAX-interval)*sizeof(FLOAT));

	FLOAT ***SCORE_array = (FLOAT ***)malloc((L_MAX-interval)*sizeof(FLOAT **));
	FLOAT **sub_dst_SCORE = (FLOAT **)malloc(NoC*(L_MAX-interval)*sizeof(FLOAT*));

	uintptr_t pointer_RL = (uintptr_t)dst_RL;
	uintptr_t pointer_RI = (uintptr_t)dst_RI;
	uintptr_t pointer_OI = (uintptr_t)dst_OI;
	uintptr_t pointer_RL_S = (uintptr_t)dst_RL_S;
	uintptr_t pointer_OFF = (uintptr_t)dst_OFF;
	uintptr_t pointer_SCORE = (uintptr_t)sub_dst_SCORE;
	for (int level=interval; level<L_MAX; level++) {

		int L=level-interval;

		RL_array[L] = (int *)pointer_RL;
		pointer_RL += (uintptr_t)NoC*sizeof(int);

		RI_array[L] = (int *)pointer_RI;
		pointer_RI += (uintptr_t)NoC*sizeof(int);

		OI_array[L] = (int *)pointer_OI;
		pointer_OI += (uintptr_t)NoC*sizeof(int);

		RL_S_array[L] = (int *)pointer_RL_S;
		pointer_RL_S += (uintptr_t)NoC*sizeof(int);

		OFF_array[L] = (FLOAT *)pointer_OFF;
		pointer_OFF += (uintptr_t)NoC*sizeof(FLOAT);

		SCORE_array[L] = (FLOAT **)pointer_SCORE;
		pointer_SCORE += (uintptr_t)NoC*sizeof(FLOAT*);
	}

	int sum_RL_S = 0;
	int sum_SNJ = 0;
	/* prepare for parallel execution */
	for(int level=interval; level<L_MAX; level++) {
		int L = level - interval;

		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
		{
			continue;
		}

		for(int j=0; j<NoC; j++) {

			/* root score + offset */
			RL_array[L][j] = rm_size_array[level][j*2]*rm_size_array[level][j*2+1];  //length of root-matching
			RI_array[L][j] = MO->MI->ridx[j];  //root-index
			OI_array[L][j] =  MO->MI->oidx[j];  //offset-index
			RL_S_array[L][j] =sizeof(FLOAT)*RL_array[L][j];


			OFF_array[L][j] = MO->MI->offw[RI_array[L][j]];  //offset information


			/* search max values */
			max_RL_S = (max_RL_S < RL_S_array[L][j]) ? RL_S_array[L][j] : max_RL_S;
			max_numpart = (max_numpart < numpart[j]) ? numpart[j] : max_numpart;
		}
	}

	sum_RL_S = max_RL_S*NoC*(L_MAX-interval);

	/* root matching size */
	sum_SNJ = sizeof(int*)*max_numpart*NoC*(L_MAX-interval);

	/* consolidated allocation for SCORE_array and distribute region */
	FLOAT *dst_SCORE = (FLOAT *)malloc(sum_RL_S);
	pointer_SCORE = (uintptr_t)dst_SCORE;
	for(int level=interval; level<L_MAX; level++) {
		int L = level - interval;

		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
		{
			continue;
		}

		for(int j=0; j<NoC; j++) {
			SCORE_array[L][j] = (FLOAT *)pointer_SCORE;
			pointer_SCORE += (uintptr_t)max_RL_S;
		}
	}

	/* add offset */
	for(int level=interval; level<L_MAX; level++) {
		int L = level - interval;

		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
		{
			continue;
		}

		for(int j=0; j<NoC; j++) {
			memcpy(SCORE_array[L][j], rootmatch[level][j], RL_S_array[L][j]);
			FLOAT *SC_S = SCORE_array[L][j];
			FLOAT *SC_E = SCORE_array[L][j]+RL_array[L][j];
			while(SC_S<SC_E) *(SC_S++)+=OFF_array[L][j];
		}
	}

	/* anchor matrix */  // consolidated allocation
	int ***ax_array = (int ***)malloc((L_MAX-interval)*sizeof(int **));
	int **sub_dst_ax = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int *));
	int *dst_ax = (int *)malloc(sum_SNJ);

	int ***ay_array = (int ***)malloc((L_MAX-interval)*sizeof(int **));
	int **sub_dst_ay = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int *));
	int *dst_ay = (int *)malloc(sum_SNJ);

	/* boudary index */  // consolidated allocation
	int ****Ix_array =(int ****)malloc((L_MAX-interval)*sizeof(int ***));
	int ***sub_dst_Ix = (int ***)malloc(NoC*(L_MAX-interval)*sizeof(int **));
	int **dst_Ix = (int **)malloc(sum_SNJ);

	int ****Iy_array = (int ****)malloc((L_MAX-interval)*sizeof(int ***));
	int ***sub_dst_Iy = (int ***)malloc(NoC*(L_MAX-interval)*sizeof(int **));
	int **dst_Iy = (int **)malloc(sum_SNJ);

	/* distribute region */
	uintptr_t pointer_ax = (uintptr_t)sub_dst_ax;
	uintptr_t pointer_ay = (uintptr_t)sub_dst_ay;
	uintptr_t pointer_Ix = (uintptr_t)sub_dst_Ix;
	uintptr_t pointer_Iy = (uintptr_t)sub_dst_Iy;
	for(int level=interval; level<L_MAX; level++) {
		int L = level - interval;

		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
		{
			continue;
		}

		ax_array[L] = (int **)pointer_ax;
		pointer_ax += (uintptr_t)(NoC*sizeof(int*));

		ay_array[L] = (int **)pointer_ay;
		pointer_ay += (uintptr_t)(NoC*sizeof(int*));

		Ix_array[L] = (int ***)pointer_Ix;
		pointer_Ix += (uintptr_t)(NoC*sizeof(int**));

		Iy_array[L] = (int ***)pointer_Iy;
		pointer_Iy += (uintptr_t)(NoC*sizeof(int**));
	}

	pointer_ax = (uintptr_t)dst_ax;
	pointer_ay = (uintptr_t)dst_ay;
	pointer_Ix = (uintptr_t)dst_Ix;
	pointer_Iy = (uintptr_t)dst_Iy;
	for(int level=interval; level<L_MAX; level++) {
		int L = level - interval;

		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
		{
			continue;
		}

		for(int j=0; j<NoC; j++) {
			uintptr_t pointer_offset = sizeof(int*)*max_numpart;

			ax_array[L][j] = (int *)pointer_ax;
			pointer_ax += pointer_offset;

			ay_array[L][j] = (int *)pointer_ay;
			pointer_ay += pointer_offset;

			Ix_array[L][j] = (int **)pointer_Ix;
			pointer_Ix += pointer_offset;

			Iy_array[L][j] = (int **)pointer_Iy;
			pointer_Iy += pointer_offset;
		}
	}

	/* add parts */
	if(NoP>0)
        {
		/* arrays to store temporary loop variables */
		int tmp_array_size = 0;
		for(int level=interval; level<L_MAX; level++) {
			if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
			{
				continue;
			}

			for(int j=0; j<NoC; j++) {
				tmp_array_size += max_numpart*sizeof(int);
			}
		}

		int ***DIDX_array = (int ***)malloc((L_MAX-interval)*sizeof(int**));
		int **sub_dst_DIDX = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int*));
		int *dst_DIDX = (int *)malloc(tmp_array_size);


		int ***DID_4_array = (int ***)malloc((L_MAX-interval)*sizeof(int **));
		int **sub_dst_DID_4 = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int*));
		int *dst_DID_4;
		res = cuMemHostAlloc((void **)&dst_DID_4, tmp_array_size, CU_MEMHOSTALLOC_DEVICEMAP);
		if(res != CUDA_SUCCESS) {
			printf("cuMemHostAlloc(dst_DID_4) failed: res = %s\n", cuda_response_to_string(res));
			exit(1);
		}


		int ***PIDX_array = (int ***)malloc((L_MAX-interval)*sizeof(int **));
		int **sub_dst_PIDX = (int **)malloc(NoC*(L_MAX-interval)*sizeof(int*));
		int *dst_PIDX;
		res = cuMemHostAlloc((void **)&dst_PIDX, tmp_array_size, CU_MEMHOSTALLOC_DEVICEMAP);
		if(res != CUDA_SUCCESS) {
			printf("cuMemHostAlloc(dst_PIDX) failed: res = %s\n", cuda_response_to_string(res));
			exit(1);
		}

		/* distribute consolidated region */
		uintptr_t pointer_DIDX = (uintptr_t)sub_dst_DIDX;
		uintptr_t pointer_DID_4 = (uintptr_t)sub_dst_DID_4;
		uintptr_t pointer_PIDX = (uintptr_t)sub_dst_PIDX;
		for(int level=interval; level<L_MAX; level++) {
			int L = level - interval;

			if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
			{
				continue;
			}

			DIDX_array[L] = (int **)pointer_DIDX;
			pointer_DIDX += (uintptr_t)(NoC*sizeof(int*));

			DID_4_array[L] = (int **)pointer_DID_4;
			pointer_DID_4 += (uintptr_t)(NoC*sizeof(int*));

			PIDX_array[L] = (int **)pointer_PIDX;
			pointer_PIDX += (uintptr_t)(NoC*sizeof(int*));
		}

		pointer_DIDX = (uintptr_t)dst_DIDX;
		pointer_DID_4 = (uintptr_t)dst_DID_4;
		pointer_PIDX = (uintptr_t)dst_PIDX;
		for(int level=interval; level<L_MAX; level++) {
			int L = level - interval;

			if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) {
				continue;
			}

			for(int j=0; j<NoC; j++) {
				uintptr_t pointer_offset = (uintptr_t)(max_numpart*sizeof(int));

				DIDX_array[L][j] = (int *)pointer_DIDX;
				pointer_DIDX += pointer_offset;

				DID_4_array[L][j] = (int *)pointer_DID_4;
				pointer_DID_4 += pointer_offset;

				PIDX_array[L][j] = (int *)pointer_PIDX;
				pointer_PIDX += pointer_offset;
			}
		}

		/* prepare for parallel execution */
		int sum_size_index_matrix = 0;
		for(int level=interval; level<L_MAX; level++) {
			int L = level - interval;

			if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) {
				continue;
			}

			for(int j=0; j<NoC; j++) {
				for (int k=0;k<numpart[j];k++) {
					/* assign values to each element */
					DIDX_array[L][j][k] = MO->MI->didx[j][k];
					DID_4_array[L][j][k] = DIDX_array[L][j][k]*4;
					PIDX_array[L][j][k] = MO->MI->pidx[j][k];

					/* anchor */
					ax_array[L][j][k] = MO->MI->anchor[DIDX_array[L][j][k]*2]+1;
					ay_array[L][j][k] = MO->MI->anchor[DIDX_array[L][j][k]*2+1]+1;

					int PSSIZE[2] ={pm_size_array[L][PIDX_array[L][j][k]*2], pm_size_array[L][PIDX_array[L][j][k]*2+1]}; // size of C

					/* index matrix */
					sum_size_index_matrix += sizeof(int)*PSSIZE[0]*PSSIZE[1];
				}
			}
		}

		int *dst_Ix_kk = (int *)malloc(sum_size_index_matrix);
		int *dst_Iy_kk = (int *)malloc(sum_size_index_matrix);
		uintptr_t pointer_Ix_kk = (uintptr_t)dst_Ix_kk;
		uintptr_t pointer_Iy_kk = (uintptr_t)dst_Iy_kk;
		for(int level=interval; level<L_MAX; level++) {
			int L = level - interval;

			if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
			{
				continue;
			}

			for(int j=0; j<NoC; j++) {
				for (int k=0;k<numpart[j];k++) {
					int PSSIZE[2] ={pm_size_array[L][PIDX_array[L][j][k]*2], pm_size_array[L][PIDX_array[L][j][k]*2+1]}; // size of C

					Ix_array[L][j][k] = (int *)pointer_Ix_kk;
					Iy_array[L][j][k] = (int *)pointer_Iy_kk;

					pointer_Ix_kk += (uintptr_t)(sizeof(int)*PSSIZE[0]*PSSIZE[1]);
					pointer_Iy_kk += (uintptr_t)(sizeof(int)*PSSIZE[0]*PSSIZE[1]);
				}
			}
		}

		gettimeofday(&tv_dt_start, nullptr);
		FLOAT ****M_array = dt_GPU(
			Ix_array,      // int ****Ix_array
			Iy_array,      // int ****Iy_array
			PIDX_array,    // int ***PIDX_array
			pm_size_array, // int **size_array
			NoP,           // int NoP
			numpart,       // int *numpart
			NoC,           // int NoC
			interval,      // int interval
			L_MAX,         // int L_MAX
			feature_size,         // int *feature_size,
			padx,          // int padx,
			pady,          // int pady,
			MO->MI->max_X, // int max_X
			MO->MI->max_Y, // int max_Y
			MO->MI->def, // FLOAT *def
			tmp_array_size, // int tmp_array_size
			dst_PIDX, // int *dst_PIDX
			dst_DID_4 // int *DID_4
			);
		gettimeofday(&tv_dt_end, nullptr);
		tvsub(&tv_dt_end, &tv_dt_start, &tv);
		time_dt += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

		/* add part score */
		for(int level=interval; level<L_MAX; level++){
			int L = level - interval;

			if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
			{
				continue;
			}

			for(int j=0; j<NoC; j++) {
				for(int k=0; k<numpart[j]; k++) {
					int PSSIZE[2] ={pm_size_array[L][PIDX_array[L][j][k]*2],
							pm_size_array[L][PIDX_array[L][j][k]*2+1]}; // Size of C
					int R_S[2]={rm_size_array[level][j*2], rm_size_array[level][j*2+1]};

					dpm_ttic_add_part_calculation(SCORE_array[L][j], M_array[L][j][k], R_S,
								      PSSIZE, ax_array[L][j][k], ay_array[L][j][k]);
				}
			}
		}

		s_free(M_array[0][0][0]);
		s_free(M_array[0][0]);
		s_free(M_array[0]);
		s_free(M_array);

		/* free temporary arrays */
		free(dst_DIDX);
		free(sub_dst_DIDX);
		free(DIDX_array);

		res = cuMemFreeHost(dst_DID_4);
		if(res != CUDA_SUCCESS) {
			printf("cuMemFreeHost(dst_DID_4) failed: res = %s\n", cuda_response_to_string(res));
			exit(1);
		}
		free(sub_dst_DID_4);
		free(DID_4_array);

		res = cuMemFreeHost(dst_PIDX);
		if(res != CUDA_SUCCESS) {
			printf("cuMemFreeHost(dst_PIDX) failed: res = %s\n", cuda_response_to_string(res));
			exit(1);
		}

		free(sub_dst_PIDX);
		free(PIDX_array);

		res = cuCtxSetCurrent(ctx[0]);
		if(res != CUDA_SUCCESS) {
			printf("cuCtxSetCurrent(ctx[0]) failed: res = %s\n", cuda_response_to_string(res));
			exit(1);
		}
        } // start from if(NoP>0)

	/* combine root and part score and detect boundary box for each-component */

	FLOAT *scale_array = (FLOAT *)malloc((L_MAX-interval)*sizeof(FLOAT));
	for(int level=interval; level<L_MAX; level++) {
		int L = level - interval;

		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X))
		{
			Tboxes[count]=nullptr;
			count++;
			continue;
		}

		scale_array[L] = (FLOAT)sbin/scales[level];
	}

	for (int level=interval; level<L_MAX; level++)  // feature's loop(A's loop) 1level 1picture
        {
		/* parameters (related for level) */
		int L=level-interval;
		/* matched score size matrix */
		FLOAT scale=(FLOAT)sbin/scales[level];

		/* loop conditon */
		if(feature_size[level*2]+2*pady<MO->MI->max_Y ||(feature_size[level*2+1]+2*padx<MO->MI->max_X)) {
			Tboxes[count]=nullptr;
			count++;
			continue;
		}

		/* calculate accumulated score */
		gettimeofday(&tv_calc_a_score_start, nullptr);

		calc_a_score_GPU(
			acc_score,              // FLOAT *ac_score
			SCORE_array[L],       // FLOAT **score
			rm_size_array[level], // int *ssize_start
			MO->MI,               // Model_info *MI
			scale,                // FLOAT scale
			RL_S_array[L],        // int *size_score_array
			NoC                   // int NoC
			);

		gettimeofday(&tv_calc_a_score_end, nullptr);
		tvsub(&tv_calc_a_score_end, &tv_calc_a_score_start, &tv);
		time_calc_a_score += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

		for(int j = 0; j <NoC; j++) {
			int R_S[2]={rm_size_array[level][j*2], rm_size_array[level][j*2+1]};

			/* get all good matches */
			int GMN;
			int *GMPC = get_gmpc(SCORE_array[L][j],thresh,R_S,&GMN);
			int RSIZE[2]={MO->MI->rsize[j*2], MO->MI->rsize[j*2+1]};

			int GL = (numpart[j]+1)*4+3;  //31

			/* detected box coordinate(current level) */
			FLOAT *t_boxes = (FLOAT*)calloc(GMN*GL,sizeof(FLOAT));

			gettimeofday(&tv_box_start, nullptr);

			// NO NEED TO USE GPU 
			for(int k = 0;k < GMN;k++) {
				FLOAT *P_temp = t_boxes+GL*k;
				int y = GMPC[2*k];
				int x = GMPC[2*k+1];

				/* calculate root box coordinate */
				FLOAT *RB =rootbox(x,y,scale,padx,pady,RSIZE);
				memcpy(P_temp, RB,sizeof(FLOAT)*4);
				s_free(RB);
				P_temp+=4;

				for(int pp=0;pp<numpart[j];pp++) {
					int PBSIZE[2]={psize[j][pp*2], psize[j][pp*2+1]};
					int Isize[2]={pm_size_array[L][MO->MI->pidx[j][pp]*2], pm_size_array[L][MO->MI->pidx[j][pp]*2+1]};

					/* calculate part box coordinate */
					FLOAT *PB = partbox(x,y,ax_array[L][j][pp],ay_array[L][j][pp],scale,padx,pady,PBSIZE,Ix_array[L][j][pp],Iy_array[L][j][pp],Isize);
					memcpy(P_temp, PB,sizeof(FLOAT)*4);
					P_temp+=4;
					s_free(PB);
				}
				/* component number and score */
				*(P_temp++)=(FLOAT)j; //component number
				*(P_temp++)=SCORE_array[L][j][x*R_S[0]+y]; //score of good match
				*P_temp = scale;
			}

			//  NO NEED TO USE GPU
			gettimeofday(&tv_box_end, nullptr);
			tvsub(&tv_box_end, &tv_box_start, &tv);
			time_box += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

			/* save box information */
			if (GMN > 0)
				Tboxes[count] = t_boxes;
			else
				Tboxes[count] = nullptr;

			b_nums[count]=GMN;
			count++;
			detected_boxes+=GMN;			//number of detected box

			/* release */
			s_free(GMPC);
		}
		////numcom
        }
	////level

	/* free temporary arrays */
	free(dst_RL);
	free(RL_array);

	free(dst_RI);
	free(RI_array);

	free(dst_OI);
	free(OI_array);

	free(dst_RL_S);
	free(RL_S_array);

	free(dst_OFF);
	free(OFF_array);

	free(dst_SCORE);
	free(sub_dst_SCORE);
	free(SCORE_array);

	free(dst_ax);
	free(sub_dst_ax);
	free(ax_array);

	free(dst_ay);
	free(sub_dst_ay);
	free(ay_array);

	free(Ix_array[0][0][0]);
	free(dst_Ix);
	free(sub_dst_Ix);
	free(Ix_array);

	free(Iy_array[0][0][0]);
	free(dst_Iy);
	free(sub_dst_Iy);
	free(Iy_array);

	free(scale_array);

	gettimeofday(&tv_nucom_end, nullptr);

#ifdef PRINT_INFO
	printf("root SCORE : %f\n", time_root_score);
	printf("part SCORE : %f\n", time_part_score);
	printf("dt  : %f\n", time_dt);
	printf("calc_a_score : %f\n", time_calc_a_score);
#endif
	res = cuCtxSetCurrent(ctx[0]);
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSetCurrent(ctx[0]) failed: res = %s\n",cuda_response_to_string(res));
		exit(1);
	}

	/* free memory regions */
	res = cuMemFreeHost((void *)featp2[0]);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(featp2[0]) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	s_free(featp2);

	res = cuMemFreeHost((void *)rootmatch[interval][0]);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(rootmatch[0][0]) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}
	s_free(rootmatch[0]);
	s_free(rootmatch);

	if (partmatch != nullptr) {
		res = cuMemFreeHost((void *)partmatch[0][0]);
		if(res != CUDA_SUCCESS) {
			printf("cuMemFreeHost(partmatch[0][0]) failed: res = %s\n", cuda_response_to_string(res));
			exit(1);
		}

		s_free(partmatch[0]);
		s_free(partmatch);

		s_free(new_PADsize);
	}

	/* release */
	s_free(rm_size_array[0]);
	s_free(rm_size_array);
	s_free(pm_size_array[0]);
	s_free(pm_size_array);

	/* Output boundary-box coorinate information */
	int GL=(numpart[0]+1)*4+3;
	FLOAT *boxes=(FLOAT*)calloc(detected_boxes*GL,sizeof(FLOAT));		//box coordinate information(Temp)

	FLOAT *T1 = boxes;
	for(int i = 0; i < LofFeat; i++) {
		int num_t = b_nums[i]*GL;
		if(num_t > 0) {
			FLOAT *T2 = Tboxes[i];
			//memcpy_s(T1,sizeof(FLOAT)*num_t,T2,sizeof(FLOAT)*num_t);
			memcpy(T1, T2,sizeof(FLOAT)*num_t);
			T1 += num_t;
		}
	}

	FLOAT abs_threshold = abs(thresh);

	/* accumulated score calculation */
	FLOAT max_score = 0.0;

	/* add offset to accumulated score */
	for(int i = 0; i < MO->MI->IM_HEIGHT*MO->MI->IM_WIDTH; i++) {
		if (acc_score[i] < thresh) {
			acc_score[i] = 0.0;
		} else {
			acc_score[i] += abs_threshold;

			if (acc_score[i] > max_score)
				max_score = acc_score[i];
		}
	}

	/* normalization */
	if (max_score > 0.0) {
		FLOAT ac_ratio = 1.0 / max_score;

		for (int i = 0; i < MO->MI->IM_HEIGHT*MO->MI->IM_WIDTH; i++) {
			acc_score[i] *= ac_ratio;
		}
	}

	/* release */
	free_boxes(Tboxes,LofFeat);
	s_free(b_nums);

	/* output result */
	*detected_count = detected_boxes;
	return boxes;
}
Exemplo n.º 18
0
int cuda_test_memcpy_async(unsigned int size)
{
	int i;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUstream stream;
	CUdeviceptr data_addr;
	unsigned int *in, *out;
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	unsigned long total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuStreamCreate(&stream, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamCreate failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAlloc(&data_addr, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAllocHost((void **)&in, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAllocHost(in) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAllocHost((void **)&out, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAllocHost(out) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	for (i = 0; i < size / 4; i++) {
		in[i] = i+1;
		out[i] = 0;
	}

	gettimeofday(&tv_h2d_start, NULL);
	res = cuMemcpyHtoDAsync(data_addr, in, size, stream);
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoDAsync failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	res = cuStreamSynchronize(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	gettimeofday(&tv_h2d_end, NULL);

	gettimeofday(&tv_d2h_start, NULL);
	res = cuMemcpyDtoHAsync(out, data_addr, size, stream);
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoHAsync failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	res = cuStreamSynchronize(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	gettimeofday(&tv_d2h_end, NULL);

	for (i = 0; i < size / 4; i++) {
		if (in[i] != out[i]) {
			printf("in[%d] = %u, out[%d] = %u\n",
				   i, in[i], i, out[i]);
		}
	}

	res = cuMemFreeHost(out);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(out) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemFreeHost(in);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(in) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemFree(data_addr);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuStreamDestroy(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamDestroy failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000 + tv.tv_usec / 1000;

	printf("HtoD: %f\n", h2d);
	printf("DtoH: %f\n", d2h);

	return 0;

end:

	return -1;
}
int gib_free ( void *buffers, gib_context c ) {
  ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx));
  ERROR_CHECK_FAIL(cuMemFreeHost(buffers));
  ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
  return GIB_SUC;
}
Exemplo n.º 20
0
int main() {
  CU_ERROR_CHECK(cuInit(0));

  int count;
  CU_ERROR_CHECK(cuDeviceGetCount(&count));

  count = (count > 2) ? 2 : count;

  CUdevice devices[count];
  for (int i = 0; i < count; i++)
    CU_ERROR_CHECK(cuDeviceGet(&devices[i], i));

  // Question 1:  Can you create multiple contexts on the same device?
  {
    fprintf(stderr, "Attempting to create multiple contexts on each device...\n");
    CUcontext contexts[count * N];
    size_t j = 0;
    for (int i = 0; i < count; i++) {
      CUresult error = CUDA_SUCCESS;
      size_t k;
      for (k = 0; k < N && error == CUDA_SUCCESS; k++) {
        error = cuCtxCreate(&contexts[j], CU_CTX_SCHED_AUTO, devices[i]);
        if (error == CUDA_SUCCESS)
          CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[j++]));
      }
      fprintf(stderr, "  created %zu contexts on device %d before cuCtxCreate returned \"%s\"\n", (k - 1), i, cuGetErrorString(error));
    }

    CUresult error = CUDA_SUCCESS;
    size_t k;
    for (k = 0; k < j && error == CUDA_SUCCESS; k++)
      error = cuCtxPushCurrent(contexts[k]);
    if (error == CUDA_SUCCESS)
      fprintf(stderr, "  successfully pushed %zu contexts with cuCtxPushCurrent\n", k);
    else
      fprintf(stderr, "  pushed %zu contexts before cuCtxPushCurrent returned \"%s\"\n", (k - 1), cuGetErrorString(error));

    for (size_t k = 0; k < j; k++)
      CU_ERROR_CHECK(cuCtxDestroy(contexts[k]));

    fprintf(stderr, "\n");
  }

  CUcontext contexts[count][2];
  for (int i = 0; i < count; i++) {
    for (size_t j = 0; j < 2; j++) {
      CU_ERROR_CHECK(cuCtxCreate(&contexts[i][j], CU_CTX_SCHED_AUTO, devices[i]));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[i][j]));
    }
  }

  // Question 2:  Can you access a host pointer in a different context from
  // which it was created?
  // Question 3:  Can you free a host pointer in a different context from which
  // it was created?
  {
    void * hPtr;
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemAllocHost(&hPtr, 1024));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));

    CUdeviceptr dPtr[count];
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
    CU_ERROR_CHECK(cuMemAlloc(&dPtr[0], 1024)); // Different context, same device
    fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[0], hPtr, 1024)));
    CU_ERROR_CHECK(cuMemFree(dPtr[0]));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      CU_ERROR_CHECK(cuMemAlloc(&dPtr[1], 1024)); // Different context, different device
      fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[1], hPtr, 1024)));
    CU_ERROR_CHECK(cuMemFree(dPtr[1]));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }

    fprintf(stderr, "\n");

    CUresult error = CUDA_ERROR_UNKNOWN;
    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      error = cuMemFreeHost(hPtr);
      fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      error = cuMemFreeHost(hPtr);
      fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      error = cuMemFreeHost(hPtr);
      fprintf(stderr, "Freeing a host pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");
  }

  // Question 4:  Can you access a device pointer in a different context from
  // which it was created?
  // Question 5:  Can you free a device pointer in a different context from which
  // it was created?
  {
    CUdeviceptr dPtr[count][2];
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][0], 1024));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
    CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][1], 1024));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));

    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
    fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[0][1], 1024)));
    CU_ERROR_CHECK(cuMemFree(dPtr[0][1]));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));

    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      CU_ERROR_CHECK(cuMemAlloc(&dPtr[1][0], 1024)); // Different context, different device
      fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[1][0], 1024)));
      CU_ERROR_CHECK(cuMemFree(dPtr[1][0]));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }

    fprintf(stderr, "\n");

    CUresult error = CUDA_ERROR_UNKNOWN;
    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      error = cuMemFree(dPtr[0][0]);
      fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      error = cuMemFree(dPtr[0][0]);
      fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      error = cuMemFree(dPtr[0][0]);
      fprintf(stderr, "Freeing a device pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");
  }

  // Question 6:  Can you access a module in a different context from which it
  // was loaded?
  // Question 7:  Can you unload a module in a different context from which it
  // was loaded?
  {
    CUmodule module;
    CUdeviceptr ptr;
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuModuleLoad(&module,  "kernel-test.ptx"));
    CU_ERROR_CHECK(cuMemAlloc(&ptr, sizeof(float)));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));

    CUfunction function = 0;
    if (count > 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel")));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (function == 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel")));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (function == 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      fprintf(stderr, "Getting a function pointer from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel")));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");

    CUdeviceptr a, b;
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemAlloc(&a, sizeof(float)));
    CU_ERROR_CHECK(cuMemAlloc(&b, sizeof(float)));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    void * params[] = { &a, & b };

    CUresult error = CUDA_ERROR_UNKNOWN;
    if (count > 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      fprintf(stderr, "Launching a function from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      fprintf(stderr, "Launching a function from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      fprintf(stderr, "Launching a function from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");

    error = CUDA_ERROR_UNKNOWN;
    if (count > 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      fprintf(stderr, "Unloading a module from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      fprintf(stderr, "Unloading a module from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      fprintf(stderr, "Unloading a module from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemFree(a));
    CU_ERROR_CHECK(cuMemFree(b));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
  }

  for (int i = 0; i < count; i++) {
    for (size_t j = 0; j < 2; j++)
      CU_ERROR_CHECK(cuCtxDestroy(contexts[i][j]));
  }

  return 0;
}