void CudaVideoRender::initCudaVideo( )
	{
		// bind the context lock to the CUDA context
		CUresult result = cuvidCtxLockCreate(&m_CtxLock, m_cuContext);
		if (result != CUDA_SUCCESS) {
			printf("cuvidCtxLockCreate failed: %d\n", result);
			assert(0);
		}

		std::auto_ptr<VideoDecoder> apVideoDecoder(new VideoDecoder(m_pVideoSource->format(), m_cuContext, m_eVideoCreateFlags, m_CtxLock));
		std::auto_ptr<VideoParser> apVideoParser(new VideoParser(apVideoDecoder.get(), m_pFrameQueue));
		m_pVideoSource->setParser(*apVideoParser.get());

		m_pVideoParser  = apVideoParser.release();
		m_pVideoDecoder = apVideoDecoder.release();

		// Create a Stream ID for handling Readback
		if (m_bReadback) {
			cutilDrvSafeCallNoSync( cuStreamCreate(&m_ReadbackSID, 0) );
			cutilDrvSafeCallNoSync( cuStreamCreate(&m_KernelSID,   0) );
			printf("> initCudaVideo()\n");
			printf("  CUDA Streams (%s) <m_ReadbackSID = %p>\n", ((m_ReadbackSID == 0) ? "Disabled" : "Enabled"), m_ReadbackSID );
			printf("  CUDA Streams (%s) <m_KernelSID   = %p>\n", ((m_KernelSID   == 0) ? "Disabled" : "Enabled"), m_KernelSID   );
		}
	}
Exemplo n.º 2
0
void
initCudaVideo()
{
    // bind the context lock to the CUDA context
    CUresult result = cuvidCtxLockCreate(&g_CtxLock, g_oContext);

    if (result != CUDA_SUCCESS)
    {
        printf("cuvidCtxLockCreate failed: %d\n", result);
        assert(0);
    }

    std::auto_ptr<VideoDecoder> apVideoDecoder(new VideoDecoder(g_pVideoSource->format(), g_oContext, g_eVideoCreateFlags, g_CtxLock));
    std::auto_ptr<VideoParser> apVideoParser(new VideoParser(apVideoDecoder.get(), g_pFrameQueue));
    g_pVideoSource->setParser(*apVideoParser.get());

    g_pVideoParser  = apVideoParser.release();
    g_pVideoDecoder = apVideoDecoder.release();

    // Create a Stream ID for handling Readback
    if (g_bReadback)
    {
        checkCudaErrors(cuStreamCreate(&g_ReadbackSID, 0));
        checkCudaErrors(cuStreamCreate(&g_KernelSID,   0));
        printf("> initCudaVideo()\n");
        printf("  CUDA Streams (%s) <g_ReadbackSID = %p>\n", ((g_ReadbackSID == 0) ? "Disabled" : "Enabled"), g_ReadbackSID);
        printf("  CUDA Streams (%s) <g_KernelSID   = %p>\n", ((g_KernelSID   == 0) ? "Disabled" : "Enabled"), g_KernelSID);
    }
}
Exemplo n.º 3
0
bool GLInteropResource::ensureResource(int w, int h, int H, GLuint tex, int plane)
{
    Q_ASSERT(plane < 2 && "plane number must be 0 or 1 for NV12");
    TexRes &r = res[plane];
    if (r.texture == tex && r.w == w && r.h == h && r.H == H && r.cuRes)
        return true;
    if (!ctx) {
        // TODO: how to use pop/push decoder's context without the context in opengl context
        CUDA_ENSURE(cuCtxCreate(&ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev), false);
        if (USE_STREAM) {
            CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT));
            CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT));
        }
        qDebug("cuda contex on gl thread: %p", ctx);
        CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this
    }
    if (r.cuRes) {
        CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false);
        r.cuRes = NULL;
    }
    // CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD works too for opengl, but not d3d
    CUDA_ENSURE(cuGraphicsGLRegisterImage(&r.cuRes, tex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_NONE), false);
    r.texture = tex;
    r.w = w;
    r.h = h;
    r.H = H;
    return true;
}
Exemplo n.º 4
0
void *cuda_make_ctx(CUcontext ctx, int flags) {
  cuda_context *res;
  void *p;

  res = malloc(sizeof(*res));
  if (res == NULL)
    return NULL;
  res->ctx = ctx;
  res->err = CUDA_SUCCESS;
  res->blas_handle = NULL;
  res->refcnt = 1;
  res->flags = flags;
  res->enter = 0;
  res->freeblocks = NULL;
  if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) {
    goto fail_cache;
  }
  res->extcopy_cache = cache_lru(64, 32, (cache_eq_fn)extcopy_eq,
                                 (cache_hash_fn)extcopy_hash,
                                 (cache_freek_fn)extcopy_free,
                                 (cache_freev_fn)cuda_freekernel);
  if (res->extcopy_cache == NULL) {
    goto fail_cache;
  }
  err = cuStreamCreate(&res->s, 0);
  if (err != CUDA_SUCCESS) {
    goto fail_stream;
  }
  err = cuStreamCreate(&res->mem_s, CU_STREAM_NON_BLOCKING);
  if (err != CUDA_SUCCESS) {
    goto fail_mem_stream;
  }
  err = cuMemAllocHost(&p, 16);
  if (err != CUDA_SUCCESS) {
    goto fail_errbuf;
  }
  memset(p, 0, 16);
  /* Need to tag for new_gpudata */
  TAG_CTX(res);
  res->errbuf = new_gpudata(res, (CUdeviceptr)p, 16);
  if (res->errbuf == NULL) {
    err = res->err;
    goto fail_end;
  }
  res->errbuf->flags |= CUDA_MAPPED_PTR;
  return res;
 fail_end:
  cuMemFreeHost(p);
 fail_errbuf:
  cuStreamDestroy(res->mem_s);
 fail_mem_stream:
  cuStreamDestroy(res->s);
 fail_stream:
  cache_destroy(res->extcopy_cache);
 fail_cache:
  free(res);
  return NULL;
}
Exemplo n.º 5
0
bool EGLInteropResource::ensureD3D9CUDA(int w, int h, int W, int H)
{
    TexRes &r = res[0];// 1 NV12 texture
    if (r.w == w && r.h == h && r.W == W && r.H == H && r.cuRes)
        return true;
    if (!ctx) {
        // TODO: how to use pop/push decoder's context without the context in opengl context
        if (!ensureD3DDevice())
            return false;
        // CUdevice is different from decoder's
        CUDA_ENSURE(cuD3D9CtxCreate(&ctx, &dev, CU_CTX_SCHED_BLOCKING_SYNC, device9), false);
#if USE_STREAM
        CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT));
        CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT));
#endif //USE_STREAM
        qDebug("cuda contex on gl thread: %p", ctx);
        CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this
    }
    if (r.cuRes) {
        CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false);
        r.cuRes = NULL;
    }

    // create d3d resource for interop
    if (!surface9_nv12) {
        // TODO: need pitch from cuvid to ensure cuMemcpy2D can copy the whole pitch
        DX_ENSURE(device9->CreateTexture(W
                                         //, H
                                         , H*3/2
                                         , 1
                                         , D3DUSAGE_DYNAMIC //D3DUSAGE_DYNAMIC is lockable // 0 is from NV example. cudaD3D9.h says The primary rendertarget may not be registered with CUDA. So can not be D3DUSAGE_RENDERTARGET?
                                         //, D3DUSAGE_RENDERTARGET
                                         , D3DFMT_L8
                                         //, (D3DFORMAT)MAKEFOURCC('N','V','1','2') // can not create nv12. use 2 textures L8+A8L8?
                                         , D3DPOOL_DEFAULT // must be D3DPOOL_DEFAULT for cuda?
                                         , &texture9_nv12
                                         , NULL) // - Resources allocated as shared may not be registered with CUDA.
                  , false);
        DX_ENSURE(device9->CreateOffscreenPlainSurface(W, H, (D3DFORMAT)MAKEFOURCC('N','V','1','2'), D3DPOOL_DEFAULT, &surface9_nv12, NULL), false);
    }

    // TODO: cudaD3D9.h says NV12 is not supported
    // CUDA_ERROR_INVALID_HANDLE if register D3D9 surface
    // TODO: why flag CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD is invalid while it's fine for opengl
    CUDA_ENSURE(cuGraphicsD3D9RegisterResource(&r.cuRes, texture9_nv12, CU_GRAPHICS_REGISTER_FLAGS_NONE), false);
    return true;
}
Exemplo n.º 6
0
void GPUInterface::ResizeStreamCount(int newStreamCount) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::ResizeStreamCount\n");
#endif
    SAFE_CUDA(cuCtxPushCurrent(cudaContext));

    SAFE_CUDA(cuCtxSynchronize());

    if (cudaStreams != NULL) {
        for(int i=0; i<numStreams; i++) {
            if (cudaStreams[i] != NULL)
                SAFE_CUDA(cuStreamDestroy(cudaStreams[i]));
        }
        free(cudaStreams);
    }

    if (cudaEvents != NULL) {
        for(int i=0; i<numStreams; i++) {
            if (cudaEvents[i] != NULL)
                SAFE_CUDA(cuEventDestroy(cudaEvents[i]));
        }
        free(cudaEvents);
    }

    if (newStreamCount == 1) {
        numStreams = 1;
        cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams);
        cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1));
        cudaStreams[0] = NULL;
        CUevent event;
        for(int i=0; i<2; i++) {
            SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
            cudaEvents[i] = event;
        }
    } else {
        numStreams = newStreamCount;
        if (numStreams > BEAGLE_STREAM_COUNT) {
            numStreams = BEAGLE_STREAM_COUNT;
        }
        cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams);
        CUstream stream;
        cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1));
        CUevent event;
        for(int i=0; i<numStreams; i++) {
            SAFE_CUDA(cuStreamCreate(&stream, CU_STREAM_DEFAULT));
            cudaStreams[i] = stream;
            SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
            cudaEvents[i] = event;
        }
        SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
        cudaEvents[numStreams] = event;
    }

    SAFE_CUDA(cuCtxPopCurrent(&cudaContext));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::ResizeStreamCount\n");
#endif
}
Exemplo n.º 7
0
        static CUstream create(const vex::backend::context &ctx, unsigned flags = 0) {
            ctx.set_current();

            CUstream s;
            cuda_check( cuStreamCreate(&s, flags) );

            return s;
        }
Exemplo n.º 8
0
  stream device_t<CUDA>::createStream(){
    OCCA_EXTRACT_DATA(CUDA, Device);

    CUstream *retStream = new CUstream;

    OCCA_CUDA_CHECK("Device: createStream",
                    cuStreamCreate(retStream, CU_STREAM_DEFAULT));

    return retStream;
  }
Exemplo n.º 9
0
Arquivo: lib-83.c Projeto: 0day-ci/gcc
int
main (int argc, char **argv)
{
  float atime;
  CUstream stream;
  CUresult r;

  acc_init (acc_device_nvidia);

  (void) acc_get_device_num (acc_device_nvidia);

  init_timers (1);

  stream = (CUstream) acc_get_cuda_stream (0);
  if (stream != NULL)
    abort ();

  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
      abort ();
    }

  if (!acc_set_cuda_stream (0, stream))
    abort ();

  start_timer (0);

  acc_wait_all_async (0);

  acc_wait (0);

  atime = stop_timer (0);

  if (0.010 < atime)
    {
      fprintf (stderr, "actual time too long\n");
      abort ();
    }

  fini_timers ();

  acc_shutdown (acc_device_nvidia);

  exit (0);
}
Exemplo n.º 10
0
int
main (int argc, char **argv)
{
  CUstream stream;
  CUresult r;
  struct timeval tv1, tv2;
  time_t t1;

  acc_init (acc_device_nvidia);

  stream = (CUstream) acc_get_cuda_stream (0);
  if (stream != NULL)
    abort ();

  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
      abort ();
    }

  if (!acc_set_cuda_stream (0, stream))
    abort ();

  gettimeofday (&tv1, NULL);

  acc_wait_all_async (0);

  acc_wait (0);

  gettimeofday (&tv2, NULL);

  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);

  if (t1 > 1000)
    {
      fprintf (stderr, "too long\n");
      abort ();
    }

  acc_shutdown (acc_device_nvidia);

  exit (0);
}
Exemplo n.º 11
0
void *cuda_make_ctx(CUcontext ctx, int flags) {
  int64_t v = 0;
  cuda_context *res;
  int e = 0;

  res = malloc(sizeof(*res));
  if (res == NULL)
    return NULL;
  res->ctx = ctx;
  res->err = CUDA_SUCCESS;
  res->blas_handle = NULL;
  res->refcnt = 1;
  res->flags = flags;
  res->enter = 0;
  if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) {
    free(res);
    return NULL;
  }
  res->extcopy_cache = cache_alloc(64, 32);
  if (res->extcopy_cache == NULL) {
    free(res);
    return NULL;
  }
  err = cuStreamCreate(&res->s, 0);
  if (err != CUDA_SUCCESS) {
    cache_free(res->extcopy_cache);
    free(res);
    return NULL;
  }
  TAG_CTX(res); /* Need to tag before cuda_alloc */
  res->errbuf = cuda_alloc(res, 8, &v, GA_BUFFER_INIT, &e);
  if (e != GA_NO_ERROR) {
    err = res->err;
    cache_free(res->extcopy_cache);
    cuStreamDestroy(res->s);
    free(res);
    return NULL;
  }
  res->refcnt--; /* Don't want to create a reference loop with the errbuf */
  return res;
}
Exemplo n.º 12
0
CUresult initialize(int device, CUcontext *phContext, CUdevice *phDevice, CUmodule *phModule, CUstream *phStream)
{
  // Initialize the device and create the context
  cuInit(0);
  cuDeviceGet(phDevice, device);
  CUresult status = cuCtxCreate(phContext, 0, *phDevice);
  if (status != CUDA_SUCCESS)
    {std::cout << "ERROR: could not create context\n"; exit(0);}

    status = cuModuleLoad(phModule, "PTXTestFunctions.o.ptx");
  
  if (status != CUDA_SUCCESS)
    {std::cout << "ERROR: could not load .ptx module: " << status << "\n"; exit(0);}

  // Create stream
  status = cuStreamCreate(phStream, 0);
  if (status != CUDA_SUCCESS)
    {printf("ERROR: during stream creation\n"); exit(0);}

  return status;
}
Exemplo n.º 13
0
bool VideoDecoderCUDAPrivate::initCuda()
{
    CUresult result = cuInit(0);
    if (result != CUDA_SUCCESS) {
        available = false;
        qWarning("cuInit(0) faile (%d)", result);
        return false;
    }
    cudev = GetMaxGflopsGraphicsDeviceId();

    int clockRate;
    cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cudev);
    int major, minor;
    cuDeviceComputeCapability(&major, &minor, cudev);
    char devname[256];
    cuDeviceGetName(devname, 256, cudev);
    description = QString("CUDA device: %1 %2.%3 %4 MHz").arg(devname).arg(major).arg(minor).arg(clockRate/1000);

    //TODO: cuD3DCtxCreate > cuGLCtxCreate > cuCtxCreate
    checkCudaErrors(cuCtxCreate(&cuctx, CU_CTX_SCHED_BLOCKING_SYNC, cudev)); //CU_CTX_SCHED_AUTO?
    CUcontext cuCurrent = NULL;
    result = cuCtxPopCurrent(&cuCurrent);
    if (result != CUDA_SUCCESS) {
        qWarning("cuCtxPopCurrent: %d\n", result);
        return false;
    }
    checkCudaErrors(cuvidCtxLockCreate(&vid_ctx_lock, cuctx));
    {
        AutoCtxLock lock(this, vid_ctx_lock);
        Q_UNUSED(lock);
        //Flags- Parameters for stream creation (must be 0 (CU_STREAM_DEFAULT=0 in cuda5) in cuda 4.2, no CU_STREAM_NON_BLOCKING)
        checkCudaErrors(cuStreamCreate(&stream, 0));//CU_STREAM_NON_BLOCKING)); //CU_STREAM_DEFAULT
        //require compute capability >= 1.1
        //flag: Reserved for future use, must be 0
        //cuStreamAddCallback(stream, CUstreamCallback, this, 0);
    }
    return true;
}
Exemplo n.º 14
0
void *cuda_make_ctx(CUcontext ctx, int flags) {
  cuda_context *res;
  res = malloc(sizeof(*res));
  if (res == NULL)
    return NULL;
  res->ctx = ctx;
  res->err = CUDA_SUCCESS;
  res->blas_handle = NULL;
  res->refcnt = 1;
  res->flags = flags;
  res->extcopy_cache = cache_alloc(64, 32);
  if (res->extcopy_cache == NULL) {
    free(res);
    return NULL;
  }
  err = cuStreamCreate(&res->s, 0);
  if (err != CUDA_SUCCESS) {
    cache_free(res->extcopy_cache);
    free(res);
    return NULL;
  }
  TAG_CTX(res);
  return res;
}
Exemplo n.º 15
0
static void try_init( void ) {
  int deviceCount = 0;                                                    
	int syncflag;
	char *sync;
  CUresult err = cuInit(0);
	CUresult status;
  struct cudaDeviceProp deviceProp;
	if( state.init  ) { return; } // already initialised
  state.device = 0;                                                            

  if (CUDA_SUCCESS == err)                                                 
  CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount));                
  if (deviceCount == 0) {  error( "No device found" ); }


	if( state.target_device >= deviceCount ) {
		error( "Invalid device requested" );
	}



        CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&(state.device), state.target_device));        
#ifdef USE_BOINC
	syncflag=0x4;
#else
	syncflag=0x0;
#endif

	sync = getenv("SWAN_SYNC" );
	if( sync != NULL ) {
		syncflag = atoi( sync );
		fprintf(stderr, "SWAN: Using synchronization method %d\n", syncflag );
	}

	if( getenv("SWAN_PROFILE") || getenv("CUDA_PROFILE") ) {
		state.debug = 1;
	}
	if( state.debug ) {
		printf("SWAN: Built for CUDA version %d.%d\n", CUDA_MAJOR, CUDA_MINOR );
	}

#ifdef DEV_EXTENSIONS
	syncflag |= CU_CTX_MAP_HOST;
#endif

#ifdef USE_FIXED_DEVICE
printf( "********************************************** OVERRIDING DEVICE ALLOCATION\n");
  status = cuCtxCreate( &(state.context), syncflag ,  0 ); // state.device + state.target_device );
  if ( CUDA_SUCCESS != status ) error ( "Unable to create context\n" );
#else
  status = cuCtxCreate( &(state.context), syncflag , state.target_device );


  if ( CUDA_SUCCESS != status ) {
   printf("SWAN: Failed to get requested device (compute exclusive mode). Trying any..\n" );
    int count;
    cuDeviceGetCount( &count );
    int i=0;

    while ( (i < count) &&  (status != CUDA_SUCCESS) ) {
      state.target_device = i;
      CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&(state.device), state.target_device));
      status = cuCtxCreate( &(state.context), syncflag , state.target_device );
			i++;
    }
	}

#endif

  if ( CUDA_SUCCESS != status ) error ( "Unable to create context\n" );

	state.mods         = NULL;
	state.mod_names    = NULL;
	state.num_mods     = 0;
	state.funcs        = NULL;
	state.func_names   = NULL;
	state.num_funcs    = 0;

	state.init = 1;

  cudaGetDeviceProperties(&deviceProp, state.device);

	cuStreamCreate( &state.stream, 0 );
	state.device_version = (deviceProp.major * 100 + deviceProp.minor * 10);
	state.multiProcessorCount = deviceProp.multiProcessorCount;

}
Exemplo n.º 16
0
/*
// Property Message
//
// API
//static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step,
          int numStep, int startIndex, int sideLength, int bx, int by,
          CvLSVMFeaturePyramid **maps)
// INPUT
// image
// step
// numStep
// startIndex
// sideLength
// bx
// by
// OUTPUT
// maps
// RESULT
// Error status
*/
static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step,
        int numStep, int startIndex, int sideLength, int bx, int by,
        CvLSVMFeaturePyramid **maps)
{
    CvLSVMFeatureMap **feature_maps;

    int i;
    int width, height, numChannels, sizeX, sizeY, p, pp, newSizeX, newSizeY;
    float *scales;
    CvLSVMFeatureMapGPU **devs_img, **devs_map_pre_norm, **devs_map_pre_pca;
    CUstream *streams;
    CUresult res;

    scales = (float *) malloc(sizeof(float) * (numStep));
    devs_img = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));
    devs_map_pre_norm = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));
    devs_map_pre_pca = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));
    streams = (CUstream *) malloc(sizeof(CUstream) * (numStep));
    feature_maps = (CvLSVMFeatureMap **) malloc(
            sizeof(CvLSVMFeatureMap *) * (numStep));

    // allocate device memory
    for (i = 0; i < numStep; i++)
    {
        scales[i] = 1.0f / powf(step, (float) i);
        width  = (int) (((float) image->width ) * scales[i] + 0.5);
        height = (int) (((float) image->height) * scales[i] + 0.5);
        numChannels = image->nChannels;
        sizeX = width  / sideLength;
        sizeY = height / sideLength;
        p  = NUM_SECTOR * 3;
        pp = NUM_SECTOR * 12;
        newSizeX = sizeX - 2;
        newSizeY = sizeY - 2;

        allocFeatureMapObjectGPU<float>(&devs_img[i], width, height,
                numChannels);
        allocFeatureMapObjectGPU<float>(&devs_map_pre_norm[i], sizeX, sizeY, p);
        allocFeatureMapObjectGPU<float>(&devs_map_pre_pca[i], newSizeX,
                newSizeY, pp);
        res = cuStreamCreate(&streams[i], CU_STREAM_DEFAULT);
        CUDA_CHECK(res, "cuStreamCreate(stream)");
    }

    // excute main function
    resizeGPUStream(numStep, image, scales, devs_img, streams);

    getFeatureMapsGPUStream(numStep, sideLength, devs_img, devs_map_pre_norm,
            streams);

    normalizeAndTruncateGPUStream(numStep, Val_Of_Truncate, devs_map_pre_norm,
            devs_map_pre_pca, streams);

    PCAFeatureMapsGPUStream(numStep, bx, by, devs_map_pre_pca, feature_maps,
            streams);

    // synchronize cuda stream
    for (i = 0; i < numStep; i++)
    {
        cuStreamSynchronize(streams[i]);
        cuStreamDestroy(streams[i]);
    }

    for (i = 0; i < numStep; i++)
    {
        (*maps)->pyramid[startIndex + i] = feature_maps[i];
    }/*for(i = 0; i < numStep; i++)*/

    // free device memory
    for (i = 0; i < numStep; i++)
    {
        freeFeatureMapObjectGPU(&devs_img[i]);
        freeFeatureMapObjectGPU(&devs_map_pre_norm[i]);
        freeFeatureMapObjectGPU(&devs_map_pre_pca[i]);
    }

    free(scales);
    free(devs_img);
    free(devs_map_pre_norm);
    free(devs_map_pre_pca);
    free(streams);
    free(feature_maps);

    return LATENT_SVM_OK;
}
Exemplo n.º 17
0
void GPUInterface::SetDevice(int deviceNumber, int paddedStateCount, int categoryCount, int paddedPatternCount, int unpaddedPatternCount, int tipCount,
                             long flags) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::SetDevice\n");
#endif

    SAFE_CUDA(cuDeviceGet(&cudaDevice, (*resourceMap)[deviceNumber]));

    unsigned int ctxFlags = CU_CTX_SCHED_AUTO;

    if (flags & BEAGLE_FLAG_SCALING_DYNAMIC) {
        ctxFlags |= CU_CTX_MAP_HOST;
    }

    CUresult error = cuCtxCreate(&cudaContext, ctxFlags, cudaDevice);
    if(error != CUDA_SUCCESS) {
        fprintf(stderr, "CUDA error: \"%s\" (%d) from file <%s>, line %i.\n",
                GetCUDAErrorDescription(error), error, __FILE__, __LINE__);
        if (error == CUDA_ERROR_INVALID_DEVICE) {
            fprintf(stderr, "(The requested CUDA device is likely set to compute exclusive mode. This mode prevents multiple processes from running on the device.)");
        }
        exit(-1);
    }

    InitializeKernelResource(paddedStateCount, flags & BEAGLE_FLAG_PRECISION_DOUBLE);

    if (!kernelResource) {
        fprintf(stderr,"Critical error: unable to find kernel code for %d states.\n",paddedStateCount);
        exit(-1);
    }
    kernelResource->categoryCount = categoryCount;
    kernelResource->patternCount = paddedPatternCount;
    kernelResource->unpaddedPatternCount = unpaddedPatternCount;
    kernelResource->flags = flags;

    SAFE_CUDA(cuModuleLoadData(&cudaModule, kernelResource->kernelCode));

    if ((paddedPatternCount < BEAGLE_MULTI_GRID_MAX || flags & BEAGLE_FLAG_PARALLELOPS_GRID) && !(flags & BEAGLE_FLAG_PARALLELOPS_STREAMS)) {
        numStreams = 1;
        cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams);
        cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1));
        cudaStreams[0] = NULL;
        CUevent event;
        for(int i=0; i<2; i++) {
            SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
            cudaEvents[i] = event;
        }
    } else {
        numStreams = tipCount/2 + 1;
        if (numStreams > BEAGLE_STREAM_COUNT) {
            numStreams = BEAGLE_STREAM_COUNT;
        }
        cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams);
        CUstream stream;
        cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1));
        CUevent event;
        for(int i=0; i<numStreams; i++) {
            SAFE_CUDA(cuStreamCreate(&stream, CU_STREAM_DEFAULT));
            cudaStreams[i] = stream;
            SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
            cudaEvents[i] = event;
        }
        SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
        cudaEvents[numStreams] = event;
    }

    SAFE_CUDA(cuCtxPopCurrent(&cudaContext));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::SetDevice\n");
#endif

}
Exemplo n.º 18
0
int cuda_test_memcpy_async(unsigned int size)
{
	int i;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUstream stream;
	CUdeviceptr data_addr;
	unsigned int *in, *out;
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	unsigned long total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuStreamCreate(&stream, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamCreate failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAlloc(&data_addr, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAllocHost((void **)&in, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAllocHost(in) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAllocHost((void **)&out, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAllocHost(out) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	for (i = 0; i < size / 4; i++) {
		in[i] = i+1;
		out[i] = 0;
	}

	gettimeofday(&tv_h2d_start, NULL);
	res = cuMemcpyHtoDAsync(data_addr, in, size, stream);
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoDAsync failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	res = cuStreamSynchronize(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	gettimeofday(&tv_h2d_end, NULL);

	gettimeofday(&tv_d2h_start, NULL);
	res = cuMemcpyDtoHAsync(out, data_addr, size, stream);
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoHAsync failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	res = cuStreamSynchronize(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	gettimeofday(&tv_d2h_end, NULL);

	for (i = 0; i < size / 4; i++) {
		if (in[i] != out[i]) {
			printf("in[%d] = %u, out[%d] = %u\n",
				   i, in[i], i, out[i]);
		}
	}

	res = cuMemFreeHost(out);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(out) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemFreeHost(in);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(in) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemFree(data_addr);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuStreamDestroy(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamDestroy failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000 + tv.tv_usec / 1000;

	printf("HtoD: %f\n", h2d);
	printf("DtoH: %f\n", d2h);

	return 0;

end:

	return -1;
}
Exemplo n.º 19
0
static struct ptx_stream *
select_stream_for_async (int async, pthread_t thread, bool create,
			 CUstream existing)
{
  struct nvptx_thread *nvthd = nvptx_thread ();
  /* Local copy of TLS variable.  */
  struct ptx_device *ptx_dev = nvthd->ptx_dev;
  struct ptx_stream *stream = NULL;
  int orig_async = async;

  /* The special value acc_async_noval (-1) maps (for now) to an
     implicitly-created stream, which is then handled the same as any other
     numbered async stream.  Other options are available, e.g. using the null
     stream for anonymous async operations, or choosing an idle stream from an
     active set.  But, stick with this for now.  */
  if (async > acc_async_sync)
    async++;

  if (create)
    pthread_mutex_lock (&ptx_dev->stream_lock);

  /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
     null stream, and in fact better performance may be obtainable if it doesn't
     (because the null stream enforces overly-strict synchronisation with
     respect to other streams for legacy reasons, and that's probably not
     needed with OpenACC).  Maybe investigate later.  */
  if (async == acc_async_sync)
    stream = ptx_dev->null_stream;
  else if (async >= 0 && async < ptx_dev->async_streams.size
	   && ptx_dev->async_streams.arr[async] && !(create && existing))
    stream = ptx_dev->async_streams.arr[async];
  else if (async >= 0 && create)
    {
      if (async >= ptx_dev->async_streams.size)
	{
	  int i, newsize = ptx_dev->async_streams.size * 2;

	  if (async >= newsize)
	    newsize = async + 1;

	  ptx_dev->async_streams.arr
	    = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
				   newsize * sizeof (struct ptx_stream *));

	  for (i = ptx_dev->async_streams.size; i < newsize; i++)
	    ptx_dev->async_streams.arr[i] = NULL;

	  ptx_dev->async_streams.size = newsize;
	}

      /* Create a new stream on-demand if there isn't one already, or if we're
	 setting a particular async value to an existing (externally-provided)
	 stream.  */
      if (!ptx_dev->async_streams.arr[async] || existing)
        {
	  CUresult r;
	  struct ptx_stream *s
	    = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));

	  if (existing)
	    s->stream = existing;
	  else
	    {
	      r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
	      if (r != CUDA_SUCCESS)
		GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
	    }

	  /* If CREATE is true, we're going to be queueing some work on this
	     stream.  Associate it with the current host thread.  */
	  s->host_thread = thread;
	  s->multithreaded = false;

	  s->d = (CUdeviceptr) NULL;
	  s->h = NULL;
	  map_init (s);

	  s->next = ptx_dev->active_streams;
	  ptx_dev->active_streams = s;
	  ptx_dev->async_streams.arr[async] = s;
	}

      stream = ptx_dev->async_streams.arr[async];
    }
  else if (async < 0)
    GOMP_PLUGIN_fatal ("bad async %d", async);

  if (create)
    {
      assert (stream != NULL);

      /* If we're trying to use the same stream from different threads
	 simultaneously, set stream->multithreaded to true.  This affects the
	 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
	 only wait for asynchronous launches from the same host thread they are
	 invoked on.  If multiple threads use the same async value, we make note
	 of that here and fall back to testing/waiting for all threads in those
	 functions.  */
      if (thread != stream->host_thread)
        stream->multithreaded = true;

      pthread_mutex_unlock (&ptx_dev->stream_lock);
    }
  else if (stream && !stream->multithreaded
	   && !pthread_equal (stream->host_thread, thread))
    GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);

  return stream;
}
Exemplo n.º 20
0
Arquivo: lib-72.c Projeto: 0day-ci/gcc
int
main (int argc, char **argv)
{
  CUdevice dev;
  CUfunction delay;
  CUmodule module;
  CUresult r;
  CUstream stream;
  unsigned long *a, *d_a, dticks;
  int nbytes;
  float dtime;
  void *kargs[2];
  int clkrate;
  int devnum, nprocs;

  acc_init (acc_device_nvidia);

  devnum = acc_get_device_num (acc_device_nvidia);

  r = cuDeviceGet (&dev, devnum);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
      abort ();
    }

  r =
    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
			  dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuModuleLoad (&module, "subr.ptx");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
      abort ();
    }

  r = cuModuleGetFunction (&delay, module, "delay");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
      abort ();
    }

  nbytes = nprocs * sizeof (unsigned long);

  dtime = 200.0;

  dticks = (unsigned long) (dtime * clkrate);

  a = (unsigned long *) malloc (nbytes);
  d_a = (unsigned long *) acc_malloc (nbytes);

  acc_map_data (a, d_a, nbytes);

  kargs[0] = (void *) &d_a;
  kargs[1] = (void *) &dticks;

  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
      abort ();
    }

  if (!acc_set_cuda_stream (0, stream))
    abort ();
    
  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
      abort ();
    }

  if (acc_async_test_all () != 0)
    {
      fprintf (stderr, "asynchronous operation not running\n");
      abort ();
    }

  sleep ((int) (dtime / 1000.f) + 1);

  if (acc_async_test_all () != 1)
    {
      fprintf (stderr, "found asynchronous operation still running\n");
      abort ();
    }

  acc_unmap_data (a);

  free (a);
  acc_free (d_a);

  acc_shutdown (acc_device_nvidia);

  exit (0);
}
Exemplo n.º 21
0
value spoc_getCudaDevice(value i)
{
	CAMLparam1(i);
	CAMLlocal4(general_info, cuda_info, specific_info, gc_info);
	CAMLlocal3(device,  maxT, maxG);
	int nb_devices;
	CUdevprop dev_infos;
	CUdevice dev;
	CUcontext ctx;
	CUstream queue[2];
	spoc_cu_context *spoc_ctx;
	//CUcontext gl_ctx;
	char infoStr[1024];
	int infoInt;
	size_t infoUInt;
	int major, minor;
	enum cudaError_enum cuda_error; 


	cuDeviceGetCount (&nb_devices);

	if ((Int_val(i)) > nb_devices)
		raise_constant(*caml_named_value("no_cuda_device")) ;


	CUDA_CHECK_CALL(cuDeviceGet(&dev, Int_val(i)));
	CUDA_CHECK_CALL(cuDeviceGetProperties(&dev_infos, dev));

	general_info = caml_alloc (9, 0);
	CUDA_CHECK_CALL(cuDeviceGetName(infoStr, sizeof(infoStr), dev));

	Store_field(general_info,0, copy_string(infoStr));//
	CUDA_CHECK_CALL(cuDeviceTotalMem(&infoUInt, dev));

	Store_field(general_info,1, Val_int(infoUInt));//
	Store_field(general_info,2, Val_int(dev_infos.sharedMemPerBlock));//
	Store_field(general_info,3, Val_int(dev_infos.clockRate));//
	Store_field(general_info,4, Val_int(dev_infos.totalConstantMemory));//
	CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev));
	Store_field(general_info,5, Val_int(infoInt));//
	CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
	Store_field(general_info,6, Val_bool(infoInt));//
	Store_field(general_info,7, i);
	CUDA_CHECK_CALL(cuCtxCreate	(&ctx,
			CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST,
			dev));
	spoc_ctx = malloc(sizeof(spoc_cl_context));
	spoc_ctx->ctx = ctx;
	CUDA_CHECK_CALL(cuStreamCreate(&queue[0], 0));
	CUDA_CHECK_CALL(cuStreamCreate(&queue[1], 0));
	spoc_ctx->queue[0] = queue[0];
	spoc_ctx->queue[1] = queue[1];
	Store_field(general_info,8, (value)spoc_ctx);
	CUDA_CHECK_CALL(cuCtxSetCurrent(ctx));


	cuda_info = caml_alloc(1, 0); //0 -> Cuda
	specific_info = caml_alloc(18, 0);

	cuDeviceComputeCapability(&major, &minor, dev);
	Store_field(specific_info,0, Val_int(major));//
	Store_field(specific_info,1, Val_int(minor));//
	Store_field(specific_info,2, Val_int(dev_infos.regsPerBlock));//
	Store_field(specific_info,3, Val_int(dev_infos.SIMDWidth));//
	Store_field(specific_info,4, Val_int(dev_infos.memPitch));//
	Store_field(specific_info,5, Val_int(dev_infos.maxThreadsPerBlock));//

	maxT = caml_alloc(3, 0);
	Store_field(maxT,0, Val_int(dev_infos.maxThreadsDim[0]));//
	Store_field(maxT,1, Val_int(dev_infos.maxThreadsDim[1]));//
	Store_field(maxT,2, Val_int(dev_infos.maxThreadsDim[2]));//
	Store_field(specific_info,6, maxT);

	maxG = caml_alloc(3, 0);
	Store_field(maxG,0, Val_int(dev_infos.maxGridSize[0]));//
	Store_field(maxG,1, Val_int(dev_infos.maxGridSize[1]));//
	Store_field(maxG,2, Val_int(dev_infos.maxGridSize[2]));//
	Store_field(specific_info,7, maxG);

	Store_field(specific_info,8, Val_int(dev_infos.textureAlign));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
	Store_field(specific_info,9, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
	Store_field(specific_info,10, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
	Store_field(specific_info,11, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
	Store_field(specific_info,12, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
	Store_field(specific_info,13, Val_int(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
	Store_field(specific_info,14, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
	Store_field(specific_info,15, Val_int(infoInt));
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
	Store_field(specific_info,16, Val_int(infoInt));
	cuDriverGetVersion(&infoInt);
	Store_field(specific_info, 17, Val_int(infoInt));

	Store_field(cuda_info, 0, specific_info);
	device = caml_alloc(4, 0);
	Store_field(device, 0, general_info);
	Store_field(device, 1, cuda_info);

	{spoc_cuda_gc_info* gcInfo = (spoc_cuda_gc_info*)malloc(sizeof(spoc_cuda_gc_info));
	CUDA_CHECK_CALL(cuMemGetInfo(&infoUInt, NULL));
	infoUInt -= (32*1024*1024);

	Store_field(device, 2, (value)gcInfo);


	{cuda_event_list* events = NULL;
	Store_field(device, 3, (value)events);



	CAMLreturn(device);}}
}
Exemplo n.º 22
0
	CNvidiaNvencCodec(DWORD nCodecInstanceId, const CCodecContextBase& CodecContext) : m_NvidiaNvencCodecContext(static_cast<const CNvidiaNvencCodecContext&>(CodecContext)), m_hNvEncodeAPI64(LoadLibraryA("nvEncodeAPI64.dll"))
	{
		PNVENCODEAPICREATEINSTANCE pNvEncodeAPICreateInstance = reinterpret_cast<PNVENCODEAPICREATEINSTANCE>(GetProcAddress(m_hNvEncodeAPI64, "NvEncodeAPICreateInstance"));
		memset(&m_FunctionList, 0, sizeof(m_FunctionList));
		m_FunctionList.version = NV_ENCODE_API_FUNCTION_LIST_VER;
		NVENCSTATUS nStatus = pNvEncodeAPICreateInstance(&m_FunctionList);
		CHECK_CUDA_DRV_STATUS(cuCtxCreate(&m_Context, 0, 0));
		if (m_NvidiaNvencCodecContext.GetUseSwscaleInsteadOfCuda())
		{
			CHECK_CUDA_DRV_STATUS(cuMemAlloc(&m_pNv12Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 3 / 2));
			m_nNv12BufferPitch = m_NvidiaNvencCodecContext.GetWidth();
			CHECK_CUDA_DRV_STATUS(cuMemAllocHost(&m_pPageLockedNv12Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 3 / 2));
			m_pNv12Planes[0] = reinterpret_cast<unsigned char*>(m_pPageLockedNv12Buffer);
			m_pNv12Planes[1] = reinterpret_cast<unsigned char*>(m_pPageLockedNv12Buffer) + m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight();
			m_pNv12Strides[0] = m_NvidiaNvencCodecContext.GetWidth();
			m_pNv12Strides[1] = m_NvidiaNvencCodecContext.GetWidth();
			m_SwscaleContext = sws_getContext(m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight(), AV_PIX_FMT_BGR32, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight(), AV_PIX_FMT_NV12, 0, 0, 0, 0);
		}
		else
		{
			CHECK_CUDA_DRV_STATUS(cuMemAllocPitch(&m_pNv12Buffer, &m_nNv12BufferPitch, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight() * 3 / 2, 16));
			if (m_NvidiaNvencCodecContext.GetUsePageLockedIntermediateBuffer())
			{
				CHECK_CUDA_DRV_STATUS(cuMemAllocHost(&m_pPageLockedRgb32Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 4));
			}
			CHECK_CUDA_DRV_STATUS(cuMemAlloc(&m_pRgb32Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 4));
		}
		CHECK_CUDA_DRV_STATUS(cuStreamCreate(&m_Stream, 0));
		NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS SessionParameters;
		memset(&SessionParameters, 0, sizeof(SessionParameters));
		SessionParameters.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
		SessionParameters.apiVersion = NVENCAPI_VERSION;
		SessionParameters.device = m_Context;
		SessionParameters.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
		nStatus = m_FunctionList.nvEncOpenEncodeSessionEx(&SessionParameters, &m_pEncoder);
		m_PictureParameters.version = NV_ENC_PIC_PARAMS_VER;
		auto PresetGuid = NV_ENC_PRESET_HP_GUID;
		NV_ENC_PRESET_CONFIG PresetConfiguration = { NV_ENC_PRESET_CONFIG_VER, 0 };
		PresetConfiguration.presetCfg.version = NV_ENC_CONFIG_VER;
		CHECK_NVENC_STATUS(m_FunctionList.nvEncGetEncodePresetConfig(m_pEncoder, NV_ENC_CODEC_H264_GUID, PresetGuid, &PresetConfiguration));
		NV_ENC_CONFIG EncoderConfiguration = { NV_ENC_CONFIG_VER, 0 };
		EncoderConfiguration = PresetConfiguration.presetCfg;
		EncoderConfiguration.gopLength = NVENC_INFINITE_GOPLENGTH;
		EncoderConfiguration.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
		EncoderConfiguration.frameIntervalP = 1; // No B frames
		EncoderConfiguration.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
		EncoderConfiguration.encodeCodecConfig.h264Config.idrPeriod = m_NvidiaNvencCodecContext.GetFrameCount();
		EncoderConfiguration.encodeCodecConfig.h264Config.chromaFormatIDC = 1;
		EncoderConfiguration.encodeCodecConfig.h264Config.sliceMode = 0;
		EncoderConfiguration.encodeCodecConfig.h264Config.sliceModeData = 0;
		NV_ENC_INITIALIZE_PARAMS InitializationParameters = { NV_ENC_INITIALIZE_PARAMS_VER, 0 };
		InitializationParameters.encodeGUID = NV_ENC_CODEC_H264_GUID;
		InitializationParameters.presetGUID = PresetGuid;
		InitializationParameters.frameRateNum = m_NvidiaNvencCodecContext.GetFps();
		InitializationParameters.frameRateDen = 1;
#ifdef ASYNCHRONOUS
		InitializationParameters.enableEncodeAsync = 1;
#else
		InitializationParameters.enableEncodeAsync = 0;
#endif
		InitializationParameters.enablePTD = 1; // Let the encoder decide the picture type
		InitializationParameters.reportSliceOffsets = 0;
		InitializationParameters.maxEncodeWidth = m_NvidiaNvencCodecContext.GetWidth();
		InitializationParameters.maxEncodeHeight = m_NvidiaNvencCodecContext.GetHeight();
		InitializationParameters.encodeConfig = &EncoderConfiguration;
		InitializationParameters.encodeWidth = m_NvidiaNvencCodecContext.GetWidth();
		InitializationParameters.encodeHeight = m_NvidiaNvencCodecContext.GetHeight();
		InitializationParameters.darWidth = 16;
		InitializationParameters.darHeight = 9;
		CHECK_NVENC_STATUS(m_FunctionList.nvEncInitializeEncoder(m_pEncoder, &InitializationParameters));
		// Picture parameters that are known ahead of encoding
		m_PictureParameters = { NV_ENC_PIC_PARAMS_VER, 0 };
		m_PictureParameters.codecPicParams.h264PicParams.sliceMode = 0;
		m_PictureParameters.codecPicParams.h264PicParams.sliceModeData = 0;
		m_PictureParameters.inputWidth = m_NvidiaNvencCodecContext.GetWidth();
		m_PictureParameters.inputHeight = m_NvidiaNvencCodecContext.GetHeight();
		m_PictureParameters.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL;
		m_PictureParameters.inputPitch = static_cast<uint32_t>(m_nNv12BufferPitch);
		m_PictureParameters.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
#ifdef ASYNCHRONOUS
		m_hCompletionEvent = CreateEvent(NULL, FALSE, FALSE, NULL);
		m_EventParameters = { NV_ENC_EVENT_PARAMS_VER, 0 };
		m_EventParameters.completionEvent = m_hCompletionEvent;
		CHECK_NVENC_STATUS(m_FunctionList.nvEncRegisterAsyncEvent(m_pEncoder, &m_EventParameters));
		m_PictureParameters.completionEvent = m_hCompletionEvent;
#endif
		// Register CUDA input pointer
		NV_ENC_REGISTER_RESOURCE RegisterResource = { NV_ENC_REGISTER_RESOURCE_VER, NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight(), static_cast<uint32_t>(m_nNv12BufferPitch), 0, reinterpret_cast<void*>(m_pNv12Buffer), NULL, NV_ENC_BUFFER_FORMAT_NV12_PL };
		CHECK_NVENC_STATUS(m_FunctionList.nvEncRegisterResource(m_pEncoder, &RegisterResource));
		NV_ENC_MAP_INPUT_RESOURCE MapInputResource = { NV_ENC_MAP_INPUT_RESOURCE_VER, 0, 0, RegisterResource.registeredResource };
		m_pRegisteredResource = RegisterResource.registeredResource;
		CHECK_NVENC_STATUS(m_FunctionList.nvEncMapInputResource(m_pEncoder, &MapInputResource));
		m_PictureParameters.inputBuffer = MapInputResource.mappedResource;
		// Create output bitstream buffer
		m_nOutputBitstreamSize = 2 * 1024 * 1024;
		NV_ENC_CREATE_BITSTREAM_BUFFER CreateBitstreamBuffer = { NV_ENC_CREATE_BITSTREAM_BUFFER_VER, m_nOutputBitstreamSize, NV_ENC_MEMORY_HEAP_AUTOSELECT, 0 };
		CHECK_NVENC_STATUS(m_FunctionList.nvEncCreateBitstreamBuffer(m_pEncoder, &CreateBitstreamBuffer));
		m_pOutputBitstream = CreateBitstreamBuffer.bitstreamBuffer;
		m_PictureParameters.outputBitstream = m_pOutputBitstream;
		if (m_NvidiaNvencCodecContext.GetSaveOutputToFile())
		{
			char pOutputFilename[MAX_PATH];
			sprintf_s(pOutputFilename, "nvenc-%d.h264", nCodecInstanceId);
			if (fopen_s(&m_pOutputFile, pOutputFilename, "wb") != 0)
			{
				throw std::runtime_error(std::string("could not open ").append(pOutputFilename).append(" for writing!"));
			}
		}
	}
Exemplo n.º 23
0
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq)
{
	VirtQueueElement elem;
	
	while(virtqueue_pop(vq, &elem)) {
		struct param *p = elem.out_sg[0].iov_base;
	
		//for all library routines: get required arguments from buffer, execute, and push results back in virtqueue
		switch (p->syscall_type) {
		case CUINIT: {
			p->result = cuInit(p->flags);
			break;
		}
		case CUDRIVERGETVERSION: {
			p->result = cuDriverGetVersion(&p->val1);
			break;
		}
		case CUDEVICEGETCOUNT: {
			p->result = cuDeviceGetCount(&p->val1);
			break;
		}
		case CUDEVICEGET: {
			p->result = cuDeviceGet(&p->device, p->val1);
			break;
		}
		case CUDEVICECOMPUTECAPABILITY: {
			p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device);
			break;
		}
		case CUDEVICEGETNAME: {
			p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device);
			break;
		}
		case CUDEVICEGETATTRIBUTE: {
			p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device);
			break;
		}
		case CUCTXCREATE: {
                        p->result = cuCtxCreate(&p->ctx, p->flags, p->device);				
			break;
		}
		case CUCTXDESTROY: {
			p->result = cuCtxDestroy(p->ctx);
			break;
		}
		case CUCTXGETCURRENT: {
			p->result = cuCtxGetCurrent(&p->ctx);
			break;
		}
		case CUCTXGETDEVICE: {
			p->result = cuCtxGetDevice(&p->device);
			break;
		}
		case CUCTXPOPCURRENT: {
			p->result = cuCtxPopCurrent(&p->ctx);
			break;
		}
		case CUCTXSETCURRENT: {
			p->result = cuCtxSetCurrent(p->ctx);
	                break;
		}
	        case CUCTXSYNCHRONIZE: {
		        p->result = cuCtxSynchronize();
	                break;
	        }
		case CUMODULELOAD: {
			//hardcoded path - needs improvement
			//all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS
			char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char));
			if (!binname) {
				p->result = 0;
		                virtqueue_push(vq, &elem, 0);
				break;
			}
		        strcpy(binname, getenv("QEMU_NFS_PATH"));
		        strcat(binname, (char *)elem.out_sg[1].iov_base);
			//change current CUDA context
			//each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes
                        if (cuCtxSetCurrent(p->ctx) != 0) {
				p->result = 999;
                                break;
			}
			p->result = cuModuleLoad(&p->module, binname);
			free(binname);
			break;
		}
                case CUMODULEGETGLOBAL: {
                        char *name = malloc(100*sizeof(char));
                        if (!name) {
                                p->result = 999;
                                break;
                        }
                        strcpy(name, (char *)elem.out_sg[1].iov_base);
                        p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name);
                        break;
                }
		case CUMODULEUNLOAD: {
			p->result = cuModuleUnload(p->module);
			break;			
		}
		case CUMEMALLOC: {
			if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuMemAlloc(&p->dptr, p->bytesize);
			break;
		}
                case CUMEMALLOCPITCH: {
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize);
			break;
		}
		//large buffers are alocated in smaller chuncks in guest kernel space
		//gets each chunck seperately and copies it to device memory
	        case CUMEMCPYHTOD: {
			int i;
			size_t offset;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			offset = 0;
			for (i=0; i<nr_pages; i++) {
				s = *(long *)elem.out_sg[1+2*i+1].iov_base;
				p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s);
				if (p->result != 0) break;
				offset += s;
			}
	                break;
		}
		case CUMEMCPYHTODASYNC: {
			int i;
                        size_t offset;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
                        offset = 0;
			for (i=0; i<nr_pages; i++) {
                                s = *(long *)elem.out_sg[1+2*i+1].iov_base;
                                p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream);
                                if (p->result != 0) break;
                                offset += s;
                        }
                        break;
		}
		case CUMEMCPYDTODASYNC: {
			p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream);
                        break;		
		}
	        case CUMEMCPYDTOH: {
			int i;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			size_t offset = 0;
			for (i=0; i<nr_pages; i++) {
				s = *(long *)elem.in_sg[0+2*i+1].iov_base;
				p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s);
				if (p->result != 0) break;
				offset += s;
			}
			break;
		}
		case CUMEMCPYDTOHASYNC: {
			int i;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
                        size_t offset = 0;
			for (i=0; i<nr_pages; i++) {
                                s = *(long *)elem.in_sg[0+2*i+1].iov_base;
                                p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream);
                                if (p->result != 0) break;
                                offset += s;
                        }
			break;
		}
		case CUMEMSETD32: {
			p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize);
			break;
		}
	        case CUMEMFREE: {
	                p->result = cuMemFree(p->dptr);
	                break;
	        }
		case CUMODULEGETFUNCTION: {
			char *name = (char *)elem.out_sg[1].iov_base;
			name[p->length] = '\0';
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuModuleGetFunction(&p->function, p->module, name);
			break;	
		}
		case CULAUNCHKERNEL: {
			void **args = malloc(p->val1*sizeof(void *));
	                if (!args) {
				p->result = 9999;
	                        break;
        	        }
			int i;
			for (i=0; i<p->val1; i++) {
				args[i] = elem.out_sg[1+i].iov_base;
			}
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuLaunchKernel(p->function,
					p->gridDimX, p->gridDimY, p->gridDimZ,
			                p->blockDimX, p->blockDimY, p->blockDimZ,
					p->bytecount, 0, args, 0);
			free(args);
			break;
		}
		case CUEVENTCREATE: {
			p->result = cuEventCreate(&p->event1, p->flags);
			break;
		}
		case CUEVENTDESTROY: {
			p->result = cuEventDestroy(p->event1);
			break;
		}
		case CUEVENTRECORD: {
			p->result = cuEventRecord(p->event1, p->stream);
			break;
		}
		case CUEVENTSYNCHRONIZE: {
			p->result = cuEventSynchronize(p->event1);
			break;
		}
		case CUEVENTELAPSEDTIME: {
			p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2);
			break;
		}
		case CUSTREAMCREATE: {
			p->result =  cuStreamCreate(&p->stream, 0);
			break;
		}		
                case CUSTREAMSYNCHRONIZE: {
                        p->result = cuStreamSynchronize(p->stream);
                        break;
                }
                case CUSTREAMQUERY: {
                        p->result = cuStreamQuery(p->stream);
                        break;
                }
		case CUSTREAMDESTROY: {
                        p->result = cuStreamDestroy(p->stream);
                        break;
                }

		default: 
			printf("Unknown syscall_type\n");
		}
		virtqueue_push(vq, &elem, 0);
	}
	//notify frontend - trigger virtual interrupt
	virtio_notify(vdev, vq);
	return;
}
Exemplo n.º 24
0
int
main (int argc, char **argv)
{
  CUdevice dev;
  CUfunction delay;
  CUmodule module;
  CUresult r;
  CUstream stream;
  unsigned long *a, *d_a, dticks;
  int nbytes;
  float atime, dtime;
  void *kargs[2];
  int clkrate;
  int devnum, nprocs;

  acc_init (acc_device_nvidia);

  devnum = acc_get_device_num (acc_device_nvidia);

  r = cuDeviceGet (&dev, devnum);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
      abort ();
    }

  r =
    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
			  dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuModuleLoad (&module, "subr.ptx");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
      abort ();
    }

  r = cuModuleGetFunction (&delay, module, "delay");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
      abort ();
    }

  nbytes = nprocs * sizeof (unsigned long);

  dtime = 200.0;

  dticks = (unsigned long) (dtime * clkrate);

  a = (unsigned long *) malloc (nbytes);
  d_a = (unsigned long *) acc_malloc (nbytes);

  acc_map_data (a, d_a, nbytes);

  kargs[0] = (void *) &d_a;
  kargs[1] = (void *) &dticks;

  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
  if (r != CUDA_SUCCESS)
	{
	  fprintf (stderr, "cuStreamCreate failed: %d\n", r);
	  abort ();
	}

  acc_set_cuda_stream (0, stream);

  init_timers (1);

  start_timer (0);

  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
      abort ();
    }

  acc_wait (1);

  atime = stop_timer (0);

  if (atime < dtime)
    {
      fprintf (stderr, "actual time < delay time\n");
      abort ();
    }

  start_timer (0);

  acc_wait (1);

  atime = stop_timer (0);

  if (0.010 < atime)
    {
      fprintf (stderr, "actual time < delay time\n");
      abort ();
    }

  acc_unmap_data (a);

  fini_timers ();

  free (a);
  acc_free (d_a);

  acc_shutdown (acc_device_nvidia);

  return 0;
}
Exemplo n.º 25
0
Arquivo: lib-82.c Projeto: 0day-ci/gcc
int
main (int argc, char **argv)
{
  CUdevice dev;
  CUfunction delay2;
  CUmodule module;
  CUresult r;
  int N;
  int i;
  CUstream *streams;
  unsigned long **a, **d_a, *tid, ticks;
  int nbytes;
  void *kargs[3];
  int clkrate;
  int devnum, nprocs;

  acc_init (acc_device_nvidia);

  devnum = acc_get_device_num (acc_device_nvidia);

  r = cuDeviceGet (&dev, devnum);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
      abort ();
    }

  r =
    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
			  dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuModuleLoad (&module, "subr.ptx");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
      abort ();
    }

  r = cuModuleGetFunction (&delay2, module, "delay2");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
      abort ();
    }

  nbytes = sizeof (int);

  ticks = (unsigned long) (200.0 * clkrate);

  N = nprocs;

  streams = (CUstream *) malloc (N * sizeof (void *));

  a = (unsigned long **) malloc (N * sizeof (unsigned long *));
  d_a = (unsigned long **) malloc (N * sizeof (unsigned long *));
  tid = (unsigned long *) malloc (N * sizeof (unsigned long));

  for (i = 0; i < N; i++)
    {
      a[i] = (unsigned long *) malloc (sizeof (unsigned long));
      *a[i] = N;
      d_a[i] = (unsigned long *) acc_malloc (nbytes);
      tid[i] = i;

      acc_map_data (a[i], d_a[i], nbytes);

      streams[i] = (CUstream) acc_get_cuda_stream (i);
      if (streams[i] != NULL)
        abort ();

      r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT);
      if (r != CUDA_SUCCESS)
        {
          fprintf (stderr, "cuStreamCreate failed: %d\n", r);
          abort ();
        }

       if (!acc_set_cuda_stream (i, streams[i]))
        abort ();
    }

  for (i = 0; i < N; i++)
    {
      kargs[0] = (void *) &d_a[i];
      kargs[1] = (void *) &ticks;
      kargs[2] = (void *) &tid[i];

      r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
      if (r != CUDA_SUCCESS)
	{
	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
	  abort ();
	}

      ticks = (unsigned long) (50.0 * clkrate);
    }

  acc_wait_all_async (0);

  for (i = 0; i < N; i++)
    {
      acc_copyout (a[i], nbytes);
      if (*a[i] != i)
	abort ();
    }

  free (streams);

  for (i = 0; i < N; i++)
    {
      free (a[i]);
    }

  free (a);
  free (d_a);
  free (tid);

  acc_shutdown (acc_device_nvidia);

  exit (0);
}