예제 #1
0
void initGLBuffers()
{
    if (pbo)
    {
        // delete old buffer
        checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource));
        glDeleteBuffersARB(1, &pbo);
    }

    // create pixel buffer object for display
    glGenBuffersARB(1, &pbo);
    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
    glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, width*height*sizeof(uchar4), 0, GL_STREAM_DRAW_ARB);
    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo,
                    cudaGraphicsMapFlagsWriteDiscard));

#if USE_BUFFER_TEX

    // create buffer texture, attach to pbo
    if (bufferTex)
    {
        glDeleteTextures(1, &bufferTex);
    }

    glGenTextures(1, &bufferTex);
    glBindTexture(GL_TEXTURE_BUFFER_EXT, bufferTex);
    glTexBufferEXT(GL_TEXTURE_BUFFER_EXT, GL_RGBA8, pbo);
    glBindTexture(GL_TEXTURE_BUFFER_EXT, 0);
#else

    // create texture for display
    if (displayTex)
    {
        glDeleteTextures(1, &displayTex);
    }

    glGenTextures(1, &displayTex);
    glBindTexture(GL_TEXTURE_TYPE, displayTex);
    glTexImage2D(GL_TEXTURE_TYPE, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
    glTexParameteri(GL_TEXTURE_TYPE, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    glTexParameteri(GL_TEXTURE_TYPE, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
    glBindTexture(GL_TEXTURE_TYPE, 0);
#endif

    // calculate new grid size
    gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y));
}
예제 #2
0
void CudaNarrowphase::squeezeContacts(void * overlappingPairs, unsigned numOverlappingPairs)
{
	void * srcContact = m_contact[0]->bufferOnDevice();
	
	const unsigned scanValidPairLength = iDivUp(numOverlappingPairs, 1024) * 1024;
	
	m_validCounts->create(scanValidPairLength * 4);
	
	void * counts = m_validCounts->bufferOnDevice();
	
	narrowphaseComputeValidPairs((uint *)counts, (ContactData *)srcContact, numOverlappingPairs, scanValidPairLength);
	
	m_scanValidContacts[0]->create(scanValidPairLength * 4);
	m_scanValidContacts[1]->create(scanValidPairLength * 4);
	
	void * scanResult = m_scanValidContacts[0]->bufferOnDevice();
	void * scanIntermediate = m_scanValidContacts[1]->bufferOnDevice();
	scanExclusive((uint *)scanResult, (uint *)counts, (uint *)scanIntermediate, scanValidPairLength / 1024, 1024);

	m_numContacts = ScanUtil::getScanResult(m_validCounts, m_scanValidContacts[0], scanValidPairLength);
	
	if(m_numContacts < 1) return;
	
	m_contactPairs->create(numOverlappingPairs * 8);
	void * dstPairs = m_contactPairs->bufferOnDevice();
	
	void * dstContact = m_contact[1]->bufferOnDevice();
	
	narrowphaseSqueezeContactPairs((uint2 *)dstPairs, (uint2 *)overlappingPairs, 
									(ContactData *)dstContact, (ContactData *)srcContact,
									(uint *)counts, (uint *)scanResult, 
									numOverlappingPairs);
}
예제 #3
0
void reshape(int w, int h)
{
    width = w; height = h;
    initPixelBuffer();

    // calculate new grid size
    gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y));

    glViewport(0, 0, w, h);

    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();

    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
    glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
}
extern "C" size_t histogram64(
    cl_command_queue cqCommandQueue,
    cl_mem d_Histogram,
    cl_mem d_Data,
    uint byteCount
){
    cl_int ciErrNum;
    uint histogramCount;
    size_t localWorkSize, globalWorkSize;

    if(!cqCommandQueue)
        cqCommandQueue = cqDefaultCommandQue;

    {
        histogramCount = iDivUp(byteCount, HISTOGRAM64_WORKGROUP_SIZE * iSnapDown(255, 16));
        shrCheckError( (byteCount % 16 == 0), shrTRUE );
        shrCheckError( (histogramCount <= MAX_PARTIAL_HISTOGRAM64_COUNT), shrTRUE );
        cl_uint dataCount = byteCount / 16;

        ciErrNum  = clSetKernelArg(ckHistogram64, 0, sizeof(cl_mem),  (void *)&d_PartialHistograms);
        ciErrNum |= clSetKernelArg(ckHistogram64, 1, sizeof(cl_mem),  (void *)&d_Data);
        ciErrNum |= clSetKernelArg(ckHistogram64, 2, sizeof(cl_uint), (void *)&dataCount);
        shrCheckError(ciErrNum, CL_SUCCESS);

        localWorkSize = HISTOGRAM64_WORKGROUP_SIZE;
        globalWorkSize = histogramCount * HISTOGRAM64_WORKGROUP_SIZE;

        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckHistogram64, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);
    }

    {
        ciErrNum  = clSetKernelArg(ckMergeHistogram64, 0, sizeof(cl_mem),  (void *)&d_Histogram);
        ciErrNum |= clSetKernelArg(ckMergeHistogram64, 1, sizeof(cl_mem),  (void *)&d_PartialHistograms);
        ciErrNum |= clSetKernelArg(ckMergeHistogram64, 2, sizeof(cl_uint), (void *)&histogramCount);
        shrCheckError(ciErrNum, CL_SUCCESS);

        localWorkSize = MERGE_WORKGROUP_SIZE;
        globalWorkSize = HISTOGRAM64_BIN_COUNT * MERGE_WORKGROUP_SIZE;

        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckMergeHistogram64, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);

        return HISTOGRAM64_WORKGROUP_SIZE;
    }
}
예제 #5
0
파일: GPUmat.cpp 프로젝트: gosha1128/NYU
/*************************************************
 * HOST DRIVERS
 *************************************************/
void hostGPUDRV(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs) {


  unsigned int maxthreads = MAXTHREADS_STREAM;
  int nstreams = iDivUp(N, maxthreads*BLOCK_DIM1D);
  CUresult err = CUDA_SUCCESS;
  for (int str = 0; str < nstreams; str++) {
    int offset = str * maxthreads * BLOCK_DIM1D;
    int size = 0;
    if (str == (nstreams - 1))
      size = N - str * maxthreads * BLOCK_DIM1D;
    else
      size = maxthreads * BLOCK_DIM1D;


    int gridx = iDivUp(size, BLOCK_DIM1D); // number of x blocks

    // setup execution parameters

    if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, BLOCK_DIM1D, 1, 1))) {
      mexErrMsgTxt("Error in cuFuncSetBlockShape");
    }

    if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) {
      mexErrMsgTxt("Error in cuFuncSetSharedSize");
    }


    // add parameters
    int poffset = 0;

    // CUDA kernels interface
    // N: number of elements
    // offset: used for streams
    ALIGN_UP(poffset, __alignof(size));
    if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) {
      mexErrMsgTxt("Error in cuParamSeti");
    }
    poffset += sizeof(size);

    ALIGN_UP(poffset, __alignof(offset));
    if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) {
      mexErrMsgTxt("Error in cuParamSeti");
    }
    poffset += sizeof(offset);

    for (int p=0;p<nrhs;p++) {
      ALIGN_UP(poffset, prhs[p].align);
      if (CUDA_SUCCESS
          != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) {
        mexErrMsgTxt("Error in cuParamSetv");
      }
      poffset += prhs[p].psize;
    }

    if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) {
      mexErrMsgTxt("Error in cuParamSetSize");
    }

    err = cuLaunchGridAsync(drvfun, gridx, 1, 0);
    if (CUDA_SUCCESS != err) {
      mexErrMsgTxt("Error running kernel");
    }
  }

}
예제 #6
0
파일: cuAvgPool.cpp 프로젝트: gosha1128/NYU
//host driver
//void hostDriver(CUfunction drvfun, dim3 grid, dim3 threads, int shmem, int imgSizeX, int imgSizeY, int shmemX, int nrhs, hostdrv_pars_t *prhs) {
void hostDriver(CUfunction drvfun, int N, int nrhs, hostdrv_pars_t *prhs, int imx, int imy, int outx, int outy, int poolx, int pooly){
    //mexPrintf("threads.x: %d threads.y: %d threads.z %d\n",threads.x,threads.y,threads.z);
    
    
    unsigned int maxthreads = 65000;
    // Set threads per block here.
        unsigned int blocksdim1d = 256;
    dim3 threads(blocksdim1d, 1, 1);
    int nstreams = iDivUp(N, maxthreads*blocksdim1d);
    CUresult err = CUDA_SUCCESS;
    for (int str = 0; str < nstreams; str++) {
        int offset = str * maxthreads * blocksdim1d;
        int size = 0;
        if (str == (nstreams - 1))
            size = N - str * maxthreads * blocksdim1d;
        else
            size = maxthreads * blocksdim1d;
        
        
        int gridx = iDivUp(size, blocksdim1d); // number of x blocks
        
        // setup execution parameters
        
        if (CUDA_SUCCESS != (err = cuFuncSetBlockShape(drvfun, threads.x, threads.y, threads.y))) {
            mexErrMsgTxt("Error in cuFuncSetBlockShape");
        }
        
        if (CUDA_SUCCESS != cuFuncSetSharedSize(drvfun, 0)) {
            mexErrMsgTxt("Error in cuFuncSetSharedSize");
        }
        
        //mexPrintf("block shape ok\n");
        
        // add parameters
        int poffset = 0;
        
        // CUDA kernels interface
        // N: number of elements
        for (int p=0;p<nrhs;p++) {
            ALIGN_UP(poffset, prhs[p].align);
            if (CUDA_SUCCESS
                    != cuParamSetv(drvfun, poffset, prhs[p].par, prhs[p].psize)) {
                mexErrMsgTxt("Error in cuParamSetv");
            }
            poffset += prhs[p].psize;
        }
        
        ALIGN_UP(poffset, __alignof(size));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, size)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(size);
        
        ALIGN_UP(poffset, __alignof(offset));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, offset)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(offset);
        
        ALIGN_UP(poffset, __alignof(imx));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imx)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(imx);
        
        ALIGN_UP(poffset, __alignof(imy));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, imy)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(imy);
        
        ALIGN_UP(poffset, __alignof(outx));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outx)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(outx);
        
        ALIGN_UP(poffset, __alignof(outy));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, outy)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(outy);
        
        ALIGN_UP(poffset, __alignof(poolx));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, poolx)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(poolx);
        
        ALIGN_UP(poffset, __alignof(pooly));
        if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, pooly)) {
            mexErrMsgTxt("Error in cuParamSeti");
        }
        poffset += sizeof(pooly);
        
//   if (CUDA_SUCCESS != cuParamSeti(drvfun, poffset, shmemX)) {
//     mexErrMsgTxt("Error in cuParamSeti");
//   }
//   poffset += sizeof(shmemX);
        
        if (CUDA_SUCCESS != cuParamSetSize(drvfun, poffset)) {
            mexErrMsgTxt("Error in cuParamSetSize");
        }
        
        err = cuLaunchGridAsync(drvfun, gridx, 1, 0);
        if (CUDA_SUCCESS != err) {
            mexErrMsgTxt("Error running kernel");
        }
        
    }
}
예제 #7
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    //start logs
    shrSetLogFileName ("volumeRender.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    if (cutCheckCmdLineFlag(argc, (const char **)argv, "qatest") ||
		cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) 
	{
        g_bQAReadback = true;
        fpsLimit = frameCheckNumber;
    }

    if (cutCheckCmdLineFlag(argc, (const char **)argv, "glverify")) 
	{
        g_bQAGLVerify = true;
        fpsLimit = frameCheckNumber;
    }

    if (g_bQAReadback) {
	    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilDeviceInit(argc, argv);
        } else {
            cudaSetDevice( cutGetMaxGflopsDeviceId() );
        }

    } else {
        // First initialize OpenGL context, so we can properly set the GL for CUDA.
        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
        initGL( &argc, argv );

	    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
            cutilGLDeviceInit(argc, argv);
        } else {
            cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
        }
/*
        int device;
        struct cudaDeviceProp prop;
        cudaGetDevice( &device );
        cudaGetDeviceProperties( &prop, device );
        if( !strncmp( "Tesla", prop.name, 5 ) ) {
            shrLog("This sample needs a card capable of OpenGL and display.\n");
            shrLog("Please choose a different device with the -device=x argument.\n");
            cutilExit(argc, argv);
        }
*/
	}

    // parse arguments
    char *filename;
    if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) {
        volumeFilename = filename;
    }
    int n;
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "size", &n)) {
        volumeSize.width = volumeSize.height = volumeSize.depth = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "xsize", &n)) {
        volumeSize.width = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "ysize", &n)) {
        volumeSize.height = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "zsize", &n)) {
         volumeSize.depth = n;
    }

    // load volume data
    char* path = shrFindFilePath(volumeFilename, argv[0]);
    if (path == 0) {
        shrLog("Error finding file '%s'\n", volumeFilename);
        exit(EXIT_FAILURE);
    }

    size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType);
    void *h_volume = loadRawFile(path, size);
    
    initCuda(h_volume, volumeSize);
    free(h_volume);

    cutilCheckError( cutCreateTimer( &timer));

    shrLog("Press '=' and '-' to change density\n"
           "      ']' and '[' to change brightness\n"
           "      ';' and ''' to modify transfer function offset\n"
           "      '.' and ',' to modify transfer function scale\n\n");

    // calculate new grid size
    gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y));

    if (g_bQAReadback) {
        g_CheckRender = new CheckBackBuffer(width, height, 4, false);
        g_CheckRender->setPixelFormat(GL_RGBA);
        g_CheckRender->setExecPath(argv[0]);
        g_CheckRender->EnableQAReadback(true);

        uint *d_output;
        cutilSafeCall(cudaMalloc((void**)&d_output, width*height*sizeof(uint)));
        cutilSafeCall(cudaMemset(d_output, 0, width*height*sizeof(uint)));

        float modelView[16] = 
        {
            1.0f, 0.0f, 0.0f, 0.0f,
            0.0f, 1.0f, 0.0f, 0.0f,
            0.0f, 0.0f, 1.0f, 0.0f,
            0.0f, 0.0f, 4.0f, 1.0f
        };

        invViewMatrix[0] = modelView[0]; invViewMatrix[1] = modelView[4]; invViewMatrix[2] = modelView[8]; invViewMatrix[3] = modelView[12];
        invViewMatrix[4] = modelView[1]; invViewMatrix[5] = modelView[5]; invViewMatrix[6] = modelView[9]; invViewMatrix[7] = modelView[13];
        invViewMatrix[8] = modelView[2]; invViewMatrix[9] = modelView[6]; invViewMatrix[10] = modelView[10]; invViewMatrix[11] = modelView[14];

        // call CUDA kernel, writing results to PBO
	    copyInvViewMatrix(invViewMatrix, sizeof(float4)*3);
        
        // Start timer 0 and process n loops on the GPU 
        int nIter = 10;
        for (int i = -1; i < nIter; i++)
        {
            if( i == 0 ) {
                cudaThreadSynchronize();
                cutStartTimer(timer); 
            }
            
            render_kernel(gridSize, blockSize, d_output, width, height, density, brightness, transferOffset, transferScale);
        }
        cudaThreadSynchronize();
        cutStopTimer(timer);
        // Get elapsed time and throughput, then log to sample and master logs
        double dAvgTime = cutGetTimerValue(timer)/(nIter * 1000.0);
        shrLogEx(LOGBOTH | MASTER, 0, "volumeRender, Throughput = %.4f MTexels/s, Time = %.5f s, Size = %u Texels, NumDevsUsed = %u, Workgroup = %u\n", 
               (1.0e-6 * width * height)/dAvgTime, dAvgTime, (width * height), 1, blockSize.x * blockSize.y); 
        

        cutilCheckMsg("Error: render_kernel() execution FAILED");
        cutilSafeCall( cudaThreadSynchronize() );

        cutilSafeCall( cudaMemcpy(g_CheckRender->imageData(), d_output, width*height*4, cudaMemcpyDeviceToHost) );
        g_CheckRender->savePPM(sOriginal[g_Index], true, NULL);

        if (!g_CheckRender->PPMvsPPM(sOriginal[g_Index], sReference[g_Index], MAX_EPSILON_ERROR, THRESHOLD)) {
            shrLog("\nFAILED\n\n");
        } else {
            shrLog("\nPASSED\n\n");
        }

        cudaFree(d_output);
    	freeCudaBuffers();

        if (g_CheckRender) {
            delete g_CheckRender; g_CheckRender = NULL;
        }

    } else {
        // This is the normal rendering path for VolumeRender
        glutDisplayFunc(display);
        glutKeyboardFunc(keyboard);
        glutMouseFunc(mouse);
        glutMotionFunc(motion);
        glutReshapeFunc(reshape);
        glutIdleFunc(idle);

        initPixelBuffer();

        if (g_bQAGLVerify) {
            g_CheckRender = new CheckBackBuffer(width, height, 4);
            g_CheckRender->setPixelFormat(GL_RGBA);
            g_CheckRender->setExecPath(argv[0]);
            g_CheckRender->EnableQAReadback(true);
        }
        atexit(cleanup);

        glutMainLoop();
    }

    cudaThreadExit();
    shrEXIT(argc, (const char**)argv);
}
예제 #8
0
void initData(int argc, char **argv)
{
    // parse arguments
    char *filename;

    if (getCmdLineArgumentString(argc, (const char **) argv, "file", &filename))
    {
        volumeFilename = filename;
    }

    int n;


    if (checkCmdLineFlag(argc, (const char **) argv, "size"))
    {
        n = getCmdLineArgumentInt(argc, (const char **) argv, "size");
        volumeSize.width = volumeSize.height = volumeSize.depth = n;
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "xsize"))
    {
        n = getCmdLineArgumentInt(argc, (const char **) argv, "xsize");
        volumeSize.width = n;
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "ysize"))
    {
        n = getCmdLineArgumentInt(argc, (const char **) argv, "ysize");
        volumeSize.height = n;
    }

    if (checkCmdLineFlag(argc, (const char **) argv, "zsize"))
    {
        n = getCmdLineArgumentInt(argc, (const char **) argv, "zsize");
        volumeSize.depth = n;
    }

    char *path = sdkFindFilePath(volumeFilename, argv[0]);

    if (path == 0)
    {
        printf("Error finding file '%s'\n", volumeFilename);
        exit(EXIT_FAILURE);
    }

    size_t size = volumeSize.width*volumeSize.height*volumeSize.depth*sizeof(VolumeType);
    void *h_volume = loadRawFile(path, size);

    FilterKernel_init();
    Volume_init(&volumeOriginal,volumeSize, h_volume, 0);
    free(h_volume);
    Volume_init(&volumeFilter0, volumeSize, NULL, 1);
    Volume_init(&volumeFilter1, volumeSize, NULL, 1);
    VolumeRender_init();
    VolumeRender_setPreIntegrated(preIntegrated);
    VolumeRender_setVolume(&volumeOriginal);

    sdkCreateTimer(&timer);
    sdkCreateTimer(&animationTimer);
    sdkStartTimer(&animationTimer);

    // calculate new grid size
    gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y));
}
예제 #9
0
int main(int argc, char *argv[])
{
    QCoreApplication a(argc, argv);


    int size = 20000000;
    int vt = 5;



    //example
 //  size =9;


    int* vector = (int *) malloc (size * sizeof(int));
    int* vectorCheck = (int *) malloc (size * sizeof(int));

/*
    //example
    vector[0] = 1;
    vector[1] = 3;
    vector[2] = 5;
    vector[3] = 2;
    vector[4] = 7;
    vector[5] = 9;
    vector[6] = 6;
    vector[7] = 2;
    vector[8] = 3;
*/


    int number = 0;
    for (int i = 0; i < size; i++){
        if (i % (vt * 128) == 0)
            number++;
        vector[i] = rand() % 10;
    }

/*
    for (int i=0; i<size; i++)
        printf(" %d ", vector[i]);
   printf("\n");
*/

    pickCudaDevice();
    checkCudaError();

    int* d_vector;
    cudaMalloc((void **) &d_vector, size * sizeof(int));
    checkCudaError();

    int* d_result;
    cudaMalloc((void **) &d_result, size * sizeof(int));
    checkCudaError();

    int* d_vectorCheck;
    cudaMalloc((void **) &d_vectorCheck, size * sizeof(int));
    checkCudaError();

    int* d_resultCheck;
    cudaMalloc((void **) &d_resultCheck, size * sizeof(int));
    checkCudaError();

    uint numThreads, numBlocks;

    cudaMemcpy(d_vector,vector,size * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_vectorCheck,vector,size * sizeof(int), cudaMemcpyHostToDevice);


    computeGridSize(iDivUp(size,VT),NTHREADS,numBlocks,numThreads);

    printf("Start kernel\n");
    //reduce_wrapper(numBlocks,numThreads,d_result,d_vector,size, vt);
    //checkCudaError();

    //gpu time measurement
    cudaEvent_t gstart_exScan,gstop_exScan;
    cudaEventCreate(&gstart_exScan);
    cudaEventCreate(&gstop_exScan);

    cudaEventRecord(gstart_exScan, 0);

    exclusiveScan_wrapper2(numBlocks,
                          numThreads,
                          d_result,
                          d_vector,
                          size,
                          VT);

    cudaEventRecord(gstop_exScan, 0);
    cudaEventSynchronize(gstop_exScan);

    float gpu_time_exScan;
    cudaEventElapsedTime(&gpu_time_exScan, gstart_exScan, gstop_exScan);
    printf("Our GPU version has finished, it took %f ms\n",gpu_time_exScan );

    cudaEventDestroy(gstart_exScan); //cleaning up a bit
    cudaEventDestroy(gstop_exScan);
    checkCudaError();


    //gpu time measurement
    cudaEvent_t gstart,gstop;
    cudaEventCreate(&gstart);
    cudaEventCreate(&gstop);

    cudaEventRecord(gstart, 0);

    exclusiveScan_thrust(d_vectorCheck,
                         d_vectorCheck + size,
                         d_resultCheck,
                         0);

    cudaEventRecord(gstop, 0);
    cudaEventSynchronize(gstop);

    float gpu_time;
    cudaEventElapsedTime(&gpu_time, gstart, gstop);
    printf("Thrust version has finished, it took %f ms\n",gpu_time );

    cudaEventDestroy(gstart); //cleaning up a bit
    cudaEventDestroy(gstop);

    checkCudaError();

    printf("End kernel\n");

    cudaMemcpy(vector,d_result,size * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(vectorCheck,d_resultCheck,size * sizeof(int), cudaMemcpyDeviceToHost);

    /*
    for (int i=0; i<size; i++)
        printf(" %d ", vectorCheck[i]);
   printf("\n");
*/

/*
   for (int i=0; i<size; i++)
       printf(" %d ", vector[i]);
  printf("\n");
*/
    printf("Difference %d\n", vectorsDifference(vector,vectorCheck,size));

    if(areVectorsEqual(vector,vectorCheck,size) == 0)
        printf("Vectors are equal!!\n");
    else
        printf("Vectors are NOT equal :( \n");
    checkCudaError();

    cudaFree(d_vector);
    free(vector);
    cudaFree(d_result);
    cudaFree(d_resultCheck);
    cudaFree(d_vectorCheck);
    free(vectorCheck);
    return 0;
}
예제 #10
0
CUdeviceptr presum(CUdeviceptr *d_Input, uint arrayLength)
{

    uint N = 0;
    CUdeviceptr d_Output;
    struct timeval start,stop;
    gettimeofday(&start, NULL);
    initScan();
    gettimeofday(&stop, NULL);

    if(arrayLength <= MAX_SHORT_ARRAY_SIZE && arrayLength > MIN_SHORT_ARRAY_SIZE)
      {    
        for(uint i = 4; i<=MAX_SHORT_ARRAY_SIZE ; i<<=1){
          if(arrayLength <= i){
            N = i;
            break;
          }
        }
        checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint)));

        checkCudaErrors(cudaDeviceSynchronize());

        scanExclusiveShort((uint *)d_Output, (uint *)(*d_Input), N);
        //szWorkgroup = scanExclusiveShort((uint *)d_Output, (uint *)d_Input, 1, N);

        checkCudaErrors(cudaDeviceSynchronize());

    }else if(arrayLength <= MAX_LARGE_ARRAY_SIZE)
    {

      N = MAX_SHORT_ARRAY_SIZE * iDivUp(arrayLength,MAX_SHORT_ARRAY_SIZE);

      checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint)));      
      
      checkCudaErrors(cudaDeviceSynchronize());

      scanExclusiveLarge((uint *)d_Output, (uint *)(*d_Input), N);
      
      checkCudaErrors(cudaDeviceSynchronize());

    }else if(arrayLength <= MAX_LL_SIZE)
      {


        N = MAX_LARGE_ARRAY_SIZE * iDivUp(arrayLength,MAX_LARGE_ARRAY_SIZE);

        printf("N = %d\n",N);

        checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint)));      

        checkCudaErrors(cudaDeviceSynchronize());

        scanExclusiveLL((uint *)d_Output, (uint *)(*d_Input), N);
        
        checkCudaErrors(cudaDeviceSynchronize());

      }else{
      cuMemFree(d_Output);
      closeScan();

      return NULL;      
    }

    closeScan();

    cuMemFree(*d_Input);
    *d_Input = d_Output;

    
    printf("inside scan time:\n");
    printDiff(start,stop);

    return d_Output;
}