cumem(int newsz){
   sz = newsz;
   cudaMalloc(&data, sz);
   status = inuse;
   next = NULL;
 };
Exemple #2
0
ImGpu::ImGpu(const char* filename)
{
	FILE *fp = 0;
	int t1, t2, t3, t4;
	cudaError_t cudaStatus;

	sscanf_s(filename, "%dx%dx%dx%d_", &t1, &t2, &t3, &t4);

	width = t1;
	height = t2;
	bpp = t3;
	dimension = t4;

	void *pxl = 0;

	/* Allocate memory for the pixels on the Gpu */
	if (8 == bpp)
	{
		cudaStatus = cudaMalloc((void**)&dev_pxl, width *height *dimension * sizeof(char));
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "cudaMalloc failed!");
			goto Error;
		}
		cudaMemset(dev_pxl, 255, sizeof(char) * width *height *dimension);
		pxl = new char[sizeof(char) * width *height *dimension];
	}
	else if (16 == bpp)
	{
		cudaStatus = cudaMalloc((void**)&dev_pxl, width *height *dimension * sizeof(unsigned short));
		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "cudaMalloc failed!");
			goto Error;
		}
		cudaMemset(dev_pxl, 255, sizeof(unsigned short) * width *height *dimension);
		pxl = new unsigned short[sizeof(unsigned short) * width *height *dimension];
	}

	/*
	* Open the file to read the pixels
	*/
	fopen_s(&fp, filename, "rb"); /* open for reading */

	if (0 != fp){
		std::fread(pxl, sizeof(unsigned char), width*height*dimension, fp);
		fclose(fp); /* close the file */
	}


	// Copy input vectors from host memory to GPU buffers.
	cudaStatus = cudaMemcpy(dev_pxl, pxl, width *height *dimension * sizeof(char), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

	delete(pxl);
	return;
Error:
	cudaFree(dev_pxl);
	//delete(pxl);
}
/* Main */
void mexFunction( int nlhs, mxArray *plhs[],
                  int nrhs, const mxArray *prhs[]) {

    if (nrhs != 7) {
        mexErrMsgTxt("sgemm requires 7 input arguments");
    } else if (nlhs != 1) {
        mexErrMsgTxt("sgemm requires 1 output argument");
    }

    if ( !mxIsSingle(prhs[4]) ||
            !mxIsSingle(prhs[5]) ||
            !mxIsSingle(prhs[6]))   {
        mexErrMsgTxt("Input arrays must be single precision.");
    }

    int ta = (int) mxGetScalar(prhs[0]);
    int tb = (int) mxGetScalar(prhs[1]);
    float alpha = (float) mxGetScalar(prhs[2]);
    float beta = (float) mxGetScalar(prhs[3]);
    float *h_A = (float*) mxGetData(prhs[4]);
    float *h_B = (float*) mxGetData(prhs[5]);
    float *h_C = (float*) mxGetData(prhs[6]);

    int M = mxGetM(prhs[4]);   /* gets number of rows of A */
    int K = mxGetN(prhs[4]);   /* gets number of columns of A */
    int L = mxGetM(prhs[5]);   /* gets number of rows of B */
    int N = mxGetN(prhs[5]);   /* gets number of columns of B */

    cublasOperation_t transa, transb;
    int MM, KK, NN;
    if (ta == 0) {
        transa = CUBLAS_OP_N;
        MM=M;
        KK=K;
    } else {
        transa = CUBLAS_OP_T;
        MM=K;
        KK=M;
    }

    if (tb == 0) {
        transb = CUBLAS_OP_N;
        NN=N;
    } else {
        transb = CUBLAS_OP_T;
        NN=L;
    }

    /*	printf("transa=%c\n",transa);
    	printf("transb=%c\n",transb);
    	printf("alpha=%f\n",alpha);
    	printf("beta=%f\n",beta);	*/

    /* Left hand side matrix set up */
    mwSize dims0[2];
    dims0[0]=MM;
    dims0[1]=NN;
    plhs[0] = mxCreateNumericArray(2,dims0,mxSINGLE_CLASS,mxREAL);
    float *h_C_out = (float*) mxGetData(plhs[0]);

    cublasStatus_t status;
    cublasHandle_t handle;
    status = cublasCreate(&handle);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! CUBLAS initialization error\n");
    }

    float* d_A = 0;
    float* d_B = 0;
    float* d_C = 0;

    /* Allocate device memory for the matrices */
    if (cudaMalloc((void**)&d_A, M * K * sizeof(d_A[0])) != cudaSuccess) {
        mexErrMsgTxt("!!!! device memory allocation error (allocate A)\n");
    }
    if (cudaMalloc((void**)&d_B, L * N * sizeof(d_B[0])) != cudaSuccess) {
        mexErrMsgTxt("!!!! device memory allocation error (allocate B)\n");

    }
    if (cudaMalloc((void**)&d_C, MM * NN * sizeof(d_C[0])) != cudaSuccess) {
        mexErrMsgTxt("!!!! device memory allocation error (allocate C)\n");
    }

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(M * K, sizeof(h_A[0]), h_A, 1, d_A, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! device access error (write A)\n");

    }
    status = cublasSetVector(L * N, sizeof(h_B[0]), h_B, 1, d_B, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! device access error (write B)\n");
    }
    status = cublasSetVector(MM * NN, sizeof(h_C[0]), h_C, 1, d_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! device access error (write C)\n");
    }

    /* Performs operation using cublas */
    status = cublasSgemm(handle, transa, transb, MM, NN, KK, &alpha, d_A, M, d_B, L, &beta, d_C, MM);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! kernel execution error.\n");
    }

    /* Read the result back */
    status = cublasGetVector(MM * NN, sizeof(h_C[0]), d_C, 1, h_C_out, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! device access error (read C)\n");

    }

    if (cudaFree(d_A) != cudaSuccess) {
        mexErrMsgTxt("!!!! memory free error (A)\n");
    }
    if (cudaFree(d_B) != cudaSuccess) {
        mexErrMsgTxt("!!!! memory free error (B)\n");
    }
    if (cudaFree(d_C) != cudaSuccess) {
        mexErrMsgTxt("!!!! memory free error (C)\n");
    }

    /* Shutdown */
    status = cublasDestroy(handle);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! shutdown error (A)\n");
    }
}
Exemple #4
0
int main( int argc, char **argv )
{
    uchar *h_Data;
    uint  *h_HistogramCPU, *h_HistogramGPU;
    uchar *d_Data;
    uint  *d_Histogram;
    uint hTimer;
    int PassFailFlag = 1;
    uint byteCount = 64 * 1048576;
    uint uiSizeMult = 1;

    cudaDeviceProp deviceProp;
    deviceProp.major = 0;
    deviceProp.minor = 0;
    int dev;

	shrQAStart(argc, argv);

	// set logfile name and start logs
    shrSetLogFileName ("histogram.txt");

    //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
        dev = cutilDeviceInit(argc, argv);
        if (dev < 0) {
           printf("No CUDA Capable Devices found, exiting...\n");
           shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
        }
    } else {
        cudaSetDevice( dev = cutGetMaxGflopsDeviceId() );
        cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) );
    }
    cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) );

	printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", 
		deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);

	int version = deviceProp.major * 0x10 + deviceProp.minor;

	if(version < 0x11) 
    {
        printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n");
        cutilDeviceReset();
		shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
    }

    cutilCheckError(cutCreateTimer(&hTimer));

    // Optional Command-line multiplier to increase size of array to histogram
    if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult))
    {
        uiSizeMult = CLAMP(uiSizeMult, 1, 10);
        byteCount *= uiSizeMult;
    }

		shrLog("Initializing data...\n");
        shrLog("...allocating CPU memory.\n");
            h_Data         = (uchar *)malloc(byteCount);
            h_HistogramCPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));
            h_HistogramGPU = (uint  *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));

        shrLog("...generating input data\n");
            srand(2009);
            for(uint i = 0; i < byteCount; i++) 
                h_Data[i] = rand() % 256;

        shrLog("...allocating GPU memory and copying input data\n\n");
            cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount  ) );
            cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint)  ) );
            cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) );

	//-----
	// 64 bin histogram
	//------
	{
        shrLog("Starting up 64-bin histogram...\n\n");
            initHistogram64();

        shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++){
                //iter == -1 -- warmup iteration
                if(iter == 0){
                    cutilSafeCall( cutilDeviceSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogram64(d_Histogram, d_Data, byteCount);
            }

            cutilSafeCall( cutilDeviceSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", 
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); 

        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );

            shrLog(" ...histogram64CPU()\n");
               histogram64CPU(
                    h_HistogramCPU,
                    h_Data,
                    byteCount
                );

            shrLog(" ...comparing the results...\n");
                for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++)
                    if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0;
            shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" );

        shrLog("Shutting down 64-bin histogram...\n\n\n");
            closeHistogram64();
    }


	//-----
	// Histogram 256
	//-----
    {
        shrLog("Initializing 256-bin histogram...\n");
            initHistogram256();

        shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++){
                //iter == -1 -- warmup iteration
                if(iter == 0){
                    cutilSafeCall( cutilDeviceSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogram256(d_Histogram, d_Data, byteCount);
            }

            cutilSafeCall( cutilDeviceSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", 
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); 
                
        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );

            shrLog(" ...histogram256CPU()\n");
                histogram256CPU(
                    h_HistogramCPU,
                    h_Data,
                    byteCount
                );

            shrLog(" ...comparing the results\n");
                for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++)
                    if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0;
            shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" );

        shrLog("Shutting down 256-bin histogram...\n\n\n");
            closeHistogram256();
    }


	//-----
	// Histogram Trish
	//-----
    {
        shrLog("Initializing 256-bin TRISH histogram...\n");
            initTrish256();

        shrLog("Running 256-bin GPU TRISH histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
            for(int iter = -1; iter < numRuns; iter++)
			{
                //iter == -1 -- warmup iteration
                if (iter == 0)
				{
                    cutilSafeCall( cutilDeviceSynchronize() );
                    cutilCheckError( cutResetTimer(hTimer) );
                    cutilCheckError( cutStartTimer(hTimer) );
                }

                histogramTrish256( d_Histogram, d_Data, byteCount );
            }

            cutilSafeCall( cutilDeviceSynchronize() );
            cutilCheckError(  cutStopTimer(hTimer));
            double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
        shrLog("histogramTRISH() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
        shrLogEx(LOGBOTH | MASTER, 0, "histogramTRISH, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", 
                (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); 
                
        shrLog("\nValidating GPU results...\n");
            shrLog(" ...reading back GPU results\n");
                cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );

            shrLog(" ...histogram256CPU()\n");
                histogram256CPU(
                    h_HistogramCPU,
                    h_Data,
                    byteCount
                );

            shrLog(" ...comparing the results\n");
                for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++)
                    if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0;
            shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" );

        shrLog("Shutting down 256-bin TRISH histogram...\n\n\n");
            closeTrish256();
    }


    shrLog("Shutting down...\n");
        cutilCheckError(cutDeleteTimer(hTimer));
        cutilSafeCall( cudaFree(d_Histogram) );
        cutilSafeCall( cudaFree(d_Data) );
        free(h_HistogramGPU);
        free(h_HistogramCPU);
        free(h_Data);

    cutilDeviceReset();
	shrLog("%s - Test Summary\n", sSDKsample );
    // pass or fail (for both 64 bit and 256 bit histograms)
    shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED));
}
Exemple #5
0
	void allocate() {
		this->destroy();
		check_error( cudaMalloc((void**)&_dptr, sizeof(value_type)) );
	}
Exemple #6
0
void sparse_matrix_t::alloc_device() 
{
    cudaMalloc((void**)&devJc, (numCols+1) * sizeof(int));
    cudaMalloc((void**)&devIr, numNonZeroElems * sizeof(int));
    cudaMalloc((void**)&devRVals, numNonZeroElems * sizeof(float));
}
bool
runTestMax( int argc, char** argv, ReduceType datatype) 
{
    int size = 1<<24;    // number of elements to reduce
    int maxThreads = 256;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 64;
    bool cpuFinalReduction = false;
    int cpuFinalThreshold = 1;

    cutGetCmdLineArgumenti( argc, (const char**) argv, "n", &size);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks);
    
		shrLog("METHOD: MAX\n");
    shrLog("%d elements\n", size);
    shrLog("%d threads (max)\n", maxThreads);

    cpuFinalReduction = (cutCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == CUTTrue);
    cutGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold);

    bool runShmoo = (cutCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == CUTTrue);

    if (runShmoo)
    {
        shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
    }
    else
    {

        // create random input data on CPU
        unsigned int bytes = size * sizeof(T);

        T *h_idata = (T *) malloc(bytes);

        for(int i=0; i<size; i++) 
        {
            // Keep the numbers small so we don't get truncation error in the sum
            if (datatype == REDUCE_INT)
                h_idata[i] = (T)(rand() & 0xFF);
            else
                h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
        }

        int numBlocks = 0;
        int numThreads = 0;
        getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);
        if (numBlocks == 1) cpuFinalThreshold = 1;

        // allocate mem for the result on host side
        T* h_odata = (T*) malloc(numBlocks*sizeof(T));

        shrLog("%d blocks\n\n", numBlocks);

        // allocate device memory and data
        T* d_idata = NULL;
        T* d_odata = NULL;

        cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) );
        cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, numBlocks*sizeof(T)) );

        // copy data directly to device memory
        cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) );
        cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice) );

        // warm-up
        maxreduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);
        
        int testIterations = 100;

        unsigned int timer = 0;
        cutilCheckError( cutCreateTimer( &timer));
        
        T gpu_result = 0;

        gpu_result = benchmarkReduceMax<T>(size, numThreads, numBlocks, maxThreads, maxBlocks,
                                        whichKernel, testIterations, cpuFinalReduction, 
                                        cpuFinalThreshold, timer,
                                        h_odata, d_idata, d_odata);

		double reduceTime = cutGetAverageTimerValue(timer) * 1e-3;
        shrLogEx(LOGBOTH | MASTER, 0, "Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 
               1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);

        // compute reference solution
        T cpu_result = maxreduceCPU<T>(h_idata, size);

        double threshold = 1e-12;
        double diff = 0;
		
        if (datatype == REDUCE_INT)
        {
            shrLog("\nGPU result = %d\n", gpu_result);
            shrLog("CPU result = %d\n\n", cpu_result);
        }
        else
        {
            shrLog("\nGPU result = %f\n", gpu_result);
            shrLog("CPU result = %f\n\n", cpu_result);

            if (datatype == REDUCE_FLOAT)
                threshold = 1e-8 * size;
            diff = fabs((double)gpu_result - (double)cpu_result);
        }

        // cleanup
        cutilCheckError( cutDeleteTimer(timer) );
        free(h_idata);
        free(h_odata);

        cutilSafeCallNoSync(cudaFree(d_idata));
        cutilSafeCallNoSync(cudaFree(d_odata));

		if (datatype == REDUCE_INT) {
			return (gpu_result == cpu_result);
		} else {
			return (diff < threshold);
		}
	}
	return true;
}
void iB_FFTShift::FFTShift_2D_Float(int size_X, int size_Y, Sheet* xlSheet, int nLoop)
{
	LOG();

	INFO("2D FFT Shift Float - CPU " + ITS(size_X) + "x" + ITS(size_Y));

	/**********************************************************
	 * Float Case
	 **********************************************************/

	if (xlSheet)
	{
		for (int iLoop = 0; iLoop < nLoop; iLoop++)
		{
			// Headers
			xlSheet->writeStr(1, ((iLoop * 4) + 0), "I-CPU");
			xlSheet->writeStr(1, ((iLoop * 4) + 1), "O-CPU");
			xlSheet->writeStr(1, ((iLoop * 4) + 2), "I-GPU");
			xlSheet->writeStr(1, ((iLoop * 4) + 3), "O-GPU");

			// Allocation: 2D, Flat, Device
			arr_2D_float = MEM_ALLOC_2D_FLOAT(size_X, size_Y);
			arr_2D_flat_float = MEM_ALLOC_1D_FLOAT(size_X * size_Y);
			int devMem = size_X * size_Y * sizeof(float);
			cudaMalloc((void**)(&dev_arr_2D_flat_float), devMem);

			// Filling arrays: 2D, Flat
			Array::fillArray_2D_float(arr_2D_float, size_X, size_Y, 1);
			Array::fillArray_2D_flat_float(arr_2D_flat_float, size_X, size_Y, 1);

			// Printing input
			ctr = 0;
			for (int i = 0; i < size_X; i++)
				for (int j = 0; j < size_Y; j++)
					xlSheet->writeNum((ctr++) + 2, iLoop * 4, arr_2D_float[i][j]);

			// FFT shift operation - CPU
			arr_2D_float = FFT::FFT_Shift_2D_float(arr_2D_float, size_X, size_Y);

			// Printing CPU output
			ctr = 0;
			for (int i = 0; i < size_X; i++)
				for (int j = 0; j < size_Y; j++)
					xlSheet->writeNum((ctr++) + 2, ((iLoop * 4 ) + 1), arr_2D_float[i][j]);

			// Printing GPU input
			ctr = 0;
			for (int i = 0; i < size_X; i++)
				for (int j = 0; j < size_Y; j++)
				{
					xlSheet->writeNum(ctr + 2, ((iLoop * 4 ) + 2), arr_2D_flat_float[ctr]);
					ctr++;
				}

			// Uploading array
			cuUtils::upload_2D_float(arr_2D_flat_float, dev_arr_2D_flat_float, size_X, size_Y);

			// CUDA Gridding
			dim3 cuBlock(512, 512, 1);
			dim3 cuGrid(size_X / cuBlock.x, size_Y/ cuBlock.y, 1);

			// FFT shift
			cuFFTShift_2D( cuBlock, cuGrid, dev_arr_2D_flat_float, dev_arr_2D_flat_float, size_X);

			// Downloading array
			cuUtils::download_2D_float(arr_2D_flat_float, dev_arr_2D_flat_float, size_X, size_Y);

			// Printing output
			ctr = 0;
			for (int i = 0; i < size_X; i++)
				for (int j = 0; j < size_Y; j++)
				{
					xlSheet->writeNum((ctr) + 2, ((iLoop * 4 ) + 3), arr_2D_flat_float[ctr]);
					ctr++;
				}

			// Dellocating memory
			FREE_MEM_2D_FLOAT(arr_2D_float, size_X, size_Y);
		}

	}
	else
	{
		INFO("No valid xlSheet was created, EXITTING ...");
		EXIT(0);
	}
}
Exemple #9
0
static cudaError_t cudaMallocWrapper(void* ctx, void** devPtr, size_t size, cudaStream_t stream)
{
  return cudaMalloc(devPtr, size);
}
/** Documented at declaration */
struct gpujpeg_encoder*
gpujpeg_encoder_create(struct gpujpeg_parameters* param, struct gpujpeg_image_parameters* param_image)
{
    assert(param_image->comp_count == 1 || param_image->comp_count == 3);
    assert(param_image->comp_count <= GPUJPEG_MAX_COMPONENT_COUNT);
    assert(param->quality >= 0 && param->quality <= 100);
    assert(param->restart_interval >= 0);
    assert(param->interleaved == 0 || param->interleaved == 1);

    struct gpujpeg_encoder* encoder = (struct gpujpeg_encoder*) malloc(sizeof(struct gpujpeg_encoder));
    if ( encoder == NULL )
        return NULL;

    // Get coder
    struct gpujpeg_coder* coder = &encoder->coder;

    // Set parameters
    memset(encoder, 0, sizeof(struct gpujpeg_encoder));
    coder->param_image = *param_image;
    coder->param = *param;

    int result = 1;

    // Create writer
    encoder->writer = gpujpeg_writer_create(encoder);
    if ( encoder->writer == NULL )
        result = 0;

    // Initialize coder
    if ( gpujpeg_coder_init(coder) != 0 )
        result = 0;

    // Init preprocessor
    if ( gpujpeg_preprocessor_encoder_init(&encoder->coder) != 0 ) {
        fprintf(stderr, "Failed to init preprocessor!");
        result = 0;
    }

    // Allocate quantization tables in device memory
    for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
        if ( cudaSuccess != cudaMalloc((void**)&encoder->table_quantization[comp_type].d_table, 64 * sizeof(uint16_t)) )
            result = 0;
        if ( cudaSuccess != cudaMalloc((void**)&encoder->table_quantization[comp_type].d_table_forward, 64 * sizeof(float)) )
            result = 0;
    }
    gpujpeg_cuda_check_error("Encoder table allocation", return NULL);

    // Init quantization tables for encoder
    for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
        if ( gpujpeg_table_quantization_encoder_init(&encoder->table_quantization[comp_type], (enum gpujpeg_component_type)comp_type, coder->param.quality) != 0 )
            result = 0;
    }
    gpujpeg_cuda_check_error("Quantization init", return NULL);

    // Init huffman tables for encoder
    for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
        for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
            if ( gpujpeg_table_huffman_encoder_init(&encoder->table_huffman[comp_type][huff_type], (enum gpujpeg_component_type)comp_type, (enum gpujpeg_huffman_type)huff_type) != 0 )
                result = 0;
        }
    }
    gpujpeg_cuda_check_error("Encoder table init", return NULL);

    // Init huffman encoder
    if ( gpujpeg_huffman_gpu_encoder_init(encoder) != 0 )
        result = 0;

    if ( result == 0 ) {
        gpujpeg_encoder_destroy(encoder);
        return NULL;
    }

    // Timers
    GPUJPEG_CUSTOM_TIMER_CREATE(encoder->def);
    GPUJPEG_CUSTOM_TIMER_CREATE(encoder->in_gpu);

    return encoder;
}
Exemple #11
0
rk4_mem *SOLVER(rk4, init, TARGET, SIMENGINE_STORAGE, solver_props *props) {
#if defined TARGET_GPU
  GPU_ENTRY(init, SIMENGINE_STORAGE);

  // Temporary CPU copies of GPU datastructures
  rk4_mem tmem;
  // GPU datastructures
  rk4_mem *dmem;

  // Computes GPU kernel geometry
  size_t shmem_per_thread, total_shmem = 1<<14;
  int warp_size = 1<<5;
  uint threads_per_block;
  uint num_gpu_threads;
  uint num_gpu_blocks;

  // shared space for model states and solver overhead
  shmem_per_thread = sizeof(CDATAFORMAT) * props->statesize * 6; // 6 = magic for rk4
  // shared space for a vector of time
  shmem_per_thread += sizeof(CDATAFORMAT);
  // shared space for a vector of `running' flags
  shmem_per_thread += sizeof(int);

  
  threads_per_block = total_shmem / shmem_per_thread;
  threads_per_block = warp_size * (threads_per_block / warp_size);

  num_gpu_threads = threads_per_block < props->num_models ? threads_per_block : props->num_models;
  num_gpu_blocks = (props->num_models + threads_per_block - 1) / threads_per_block;

  props->gpu.blockx = num_gpu_threads;
  props->gpu.blocky = 1;
  props->gpu.blockz = 1;
  props->gpu.gridx = num_gpu_blocks;
  props->gpu.gridy = 1;
  props->gpu.gridz = 1;
  props->gpu.shmem_per_block = shmem_per_thread * num_gpu_threads;

  
  // Allocate GPU space for mem and pointer fields of mem (other than props)
  cutilSafeCall(cudaMalloc((void**)&dmem, sizeof(rk4_mem)));
  tmem.props = GPU_ENTRY(init_props, SIMENGINE_STORAGE, props);

  cutilSafeCall(cudaMalloc((void**)&tmem.k1, props->statesize*props->num_models*sizeof(CDATAFORMAT)));

  cutilSafeCall(cudaMalloc((void**)&tmem.k2, props->statesize*props->num_models*sizeof(CDATAFORMAT)));
  cutilSafeCall(cudaMalloc((void**)&tmem.k3, props->statesize*props->num_models*sizeof(CDATAFORMAT)));
  cutilSafeCall(cudaMalloc((void**)&tmem.k4, props->statesize*props->num_models*sizeof(CDATAFORMAT)));
  cutilSafeCall(cudaMalloc((void**)&tmem.temp, props->statesize*props->num_models*sizeof(CDATAFORMAT)));

  // Copy mem structure to GPU
  cutilSafeCall(cudaMemcpy(dmem, &tmem, sizeof(rk4_mem), cudaMemcpyHostToDevice));

  return dmem;
  
#else // Used for CPU and OPENMP targets

  rk4_mem *mem = (rk4_mem*)malloc(sizeof(rk4_mem));

  mem->props = props;
  mem->k1 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT));
  mem->k2 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT));
  mem->k3 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT));
  mem->k4 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT));
  mem->temp = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT));

  return mem;
#endif
}
float WFIRFilterCuda::cudaFilter( WLEMData::ScalarT* const output, const WLEMData::ScalarT* const input,
                const WLEMData::ScalarT* const previous, size_t channels, size_t samples, const WLEMData::ScalarT* const coeffs,
                size_t coeffSize )
{
    CuScalarT *dev_in = NULL;
    size_t pitchIn;

    CuScalarT *dev_prev = NULL;
    size_t pitchPrev;

    CuScalarT *dev_out = NULL;
    size_t pitchOut;

    CuScalarT *dev_co = NULL;

    try
    {
        CudaThrowsCall( cudaMallocPitch( ( void** )&dev_in, &pitchIn, samples * sizeof( CuScalarT ), channels ) );
        CudaThrowsCall(
                        cudaMemcpy2D( dev_in, pitchIn, input, samples * sizeof( CuScalarT ), samples * sizeof( CuScalarT ),
                                        channels, cudaMemcpyHostToDevice ) );

        CudaThrowsCall( cudaMallocPitch( ( void** )&dev_prev, &pitchPrev, coeffSize * sizeof( CuScalarT ), channels ) );
        CudaThrowsCall(
                        cudaMemcpy2D( dev_prev, pitchPrev, previous, coeffSize * sizeof( CuScalarT ),
                                        coeffSize * sizeof( CuScalarT ), channels, cudaMemcpyHostToDevice ) );

        CudaThrowsCall( cudaMallocPitch( ( void** )&dev_out, &pitchOut, samples * sizeof( CuScalarT ), channels ) );

        CudaThrowsCall( cudaMalloc( ( void** )&dev_co, coeffSize * sizeof( CuScalarT ) ) );
        CudaThrowsCall( cudaMemcpy( dev_co, coeffs, coeffSize * sizeof( CuScalarT ), cudaMemcpyHostToDevice ) );
    }
    catch( const WException& e )
    {
        wlog::error( CLASS ) << e.what();
        if( dev_in )
        {
            CudaSafeCall( cudaFree( ( void* )dev_in ) );
        }
        if( dev_prev )
        {
            CudaSafeCall( cudaFree( ( void* )dev_prev ) );
        }
        if( dev_out )
        {
            CudaSafeCall( cudaFree( ( void* )dev_out ) );
        }
        if( dev_co )
        {
            CudaSafeCall( cudaFree( ( void* )dev_co ) );
        }
        throw WLBadAllocException( "Could not allocate CUDA memory!" );
    }

    size_t threadsPerBlock = 32;
    size_t blocksPerGrid = ( samples + threadsPerBlock - 1 ) / threadsPerBlock;
    size_t sharedMem = coeffSize * sizeof( CuScalarT );

    cudaEvent_t start, stop;
    cudaEventCreate( &start );
    cudaEventCreate( &stop );

    cudaEventRecord( start, 0 );
    cuFirFilter( blocksPerGrid, threadsPerBlock, sharedMem, dev_out, dev_in, dev_prev, channels, samples, dev_co, coeffSize,
                    pitchOut, pitchIn, pitchPrev );
    cudaError_t kernelError = cudaGetLastError();

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );

    float elapsedTime;
    cudaEventElapsedTime( &elapsedTime, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );

    try
    {
        if( kernelError != cudaSuccess )
        {
            const std::string err( cudaGetErrorString( kernelError ) );
            throw WException( "CUDA kernel failed: " + err );
        }
        CudaThrowsCall(
                        cudaMemcpy2D( output, samples * sizeof( CuScalarT ), dev_out, pitchOut, samples * sizeof( CuScalarT ),
                                        channels, cudaMemcpyDeviceToHost ) );
    }
    catch( const WException& e )
    {
        wlog::error( CLASS ) << e.what();
        elapsedTime = -1.0;
    }

    CudaSafeCall( cudaFree( ( void* )dev_in ) );
    CudaSafeCall( cudaFree( ( void* )dev_prev ) );
    CudaSafeCall( cudaFree( ( void* )dev_out ) );
    CudaSafeCall( cudaFree( ( void* )dev_co ) );

    if( elapsedTime > -1.0 )
    {
        return elapsedTime;
    }
    else
    {
        throw WException( "Error in cudaFilter()" );
    }
}
Exemple #13
0
BaseData<Dtype>::BaseData(const int length)
{
	length_ = length;
	checkCudaErrors(cudaHostAlloc(&cpu_data_, sizeof(Dtype)*length_, cudaHostAllocDefault));
	checkCudaErrors(cudaMalloc(&gpu_data_, sizeof(Dtype)*length_));
}
void ckm( struct svm_problem *prob, struct svm_problem *pecm, float *gamma  )
{
	cublasStatus_t status;

	double g_val = *gamma;

	long int nfa;
	
	int len_tv;
	int ntv;
	int i_v;
	int i_el;
	int i_r, i_c;
	int trvei;

	double *tv_sq;
	double *v_f_g;

	float *tr_ar;
	float *tva, *vtm, *DP;
	float *g_tva = 0, *g_vtm = 0, *g_DotProd = 0;

	cudaError_t cudaStat;   
	cublasHandle_t handle;
	
	status = cublasCreate(&handle);

	len_tv = prob-> x[0].dim;
	ntv   = prob-> l;

	nfa = len_tv * ntv; 

	tva = (float*) malloc ( len_tv * ntv* sizeof(float) );
	vtm = (float*) malloc ( len_tv * sizeof(float) );
	DP  = (float*) malloc ( ntv * sizeof(float) );

	tr_ar = (float*) malloc ( len_tv * ntv* sizeof(float) );

	tv_sq = (double*) malloc ( ntv * sizeof(double) );

	v_f_g  = (double*) malloc ( ntv * sizeof(double) );

	for ( i_r = 0; i_r < ntv ; i_r++ )
	{				 
		for ( i_c = 0; i_c < len_tv; i_c++ ) 
			tva[i_r * len_tv + i_c] = (float)prob-> x[i_r].values[i_c];
	}

	cudaStat = cudaMalloc((void**)&g_tva, len_tv * ntv * sizeof(float));
	
	if (cudaStat != cudaSuccess) {
		free( tva );
		free( vtm );
		free( DP  );

		free( v_f_g );
		free( tv_sq );

		cudaFree( g_tva );
		cublasDestroy( handle );	
	
		fprintf (stderr, "!!!! Device memory allocation error (A)\n");
		getchar();
		return;
    }

	cudaStat = cudaMalloc((void**)&g_vtm, len_tv * sizeof(float));

	cudaStat = cudaMalloc((void**)&g_DotProd, ntv * sizeof(float));

	for( i_r = 0; i_r < ntv; i_r++ )
		for( i_c = 0; i_c < len_tv; i_c++ )
			tr_ar[i_c * ntv + i_r] = tva[i_r * len_tv + i_c];

	// Copy cpu vector to gpu vector
	status = cublasSetVector( len_tv * ntv, sizeof(float), tr_ar, 1, g_tva, 1 );
    
	free( tr_ar );

	for( i_v = 0; i_v < ntv; i_v++ )
	{
		tv_sq[ i_v ] = 0;
		for( i_el = 0; i_el < len_tv; i_el++ )
			tv_sq[i_v] += pow( tva[i_v*len_tv + i_el], (float)2.0 );
	}



	for ( trvei = 0; trvei < ntv; trvei++ )
	{
		status = cublasSetVector( len_tv, sizeof(float), &tva[trvei * len_tv], 1, g_vtm, 1 );
		
		status = cublasSgemv( handle, CUBLAS_OP_N, ntv, len_tv, &alpha, g_tva, ntv , g_vtm, 1, &beta, g_DotProd, 1 );

		status = cublasGetVector( ntv, sizeof(float), g_DotProd, 1, DP, 1 );

		for ( i_c = 0; i_c < ntv; i_c++ )
			v_f_g[i_c] = exp( -g_val * (tv_sq[trvei] + tv_sq[i_c]-((double)2.0)* (double)DP[i_c] ));
		

		pecm-> x[trvei].values[0] = trvei + 1;
		
		for ( i_c = 0; i_c < ntv; i_c++ )
			pecm-> x[trvei].values[i_c + 1] = v_f_g[i_c];				
		

	}

	free( tva );
	free( vtm );
	free( DP  );
	free( v_f_g );
	free( tv_sq );

	cudaFree( g_tva );
	cudaFree( g_vtm );
	cudaFree( g_DotProd );

	cublasDestroy( handle );
}
void runAutoTest(int argc, char *argv[])
{
    printf("[%s] (automated testing w/ readback)\n", sSDKsample);
    int devID = findCudaDevice(argc, (const char **)argv);

    // Ensure that SM 2.0 or higher device is available before running
    checkDeviceMeetComputeSpec(argc, argv);

    loadDefaultImage(argv[0]);

    Pixel *d_result;
    checkCudaErrors(cudaMalloc((void **)&d_result, imWidth*imHeight*sizeof(Pixel)));

    char *ref_file = NULL;
    char  dump_file[256];

    int mode = 0;
    mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode");
    getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);

    switch (mode)
    {
        case 0:
            g_SobelDisplayMode = SOBELDISPLAY_IMAGE;
            sprintf(dump_file, "lena_orig.pgm");
            break;

        case 1:
            g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX;
            sprintf(dump_file, "lena_tex.pgm");
            break;

        case 2:
            g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED;
            sprintf(dump_file, "lena_shared.pgm");
            break;

        default:
            printf("Invalid Filter Mode File\n");
            exit(EXIT_FAILURE);
            break;
    }

    printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]);
    sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp);
    checkCudaErrors(cudaDeviceSynchronize());

    unsigned char *h_result = (unsigned char *)malloc(imWidth*imHeight*sizeof(Pixel));
    checkCudaErrors(cudaMemcpy(h_result, d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost));
    sdkSavePGM(dump_file, h_result, imWidth, imHeight);

    if (!sdkComparePGM(dump_file, sdkFindFilePath(ref_file, argv[0]), MAX_EPSILON_ERROR, 0.15f, false))
    {
        g_TotalErrors++;
    }

    checkCudaErrors(cudaFree(d_result));
    free(h_result);

    if (g_TotalErrors != 0)
    {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }

    printf("Test passed!\n");
    exit(EXIT_SUCCESS);
}
Exemple #16
0
int main(int argc, char* argv[]) {	
	
	int disp_size = 64;
	const int bits = 8;

	if (argc >= 2) {
		disp_size = atoi(argv[1]);
	}
	
	// init zed cam
	auto cap = new sl::zed::Camera(sl::zed::ZEDResolution_mode::VGA);
	sl::zed::ERRCODE err = cap->init(sl::zed::MODE::PERFORMANCE, 0, true);
	if (err != sl::zed::ERRCODE::SUCCESS) {
		std::cout << sl::zed::errcode2str(err) << std::endl;
		exit(EXIT_FAILURE);
	}

	int width = cap->getImageSize().width;
	int height = cap->getImageSize().height;

	sgm::StereoSGM ssgm(width, height, disp_size, 8, 16, sgm::EXECUTE_INOUT_CUDA2CUDA);


	SGMDemo demo(width, height);
	if (demo.init()) {
		printf("fail to init SGM Demo\n");
		std::exit(EXIT_FAILURE);
	}

	Renderer renderer(width, height);

	uint16_t* d_output_buffer = NULL;
	uint8_t* d_input_left = NULL;
	uint8_t* d_input_right = NULL;
	cudaMalloc((void**)&d_input_left, width * height);
	cudaMalloc((void**)&d_input_right, width * height);

	const NppiSize roi = { width, height };

	cv::Mat h_input_left(height, width, CV_8UC1);

	while (!demo.should_close()) {
		cap->grab(sl::zed::SENSING_MODE::FULL, false, false);

		sl::zed::Mat left_zm = cap->retrieveImage_gpu(sl::zed::SIDE::LEFT);
		sl::zed::Mat right_zm = cap->retrieveImage_gpu(sl::zed::SIDE::RIGHT);

		nppiRGBToGray_8u_AC4C1R(left_zm.data, width * 4, d_input_left, width, roi);
		nppiRGBToGray_8u_AC4C1R(right_zm.data, width * 4, d_input_right, width, roi);

		ssgm.execute(d_input_left, d_input_right, (void**)&d_output_buffer);

		switch (demo.get_flag()) {
		case 0: 
			cudaMemcpy(h_input_left.data, d_input_left, width * height, cudaMemcpyDeviceToHost);
			renderer.render_input((uint8_t*)h_input_left.data); 
			break;
		case 1: 
			renderer.render_disparity(d_output_buffer, disp_size);
			break;
		case 2: 
			renderer.render_disparity_color(d_output_buffer, disp_size);
			break;
		}

		demo.swap_buffer();
	}

	cudaFree(d_input_left);
	cudaFree(d_input_right);
	delete cap;
}
Exemple #17
0
void matrix_t::alloc_device() 
{
    CUDA_SAFE_CALL( 
	  cudaMalloc((void**)&device, elems * sizeof(cufftComplex))
    );
}
fmat ModelWPAMGPU::ffun(fmat *current)
{
    fmat prediction(current->n_rows,current->n_cols);
    fmat pNoiseSample = pNoise.sample(current->n_cols);
    fmat u = U.sample(current->n_cols);
    float* lastState_dev;
    float* F_dev;
	float* U_dev;
	float* pNoise_dev;
    int stateDimension = current->n_rows;
    int numberOfSamples = current->n_cols;
    float* newState_dev;

	//allocate memory on gpu
    cudaMalloc( &lastState_dev, (size_t) current->n_elem * sizeof(float)) ;
	cudaMalloc( &F_dev, (size_t) F.n_elem * sizeof(float)) ;
	cudaMalloc( &U_dev, (size_t) u.n_elem * sizeof(float)) ;
	cudaMalloc( &pNoise_dev, (size_t) pNoiseSample.n_elem * sizeof(float)) ;
	cudaMalloc( &newState_dev, (size_t) prediction.n_elem * sizeof(float)) ;

	//Copy particles and weights to the gpu
    cudaMemcpy(lastState_dev,current->memptr(),(size_t) current->n_elem * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(F_dev,F.memptr(),(size_t) F.n_elem * sizeof(float), cudaMemcpyHostToDevice);
    //cudaMemcpy(U_dev,u.memptr(),(size_t) u.n_elem * sizeof(float), cudaMemcpyHostToDevice);
    //cudaMemcpy(pNoise_dev,pNoiseSample.memptr(),(size_t) pNoiseSample.n_elem * sizeof(float), cudaMemcpyHostToDevice);

    //pNoise
    curandGenerateNormal(gen, pNoise_dev, numberOfSamples, 0.0f, 50.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+2*numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+3*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+4*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+5*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+6*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+7*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);
    curandGenerateNormal(gen, pNoise_dev+8*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);

    // U
    U.batch.at(0);
    for (unsigned int i=0; i< 9 ;++i)
    {
        curandGenerateNormal(gen, U_dev+ i*numberOfSamples, numberOfSamples, U.batch.at(i)->a, U.batch.at(i)->b);
    }
    /*curandGenerateNormal(gen, oNoise_dev, numberOfSamples, 0.0f, 50.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+2*numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+3*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+4*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+5*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+6*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+7*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);
    curandGenerateNormal(gen, oNoise_dev+8*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);*/

    //prediction = F * current + pNoiseSample + u ;
    callFfunKernel(lastState_dev, F_dev, U_dev, pNoise_dev, stateDimension ,numberOfSamples,newState_dev);
    //printf("%s\n",cudaGetErrorString(cudaGetLastError()));

	//get estimation from gpu
    cudaMemcpy(prediction.memptr(),newState_dev,current->n_elem * sizeof(float), cudaMemcpyDeviceToHost);

	// clean up the graphics card
    cudaFree(lastState_dev);
	cudaFree(newState_dev);
	cudaFree(F_dev);
	cudaFree(U_dev);
	cudaFree(pNoise_dev);

    return prediction;
}
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype)
{ 
		fprintf(stderr, "Shmoo wasn't implemented in this modified kernel!\n");
		exit(1);
    // create random input data on CPU
    unsigned int bytes = maxN * sizeof(T);

    T *h_idata = (T*) malloc(bytes);

    for(int i = 0; i < maxN; i++) {
        // Keep the numbers small so we don't get truncation error in the sum
        if (datatype == REDUCE_INT)
            h_idata[i] = (T)(rand() & 0xFF);
        else
            h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
    }

    int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE);

    // allocate mem for the result on host side
    T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T));

    // allocate device memory and data
    T* d_idata = NULL;
    T* d_odata = NULL;

    cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) );
    cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, maxNumBlocks*sizeof(T)) );

    // copy data directly to device memory
    cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) );
    cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, maxNumBlocks*sizeof(T), cudaMemcpyHostToDevice) );

    // warm-up
    for (int kernel = 0; kernel < 7; kernel++)
    {
        sumreduce<T>(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata);
    }
    int testIterations = 100;

    unsigned int timer = 0;
    cutilCheckError( cutCreateTimer( &timer));
    
    // print headers
    shrLog("Time in milliseconds for various numbers of elements for each kernel\n\n\n");
    shrLog("Kernel");
    for (int i = minN; i <= maxN; i *= 2)
    {
        shrLog(", %d", i);
    }
   
    for (int kernel = 0; kernel < 7; kernel++)
    {
        shrLog("\n%d", kernel);
        for (int i = minN; i <= maxN; i *= 2)
        {
            cutResetTimer(timer);
            int numBlocks = 0;
            int numThreads = 0;
            getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads);
            
            float reduceTime;
            if( numBlocks <= MAX_BLOCK_DIM_SIZE ) {
                benchmarkReduceSum(i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, 
                                testIterations, false, 1, timer, h_odata, d_idata, d_odata);
                reduceTime =  cutGetAverageTimerValue(timer);
            } else {                
                reduceTime = -1.0;
            }
            shrLog(", %.5f", reduceTime);
        }
    }

    // cleanup
    cutilCheckError(cutDeleteTimer(timer));
    free(h_idata);
    free(h_odata);

    cutilSafeCallNoSync(cudaFree(d_idata));
    cutilSafeCallNoSync(cudaFree(d_odata));    
}
Exemple #20
0
int CORE_dtstrf_cublas(int M, int N, int IB, int NB,
                double *U, int LDU,
                double *A, int LDA,
                double *L, int LDL,
                int *IPIV,
                double *WORK, int LDWORK,
                int *INFO)
{
  static double zzero = 0.0;
  static double mzone =-1.0;
  cublasStatus_t status;
  cudaError_t err;
  
  double alpha;
  int i, j, ii, sb;
  int im, ip;
  
#if CONFIG_VERBOSE
  fprintf(stdout, "%s: M=%d N=%d IB=%d NB=%d U=%p LDU=%d A=%p LDA=%d L=%p LDL=%d IPIV=%p WORK=%p LDWORK=%d\n",
          __FUNCTION__, M, N, IB, NB, U, LDU, A, LDA, L, LDL, IPIV, WORK, LDWORK);
  fflush(stdout);
#endif
  
  /* Check input arguments */
  *INFO = 0;
  if (M < 0) {
    coreblas_error(1, "Illegal value of M");
    return -1;
  }
  if (N < 0) {
    coreblas_error(2, "Illegal value of N");
    return -2;
  }
  if (IB < 0) {
    coreblas_error(3, "Illegal value of IB");
    return -3;
  }
  if ((LDU < max(1,NB)) && (NB > 0)) {
    coreblas_error(6, "Illegal value of LDU");
    return -6;
  }
  if ((LDA < max(1,M)) && (M > 0)) {
    coreblas_error(8, "Illegal value of LDA");
    return -8;
  }
  if ((LDL < max(1,IB)) && (IB > 0)) {
    coreblas_error(10, "Illegal value of LDL");
    return -10;
  }
  
  /* Quick return */
  if ((M == 0) || (N == 0) || (IB == 0))
    return PLASMA_SUCCESS;
  
  /* Set L to 0 */
  err = cudaMemset(L, 0, LDL*N*sizeof(double));
  PLASMA_CUDA_ASSERT(err);
  
  double* dev_ptr = 0;
  err = cudaMalloc((void**)&dev_ptr, 2*sizeof(double));
  PLASMA_CUDA_ASSERT(err);
  double* host_ptr;
  err = cudaMallocHost((void**)&host_ptr, 2*sizeof(double));
  PLASMA_CUDA_ASSERT(err);
  
  int* piv = kaapi_memory_get_host_pointer_and_validate(IPIV);
  
  ip = 0;
  for (ii = 0; ii < N; ii += IB) {
    sb = min(N-ii, IB);
    
    for (i = 0; i < sb; i++) {
      status = cublasIdamax(kaapi_cuda_cublas_handle(),
                            M, &A[LDA*(ii+i)], 1, &im
                            );
      PLASMA_CUBLAS_ASSERT(status);
      
      /* get im */
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      /* ajust index, CUBLAS is 1-based indexing */
      im--;

      piv[ip] = ii+i+1;
      
      core_dtstrf_cmp(kaapi_cuda_kernel_stream(),
                      &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], dev_ptr, host_ptr);
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      if (host_ptr[0] == 1.0f) {
        /*
         * Swap behind.
         */
        status = cublasDswap(kaapi_cuda_cublas_handle(),
                   i, &L[LDL*ii+i], LDL, &WORK[im], LDWORK
        );
        PLASMA_CUBLAS_ASSERT(status);
        /*
         * Swap ahead.
         */
        status = cublasDswap(kaapi_cuda_cublas_handle(),
              sb-i, &U[LDU*(ii+i)+ii+i], LDU, &A[LDA*(ii+i)+im], LDA
         );
        PLASMA_CUBLAS_ASSERT(status);
        /*
         * Set IPIV.
         */
        piv[ip] = NB + im + 1;

        core_dtstrf_set_zero(kaapi_cuda_kernel_stream(),
                             A, LDA, i, ii, im, zzero
                        );
      }
      
      core_dtstrf_cmp_zzero_and_get_alpha(kaapi_cuda_kernel_stream(),
                      &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], zzero, dev_ptr, host_ptr);
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      if ((*INFO == 0) && (host_ptr[0] == 1.0f)) {
        *INFO = ii+i+1;
      }
      
//      alpha = ((double)1. / U[LDU*(ii+i)+ii+i]);
      alpha = host_ptr[1];
      status = cublasDscal(kaapi_cuda_cublas_handle(),
                           M, &alpha, &A[LDA*(ii+i)], 1
                           );
      PLASMA_CUBLAS_ASSERT(status);
      
      status = cublasDcopy(kaapi_cuda_cublas_handle(),
                  M, &A[LDA*(ii+i)], 1, &WORK[LDWORK*i], 1
        );
      PLASMA_CUBLAS_ASSERT(status);
      
      status = cublasDger(kaapi_cuda_cublas_handle(),
                          M, sb-i-1,
                          &mzone, &A[LDA*(ii+i)], 1,
                          &U[LDU*(ii+i+1)+ii+i], LDU,
                          &A[LDA*(ii+i+1)], LDA
      );
      PLASMA_CUBLAS_ASSERT(status);
      ip = ip+1;
    }
    /*
     * Apply the subpanel to the rest of the panel.
     */
    if(ii+i < N) {
      for(j = ii; j < ii+sb; j++) {
        if (piv[j] <= NB) {
          piv[j] = piv[j] - ii;
        }
      }
      
      CORE_dssssm_cublas_v2(
                  NB, N-(ii+sb), M, N-(ii+sb), sb, sb,
                  &U[LDU*(ii+sb)+ii], LDU,
                  &A[LDA*(ii+sb)], LDA,
                  &L[LDL*ii], LDL,
                  WORK, LDWORK, &piv[ii]
                  );
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      for(j = ii; j < ii+sb; j++) {
        if (piv[j] <= NB) {
          piv[j] = piv[j] + ii;
        }
      }
    }
  }
  
  cudaFreeHost(host_ptr);
  cudaFree(dev_ptr);
  return PLASMA_SUCCESS;
}
Exemple #21
0
void compute_process()
{
	int np, pid;
	MPI_Comm_rank(MPI_COMM_WORLD, &pid);
	MPI_Comm_size(MPI_COMM_WORLD, &np);
	int server_process = np - 1;
	MPI_Status status;

	int num_comp_nodes = np -1;
	unsigned int num_bytes = sizeof(sAgents);
	unsigned int num_halo_points = RADIO * world_width;
	unsigned int num_halo_bytes = num_halo_points * sizeof(int);
 
	size_t size_world = world_width * world_height * sizeof(int);
	int *h_world = (int *)malloc(size_world);
	int *d_world;

	int left_neighbor = (pid > 0) ? (pid - 1) : MPI_PROC_NULL;
	int right_neighbor = (pid < np -2) ? (pid + 1) : MPI_PROC_NULL;


	for(int j = 0; j < world_width * world_height; j++)
	{	
		h_world[j] = 0;
	}


	sAgents h_agents_in, h_agents_left_node, h_agents_right_node;
	float4 h_agents_pos[agents_total], h_agents_ids[agents_total];
	float4 *d_agents_pos, *d_agents_ids;
	unsigned int num_bytes_agents = agents_total * sizeof(float4);

	int world_height_node = world_height / num_comp_nodes;

	// Error code to check return values for CUDA calls
        cudaError_t err = cudaSuccess;

	// Allocate the device pointer
    	err = cudaMalloc((void **)&d_world, size_world);

	if (err != cudaSuccess)
	{
        	fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
        }

    	err = cudaMalloc((void **)&d_agents_pos, num_bytes_agents);

	if (err != cudaSuccess)
	{
        	fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
        }

    	err = cudaMalloc((void **)&d_agents_ids, num_bytes_agents);

	if (err != cudaSuccess)
	{
        	fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
        }
		
	MPI_Recv(&h_agents_in, num_bytes, MPI_BYTE, server_process, 0, MPI_COMM_WORLD, &status);

	for(int i = 0; i < agents_total; i++)
	{
		//identify the active agents according to the y coordinate and set the busy cells in the world
		if(  ( round(h_agents_in.pos[i].y) >= (pid * world_height_node) ) and ( round(h_agents_in.pos[i].y) < ( (pid + 1) * world_height_node ) )  )
		{	
			h_agents_in.ids[i].y = 1;
			h_world[(int)round( (world_width * (h_agents_in.pos[i].y - 1) ) + h_agents_in.pos[i].x )] = h_agents_in.ids[i].x;
		}
		//Copy the data to a local arrays
		h_agents_pos[i] = h_agents_in.pos[i];
		h_agents_ids[i] = h_agents_in.ids[i];
	}


	err = cudaMemcpy(d_world, h_world, size_world, cudaMemcpyHostToDevice);

	if (err != cudaSuccess)
    	{
        	fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
    	}


	//for(int it = 0; it < nreps ; it++)
	while(1)
	{
		int it=4;
		err = cudaMemcpy(d_agents_pos, h_agents_pos, num_bytes_agents, cudaMemcpyHostToDevice);

		if (err != cudaSuccess)
	    	{
	        	fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
	        	exit(EXIT_FAILURE);
	    	}
	
		err = cudaMemcpy(d_agents_ids, h_agents_ids, num_bytes_agents, cudaMemcpyHostToDevice);

		if (err != cudaSuccess)
	    	{
	        	fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
	        	exit(EXIT_FAILURE);
	    	}


		launch_kernel(d_agents_pos, d_agents_ids, d_world, world_width, world_height, agent_width, agent_height, world_height_node, pid );

		cudaMemcpy(h_agents_pos, d_agents_pos, num_bytes_agents, cudaMemcpyDeviceToHost);
		cudaMemcpy(h_agents_ids, d_agents_ids, num_bytes_agents, cudaMemcpyDeviceToHost);

		//copy the data to the struct
		for( int i = 0; i < agents_total; i++)
		{
			h_agents_in.pos[i] = h_agents_pos[i];
			h_agents_in.ids[i] = h_agents_ids[i];
		}


		MPI_Barrier(MPI_COMM_WORLD);
		MPI_Send(&h_agents_in, num_bytes, MPI_BYTE, server_process, DATA_COLLECT, MPI_COMM_WORLD);

		#ifdef DEBUG
		//printf("pid: %d\n", pid);
		//display_data(h_agents_in);
		#endif

		// send data to left, get data from right 
		MPI_Sendrecv(&h_agents_in, num_bytes, MPI_BYTE, left_neighbor, it, &h_agents_right_node, num_bytes, MPI_BYTE, right_neighbor, it, MPI_COMM_WORLD, &status);

		// send data to right, get data from left 
		MPI_Sendrecv(&h_agents_in, num_bytes, MPI_BYTE, right_neighbor, it, &h_agents_left_node, num_bytes, MPI_BYTE, left_neighbor, it, MPI_COMM_WORLD, &status);

		for( int i = 0; i < agents_total; i++)
		{
			if(pid != np-2)
			{
				if(h_agents_right_node.ids[i].y == 2)
				{
					h_agents_in.pos[i] = h_agents_right_node.pos[i];
					h_agents_pos[i] = h_agents_right_node.pos[i];
					h_agents_in.ids[i].y = 1;			
					h_agents_ids[i].y = 1;	
				}
			}
			if(pid != 0)				
			{
				if(h_agents_left_node.ids[i].y == 3)
				{
					h_agents_in.pos[i] = h_agents_left_node.pos[i];
					h_agents_pos[i] = h_agents_left_node.pos[i];
					h_agents_in.ids[i].y = 1;			
					h_agents_ids[i].y = 1;	
				}
			}
		}

/***
		if(pid == 1)
		{	
			printf("pid: %d\n", pid);
			display_data(h_agents_in);
			display_data(h_agents_right_node);
			display_data(h_agents_left_node);
		}
***/

	}


	/* Release resources */
//	free(h_agents_in); 
/*	
	free(h_output);
	cudaFreeHost(h_left_boundary); cudaFreeHost(h_right_boundary);
	cudaFreeHost(h_left_halo); cudaFreeHost(h_right_halo);
	cudaFree(d_input); cudaFree(d_output);
*/
}
Exemple #22
0
int main(void)
{
  //std::cout << "Generating a time series on device "<< tim.get_nsamps() << std::endl;
  //DeviceTimeSeries<float> d_tim(8388608);
  //d_tim.set_tsamp(0.000064);
  TimeSeries<float> tim;
  tim.from_file("/lustre/home/ebarr/Soft/peasoup/tmp5.tim");
  DeviceTimeSeries<float> d_tim(tim);
  
  unsigned int size = d_tim.get_nsamps();
  
  TimeSeriesFolder folder(size);
  
  //DeviceTimeSeries<float> d_tim_r(fft_size); //<----for resampled data
  //TimeDomainResampler resampler;
  

  float* folded_buffer;
  cudaError_t error;
  cufftResult result;
  error = cudaMalloc((void**)&folded_buffer, sizeof(float)*size);
  ErrorChecker::check_cuda_error(error);


  unsigned nints = 64;
  unsigned nbins = 32;

  cufftComplex* fft_out;
  error = cudaMalloc((void**)&fft_out, sizeof(cufftComplex)*nints*nbins);
  cufftHandle plan;
  result = cufftPlan1d(&plan,nbins,CUFFT_R2C, nints);
  ErrorChecker::check_cufft_error(result);
  Stopwatch timer;

  FoldedSubints<float> folded_array(nbins,nints);
  //folder.fold(d_tim,folded_array,0.007453079228);

  std::cout << "made it here" << std::endl;
  

  FoldOptimiser optimiser(nbins,nints);
  timer.start();
  for (int ii=0;ii<1;ii++){
    //FoldedSubints<float> folded_array(nbins,nints);
    folder.fold(d_tim,folded_array,0.007453099228);
    Utils::dump_device_buffer<float>(folded_array.get_data(),nints*nbins,"original_fold.bin");
    
    optimiser.optimise(folded_array);
  }
  timer.stop();
  
  /*
  float* temp = new float [nints*nbins];
  
  cudaMemcpy(temp,folded_buffer,nints*nbins*sizeof(float),cudaMemcpyDeviceToHost);
  ErrorChecker::check_cuda_error();

  for (int ii=0;ii<nints*nbins;ii++)
    std::cout << temp[ii] << std::endl;
  */

  
  std::cout << "Total execution time (s): " << timer.getTime()<<std::endl;
  std::cout << "Average execution time (s): " << timer.getTime()/1000.0 << std::endl;



  return 0;
}
Exemple #23
0
void allocate_memory(void)
{
    //allocate host arrays
    a0 = (float*)malloc(memsize);
    a1 = (float*)malloc(memsize);
    c1 = (float*)malloc(memsize);
    c2 = (float*)malloc(memsize);

    a2 = (float*)malloc(memsize);
    a3 = (float*)malloc(memsize);
    b0 = (float*)malloc(memsize);
    b1 = (float*)malloc(memsize);
    b2 = (float*)malloc(memsize);
    c0 = (float*)malloc(memsize);
    wrk = (float*)malloc(memsize);
    bnd = (float*)malloc(memsize);
    if (!a0 || !a1 || !a2 || !a3 || !b0 || !b1 || !b2 || 
        !c0 || !c1 || !c2 || !wrk || !bnd) 
    {
        fprintf(stderr, "Host allocation error in file '%s' in line %i\n",
                __FILE__, __LINE__);
        exit(EXIT_FAILURE);
    }
    
    //allocate pressure array page-locked
    CUDA_SAFE_CALL(cudaMallocHost((void**)&p1,memsize));
    CUDA_SAFE_CALL(cudaMallocHost((void**)&p2,memsize));

    //allocate page-locked gosa variables
    CUDA_SAFE_CALL(cudaMallocHost((void**)&gosa_btm,sizeof(float)));
    CUDA_SAFE_CALL(cudaMallocHost((void**)&gosa_top,sizeof(float)));
    
#if defined(USE_PAD)
    // This padding & offsetting is a workaround for a problem with the r3.0
    // global memory allocator
#define PAD    (1024*1024)
#define OFS    (6*1024)
#else
#define PAD    (0)
#define OFS    (0)
#endif /* defined(USE_PAD) */
    
    //allocate device arrays
    CUDA_SAFE_CALL(cudaMalloc((void**)&gosa_d_orig, 
                              GRID_X * GRID_Y * sizeof(gosa_d[0])));
    gosa_d = 0*OFS + gosa_d_orig; 
    DBGMSG(("gosa_d = %10p  size = %10lu\n", 
            gosa_d, GRID_X * GRID_Y * sizeof(float)));
    CUDA_SAFE_CALL(cudaMalloc((void**)&a0_d_orig,memsize + PAD));
    a0_d   = 1*OFS + a0_d_orig;
    DBGMSG (("a0_d   = %10p  size = %10d  pad = %10d\n", a0_d,  memsize, PAD));
    CUDA_SAFE_CALL(cudaMalloc((void**)&a1_d_orig,memsize + PAD));
    a1_d   = 2*OFS + a1_d_orig;
    DBGMSG (("a1_d   = %10p  size = %10d  pad = %10d\n", a1_d,  memsize, PAD));
    CUDA_SAFE_CALL(cudaMalloc((void**)&c1_d_orig,memsize + PAD));
    c1_d   = 9*OFS + c1_d_orig;
    DBGMSG (("c1_d   = %10p  size = %10d  pad = %10d\n", c1_d,  memsize, PAD));
    CUDA_SAFE_CALL(cudaMalloc((void**)&c2_d_orig,memsize + PAD));
    c2_d   = 10*OFS + c2_d_orig;
    DBGMSG (("c2_d   = %10p  size = %10d  pad = %10d\n", c2_d,  memsize, PAD));

    CUDA_SAFE_CALL(cudaMalloc((void**)&a2_d_orig,memsize + PAD));
    a2_d   = 3*OFS + a2_d_orig;
    DBGMSG (("a2_d   = %10p  size = %10d  pad = %10d\n", a2_d,  memsize, PAD));
    CUDA_SAFE_CALL(cudaMalloc((void**)&a3_d_orig,memsize + PAD));
    a3_d   = 4*OFS + a3_d_orig;
    DBGMSG (("a3_d   = %10p  size = %10d  pad = %10d\n", a3_d,  memsize, PAD));
    CUDA_SAFE_CALL(cudaMalloc((void**)&b0_d_orig,memsize + PAD));
    b0_d   = 5*OFS + b0_d_orig;
    DBGMSG (("b0_d   = %10p  size = %10d  pad = %10d\n", b0_d,  memsize, PAD));
    CUDA_SAFE_CALL(cudaMalloc((void**)&b1_d_orig,memsize + PAD));
    b1_d   = 6*OFS + b1_d_orig;
    DBGMSG (("b1_d   = %10p  size = %10d  pad = %10d\n", b1_d,  memsize, PAD));
    CUDA_SAFE_CALL(cudaMalloc((void**)&b2_d_orig,memsize + PAD));
    b2_d   = 7*OFS + b2_d_orig;
    DBGMSG (("b2_d   = %10p  size = %10d  pad = %10d\n", b2_d,  memsize, PAD));
    CUDA_SAFE_CALL(cudaMalloc((void**)&c0_d_orig,memsize + PAD));
    c0_d   = 8*OFS + c0_d_orig;
    DBGMSG (("c0_d   = %10p  size = %10d  pad = %10d\n", c0_d,  memsize, PAD));
    CUDA_SAFE_CALL(cudaMalloc((void**)&wrk_d_orig,memsize + PAD));
    wrk_d  = 11*OFS + wrk_d_orig;
    DBGMSG (("wrk_d  = %10p  size = %10d  pad = %10d\n", wrk_d, memsize, PAD));
    CUDA_SAFE_CALL(cudaMalloc((void**)&bnd_d_orig,memsize + PAD));
    bnd_d  = 12*OFS + bnd_d_orig;
    DBGMSG (("bnd_d  = %10p  size = %10d  pad = %10d\n", bnd_d, memsize, PAD));
    CUDA_SAFE_CALL(cudaMalloc((void**)&p1_d_orig,memsize + PAD));
    p1_d   = 13*OFS + p1_d_orig;
    DBGMSG (("p1_d   = %10p  size = %10d  pad = %10d\n", p1_d,  memsize, PAD));
    CUDA_SAFE_CALL(cudaMalloc((void**)&p2_d_orig,memsize + PAD));
    p2_d   = 14*OFS + p2_d_orig;
    DBGMSG (("p2_d   = %10p  size = %10d  pad = %10d\n", p2_d,  memsize, PAD));
}
Exemple #24
0
int main (int argc, char *argv[])
{
int rank, size, n, len, numbytes;
void *a_h, *a_d;
struct timeval time[2];
double bandwidth;
char name[MPI_MAX_PROCESSOR_NAME];
MPI_Status status;

MPI_Init (&argc, &argv);
MPI_Comm_rank (MPI_COMM_WORLD, &rank);
MPI_Comm_size (MPI_COMM_WORLD, &size);

MPI_Get_processor_name(name, &len);
printf("Process %d is on %s\n", rank, name);

printf("Using regular memory \n");
a_h = malloc(NBYTES);

cudaMalloc( (void **) &a_d, NBYTES);

/* Test host -> device bandwidth. */
MPI_Barrier(MPI_COMM_WORLD);

gettimeofday(&time[0], NULL);
for (n=0; n<NREPEAT; n )
{
cudaMemcpy(a_d, a_h, NBYTES, cudaMemcpyHostToDevice);
}
gettimeofday(&time[1], NULL);

bandwidth = time[1].tv_sec - time[0].tv_sec;
bandwidth = 1.e-6*(time[1].tv_usec - time[0].tv_usec);
bandwidth = NBYTES*NREPEAT/1.e6/bandwidth;

printf("Host->device bandwidth for process %d: %f MB/sec\n",rank,bandwidth);

/* Test MPI send/recv bandwidth. */
MPI_Barrier(MPI_COMM_WORLD);

gettimeofday(&time[0], NULL);
for (n=0; n<NREPEAT; n )
{
if (rank == 0)
MPI_Send(a_h, NBYTES/sizeof(int), MPI_INT, 1, 0, MPI_COMM_WORLD);
else
MPI_Recv(a_h, NBYTES/sizeof(int), MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
}
gettimeofday(&time[1], NULL);

bandwidth = time[1].tv_sec - time[0].tv_sec;
bandwidth = 1.e-6*(time[1].tv_usec - time[0].tv_usec);
bandwidth = NBYTES*NREPEAT/1.e6/bandwidth;

if (rank == 0)
printf("MPI send/recv bandwidth: %f MB/sec\n", bandwidth);

cudaFree(a_d);
free(a_h);

MPI_Finalize();
return 0;
} 
Exemple #25
0
int main(int argc, char **argv)
{
    int OPT_N  = 4000000;
    int OPT_SZ = OPT_N * sizeof(float);

    BlackScholes bs;
               
    printf("Initializing data...\n");

    float *callResult, *putResult, *stockPrice, *optionStrike, *optionYears;
    
    checkCuda( cudaMallocHost((void**)&callResult,     OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&putResult,      OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&stockPrice,     OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&optionStrike,   OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&optionYears,    OPT_SZ) );
    
    initOptions(OPT_N, stockPrice, optionStrike, optionYears);

    printf("Running Host Version...\n");

    StartTimer();
    
    // run BlackScholes operator on host
    bs(callResult, putResult, stockPrice, optionStrike, 
       optionYears, RISKFREE, VOLATILITY, OPT_N);

    printf("Option 0 call: %f\n", callResult[0]); 
    printf("Option 0 put:  %f\n", putResult[0]);

    double ms = GetTimer();

    //Both call and put is calculated
    printf("Options count             : %i     \n", 2 * OPT_N);
       printf("\tBlackScholes() time    : %f msec\n", ms);
    printf("\t%f GB/s, %f GOptions/s\n", 
           ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3),
           ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3));

    float *d_callResult, *d_putResult;
    float *d_stockPrice, *d_optionStrike, *d_optionYears;

    checkCuda( cudaMalloc    ((void**)&d_callResult,   OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_putResult,    OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_stockPrice,   OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_optionStrike, OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_optionYears,  OPT_SZ) );

    printf("Running Device Version...\n");

    StartTimer();
    
    // Launch Black-Scholes operator on device
#ifdef HEMI_CUDA_COMPILER
    cudaMemcpy(d_stockPrice, stockPrice, OPT_SZ, cudaMemcpyHostToDevice);
    cudaMemcpy(d_optionStrike, optionStrike, OPT_SZ, cudaMemcpyHostToDevice);
    cudaMemcpy(d_optionYears, optionYears, OPT_SZ, cudaMemcpyHostToDevice);

    hemi::launch(bs, 
                 d_callResult, d_putResult, d_stockPrice, d_optionStrike, 
                 d_optionYears, RISKFREE, VOLATILITY, OPT_N);

    cudaMemcpy(callResult, d_callResult, OPT_SZ, cudaMemcpyDeviceToHost);
    cudaMemcpy(putResult, d_putResult, OPT_SZ, cudaMemcpyDeviceToHost);
#else // demonstrates that "launch" goes to host when not compiled with NVCC
    hemi::launch(bs, 
                 callResult, putResult, stockPrice, optionStrike, 
                 optionYears, RISKFREE, VOLATILITY, OPT_N);
#endif

    printf("Option 0 call: %f\n", callResult[0]); 
    printf("Option 0 put:  %f\n", putResult[0]);

    ms = GetTimer();

    //Both call and put is calculated
    printf("Options count             : %i     \n", 2 * OPT_N);
       printf("\tBlackScholes() time    : %f msec\n", ms);
    printf("\t%f GB/s, %f GOptions/s\n", 
           ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3),
           ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3));

    checkCuda( cudaFree(d_stockPrice) );
    checkCuda( cudaFree(d_optionStrike) );
    checkCuda( cudaFree(d_optionYears) );
    checkCuda( cudaFreeHost(callResult) );
    checkCuda( cudaFreeHost(putResult) );
    checkCuda( cudaFreeHost(stockPrice) );
    checkCuda( cudaFreeHost(optionStrike) );
    checkCuda( cudaFreeHost(optionYears) );
}
////////////////////////////////////////////////////////////////////////////////
// initialize marching cubes
////////////////////////////////////////////////////////////////////////////////
void
initMC(int argc, char** argv)
{
    // parse command line arguments
    int n;
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "grid", &n)) {
        gridSizeLog2.x = gridSizeLog2.y = gridSizeLog2.z = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridx", &n)) {
        gridSizeLog2.x = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridy", &n)) {
        gridSizeLog2.y = n;
    }
    if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridz", &n)) {
        gridSizeLog2.z = n;
    }

    char *filename;
    if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) {
        volumeFilename = filename;
    }

    gridSize = make_uint3(1<<gridSizeLog2.x, 1<<gridSizeLog2.y, 1<<gridSizeLog2.z);
    gridSizeMask = make_uint3(gridSize.x-1, gridSize.y-1, gridSize.z-1);
    gridSizeShift = make_uint3(0, gridSizeLog2.x, gridSizeLog2.x+gridSizeLog2.y);

    numVoxels = gridSize.x*gridSize.y*gridSize.z;
    voxelSize = make_float3(2.0f / gridSize.x, 2.0f / gridSize.y, 2.0f / gridSize.z);
    maxVerts = gridSize.x*gridSize.y*100;

    printf("grid: %d x %d x %d = %d voxels\n", gridSize.x, gridSize.y, gridSize.z, numVoxels);
    printf("max verts = %d\n", maxVerts);

#if SAMPLE_VOLUME
    // load volume data
    char* path = cutFindFilePath(volumeFilename, argv[0]);
    if (path == 0) {
        fprintf(stderr, "Error finding file '%s'\n", volumeFilename);
        cudaThreadExit();
        exit(EXIT_FAILURE);
    }

    int size = gridSize.x*gridSize.y*gridSize.z*sizeof(uchar);
    uchar *volume = loadRawFile(path, size);
    cutilSafeCall(cudaMalloc((void**) &d_volume, size));
    cutilSafeCall(cudaMemcpy(d_volume, volume, size, cudaMemcpyHostToDevice) );
    free(volume);

	bindVolumeTexture(d_volume);
#endif

    if (g_bQAReadback) {
        cudaMalloc((void **)&(d_pos),    maxVerts*sizeof(float)*4);
        cudaMalloc((void **)&(d_normal), maxVerts*sizeof(float)*4);
    } else {
        // create VBOs
        createVBO(&posVbo, maxVerts*sizeof(float)*4);
		// DEPRECATED: cutilSafeCall( cudaGLRegisterBufferObject(posVbo) );
		cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_posvbo_resource, posVbo, 
							   cudaGraphicsMapFlagsWriteDiscard));

        createVBO(&normalVbo, maxVerts*sizeof(float)*4);
        // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(normalVbo));
		cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_normalvbo_resource, normalVbo, 
							   cudaGraphicsMapFlagsWriteDiscard));

    }

    // allocate textures
	allocateTextures(	&d_edgeTable, &d_triTable, &d_numVertsTable );

    // allocate device memory
    unsigned int memSize = sizeof(uint) * numVoxels;
    cutilSafeCall(cudaMalloc((void**) &d_voxelVerts,            memSize));
    cutilSafeCall(cudaMalloc((void**) &d_voxelVertsScan,        memSize));
    cutilSafeCall(cudaMalloc((void**) &d_voxelOccupied,         memSize));
    cutilSafeCall(cudaMalloc((void**) &d_voxelOccupiedScan,     memSize));
    cutilSafeCall(cudaMalloc((void**) &d_compVoxelArray,   memSize));

    // initialize CUDPP scan
    CUDPPConfiguration config;
    config.algorithm    = CUDPP_SCAN;
    config.datatype     = CUDPP_UINT;
    config.op           = CUDPP_ADD;
    config.options      = CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE;
    cudppPlan(&scanplan, config, numVoxels, 1, 0);
}
attention_layer<dType>::attention_layer(int LSTM_size,int minibatch_size, int device_number, int D, int longest_sent,cublasHandle_t &handle,neuralMT_model<dType> *model,
		bool feed_input,bool clip_gradients,dType norm_clip) 
{
	this->handle = handle;
	this->model = model;
	this->device_number = device_number;
	this->LSTM_size = LSTM_size;
	this->minibatch_size = minibatch_size;
	this->clip_gradients = clip_gradients;
	this->norm_clip = norm_clip;
	this->feed_input = feed_input;
	this->longest_sent = longest_sent;


	cudaSetDevice(device_number);
	layer_info.init(device_number,D);
	dType *h_temp;
	full_matrix_setup(&h_temp,&d_W_a,LSTM_size,LSTM_size);
	full_matrix_setup(&h_temp,&d_W_p,LSTM_size,LSTM_size);
	full_matrix_setup(&h_temp,&d_v_p,1,LSTM_size);
	full_matrix_setup(&h_temp,&d_output_bias,LSTM_size,1);
	full_matrix_setup(&h_temp,&d_W_c_p1,LSTM_size,LSTM_size);
	full_matrix_setup(&h_temp,&d_W_c_p2,LSTM_size,LSTM_size);

	full_matrix_setup(&h_temp,&d_W_a_grad,LSTM_size,LSTM_size);
	full_matrix_setup(&h_temp,&d_W_p_grad,LSTM_size,LSTM_size);
	full_matrix_setup(&h_temp,&d_v_p_grad,1,LSTM_size);
	full_matrix_setup(&h_temp,&d_output_bias_grad,LSTM_size,1);
	full_matrix_setup(&h_temp,&d_W_c_p1_grad,LSTM_size,LSTM_size);
	full_matrix_setup(&h_temp,&d_W_c_p2_grad,LSTM_size,LSTM_size);


	full_matrix_setup(&h_temp,&d_ERRnTOt_tan_htild,LSTM_size,minibatch_size);
	full_matrix_setup(&h_temp,&d_ERRnTOt_ct,LSTM_size,minibatch_size);
	full_matrix_setup(&h_temp,&d_ERRnTOt_ht_p1,LSTM_size,minibatch_size);
	full_matrix_setup(&h_temp,&d_ERRnTOt_as,2*D+1,minibatch_size);
	full_matrix_setup(&h_temp,&d_ERRnTOt_pt,1,minibatch_size);

	full_matrix_setup(&h_temp,&d_temp_1,LSTM_size,minibatch_size);
	full_matrix_setup(&h_temp,&d_h_t_Wa_factor,2*D+1,minibatch_size);

	full_vector_setup_ones(&h_temp,&d_ones_minibatch,minibatch_size);


	thrust_d_W_a_grad = thrust::device_pointer_cast(d_W_a_grad);
	thrust_d_v_p_grad = thrust::device_pointer_cast(d_v_p_grad);
	thrust_d_W_p_grad = thrust::device_pointer_cast(d_W_p_grad);
	thrust_d_W_c_p1_grad = thrust::device_pointer_cast(d_W_c_p1_grad);
	thrust_d_W_c_p2_grad = thrust::device_pointer_cast(d_W_c_p2_grad);
	thrust_d_output_bias_grad = thrust::device_pointer_cast(d_output_bias_grad);

	CUDA_ERROR_WRAPPER(cudaMalloc((void**)&d_result, 1*sizeof(dType)),"GPU memory allocation failed\n");
	CUDA_ERROR_WRAPPER(cudaMalloc((void**)&d_temp_result, NORM_THREADS*sizeof(dType)),"GPU memory allocation failed\n");

	clear_gradients();

	CUDA_ERROR_WRAPPER(cudaMalloc((void**)&d_h_t_sum, LSTM_size*minibatch_size*sizeof(dType)),"GPU memory allocation failed\n");
	CUDA_ERROR_WRAPPER(cudaMalloc((void**)&d_h_s_sum, LSTM_size*minibatch_size*sizeof(dType)),"GPU memory allocation failed\n");


	for(int i=0; i<longest_sent; i++) {
		nodes.push_back( attention_node<dType>(LSTM_size,minibatch_size,device_number,D,feed_input,this,i) );
	}

	//now construct d_total_hs_mat
	dType **h_total_hs_mat = (dType **)malloc(longest_sent*sizeof(dType*));
	dType **h_total_hs_error = (dType **)malloc(longest_sent*sizeof(dType*));

	for(int i=0; i<longest_sent; i++) {
		if(model->source_hidden_layers.size() == 0) {
			h_total_hs_mat[i] = model->input_layer_source.nodes[i].d_h_t;
			h_total_hs_error[i] = model->input_layer_source.nodes[i].d_d_ERRt_ht;
		}
		else {
			h_total_hs_mat[i] = model->source_hidden_layers[model->source_hidden_layers.size()-1].nodes[i].d_h_t;
			h_total_hs_error[i] = model->source_hidden_layers[model->source_hidden_layers.size()-1].nodes[i].d_d_ERRt_ht;
		}
	}

	CUDA_ERROR_WRAPPER(cudaMalloc((void**)&d_total_hs_mat, longest_sent*sizeof(dType*)),"GPU memory allocation failed\n");
	CUDA_ERROR_WRAPPER(cudaMalloc((void**)&d_total_hs_error, longest_sent*sizeof(dType*)),"GPU memory allocation failed\n");
	CUDA_ERROR_WRAPPER(cudaMalloc((void**)&d_batch_info, 2*minibatch_size*sizeof(int)),"GPU memory allocation failed\n");
	cudaMemcpy(d_total_hs_mat,h_total_hs_mat,longest_sent*sizeof(dType*),cudaMemcpyHostToDevice);
	cudaMemcpy(d_total_hs_error,h_total_hs_error,longest_sent*sizeof(dType*),cudaMemcpyHostToDevice);

	free(h_total_hs_mat);
}
Exemple #28
0
int dfft_cuda_create_plan(dfft_plan *p,
    int ndim, int *gdim,
    int *inembed, int *oembed,
    int *pdim, int *pidx, int row_m,
    int input_cyclic, int output_cyclic,
    MPI_Comm comm,
    int *proc_map)
    {
    int res = dfft_create_plan_common(p, ndim, gdim, inembed, oembed,
        pdim, pidx, row_m, input_cyclic, output_cyclic, comm, proc_map, 1);

    #ifndef ENABLE_MPI_CUDA
    /* allocate staging bufs */
    /* we need to use posix_memalign/cudaHostRegister instead
     * of cudaHostAlloc, because cudaHostAlloc doesn't have hooks
     * in the MPI library, and using it would lead to data corruption
     */
    int size = p->scratch_size*sizeof(cuda_cpx_t);
    int page_size = getpagesize();
    size = ((size + page_size - 1) / page_size) * page_size;
    posix_memalign((void **)&(p->h_stage_in),page_size,size);
    posix_memalign((void **)&(p->h_stage_out),page_size,size);
    cudaHostRegister(p->h_stage_in, size, cudaHostAllocDefault);
    CHECK_CUDA();
    cudaHostRegister(p->h_stage_out, size, cudaHostAllocDefault);
    CHECK_CUDA();
    #endif

    /* allocate memory for passing variables */
   cudaMalloc((void **)&(p->d_pidx), sizeof(int)*ndim);
    CHECK_CUDA();
    cudaMalloc((void **)&(p->d_pdim), sizeof(int)*ndim);
    CHECK_CUDA();
    cudaMalloc((void **)&(p->d_iembed), sizeof(int)*ndim);
    CHECK_CUDA();
    cudaMalloc((void **)&(p->d_oembed), sizeof(int)*ndim);
    CHECK_CUDA();
    cudaMalloc((void **)&(p->d_length), sizeof(int)*ndim);
    CHECK_CUDA();

    /* initialize cuda buffers */
    int *h_length = (int *)malloc(sizeof(int)*ndim);
    int i;
    for (i = 0; i < ndim; ++i)
        h_length[i] = gdim[i]/pdim[i];
    cudaMemcpy(p->d_pidx, pidx, sizeof(int)*ndim, cudaMemcpyDefault); 
    CHECK_CUDA();
    cudaMemcpy(p->d_pdim, pdim, sizeof(int)*ndim, cudaMemcpyDefault); 
    CHECK_CUDA();
    cudaMemcpy(p->d_iembed, p->inembed, sizeof(int)*ndim, cudaMemcpyDefault); 
    CHECK_CUDA();
    cudaMemcpy(p->d_oembed, p->oembed, sizeof(int)*ndim, cudaMemcpyDefault); 
    CHECK_CUDA();
    cudaMemcpy(p->d_length, h_length, sizeof(int)*ndim, cudaMemcpyDefault); 
    CHECK_CUDA();
    free(h_length);

    int dmax = p->max_depth + 2;
    p->d_rev_j1 = (int **) malloc(sizeof(int *)*dmax);
    p->d_rev_global = (int **) malloc(sizeof(int *)*dmax);
    p->d_rev_partial = (int **) malloc(sizeof(int *)*dmax);
    p->d_c0 = (int **) malloc(sizeof(int *)*dmax);
    p->d_c1 = (int **) malloc(sizeof(int *)*dmax);
    if (p->max_depth)
        {
        p->h_alpha = (cuda_scalar_t **) malloc(sizeof(cuda_scalar_t *)*p->max_depth);
        p->d_alpha = (cuda_scalar_t **) malloc(sizeof(cuda_scalar_t *)*p->max_depth);
        }

    int d;
    for (d = 0; d < dmax; ++d)
        {
        cudaMalloc((void **)&(p->d_rev_j1[d]), sizeof(int)*ndim);
        CHECK_CUDA();
        cudaMalloc((void **)&(p->d_rev_partial[d]), sizeof(int)*ndim);
        CHECK_CUDA();
        cudaMalloc((void **)&(p->d_rev_global[d]), sizeof(int)*ndim);
        CHECK_CUDA();
        cudaMalloc((void **)&(p->d_c0[d]), sizeof(int)*ndim);
        CHECK_CUDA();
        cudaMalloc((void **)&(p->d_c1[d]), sizeof(int)*ndim);
        CHECK_CUDA();
        }

    for (d = 0; d < p->max_depth; ++d)
        {
        cudaMalloc((void **)&(p->d_alpha[d]), sizeof(cuda_scalar_t)*ndim); 
        CHECK_CUDA();
        p->h_alpha[d] = (cuda_scalar_t *) malloc(sizeof(cuda_scalar_t)*ndim);
        }

    /* perform initialization run */
    dfft_cuda_execute(NULL, NULL, 0, p);

    /* initialization finished */
    p->init = 0;

    return res;
    } 
int main(int argc, char **argv)
{
    // Start logs
    shrSetLogFileName ("quasirandomGenerator.txt");
    shrLog("%s Starting...\n\n", argv[0]);
    
    unsigned int useDoublePrecision;

    char *precisionChoice;
    cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice);
    if(precisionChoice == NULL)
        useDoublePrecision = 0;
    else{
        if(!strcasecmp(precisionChoice, "double"))
            useDoublePrecision = 1;
        else
            useDoublePrecision = 0;
    }

    unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION];

    float
        *h_OutputGPU;

    float
        *d_Output;

    int
        dim, pos;

    double
        delta, ref, sumDelta, sumRef, L1norm, gpuTime;

    unsigned int hTimer;

    if(sizeof(INT64) != 8){
        shrLog("sizeof(INT64) != 8\n");
        return 0;
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
        cutilDeviceInit(argc, argv);
    else
        cudaSetDevice( cutGetMaxGflopsDeviceId() );

    cutilCheckError(cutCreateTimer(&hTimer));

    int deviceIndex;
    cutilSafeCall(cudaGetDevice(&deviceIndex));
    cudaDeviceProp deviceProp;
    cutilSafeCall(cudaGetDeviceProperties(&deviceProp, deviceIndex));
    int version = deviceProp.major * 10 + deviceProp.minor;
    if(useDoublePrecision && version < 13){
        shrLog("Double precision not supported.\n");
        cudaThreadExit();
        return 0;
    }

    shrLog("Allocating GPU memory...\n");
        cutilSafeCall( cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float)) );

    shrLog("Allocating CPU memory...\n");
        h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float));

    shrLog("Initializing QRNG tables...\n\n");
        initQuasirandomGenerator(tableCPU);
        if(useDoublePrecision)
            initTable_SM13(tableCPU);
        else
            initTable_SM10(tableCPU);

    shrLog("Testing QRNG...\n\n");
        cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) );
		int numIterations = 20;
		for (int i = -1; i < numIterations; i++)
		{
			if (i == 0)
			{
				cutilSafeCall( cudaThreadSynchronize() );
				cutilCheckError( cutResetTimer(hTimer) );
				cutilCheckError( cutStartTimer(hTimer) );
			}
            if(useDoublePrecision)
                quasirandomGenerator_SM13(d_Output, 0, N);
            else
                quasirandomGenerator_SM10(d_Output, 0, N);
		}
        cutilSafeCall( cudaThreadSynchronize() );
        cutilCheckError(cutStopTimer(hTimer));
        gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3;
	shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", 
                (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); 

    shrLog("\nReading GPU results...\n");
        cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) );

    shrLog("Comparing to the CPU results...\n\n");
        sumDelta = 0;
        sumRef = 0;
        for(dim = 0; dim < QRNG_DIMENSIONS; dim++)
            for(pos = 0; pos < N; pos++){
                ref       = getQuasirandomValue63(pos, dim);
                delta     = (double)h_OutputGPU[dim * N + pos] - ref;
                sumDelta += fabs(delta);
                sumRef   += fabs(ref);
            }
    shrLog("L1 norm: %E\n", sumDelta / sumRef);

    shrLog("\nTesting inverseCNDgpu()...\n\n");
        cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) );
		for (int i = -1; i < numIterations; i++)
		{
			if (i == 0) 
			{
				cutilSafeCall( cudaThreadSynchronize() );
				cutilCheckError( cutResetTimer(hTimer) );
				cutilCheckError( cutStartTimer(hTimer) );
			}
            if(useDoublePrecision)
                inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N);
            else
                inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N);
		}
        cutilSafeCall( cudaThreadSynchronize() );
        cutilCheckError(cutStopTimer(hTimer));
        gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3;
	shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", 
                (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); 

    shrLog("Reading GPU results...\n");
        cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) );

    shrLog("\nComparing to the CPU results...\n");
        sumDelta = 0;
        sumRef = 0;
        for(pos = 0; pos < QRNG_DIMENSIONS * N; pos++){
            double  p = (double)(pos + 1) / (double)(QRNG_DIMENSIONS * N + 1);
            ref       = MoroInvCNDcpu(p);
            delta     = (double)h_OutputGPU[pos] - ref;
            sumDelta += fabs(delta);
            sumRef   += fabs(ref);
        }
    shrLog("L1 norm: %E\n\n", L1norm = sumDelta / sumRef);
    shrLog((L1norm < 1E-6) ? "PASSED\n\n" : "FAILED\n\n");

    shrLog("Shutting down...\n");
        cutilCheckError(cutDeleteTimer(hTimer));
        free(h_OutputGPU);
        cutilSafeCall( cudaFree(d_Output) );

    cudaThreadExit();

    shrEXIT(argc, (const char**)argv);
}
void compute_process(int agents_total, int nreps, int world_width, int world_height)
{
	int np, pid;
	MPI_Comm_rank(MPI_COMM_WORLD, &pid);
	MPI_Comm_size(MPI_COMM_WORLD, &np);
	int server_process = np - 1;
	MPI_Status status;

	/* create a type for struct agent */
	const int nitems=5;
   	int blocklengths[5] = {1,1,1,1,1};
   	MPI_Datatype types[5] = {MPI_INT, MPI_INT, MPI_INT, MPI_FLOAT, MPI_FLOAT};
	MPI_Datatype mpi_agent_type;
	MPI_Aint offsets[5];

	offsets[0] = offsetof(agent, id);
    	offsets[1] = offsetof(agent, x);
    	offsets[2] = offsetof(agent, y);
    	offsets[3] = offsetof(agent, z);
    	offsets[4] = offsetof(agent, w);

	MPI_Type_create_struct(nitems, blocklengths, offsets, types, &mpi_agent_type);
	MPI_Type_commit(&mpi_agent_type);

	unsigned int num_bytes = agents_total * sizeof(float4);
	unsigned int num_halo_points = RADIO * world_width;
	unsigned int num_halo_bytes = num_halo_points * sizeof(short int);

	//unsigned int world_node_height = (world_height / (np-1)) + (RADIO * 2);
	//if(pid == 0 or pid == np - 2)
	//	world_node_height -= RADIO;
 
	size_t size_world = world_width * world_height * sizeof(short int);
	short int *h_world = (short int *)malloc(size_world);
	*h_world = 0;
	short int *d_world;

	for(int j = 0; j < world_width * world_height; j++)
	{	
		h_world[j] = 0;
	}

	/* alloc host memory */
	agent *h_agents_in = (agent *)malloc(num_bytes);
	//agent *d_agents_in;
	float4 *h_agents_pos;
	float4 *d_agents_pos;
	
	
	//MPI_Recv(rcv_address, num_points, MPI_FLOAT, server_process, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
	MPI_Recv(h_agents_in, agents_total, mpi_agent_type, server_process, 0, MPI_COMM_WORLD, &status);

	//Iniatialize world
	for( int i = 0; i < agents_total; i++)
	{
		h_world[(world_width * (h_agents_in[i].y - 1) ) + h_agents_in[i].x] = (h_agents_in[i].x!=0?1:0);
		//if(h_world[(world_width * (h_agents_in[i].y - 1) ) + h_agents_in[i].x] == 1)
			//printf("world x: %d, y: %d\n", h_agents_in[i].x, h_agents_in[i].y);	
		h_agents_pos[i].x = h_agents_in[i].x;
		h_agents_pos[i].y = h_agents_in[i].y;
		h_agents_pos[i].z = h_agents_in[i].z;
		h_agents_pos[i].w = h_agents_in[i].w;
	}

/***
	if(pid ==1)
{
	int k=0;
	for(int j = 0; j < world_width * world_height; j++)
	{	
		if ( j%96 == 0 and j>0)
		{
			k++;
			printf("%d row: %d\n", h_world[j], k);
		}
		else
			printf("%d ", h_world[j]);
	}
}
***/

	// Error code to check return values for CUDA calls
        cudaError_t err = cudaSuccess;

	// Allocate the device pointer
    	err = cudaMalloc((void **)&d_world, size_world);

	if (err != cudaSuccess)
	{
        	fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
        }

	err = cudaMemcpy(d_world, h_world, size_world, cudaMemcpyHostToDevice);

	if (err != cudaSuccess)
    	{
        	fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
    	}


	//http://cuda-programming.blogspot.com.es/2013/02/cuda-array-in-cuda-how-to-use-cuda.html
	//http://stackoverflow.com/questions/17924705/structure-of-arrays-vs-array-of-structures-in-cuda
	// Allocate the device pointer

    	err = cudaMalloc((void **)&d_agents_pos, num_bytes);

	if (err != cudaSuccess)
	{
        	fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
        }

	err = cudaMemcpy(d_agents_pos, h_agents_pos, num_bytes, cudaMemcpyHostToDevice);

	if (err != cudaSuccess)
    	{
        	fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err));
        	exit(EXIT_FAILURE);
    	}


	launch_kernel(d_agents_pos, d_world, world_width, world_height );

	MPI_Barrier( MPI_COMM_WORLD);

#ifdef DEBUG
//	printf("pid: %d\n", pid);
//	display_data(h_agents_in, agents_total );
#endif

	MPI_Send(h_agents_in, agents_total, mpi_agent_type, server_process, DATA_COLLECT, MPI_COMM_WORLD);


	/* Release resources */
	free(h_agents_in); 
/*	
	free(h_output);
	cudaFreeHost(h_left_boundary); cudaFreeHost(h_right_boundary);
	cudaFreeHost(h_left_halo); cudaFreeHost(h_right_halo);
	cudaFree(d_input); cudaFree(d_output);
*/
}