Exemplo n.º 1
0
void quantus_cuda_cleanup(quantus_comm<T> *comm)
{
    cudaFree((T *) comm->matrix);
}
Exemplo n.º 2
0
GPUParams<Dtype>::~GPUParams() {
#ifndef CPU_ONLY
  CUDA_CHECK(cudaFree(data_));
  CUDA_CHECK(cudaFree(diff_));
#endif
}
Exemplo n.º 3
0
	static void free(void *data) {
	    if (data) {
		// std::cout << "free " << data << std::endl;
		throw_(cudaFree(data));
	    }
	}
TxVectorOptimizationDataCU::~TxVectorOptimizationDataCU() {
  if (devicePtr) {
    cudaFree(devicePtr);
  }
}
void CloudConstructor::freeGPUPoints() {
	cudaFree(d_resultPoints);
	d_resultPoints = NULL;
}
Exemplo n.º 6
0
int
main()
{
	int i;
	struct timeval start, stop;
	FILE *fd;
	char *key;

	cudaSetDevice(0);

	/* Allocate memory */
	if ((key = (char *)malloc(40 * sizeof(char))) == NULL) {
		printf("Malloc failed!\n");
		exit(EXIT_FAILURE);
	}

	cudaMallocHost((void **) &batchKeys,
	    ((BATCH_SIZE + 1) * MAX_LEN_ALIGNED) * sizeof(char));
	cudaMallocHost((void **) &nKeys, BATCH_SIZE * sizeof(size_t));
	cudaMallocHost((void **) &batchIndex, (BATCH_SIZE + 1) * sizeof(int));
	cudaMallocHost((void **) &hashedKeys, BATCH_SIZE * sizeof(uint32_t));

	cudaMalloc((void **) &d_keys,
	    ((BATCH_SIZE + 1) * MAX_LEN_ALIGNED) * sizeof(char));
        cudaMalloc((void **) &d_len, BATCH_SIZE * sizeof(size_t));
        cudaMalloc((void **) &d_index, (BATCH_SIZE + 1) * sizeof(int));
        cudaMalloc((void **) &d_res, BATCH_SIZE * sizeof(uint32_t));

	/* Create 'BATCH_SIZE' number of random keys 
	 * and add them to batch table
	 */
	batchNo = 0;
        batchIndex[0] = 0;
	for(i = 0; i < BATCH_SIZE; i++) { 
		gen_random(key, 30);
		add_to_batch(key, 30);
	}

	/* Start Time (execution + memory) */
#ifdef EXEC_MEM
	gettimeofday(&start, NULL);
#endif // EXEC_MEM
	
	/* MemCpy Host -> Device */
	cudaMemcpy(d_keys, batchKeys, (batchIndex[BATCH_SIZE-1] +
	    strlen(&batchKeys[batchIndex[BATCH_SIZE - 1]])) * sizeof(char),
	    cudaMemcpyHostToDevice);
        cudaMemcpy(d_len, nKeys, BATCH_SIZE * sizeof(size_t),
	    cudaMemcpyHostToDevice);
        cudaMemcpy(d_index, batchIndex, BATCH_SIZE * sizeof(int),
	    cudaMemcpyHostToDevice);

	/* Start Time (execution only)*/
#ifndef EXEC_MEM
	gettimeofday(&start, NULL);
#endif // EXEC_MEM

	/* Call the kernel */
	CUDAhash(d_keys, d_index, d_len, d_res);

	/* Start Time (execution only)*/
#ifndef EXEC_MEM
	cudaDeviceSynchronize();
	gettimeofday(&stop, NULL);
#endif // EXEC_MEM

	/* MemCpy Device -> Host */
	cudaMemcpy(hashedKeys, d_res, BATCH_SIZE * sizeof(uint32_t),
	    cudaMemcpyDeviceToHost);	
	
	/* Start Time (execution + memory) */
#ifdef EXEC_MEM
	gettimeofday(&stop, NULL);
#endif // EXEC_MEM

	
#ifdef DEBUG
	for(i = 0; i < BATCH_SIZE; i++) {
		printf("%s\n", &batchKeys[batchIndex[i]]);
		printf("%u\n", hashedKeys[i]);
	}
#endif // DEBUG

	/* Print Time */
	fd = fopen("log.txt", "a+");
	fprintf(fd, "%lu", ((stop.tv_sec * USECS) + stop.tv_usec ) -
	    ((start.tv_sec * USECS) + start.tv_usec));
	fprintf(fd, "\t%1.f\n", ((double)BATCH_SIZE / 
	    ((double)(((stop.tv_sec * USECS) + stop.tv_usec ) -
	    ((start.tv_sec * USECS) + start.tv_usec)) / 1000000 )) / 1000);
	fclose(fd);

#ifdef DEBUG
	printf("Time: %lu \n", ((stop.tv_sec * USECS) + stop.tv_usec ) -
	    ((start.tv_sec * USECS) + start.tv_usec));
#endif // DEBUG
	
        /* Free memory */
        cudaFree(batchKeys);
	cudaFree(nKeys);
	cudaFree(hashedKeys);
	cudaFree(batchIndex);
        cudaFree(d_keys);
        cudaFree(d_len);
        cudaFree(d_res);
        cudaFree(d_index);

	return 0;
}
Exemplo n.º 7
0
void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
{
  try {
    CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
  } catch(...) {}
}
Exemplo n.º 8
0
QList<resType> calculateOnGPU(const char * seqLib, int seqLibLength, ScoreType* queryProfile, ScoreType qProfLen, int queryLength,
                                ScoreType gapOpen, ScoreType gapExtension, ScoreType maxScore, U2::SmithWatermanSettings::SWResultView resultView) {

    //TODO: calculate maximum alignment length
    const int overlapLength = calcOverlap(queryLength);

    int partsNumber = calcPartsNumber(seqLibLength, overlapLength);

    int queryDevider = 1;
    if (queryLength > sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH) {
        queryDevider = (queryLength + sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH - 1) / sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH;
    }

    int partQuerySize = (queryLength + queryDevider - 1) / queryDevider;

    int partSeqSize = calcPartSeqSize(seqLibLength, overlapLength, partsNumber);

    int sizeRow = calcSizeRow(seqLibLength, overlapLength, partsNumber, partSeqSize);

    u2log.details(QString("partsNumber: %1 queryDevider: %2").arg(partsNumber).arg(queryDevider));

    u2log.details(QString("seqLen: %1 partSeqSize: %2 overlapSize: %3").arg(seqLibLength).arg(partSeqSize).arg(overlapLength));
    u2log.details(QString("queryLen %1 partQuerySize: %2").arg(queryLength).arg(partQuerySize));

    //************************** declare some temp variables on host

    ScoreType* tempRow = new ScoreType[sizeRow];
    ScoreType* zerroArr = new ScoreType[sizeRow];
    for (int i = 0; i < sizeRow; i++) {
        zerroArr[i] = 0;
    }

    ScoreType* directionRow = new ScoreType[sizeRow];

    size_t directionMatrixSize = 0;
    size_t backtraceBeginsSize = 0;
    int * globalMatrix = NULL;
    int * backtraceBegins = NULL;
    if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
        directionMatrixSize = seqLibLength * queryLength * sizeof(int);
        backtraceBeginsSize = 2 * sizeRow * sizeof(int);

        globalMatrix = new int[directionMatrixSize / sizeof(int)];
        backtraceBegins = new int[backtraceBeginsSize / sizeof(int)];

        memset(globalMatrix, 0, directionMatrixSize);
        memset(backtraceBegins, 0, backtraceBeginsSize);
    }
    //************************** sizes of arrays

    size_t sizeQ = sizeRow * sizeof(ScoreType);
    size_t sizeQQ = (sizeRow) * sizeof(ScoreType);
    size_t sizeP = qProfLen * sizeof(ScoreType);
    size_t sizeL = (seqLibLength) * sizeof(char);

    //************************** declare arrays on device

    char * g_seqLib;
    ScoreType* g_queryProfile;
    ScoreType* g_HdataMax;
    ScoreType* g_HdataUp;
    ScoreType* g_HdataRec;
    ScoreType* g_HdataTmp;
    ScoreType* g_FdataUp;
    ScoreType* g_directionsUp;
    ScoreType* g_directionsMax;
    ScoreType* g_directionsRec;
    int * g_directionsMatrix = NULL;
    int * g_backtraceBegins = NULL;

    //************************** allocate global memory on device

    cudaMalloc((void **)& g_seqLib, sizeL);
    cudaMalloc((void **)& g_queryProfile, sizeP);
    cudaMalloc((void **)& g_HdataMax, sizeQ);
    cudaMalloc((void **)& g_HdataUp, sizeQ);
    cudaMalloc((void **)& g_FdataUp, sizeQ);
    cudaMalloc((void **)& g_directionsUp, sizeQ);
    cudaMalloc((void **)& g_directionsMax, sizeQ);
    cudaMalloc((void **)& g_HdataRec, sizeQ);
    cudaMalloc((void **)& g_directionsRec, sizeQ);

    if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
        cudaError errorMatrix = cudaMalloc(reinterpret_cast<void **>(&g_directionsMatrix), directionMatrixSize);
        cudaError errorBacktrace = cudaMalloc(reinterpret_cast<void **>(&g_backtraceBegins), backtraceBeginsSize);
    }

    u2log.details(QString("GLOBAL MEMORY USED %1 KB").arg((sizeL + sizeP + sizeQ * 7
                                                        + directionMatrixSize + backtraceBeginsSize) / 1024));

    //************************** copy from host to device

    cudaMemcpy(g_seqLib, seqLib, sizeL, cudaMemcpyHostToDevice);
    cudaMemcpy(g_queryProfile, queryProfile, sizeP, cudaMemcpyHostToDevice);
    cudaMemcpy(g_HdataMax, zerroArr, sizeQ, cudaMemcpyHostToDevice);
    cudaMemcpy(g_HdataUp, zerroArr, sizeQ, cudaMemcpyHostToDevice);
    cudaMemcpy(g_FdataUp, zerroArr, sizeQ, cudaMemcpyHostToDevice);
    cudaMemcpy(g_directionsUp, zerroArr, sizeQ, cudaMemcpyHostToDevice);
    cudaMemcpy(g_directionsMax, zerroArr, sizeQ, cudaMemcpyHostToDevice);
    cudaMemcpy(g_directionsRec, zerroArr, sizeQ, cudaMemcpyHostToDevice);
    cudaMemcpy(g_HdataRec, zerroArr, sizeQ, cudaMemcpyHostToDevice);
    if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
        cudaMemcpy(g_directionsMatrix, globalMatrix, directionMatrixSize, cudaMemcpyHostToDevice);
        cudaMemcpy(g_backtraceBegins, backtraceBegins, backtraceBeginsSize, cudaMemcpyHostToDevice);
    }

    //************************** start calculation

    int BLOCK_SIZE = partsNumber;

    dim3 dimBlock(BLOCK_SIZE);
    dim3 dimGrid(partQuerySize);

    //move constants variables to constant cuda memory
    setConstants(partSeqSize, partsNumber, overlapLength, seqLibLength,
                queryLength, gapOpen, gapExtension, maxScore, partQuerySize,
                U2::SmithWatermanAlgorithm::UP, U2::SmithWatermanAlgorithm::LEFT, U2::SmithWatermanAlgorithm::DIAG,
                U2::SmithWatermanAlgorithm::STOP);

    size_t sh_mem_size = sizeof(ScoreType) * (dimGrid.x + 1) * 3;
    u2log.details(QString("SHARED MEM SIZE USED: %1 B").arg(sh_mem_size));
    // start main loop
    for (int i = 0; i < queryDevider; i++) {

        calculateMatrix_wrap( dimBlock.x, dimGrid.x, g_seqLib,
            g_queryProfile, g_HdataUp, g_HdataRec, g_HdataMax,
            g_FdataUp, g_directionsUp, g_directionsRec,
            g_directionsMax, i * partQuerySize, g_directionsMatrix, g_backtraceBegins);

        cudaError hasErrors = cudaThreadSynchronize();

        if (hasErrors != 0) {
            u2log.trace(QString("CUDA ERROR HAPPEN, errorId: ") + QString::number(hasErrors));
        }

        //revert arrays
        g_HdataTmp = g_HdataRec;
        g_HdataRec = g_HdataUp;
        g_HdataUp = g_HdataTmp;

        g_HdataTmp = g_directionsRec;
        g_directionsRec = g_directionsUp;
        g_directionsUp = g_HdataTmp;
    }

    //Copy vectors on host and find actual results
    cudaMemcpy(tempRow, g_HdataMax, sizeQQ, cudaMemcpyDeviceToHost);
    cudaMemcpy(directionRow, g_directionsMax, sizeQQ, cudaMemcpyDeviceToHost);
    if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
        cudaMemcpy(globalMatrix, g_directionsMatrix, directionMatrixSize, cudaMemcpyDeviceToHost);
        cudaMemcpy(backtraceBegins, g_backtraceBegins, backtraceBeginsSize, cudaMemcpyDeviceToHost);
    }

    QList<resType> pas;
    resType res;
    for (int j = 0; j < (sizeRow); j++) {
        if (tempRow[j] >= maxScore) {
            res.refSubseq.startPos = directionRow[j];
            res.refSubseq.length = j - res.refSubseq.startPos + 1 - (j) / (partSeqSize + 1) * overlapLength - (j) / (partSeqSize + 1);
            res.score = tempRow[j];
            if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
                qint32 pairAlignOffset = 0;

                qint32 row = backtraceBegins[2 * j];
                qint32 column = backtraceBegins[2 * j + 1];
                while(U2::SmithWatermanAlgorithm::STOP != globalMatrix[seqLibLength * row + column]) {
                    if(U2::SmithWatermanAlgorithm::DIAG == globalMatrix[seqLibLength * row + column]) {
                        res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::DIAG;
                        row--;
                        column--;
                    } else if(U2::SmithWatermanAlgorithm::LEFT == globalMatrix[seqLibLength * row + column]) {
                        res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::UP;
                        column--;
                    } else if(U2::SmithWatermanAlgorithm::UP == globalMatrix[seqLibLength * row + column]) {
                        res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::LEFT;
                        row--;
                    }
                    if(0 >= row || 0 >= column) {
                        break;
                    }
                }
                res.patternSubseq.startPos = row;
                res.patternSubseq.length = backtraceBegins[2 * j] - row + 1;
            }

            pas.append(res);
        }
    }

    //deallocation memory
    cudaFree(g_seqLib);
    cudaFree(g_queryProfile);
    cudaFree(g_HdataMax);
    cudaFree(g_HdataUp);
    cudaFree(g_HdataRec);
    cudaFree(g_FdataUp);
    cudaFree(g_directionsUp);
    cudaFree(g_directionsMax);
    cudaFree(g_directionsRec);

    if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
        cudaFree(g_directionsMatrix);
        cudaFree(g_backtraceBegins);
    }

    delete[] tempRow;
    delete[] directionRow;
    delete[] zerroArr;
    delete[] globalMatrix;
    delete[] backtraceBegins;

    return pas;
}
Exemplo n.º 9
0
 ~curandStateManager()
 {
     //if(_state != NULL) memFree((char*)_state);
     if(_state != NULL) CUDA_CHECK(cudaFree(_state));
 }
Exemplo n.º 10
0
/* ========================================================================== */
int sci_gpuLU(char *fname)
{
    CheckRhs(1,2);
    CheckLhs(2,2);
    #ifdef WITH_CUDA
        cublasStatus status;
    #endif
    SciErr sciErr;
    int*    piAddr_A    = NULL;
    double* h_A         = NULL;
    double* hi_A        = NULL;
    int     rows_A;
    int     cols_A;

    int*    piAddr_Opt  = NULL;
    double* option      = NULL;
    int     rows_Opt;
    int     cols_Opt;

    void*   d_A         = NULL;
    int     na;
    void*   pvPtr       = NULL;

    int     size_A      = sizeof(double);
    bool    bComplex_A  = FALSE;
    int     inputType_A;
    int     inputType_Opt;
    double  res;
    int     posOutput   = 1;

    try
    {
        sciErr = getVarAddressFromPosition(pvApiCtx, 1, &piAddr_A);
        if(sciErr.iErr) throw sciErr;
        if(Rhs == 2)
        {
            sciErr = getVarAddressFromPosition(pvApiCtx, 2, &piAddr_Opt);
            if(sciErr.iErr) throw sciErr;
            sciErr = getVarType(pvApiCtx, piAddr_Opt, &inputType_Opt);
            if(sciErr.iErr) throw sciErr;
            if(inputType_Opt == sci_matrix)
            {
                sciErr = getMatrixOfDouble(pvApiCtx, piAddr_Opt, &rows_Opt, &cols_Opt, &option);
                if(sciErr.iErr) throw sciErr;
            }
            else
                throw "Option syntax is [number,number].";
        }
        else
        {
            rows_Opt=1;
            cols_Opt=2;
            option = (double*)malloc(2*sizeof(double));
            option[0]=0;
            option[1]=0;
        }

        if(rows_Opt != 1 || cols_Opt != 2)
            throw "Option syntax is [number,number].";

        if((int)option[1] == 1 && !isGpuInit())
            throw "gpu is not initialised. Please launch gpuInit() before use this function.";

        sciErr = getVarType(pvApiCtx, piAddr_A, &inputType_A);
        if(sciErr.iErr) throw sciErr;

        #ifdef WITH_CUDA
        if (useCuda())
        {
            if(inputType_A == sci_pointer)
            {
                sciErr = getPointer(pvApiCtx, piAddr_A, (void**)&pvPtr);
                if(sciErr.iErr) throw sciErr;

                gpuMat_CUDA* gmat;
                gmat = static_cast<gpuMat_CUDA*>(pvPtr);
				if(!gmat->useCuda)
					throw "Please switch to OpenCL mode before use this data.";
                rows_A=gmat->rows;
                cols_A=gmat->columns;
                if(gmat->complex)
                {
                    bComplex_A = TRUE;
                    size_A = sizeof(cuDoubleComplex);
                    d_A=(cuDoubleComplex*)gmat->ptr->get_ptr();
                }
                else
                    d_A=(double*)gmat->ptr->get_ptr();

                // Initialize CUBLAS
                status = cublasInit();
                if (status != CUBLAS_STATUS_SUCCESS) throw status;

                na = rows_A * cols_A;
            }
            else if(inputType_A == 1)
            {
                // Get size and data
                if(isVarComplex(pvApiCtx, piAddr_A))
                {
                    sciErr = getComplexMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A, &hi_A);
                    if(sciErr.iErr) throw sciErr;
                    size_A = sizeof(cuDoubleComplex);
                    bComplex_A = TRUE;
                }
                else
                {
                    sciErr = getMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A);
                    if(sciErr.iErr) throw sciErr;
                }

                na = rows_A * cols_A;

                // Initialize CUBLAS
                status = cublasInit();
                if (status != CUBLAS_STATUS_SUCCESS) throw status;

                // Allocate device memory
                status = cublasAlloc(na, size_A, (void**)&d_A);
                if (status != CUBLAS_STATUS_SUCCESS) throw status;

                // Initialize the device matrices with the host matrices
                if(!bComplex_A)
                {
                    status = cublasSetMatrix(rows_A,cols_A, sizeof(double), h_A, rows_A, (double*)d_A, rows_A);
                    if (status != CUBLAS_STATUS_SUCCESS) throw status;
                }
                else
                    writecucomplex(h_A, hi_A, rows_A, cols_A, (cuDoubleComplex *)d_A);

            }
            else
                throw "Bad argument type.";

            cuDoubleComplex resComplex;
            // Performs operation
            if(!bComplex_A)
                status = decomposeBlockedLU(rows_A, cols_A, rows_A, (double*)d_A, 1);
       //     else
       //         resComplex = cublasZtrsm(na,(cuDoubleComplex*)d_A);

            if (status != CUBLAS_STATUS_SUCCESS) throw status;

            // Put the result in scilab
            switch((int)option[0])
            {
                case 2 :
                case 1 :    sciprint("The first option must be 0 for this function. Considered as 0.\n");

                case 0 :    // Keep the result on the Host.
                {           // Put the result in scilab
                    if(!bComplex_A)
                    {
                        double* h_res = NULL;
                        sciErr=allocMatrixOfDouble(pvApiCtx, Rhs + posOutput, rows_A, cols_A, &h_res);
                        if(sciErr.iErr) throw sciErr;
                        status = cublasGetMatrix(rows_A,cols_A, sizeof(double), (double*)d_A, rows_A, h_res, rows_A);
                        if (status != CUBLAS_STATUS_SUCCESS) throw status;
                    }
                    else
                    {
                        sciErr = createComplexMatrixOfDouble(pvApiCtx, Rhs + posOutput, 1, 1, &resComplex.x,&resComplex.y);
                        if(sciErr.iErr) throw sciErr;
                    }

                    LhsVar(posOutput)=Rhs+posOutput;
                    posOutput++;
                    break;
                }

                default : throw "First option argument must be 0 or 1 or 2.";
            }

            switch((int)option[1])
            {
                case 0 :    // Don't keep the data input on Device.
                {
                    if(inputType_A == sci_matrix)
                    {
                        status = cublasFree(d_A);
                        if (status != CUBLAS_STATUS_SUCCESS) throw status;
                        d_A = NULL;
                    }
                    break;
                }
                case 1 :    // Keep data of the fisrt argument on Device and return the Device pointer.
                {
                    if(inputType_A == sci_matrix)
                    {
                        gpuMat_CUDA* dptr;
                        gpuMat_CUDA tmp={getCudaContext()->genMatrix<double>(getCudaQueue(),rows_A*cols_A),rows_A,cols_A};
                        dptr=new gpuMat_CUDA(tmp);
						dptr->useCuda = true;
                        dptr->ptr->set_ptr((double*)d_A);
                        if(bComplex_A)
                            dptr->complex=TRUE;
                        else
                            dptr->complex=FALSE;

                        sciErr = createPointer(pvApiCtx,Rhs+posOutput, (void*)dptr);
                        if(sciErr.iErr) throw sciErr;
                        LhsVar(posOutput)=Rhs+posOutput;
                    }
                    else
                        throw "The first input argument is already a GPU variable.";

                    posOutput++;
                    break;
                }

                default : throw "Second option argument must be 0 or 1.";
            }
            // Shutdown
            status = cublasShutdown();
            if (status != CUBLAS_STATUS_SUCCESS) throw status;
        }
        #endif

        #ifdef WITH_OPENCL
        if (!useCuda())
        {
            throw "not implemented with OpenCL.";
        }
        #endif
        if(Rhs == 1)
        {
            free(option);
            option = NULL;
        }

        if(posOutput < Lhs+1)
            throw "Too many output arguments.";

        if(posOutput > Lhs+1)
            throw "Too few output arguments.";

        PutLhsVar();
        return 0;
    }
    catch(const char* str)
    {
        Scierror(999,"%s\n",str);
    }
    catch(SciErr E)
    {
        printError(&E, 0);
    }
    #ifdef WITH_CUDA
    catch(cudaError_t cudaE)
    {
        GpuError::treat_error<CUDAmode>((CUDAmode::Status)cudaE);
    }
    catch(cublasStatus CublasE)
    {
        GpuError::treat_error<CUDAmode>((CUDAmode::Status)CublasE,1);
    }
    if (useCuda())
    {
        if(inputType_A == 1 && d_A != NULL) cudaFree(d_A);
    }
    #endif
    #ifdef WITH_OPENCL
    if (!useCuda())
    {
        Scierror(999,"not implemented with OpenCL.\n");
    }
    #endif
    if(Rhs == 1 && option != NULL) free(option);
    return EXIT_FAILURE;
}
Exemplo n.º 11
0
/* Main */
int main(int argc, char **argv)
{
    cublasStatus_t status;
    float *h_A;
    float *h_B;
    float *h_C;
    float *h_C_ref;
    float *d_A = 0;
    float *d_B = 0;
    float *d_C = 0;
    float alpha = 1.0f;
    float beta = 0.0f;
    int n2 = N * N;
    int i;
    float error_norm;
    float ref_norm;
    float diff;
    cublasHandle_t handle;

    int dev = findCudaDevice(argc, (const char **) argv);

    if (dev == -1)
    {
        return EXIT_FAILURE;
    }

    /* Initialize CUBLAS */
    printf("simpleCUBLAS test running..\n");

    status = cublasCreate(&handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for the matrices */
    h_A = (float *)malloc(n2 * sizeof(h_A[0]));

    if (h_A == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }

    h_B = (float *)malloc(n2 * sizeof(h_B[0]));

    if (h_B == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (B)\n");
        return EXIT_FAILURE;
    }

    h_C = (float *)malloc(n2 * sizeof(h_C[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Fill the matrices with test data */
    for (i = 0; i < n2; i++)
    {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
        h_C[i] = rand() / (float)RAND_MAX;
    }

    /* Allocate device memory for the matrices */
    if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");
        return EXIT_FAILURE;
    }

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write A)\n");
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write B)\n");
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }

    /* Performs operation using plain C code */
    simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
    h_C_ref = h_C;

    /* Performs operation using cublas */
    status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for reading back the result from device memory */
    h_C = (float *)malloc(n2 * sizeof(h_C[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Read the result back */
    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }

    /* Check result against reference */
    error_norm = 0;
    ref_norm = 0;

    for (i = 0; i < n2; ++i)
    {
        diff = h_C_ref[i] - h_C[i];
        error_norm += diff * diff;
        ref_norm += h_C_ref[i] * h_C_ref[i];
    }

    error_norm = (float)sqrt((double)error_norm);
    ref_norm = (float)sqrt((double)ref_norm);

    if (fabs(ref_norm) < 1e-7)
    {
        fprintf(stderr, "!!!! reference norm is 0\n");
        return EXIT_FAILURE;
    }

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_ref);

    if (cudaFree(d_A) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_B) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_C) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasDestroy(handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }

    if (error_norm / ref_norm < 1e-6f)
    {
        printf("simpleCUBLAS test passed.\n");
        exit(EXIT_SUCCESS);
    }
    else
    {
        printf("simpleCUBLAS test failed.\n");
        exit(EXIT_FAILURE);
    }
}
Exemplo n.º 12
0
int main(int argc, char* argv[])
{
	// Process config (to be filled completely
	// later).
	config_t config;
	config.idevice = 0;
	config.nx = nx;
	config.ny = ny;
	config.step = 0;

	// Create shared memory region.
	int fd = shm_open("/shmem_mmap_cuda_shm",
		O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
	if (fd == -1)
	{
		fprintf(stderr, "Cannot open shared region, errno = %d\n", errno);
		return errno;
	}

	// Create first semaphore (set to 0 to create it initially locked).
	sem_t* sem1 = sem_open("/shmem_mmap_cuda_sem1",
		O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO, 0);
	if (sem1 == SEM_FAILED)
	{
		fprintf(stderr, "Cannot open semaphore #1, errno = %d\n", errno);
		return errno;
	}

	// Create second semaphore (set to 0 to create it initially locked).
	sem_t* sem2 = sem_open("/shmem_mmap_cuda_sem2",
		O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO, 0);
	if (sem2 == SEM_FAILED)
	{
		fprintf(stderr, "Cannot open semaphore #2, errno = %d\n", errno);
		return errno;
	}

	// Call fork to create another process.
	// Standard: "Memory mappings created in the parent
	// shall be retained in the child process."
	pid_t fork_status = fork();
	
	// From this point two processes are running the same code, if no errors.
	if (fork_status == -1)
	{
		fprintf(stderr, "Cannot fork process, errno = %d\n", errno);
		return errno;
	}

	// Get the process ID.
	int pid = (int)getpid();

	// By fork return value we can determine the process role:
	// master or child (worker).
	int master = fork_status ? 1 : 0, worker = !master;

	int ndevices = 0;
	cudaError_t cuda_status = cudaGetDeviceCount(&ndevices);
	if (cuda_status != cudaSuccess)
	{
		fprintf(stderr, "Cannot get the cuda device count by process %d, status = %d\n",
			pid, cuda_status);
		return cuda_status;
	}
	
	// Return if no cuda devices present.
	if (master)
		printf("%d CUDA device(s) found\n", ndevices);
	if (!ndevices) return 0;
	ndevices = 1;

	size_t np = nx * ny;
	size_t size = np * sizeof(float);

	float* inout;
	
	if (!master)
	{
		// Lock semaphore to finish shared region configuration on master.
		int sem_status = sem_wait(sem1);
		if (sem_status == -1)
		{
			fprintf(stderr, "Cannot wait on semaphore by process %d, errno = %d\n",
				pid, errno);
			return errno;
		}

		// Map the shared region into the address space of the current process.
		inout = (float*)mmap(0, size * (ndevices + 1),
			PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
		if (inout == MAP_FAILED)
		{
			fprintf(stderr, "Cannot map shared region to memory by process %d, errno = %d\n",
				pid, errno);
			return errno;
		}
	}
	else
	{
		config.idevice = ndevices;

		// Set shared region size.
		int ftrunk_status = ftruncate(fd, size * (ndevices + 1));
		if (ftrunk_status == -1)
		{
			fprintf(stderr, "Cannot truncate shared region, errno = %d\n", errno);
			return errno;
		}

		// Map the shared region into the address space of the current process.
		inout = (float*)mmap(0, size * (ndevices + 1),
			PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
		if (inout == MAP_FAILED)
		{
			fprintf(stderr, "Cannot map shared region to memory by process %d, errno = %d\n",
				pid, errno);
			return errno;
		}

		// Create input data. Let each device to have an equal piece
		// of single shared data array.
		float invdrandmax = 1.0 / RAND_MAX;
		for (size_t i = 0; i < np; i++)
			inout[i] = rand() * invdrandmax;
		for (int i = 0; i < ndevices; i++)
			memcpy(inout + np * (i + 1), inout, np * sizeof(float));

		// Sync changed content with shared region.
		int msync_status = msync(inout, size * (ndevices + 1), MS_SYNC);
		if (msync_status == -1)
		{
			fprintf(stderr, "Cannot sync shared memory %p, errno = %d\n",
				inout, errno);
			return errno;
		}

		// Unlock semaphore to let other processes to move forward.
		int sem_status = sem_post(sem1);
		if (sem_status == -1)
		{
			fprintf(stderr, "Cannot post on semaphore by process %d, errno = %d\n",
				pid, errno);
			return errno;
		}
	}

	config.inout_cpu = inout + config.idevice * np;

	// Let workers to use CUDA devices, and master - the CPU.
	// Create device buffers.
	if (worker)
	{
		// Create device arrays for input and output data.
		cuda_status = cudaMalloc((void**)&config.in_dev, size);
		if (cuda_status != cudaSuccess)
		{
			fprintf(stderr, "Cannot allocate CUDA input buffer by process %d, status = %d\n",
				pid, cuda_status);
			return cuda_status;
		}
		cuda_status = cudaMalloc((void**)&config.out_dev, size);
		if (cuda_status != cudaSuccess)
		{
			fprintf(stderr, "Cannot allocate CUDA output buffer by process %d, status = %d\n",
				pid, cuda_status);
			return cuda_status;
		}
	}
	else
	{
		// Create device arrays for input and output data.
		config.in_dev = (float*)malloc(size);
		config.out_dev = (float*)malloc(size);
	}

	printf("Device %d initialized py process %d\n", config.idevice, pid);

	// Perform some "iterations" on data arrays, assigned to devices,
	// and shift input data array after each iteration.
	for (int i = 0; i < nticks; i++)
	{
		int status;
		if (master)
		{
			// Copy input data to device buffer.
			memcpy(config.in_dev, config.inout_cpu, size);

			status = pattern2d_cpu(1, config.nx, 1, 1, config.ny, 1,
				config.in_dev, config.out_dev, config.idevice);
			if (status)
			{
				fprintf(stderr, "Cannot execute pattern 2d by process %d, status = %d\n",
					pid, status);
				return status;
			}

			// Copy output data from device buffer.
			memcpy(config.inout_cpu, config.out_dev, size);

			// Sync with changed content in shared region.
			int msync_status = msync(inout, size * (ndevices + 1), MS_SYNC);
			if (msync_status == -1)
			{
				fprintf(stderr, "Cannot sync shared memory %p, errno = %d\n",
					inout, errno);
				return errno;
			}

			int sem_status = sem_post(sem1);
			if (sem_status == -1)
			{
				fprintf(stderr, "Cannot post on semaphore #1 by process %d, errno = %d\n",
					pid, errno);
				return errno;
			}	

			sem_status = sem_wait(sem2);
			if (sem_status == -1)
			{
				fprintf(stderr, "Cannot post on semaphore #2 by process %d, errno = %d\n",
					pid, errno);
				return errno;
			}	
		}
		else
		{
			// Copy input data to device buffer.
			cuda_status = cudaMemcpy(config.in_dev, config.inout_cpu, size,
				cudaMemcpyHostToDevice);
			if (cuda_status != cudaSuccess)
			{
				fprintf(stderr, "Cannot copy input data to CUDA buffer by process %d, status = %d\n",
					pid, cuda_status);
				return cuda_status;
			}

			status = pattern2d_gpu(1, config.nx, 1, 1, config.ny, 1,
				config.in_dev, config.out_dev, config.idevice);
			if (status)
			{
				fprintf(stderr, "Cannot execute pattern 2d by process %d, status = %d\n",
					pid, status);
				return status;
			}

			// Copy output data from device buffer.
			cuda_status = cudaMemcpy(config.inout_cpu, config.out_dev, size,
				cudaMemcpyDeviceToHost);
			if (cuda_status != cudaSuccess)
			{
				fprintf(stderr, "Cannot copy output data from CUDA buffer by process %d, status = %d\n",
					pid, cuda_status);
				return cuda_status;
			}

			// Sync with changed content in shared region.
			int msync_status = msync(inout, size * (ndevices + 1), MS_SYNC);
			if (msync_status == -1)
			{
				fprintf(stderr, "Cannot sync shared memory %p, errno = %d\n",
					inout, errno);
				return errno;
			}
			
			int sem_status = sem_wait(sem1);
			if (sem_status == -1)
			{
				fprintf(stderr, "Cannot wait on semaphore #1 by process %d, errno = %d\n",
					pid, errno);
				return errno;
			}			

			sem_status = sem_post(sem2);
			if (sem_status == -1)
			{
				fprintf(stderr, "Cannot post on semaphore #2 by process %d, errno = %d\n",
					pid, errno);
				return errno;
			}
		}

		// At this point two processes are synchronized.

		config.step++;
		
		// Reassign porcesses' input data segments to show some
		// possible manipulation on shared memory.
		// Here we perform cyclic shift of data pointers.
		config.idevice++;
		config.idevice %= ndevices + 1;
		config.inout_cpu = inout +  config.idevice * np;
	}

	// Release device buffers.
	if (worker)
	{
		cuda_status = cudaFree(config.in_dev);
		if (cuda_status != cudaSuccess)
		{
			fprintf(stderr, "Cannot release input buffer by process %d, status = %d\n",
				pid, cuda_status);
			return cuda_status;
		}
		cuda_status = cudaFree(config.out_dev);
		if (cuda_status != cudaSuccess)
		{
			fprintf(stderr, "Cannot release output buffer by process %d, status = %d\n",
				pid, cuda_status);
			return cuda_status;
		}
	}
	else
	{
		free(config.in_dev);
		free(config.out_dev);
	}
	
	printf("Device %d deinitialized py process %d\n", config.idevice, pid);

	// On master process perform results check:
	// compare each GPU result to CPU result.
	if (master)
	{
		float* control = inout + np * ndevices;
		for (int idevice = 0; idevice < ndevices; idevice++)
		{
			// Find the maximum abs difference.
			int maxi = 0, maxj = 0;
			float maxdiff = fabs(control[0] - (inout + idevice * np)[0]);
			for (int j = 0; j < ny; j++)
			{
				for (int i = 0; i < nx; i++)
				{
					float diff = fabs(
						control[i + j * nx] -
						(inout + idevice * np)[i + j * nx]);
					if (diff > maxdiff)
					{
						maxdiff = diff;
						maxi = i; maxj = j;
					}
				}
			}
			printf("Device %d result abs max diff = %f @ (%d,%d)\n",
				idevice, maxdiff, maxi, maxj);
		}
	}
	
	// Unlink semaphore.
	if (master)
	{
		int sem_status = sem_unlink("/shmem_mmap_cuda_sem1");
		if (sem_status == -1)
		{
			fprintf(stderr, "Cannot unlink semaphore #1 by process %d, errno = %d\n",
				pid, errno);
			return errno;
		}
	}
	
	// Close semaphore.
	int sem_status = sem_close(sem1);
	if (sem_status == -1)
	{
		fprintf(stderr, "Cannot close semaphore #1 by process %d, errno = %d\n",
			pid, errno);
		return errno;
	}

	// Unlink semaphore.
	if (master)
	{
		int sem_status = sem_unlink("/shmem_mmap_cuda_sem2");
		if (sem_status == -1)
		{
			fprintf(stderr, "Cannot unlink semaphore #2 by process %d, errno = %d\n",
				pid, errno);
			return errno;
		}
	}
	
	// Close semaphore.
	sem_status = sem_close(sem2);
	if (sem_status == -1)
	{
		fprintf(stderr, "Cannot close semaphore #2 by process %d, errno = %d\n",
			pid, errno);
		return errno;
	}

	// Unmap shared region.
	close(fd);
	int munmap_status = munmap(inout, size * (ndevices + 1));
	if (munmap_status == -1)
	{
		fprintf(stderr, "Cannot unmap shared region by process %d, errno = %d\n",
			pid, errno);
		return errno;
	}
	
	// Unlink shared region.
	if (master)
	{
		int unlink_status = shm_unlink("/shmem_mmap_cuda_shm");
		if (unlink_status == -1)
		{
			fprintf(stderr, "Cannot unlink shared region by process %d, errno = %d\n",
				pid, errno);
			return errno;
		}
	}
	
	return 0;
}
Exemplo n.º 13
0
OsdCudaTable::~OsdCudaTable() {

    if (_devicePtr) cudaFree(_devicePtr);
}
Exemplo n.º 14
0
int main(int argc, char *argv[])
{
    // needed to work correctly with piped benchmarkrunner
    setlinebuf(stdout);
    setlinebuf(stdin);

    int n_indices = 1;
    int n_dimensions = 1;
    char inBuf[200]; // ridiculously large input buffer.
    
    bool isFirst = true;

  do {

    // Allocate memory for the arrays
    int *h_indices = 0;
    double        *h_outputGPU  = 0;

    try
    {
        h_indices = new int [n_indices * n_dimensions];
        h_outputGPU  = new double [n_indices * n_dimensions];
    }
    catch (std::exception e)
    {
        std::cerr << "Caught exception: " << e.what() << std::endl;
        std::cerr << "Unable to allocate CPU memory (try running with fewer vectors/dimensions)" << std::endl;
        return -1;
    }

    int *d_indices;
    double        *d_output;

    try
    {
        cudaError_t cudaResult;
        cudaResult = cudaMalloc((void **)&d_indices, n_dimensions * n_indices * sizeof(int));

        if (cudaResult != cudaSuccess)
        {
            throw std::runtime_error(cudaGetErrorString(cudaResult));
        }
    }
    catch (std::runtime_error e)
    {
        std::cerr << "Caught exception: " << e.what() << std::endl;
        std::cerr << "Unable to allocate GPU memory (try running with fewer vectors/dimensions)" << std::endl;
        return -1;
    }

    // Initialize the indices (done on the host)
    for(int i = 0; i < n_indices; i++) {
      h_indices[i] = i;
    }

    // Copy the indices to the device
    cudaMemcpy(d_indices, h_indices, n_dimensions * n_indices * sizeof(int), cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();

    // Execute the QRNG on the device
    int n_vec;
    sobol_nikola_unsimplified(n_indices, d_indices, n_indices, &d_output, &n_vec);

    cudaDeviceSynchronize();

    cudaMemcpy(h_outputGPU, d_output, n_indices * n_dimensions * sizeof(double), cudaMemcpyDeviceToHost);

    // Cleanup and terminate
    delete h_indices;
    cudaFree(d_indices);
    cudaFree(d_output);

    if(!isFirst) {
      printf("RESULT ");

      for(int i = 0; i < std::min(n_indices,10); i++)
        printf("%f ", h_outputGPU[i]);

      printf("\n");
    }
    else {
      printf("OK\n");
      isFirst = false;
    }

    delete h_outputGPU;

      fgets(inBuf, 200, stdin);

      if (sscanf(inBuf, "%u", &n_indices) == 0)
      {
        // if input is not a number, it has to be "EXIT"
        if (strncmp("EXIT",inBuf,4)==0)
        {
          printf("OK\n");
          break;
        }
        else
        {
          printf("ERROR. Bad input: %s\n", inBuf);
          break;
        }
      }

    } while (true);

    cudaDeviceReset();
    return -1;
}
Exemplo n.º 15
0
PhysicsProcessor::~PhysicsProcessor(void)
{
	gpuErrchk(cudaFree(d_V));
}
Exemplo n.º 16
0
void ControlCubeCache::_reSizeCache()
{
	_nLevels = _nextnLevels;
	_levelCube = _nextLevelCube;
	_offset	= _nextOffset;
	_nextnLevels = 0;
	_nextLevelCube = 0;

	_dimCube = exp2(_nLevels - _levelCube) + 2 * CUBE_INC;

	_sizeElement = pow(_dimCube, 3); 

	int dimV = exp2(_nLevels);
	_minValue = coordinateToIndex(vmml::vector<3,int>(0,0,0), _levelCube, _nLevels);
	_maxValue = coordinateToIndex(vmml::vector<3,int>(dimV-1,dimV-1,dimV-1), _levelCube, _nLevels);

	int dc = exp2(_nLevels - _levelCube);
	vmml::vector<3,int> mn = _cpuCache->getMinCoord();
	vmml::vector<3,int> mx = _cpuCache->getMaxCoord();
	_maxC = mx - mn;
	if ((mx.x() - mn.x()) % dc != 0)
		_maxC[0] += dc;
	if ((mx.y() - mn.y()) % dc != 0)
		_maxC[1] += dc;
	if ((mx.z() - mn.z()) % dc != 0)
		_maxC[2] += dc;

	if (cudaSuccess != cudaSetDevice(_device))
	{
		std::cerr<<"Control Cube Cache, error setting device: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
		throw;
	}
	if (_memory != 0)
		if (cudaSuccess != cudaFree((void*)_memory))
		{                                                                                               
			std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
			throw;
		}
	size_t total = 0;
	size_t free = 0;

	if (cudaSuccess != cudaMemGetInfo(&free, &total))
	{
		std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
		throw;
	}

	float memorySize = (0.80f*free); // Get 80% of free memory

	_maxNumCubes = memorySize/ (_sizeElement*sizeof(float));
	if (_maxNumCubes == 0)
	{
		std::cerr<<"Control Cube Cache: Memory aviable is not enough "<<memorySize/1024/1024<<" MB"<<std::endl;
		throw;
	}

	if (cudaSuccess != cudaMalloc((void**)&_memory, _maxNumCubes*_sizeElement*sizeof(float)))
	{
		std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
		throw;
	}

	_freeSlots = _maxNumCubes;

	ControlElementCache::_reSizeCache();
}
Exemplo n.º 17
0
Arquivo: mpla.cpp Projeto: zaspel/MPLA
void mpla_redistribute_vector_for_dgesv(struct mpla_vector* b_redist, struct mpla_vector* b, struct mpla_matrix* A, struct mpla_instance* instance)
{
	// attention: this code does no correctness check for the input data



//	b_redist->vec_row_count = b->vec_row_count;
//
//	// allocating memory for process-wise vector information
//	vector->proc_row_count = new int*[instance->proc_rows];
//	vector->proc_row_offset = new int*[instance->proc_rows];
//	for (int i=0; i<instance->proc_rows; i++)
//	{
//		b_redist->proc_row_count[i] = new int[instance->proc_cols];
//		b_redist->proc_row_offset[i] = new int[instance->proc_cols];
//	}
//
//	// set sizes of 
//	for (int i=0; i<instance->proc_rows; i++)
//	{
//		for (int j=0; j<instance->proc_cols; j++)
//		{
//			b_redist->proc_row_count[i][j] = A->proc_col_count[i][j];
//			b_redist->proc_row_offset[i][j] = A->proc_col_offset[i][j];
//		}
//	}
//
//	// retrieving local data for current process
//	b_redist->cur_proc_row_count = A->cur_proc_col_count;
//	b_redist->cur_proc_row_offset = A->cur_proc_col_offset;
//
//	// allocating temporary vector storage
//	cudaMalloc((void*)&(b_redist->data), sizeof(double)*b_redist->cur_proc_row_count);

	// WARNING: The following code is not efficient for a strong parallelization !!!!!


	// create sub-communicator for each process column
	int remain_dims[2];
	remain_dims[0]=1;
	remain_dims[1]=0;
	MPI_Comm column_comm;
	MPI_Cart_sub(instance->comm, remain_dims, &column_comm);
	int column_rank;
	MPI_Comm_rank(column_comm, &column_rank);
	
	// columnwise creation of the full vector
	double* full_vector;
	int* recvcounts = new int[instance->proc_rows];
	int* displs = new int[instance->proc_rows];
	for (int i=0; i<instance->proc_rows; i++)
	{
		recvcounts[i] = b->proc_row_count[i][instance->cur_proc_col];
		displs[i] = b->proc_row_offset[i][instance->cur_proc_col];
	}
	cudaMalloc((void**)&full_vector, sizeof(double)*b->vec_row_count);
	cudaThreadSynchronize();
	checkCUDAError("cudaMalloc");
	MPI_Allgatherv(b->data, b->cur_proc_row_count, MPI_DOUBLE, full_vector, recvcounts, displs, MPI_DOUBLE, column_comm);

	// extract column-wise local part of full vector
	cudaMemcpy(b_redist->data, &(full_vector[b_redist->cur_proc_row_offset]), sizeof(double)*b_redist->cur_proc_row_count, cudaMemcpyDeviceToDevice);

	// memory cleanup
	cudaFree(full_vector);

	MPI_Comm_free(&column_comm);
}
Exemplo n.º 18
0
RealKernel::~RealKernel()
{	delete[] data;
	#ifdef GPU_ENABLED
	cudaFree(dataGpu);
	#endif
}
Exemplo n.º 19
0
void run_2D_GLOBAL_MEMORY()
{
	int arrayWidth	= 4;
	int arrayHeight	= 4;

	bool SEQ = true; 

	/* Host allocation */
	float* inArr_1_H = (float*) malloc(arrayWidth * arrayHeight * sizeof(float));
	float* inArr_2_H = (float*) malloc(arrayWidth * arrayHeight * sizeof(float));
	float* outArr_H	= (float*) malloc(arrayWidth * arrayHeight * sizeof(float));

	/* Fill arrays */
	int index = 0; 
	if (SEQ)
	{
		int ctr = 0; 
		for(int j = 0; j < (arrayHeight); j++)
		{
			for(int i = 0; i < (arrayWidth); i++)
			{
				index = ((j * arrayWidth) + i);

				inArr_1_H[index] = (float) ctr++;  
				inArr_2_H[index] = (float) ctr++; 
				outArr_H[index] = (float) 0;
			}
		}
	}
	else 
	{
		for(int j = 0; j < (arrayHeight); j++)
		{
			for(int i = 0; i < (arrayWidth); i++)
			{
				index = ((j * arrayWidth) + i);

				inArr_1_H[index] =  (float)rand()/(float)RAND_MAX;
				inArr_2_H[index] = (float)rand()/(float)RAND_MAX;
				outArr_H[index] = 0; 
			}
		}
	}
	    
	/* Print host arrays */ 
	printf("inArr_1_H \n");
	print_2D_Array(inArr_1_H, arrayWidth, arrayHeight);
	printf("inArr_2_H \n");
	print_2D_Array(inArr_2_H, arrayWidth, arrayHeight);

	/* Device allocation + <__pitch> */
	float *inArr_1_D, *inArr_2_D, *outArr_D;
	size_t __pitch;
	cudaMallocPitch((void**)&inArr_1_D, &__pitch, arrayHeight * sizeof(float), arrayWidth);
	cudaMallocPitch((void**)&inArr_2_D, &__pitch, arrayHeight * sizeof(float), arrayWidth);
	cudaMallocPitch((void**)&outArr_D, &__pitch, arrayHeight * sizeof(float), arrayWidth);

	/* Print __pitch */
	printf("__pitch %d \n", (__pitch/sizeof(float))); 

	/* Uploading data */ 
	cudaMemcpy2D(inArr_1_D, __pitch, inArr_1_H, arrayHeight * sizeof(float), arrayHeight * sizeof(float), arrayWidth, cudaMemcpyHostToDevice);
	cudaMemcpy2D(inArr_2_D, __pitch, inArr_2_H, arrayHeight * sizeof(float), arrayHeight * sizeof(float), arrayWidth, cudaMemcpyHostToDevice);
 
	/* Gridding */
	dim3 __numBlocks(1,1,1);
	dim3 __numThreadsPerBlock(BLOCK_SIZE, BLOCK_SIZE, 1);
	__numBlocks.x = ((arrayWidth / BLOCK_SIZE) + (((arrayWidth) % BLOCK_SIZE) == 0 ? 0:1));
	__numBlocks.y = ((arrayHeight / BLOCK_SIZE) + (((arrayHeight) % BLOCK_SIZE) == 0 ? 0:1));

	/* Kernel invokation */
	add_2D_Array(inArr_1_D, inArr_2_D, outArr_D, arrayWidth, arrayHeight, __pitch, __numBlocks, __numThreadsPerBlock); 

	/* Synchronization */
	cudaThreadSynchronize();

	/* Download result */
	cudaMemcpy2D(outArr_H, arrayHeight * sizeof(float), outArr_D, __pitch, arrayHeight * sizeof(float), arrayWidth, cudaMemcpyDeviceToHost);

	/* Free device arrays */
	cudaFree(inArr_1_D);
	cudaFree(inArr_2_D);
	cudaFree(outArr_D);

	/* Display results */
	printf("outArr \n");
	print_2D_Array(outArr_H, arrayWidth, arrayHeight);
}
Exemplo n.º 20
0
    void gpu_data::
    set_size(
        size_t new_size
    )
    {
        if (new_size == 0)
        {
            if (device_in_use)
            {
                // Wait for any possible CUDA kernels that might be using our memory block to
                // complete before we free the memory.
                synchronize_stream(0);
                device_in_use = false;
            }
            wait_for_transfer_to_finish();
            data_size = 0;
            host_current = true;
            device_current = true;
            device_in_use = false;
            data_host.reset();
            data_device.reset();
        }
        else if (new_size != data_size)
        {
            if (device_in_use)
            {
                // Wait for any possible CUDA kernels that might be using our memory block to
                // complete before we free the memory.
                synchronize_stream(0);
                device_in_use = false;
            }
            wait_for_transfer_to_finish();
            data_size = new_size;
            host_current = true;
            device_current = true;
            device_in_use = false;

            try
            {
                CHECK_CUDA(cudaGetDevice(&the_device_id));

                // free memory blocks before we allocate new ones.
                data_host.reset();
                data_device.reset();

                void* data;
                CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float)));
                // Note that we don't throw exceptions since the free calls are invariably
                // called in destructors.  They also shouldn't fail anyway unless someone
                // is resetting the GPU card in the middle of their program.
                data_host.reset((float*)data, [](float* ptr){
                    auto err = cudaFreeHost(ptr);
                    if(err!=cudaSuccess)
                        std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl;
                });

                CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float)));
                data_device.reset((float*)data, [](float* ptr){
                    auto err = cudaFree(ptr);
                    if(err!=cudaSuccess)
                        std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
                });

                if (!cuda_stream)
                {
                    cudaStream_t cstream;
                    CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking));
                    cuda_stream.reset(cstream, [](void* ptr){
                        auto err = cudaStreamDestroy((cudaStream_t)ptr);
                        if(err!=cudaSuccess)
                            std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl;
                    });
                }

            }
            catch(...)
            {
                set_size(0);
                throw;
            }
        }
    }
void TxVectorOptimizationDataCU::freeResources() {
  if (devicePtr) {
    cudaError_t err = cudaFree(devicePtr);
    CHKCUDAERR(err);
  }
}
Exemplo n.º 22
0
int main(int argc, char **argv)
{
    // Start logs
    printf("%s Starting...\n\n", argv[0]);

    unsigned int useDoublePrecision;

    char *precisionChoice;
    getCmdLineArgumentString(argc, (const char **)argv, "type", &precisionChoice);

    if (precisionChoice == NULL)
    {
        useDoublePrecision = 0;
    }
    else
    {
        if (!STRCASECMP(precisionChoice, "double"))
        {
            useDoublePrecision = 1;
        }
        else
        {
            useDoublePrecision = 0;
        }
    }

    unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION];

    float *h_OutputGPU, *d_Output;

    int dim, pos;
    double delta, ref, sumDelta, sumRef, L1norm, gpuTime;

    StopWatchInterface *hTimer = NULL;

    if (sizeof(INT64) != 8)
    {
        printf("sizeof(INT64) != 8\n");
        return 0;
    }

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    int dev = findCudaDevice(argc, (const char **)argv);

    sdkCreateTimer(&hTimer);

    int deviceIndex;
    checkCudaErrors(cudaGetDevice(&deviceIndex));
    cudaDeviceProp deviceProp;
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, deviceIndex));
    int version = deviceProp.major * 10 + deviceProp.minor;

    if (useDoublePrecision && version < 13)
    {
        printf("Double precision not supported.\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return 0;
    }

    printf("Allocating GPU memory...\n");
    checkCudaErrors(cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float)));

    printf("Allocating CPU memory...\n");
    h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float));

    printf("Initializing QRNG tables...\n\n");
    initQuasirandomGenerator(tableCPU);

    if (useDoublePrecision)
    {
        initTable_SM13(tableCPU);
    }
    else
    {
        initTable_SM10(tableCPU);
    }

    printf("Testing QRNG...\n\n");
    checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)));
    int numIterations = 20;

    for (int i = -1; i < numIterations; i++)
    {
        if (i == 0)
        {
            checkCudaErrors(cudaDeviceSynchronize());
            sdkResetTimer(&hTimer);
            sdkStartTimer(&hTimer);
        }

        if (useDoublePrecision)
        {
            quasirandomGenerator_SM13(d_Output, 0, N);
        }
        else
        {
            quasirandomGenerator_SM10(d_Output, 0, N);
        }
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3;
    printf("quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n",
           (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS);

    printf("\nReading GPU results...\n");
    checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost));

    printf("Comparing to the CPU results...\n\n");
    sumDelta = 0;
    sumRef = 0;

    for (dim = 0; dim < QRNG_DIMENSIONS; dim++)
        for (pos = 0; pos < N; pos++)
        {
            ref       = getQuasirandomValue63(pos, dim);
            delta     = (double)h_OutputGPU[dim * N + pos] - ref;
            sumDelta += fabs(delta);
            sumRef   += fabs(ref);
        }

    printf("L1 norm: %E\n", sumDelta / sumRef);

    printf("\nTesting inverseCNDgpu()...\n\n");
    checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)));

    for (int i = -1; i < numIterations; i++)
    {
        if (i == 0)
        {
            checkCudaErrors(cudaDeviceSynchronize());
            sdkResetTimer(&hTimer);
            sdkStartTimer(&hTimer);
        }

        if (useDoublePrecision)
        {
            inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N);
        }
        else
        {
            inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N);
        }
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3;
    printf("quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n",
           (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128);

    printf("Reading GPU results...\n");
    checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost));

    printf("\nComparing to the CPU results...\n");
    sumDelta = 0;
    sumRef = 0;
    unsigned int distance = ((unsigned int)-1) / (QRNG_DIMENSIONS * N + 1);

    for (pos = 0; pos < QRNG_DIMENSIONS * N; pos++)
    {
        unsigned int d = (pos + 1) * distance;
        ref       = MoroInvCNDcpu(d);
        delta     = (double)h_OutputGPU[pos] - ref;
        sumDelta += fabs(delta);
        sumRef   += fabs(ref);
    }

    printf("L1 norm: %E\n\n", L1norm = sumDelta / sumRef);

    printf("Shutting down...\n");
    sdkDeleteTimer(&hTimer);
    free(h_OutputGPU);
    checkCudaErrors(cudaFree(d_Output));

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE);
}
Exemplo n.º 23
0
// Main ------------------------------------------------------------------------------------------
int main(int argc, char **argv) {

	const Params p(argc, argv);
	CUDASetup setcuda(p.device);
	Timer timer;
	cudaError_t cudaStatus;
	int it_cpu = 0;
	int it_gpu = 0;
	int err = 0;
#ifdef LOGS
	set_iter_interval_print(10);
	char test_info[500];
	snprintf(test_info, 500, "-i %d -g %d -t %d -f %s -l %d\n",p.n_gpu_threads, p.n_gpu_blocks,p.n_threads, p.file_name,p.switching_limit);
	start_log_file("cudaSingleSourceShortestPath", test_info);
	//printf("Com LOG\n");
#endif

	// Allocate
	int n_nodes, n_edges;
//	int n_nodes_o;
	read_input_size(n_nodes, n_edges, p);
	timer.start("Allocation");
	Node * h_nodes = (Node *) malloc(sizeof(Node) * n_nodes);

	//*************************** Alocando Memoria para o Gold *************************************
	Gold * gold = (Gold *) malloc(sizeof(Gold) * n_nodes);
	if (p.mode == 1) {
		// **********************  Lendo O gold *********************************
		read_gold(gold, p);
		// **********************************************************************
	}

	//***********************************************************************************************
	Node * d_nodes;
	cudaStatus = cudaMalloc((void**) &d_nodes, sizeof(Node) * n_nodes);
	Edge * h_edges = (Edge *) malloc(sizeof(Edge) * n_edges);
	Edge * d_edges;
	cudaStatus = cudaMalloc((void**) &d_edges, sizeof(Edge) * n_edges);
	std::atomic_int *h_color = (std::atomic_int *) malloc(
			sizeof(std::atomic_int) * n_nodes);
	int * d_color;
	cudaStatus = cudaMalloc((void**) &d_color, sizeof(int) * n_nodes);
	std::atomic_int *h_cost = (std::atomic_int *) malloc(
			sizeof(std::atomic_int) * n_nodes);
	int * d_cost;
	cudaStatus = cudaMalloc((void**) &d_cost, sizeof(int) * n_nodes);
	int * h_q1 = (int *) malloc(n_nodes * sizeof(int));
	int * d_q1;
	cudaStatus = cudaMalloc((void**) &d_q1, sizeof(int) * n_nodes);
	int * h_q2 = (int *) malloc(n_nodes * sizeof(int));
	int * d_q2;
	cudaStatus = cudaMalloc((void**) &d_q2, sizeof(int) * n_nodes);
	std::atomic_int h_head[1];
	int * d_head;
	cudaStatus = cudaMalloc((void**) &d_head, sizeof(int));
	std::atomic_int h_tail[1];
	int * d_tail;
	cudaStatus = cudaMalloc((void**) &d_tail, sizeof(int));
	std::atomic_int h_threads_end[1];
	int * d_threads_end;
	cudaStatus = cudaMalloc((void**) &d_threads_end, sizeof(int));
	std::atomic_int h_threads_run[1];
	int * d_threads_run;
	cudaStatus = cudaMalloc((void**) &d_threads_run, sizeof(int));
	int h_num_t[1];
	int * d_num_t;
	cudaStatus = cudaMalloc((void**) &d_num_t, sizeof(int));
	int h_overflow[1];
	int * d_overflow;
	cudaStatus = cudaMalloc((void**) &d_overflow, sizeof(int));
	std::atomic_int h_gray_shade[1];
	int * d_gray_shade;
	cudaStatus = cudaMalloc((void**) &d_gray_shade, sizeof(int));
	std::atomic_int h_iter[1];
	int * d_iter;
	cudaStatus = cudaMalloc((void**) &d_iter, sizeof(int));
	cudaDeviceSynchronize();
	CUDA_ERR();
	ALLOC_ERR(h_nodes, h_edges, h_color, h_cost, h_q1, h_q2);
	timer.stop("Allocation");

	// Initialize
	timer.start("Initialization");
	const int max_gpu_threads = setcuda.max_gpu_threads();
	int source;
	read_input(source, h_nodes, h_edges, p);

	for (int i = 0; i < n_nodes; i++) {
		h_cost[i].store(INF);
	}
	h_cost[source].store(0);
	for (int i = 0; i < n_nodes; i++) {
		h_color[i].store(WHITE);
	}
	h_tail[0].store(0);
	h_head[0].store(0);
	h_threads_end[0].store(0);
	h_threads_run[0].store(0);
	h_q1[0] = source;
	h_iter[0].store(0);
	h_overflow[0] = 0;
	h_gray_shade[0].store(GRAY0);
	timer.stop("Initialization");
	//timer.print("Initialization", 1);

	// Copy to device
	timer.start("Copy To Device");
	cudaStatus = cudaMemcpy(d_nodes, h_nodes, sizeof(Node) * n_nodes,
			cudaMemcpyHostToDevice);
	cudaStatus = cudaMemcpy(d_edges, h_edges, sizeof(Edge) * n_edges,
			cudaMemcpyHostToDevice);
	cudaDeviceSynchronize();
	CUDA_ERR();
	timer.stop("Copy To Device");

	for (int rep = 0; rep < p.n_reps; rep++) {

		// Reset
		for (int i = 0; i < n_nodes; i++) {
			h_cost[i].store(INF);
		}
		h_cost[source].store(0);
		for (int i = 0; i < n_nodes; i++) {
			h_color[i].store(WHITE);
		}

		it_cpu = 0;
		it_gpu = 0;

		h_tail[0].store(0);
		h_head[0].store(0);
		h_threads_end[0].store(0);
		h_threads_run[0].store(0);
		h_q1[0] = source;
		h_iter[0].store(0);
		h_overflow[0] = 0;
		h_gray_shade[0].store(GRAY0);

		// if(rep >= p.n_warmup)
		timer.start("Kernel");

#ifdef LOGS
		start_iteration();
#endif

		// Run first iteration in master CPU thread
		h_num_t[0] = 1;
		int pid;
		int index_i, index_o;
		for (index_i = 0; index_i < h_num_t[0]; index_i++) {
			pid = h_q1[index_i];
			h_color[pid].store(BLACK);
			int cur_cost = h_cost[pid].load();
			for (int i = h_nodes[pid].x; i < (h_nodes[pid].y + h_nodes[pid].x);
					i++) {
				int id = h_edges[i].x;
				int cost = h_edges[i].y;
				cost += cur_cost;
				h_cost[id].store(cost);
				h_color[id].store(GRAY0);
				index_o = h_tail[0].fetch_add(1);
				h_q2[index_o] = id;
			}
		}
		h_num_t[0] = h_tail[0].load();
		h_tail[0].store(0);
		h_threads_run[0].fetch_add(1);
		h_gray_shade[0].store(GRAY1);
		h_iter[0].fetch_add(1);
		// if(rep >= p.n_warmup)
		timer.stop("Kernel");

		// Pointers to input and output queues
		int * h_qin = h_q2;
		int * h_qout = h_q1;
		int * d_qin = d_q2;
		int * d_qout = d_q1;

		const int CPU_EXEC = (p.n_threads > 0) ? 1 : 0;
		const int GPU_EXEC =
				(p.n_gpu_blocks > 0 && p.n_gpu_threads > 0) ? 1 : 0;

		// Run subsequent iterations on CPU or GPU until number of input queue elements is 0
		while (*h_num_t != 0) {

			if ((*h_num_t < p.switching_limit || GPU_EXEC == 0)
					&& CPU_EXEC == 1) { // If the number of input queue elements is lower than switching_limit
				it_cpu = it_cpu + 1;
				//   if(rep >= p.n_warmup)
				timer.start("Kernel");

				// Continue until switching_limit condition is not satisfied
				while ((*h_num_t != 0)
						&& (*h_num_t < p.switching_limit || GPU_EXEC == 0)
						&& CPU_EXEC == 1) {

					// Swap queues
					if (h_iter[0] % 2 == 0) {
						h_qin = h_q1;
						h_qout = h_q2;
					} else {
						h_qin = h_q2;
						h_qout = h_q1;
					}

					std::thread main_thread(run_cpu_threads, h_nodes, h_edges,
							h_cost, h_color, h_qin, h_qout, h_num_t, h_head,
							h_tail, h_threads_end, h_threads_run, h_gray_shade,
							h_iter, p.n_threads, p.switching_limit, GPU_EXEC);
					main_thread.join();

					h_num_t[0] = h_tail[0].load(); // Number of elements in output queue
					h_tail[0].store(0);
					h_head[0].store(0);
					if (h_iter[0].load() % 2 == 0)
						h_gray_shade[0].store(GRAY0);
					else
						h_gray_shade[0].store(GRAY1);
				}

				//  if(rep >= p.n_warmup)
				timer.stop("Kernel");

			} else if ((*h_num_t >= p.switching_limit || CPU_EXEC == 0)
					&& GPU_EXEC == 1) { // If the number of input queue elements is higher than or equal to switching_limit
				it_gpu = it_gpu + 1;
				//  if(rep >= p.n_warmup)
				timer.start("Copy To Device");
				cudaStatus = cudaMemcpy(d_cost, h_cost, sizeof(int) * n_nodes,
						cudaMemcpyHostToDevice);
				cudaStatus = cudaMemcpy(d_color, h_color, sizeof(int) * n_nodes,
						cudaMemcpyHostToDevice);
				cudaStatus = cudaMemcpy(d_threads_run, h_threads_run,
						sizeof(int), cudaMemcpyHostToDevice);
				cudaStatus = cudaMemcpy(d_threads_end, h_threads_end,
						sizeof(int), cudaMemcpyHostToDevice);
				cudaStatus = cudaMemcpy(d_overflow, h_overflow, sizeof(int),
						cudaMemcpyHostToDevice);
				cudaStatus = cudaMemcpy(d_q1, h_q1, sizeof(int) * n_nodes,
						cudaMemcpyHostToDevice);
				cudaStatus = cudaMemcpy(d_q2, h_q2, sizeof(int) * n_nodes,
						cudaMemcpyHostToDevice);
				cudaStatus = cudaMemcpy(d_iter, h_iter, sizeof(int),
						cudaMemcpyHostToDevice);
				cudaDeviceSynchronize();
				CUDA_ERR();
				//  if(rep >= p.n_warmup)
				timer.stop("Copy To Device");

				// Continue until switching_limit condition is not satisfied
				while ((*h_num_t != 0)
						&& (*h_num_t >= p.switching_limit || CPU_EXEC == 0)
						&& GPU_EXEC == 1) {

					// Swap queues
					if (h_iter[0] % 2 == 0) {
						d_qin = d_q1;
						d_qout = d_q2;
					} else {
						d_qin = d_q2;
						d_qout = d_q1;
					}

					//  if(rep >= p.n_warmup)
					timer.start("Copy To Device");
					cudaStatus = cudaMemcpy(d_num_t, h_num_t, sizeof(int),
							cudaMemcpyHostToDevice);
					cudaStatus = cudaMemcpy(d_tail, h_tail, sizeof(int),
							cudaMemcpyHostToDevice);
					cudaStatus = cudaMemcpy(d_head, h_head, sizeof(int),
							cudaMemcpyHostToDevice);
					cudaStatus = cudaMemcpy(d_gray_shade, h_gray_shade,
							sizeof(int), cudaMemcpyHostToDevice);
					cudaDeviceSynchronize();
					CUDA_ERR();
					//  if(rep >= p.n_warmup)
					timer.stop("Copy To Device");

					//  if(rep >= p.n_warmup)
					timer.start("Kernel");
					assert(
							p.n_gpu_threads <= max_gpu_threads
									&& "The thread block size is greater than the maximum thread block size that can be used on this device");
					cudaStatus = call_SSSP_gpu(p.n_gpu_blocks, p.n_gpu_threads,
							d_nodes, d_edges, d_cost, d_color, d_qin, d_qout,
							d_num_t, d_head, d_tail, d_threads_end,
							d_threads_run, d_overflow, d_gray_shade, d_iter,
							p.switching_limit, CPU_EXEC,
							sizeof(int) * (W_QUEUE_SIZE + 3));
					cudaDeviceSynchronize();
					CUDA_ERR();
					//  if(rep >= p.n_warmup)
					timer.stop("Kernel");

					// if(rep >= p.n_warmup)
					timer.start("Copy Back and Merge");
					cudaStatus = cudaMemcpy(h_tail, d_tail, sizeof(int),
							cudaMemcpyDeviceToHost);
					cudaStatus = cudaMemcpy(h_iter, d_iter, sizeof(int),
							cudaMemcpyDeviceToHost);
					cudaDeviceSynchronize();
					CUDA_ERR();
					//  if(rep >= p.n_warmup)
					timer.stop("Copy Back and Merge");

					h_num_t[0] = h_tail[0].load(); // Number of elements in output queue
					h_tail[0].store(0);
					h_head[0].store(0);
					if (h_iter[0].load() % 2 == 0)
						h_gray_shade[0].store(GRAY0);
					else
						h_gray_shade[0].store(GRAY1);
				}

				// if(rep >= p.n_warmup)
				timer.start("Copy Back and Merge");
				cudaStatus = cudaMemcpy(h_cost, d_cost, sizeof(int) * n_nodes,
						cudaMemcpyDeviceToHost);
				cudaStatus = cudaMemcpy(h_color, d_color, sizeof(int) * n_nodes,
						cudaMemcpyDeviceToHost);
				cudaStatus = cudaMemcpy(h_threads_run, d_threads_run,
						sizeof(int), cudaMemcpyDeviceToHost);
				cudaStatus = cudaMemcpy(h_threads_end, d_threads_end,
						sizeof(int), cudaMemcpyDeviceToHost);
				cudaStatus = cudaMemcpy(h_overflow, d_overflow, sizeof(int),
						cudaMemcpyDeviceToHost);
				cudaStatus = cudaMemcpy(h_q1, d_q1, sizeof(int) * n_nodes,
						cudaMemcpyDeviceToHost);
				cudaStatus = cudaMemcpy(h_q2, d_q2, sizeof(int) * n_nodes,
						cudaMemcpyDeviceToHost);
				cudaDeviceSynchronize();
				CUDA_ERR();
				// if(rep >= p.n_warmup)
				timer.stop("Copy Back and Merge");
			}
		}

#ifdef LOGS
		end_iteration();
#endif

		// printf("IT CPU:%d\t",it_cpu);
		//printf("IT GPU:%d\n",it_gpu);
		if (p.mode == 1) {
			err = newest_verify(h_cost, n_nodes, n_nodes, gold, it_cpu, it_gpu);
		}		//err=new_verify(h_cost, n_nodes,,it_cpu,it_gpu);

		if (err > 0) {
			printf("Errors: %d\n", err);
			read_input(source, h_nodes, h_edges, p);
			read_gold(gold, p);
		} else {
			printf(".ITERATION %d\n", rep);
		}
#ifdef LOGS
		log_error_count(err);
#endif
		// Ler a entrada novamente
		//read_input(source, h_nodes, h_edges, p);
		//read_gold(gold,p);

	} // end of iteration
#ifdef LOGS
	end_log_file();
#endif
	// timer.print("Allocation", 1);
	//timer.print("Copy To Device", p.n_reps);
	// timer.print("Kernel", p.n_reps);
	// timer.print("Copy Back and Merge", p.n_reps);
	if (p.mode == 0) {
		create_output(h_cost, n_nodes, n_edges, std::string(p.comparison_file));
	}
	// Verify answer
	verify(h_cost, n_nodes, p.comparison_file);

	// Free memory
	timer.start("Deallocation");
	free(h_nodes);
	free(h_edges);
	free(h_color);
	free(h_cost);
	free(h_q1);
	free(h_q2);
	cudaStatus = cudaFree(d_nodes);
	cudaStatus = cudaFree(d_edges);
	cudaStatus = cudaFree(d_cost);
	cudaStatus = cudaFree(d_color);
	cudaStatus = cudaFree(d_q1);
	cudaStatus = cudaFree(d_q2);
	cudaStatus = cudaFree(d_num_t);
	cudaStatus = cudaFree(d_head);
	cudaStatus = cudaFree(d_tail);
	cudaStatus = cudaFree(d_threads_end);
	cudaStatus = cudaFree(d_threads_run);
	cudaStatus = cudaFree(d_overflow);
	cudaStatus = cudaFree(d_iter);
	cudaStatus = cudaFree(d_gray_shade);
	CUDA_ERR();
	cudaDeviceSynchronize();
	timer.stop("Deallocation");
	//timer.print("Deallocation", 1);

	// Release timers
	timer.release("Allocation");
	timer.release("Initialization");
	timer.release("Copy To Device");
	timer.release("Kernel");
	timer.release("Copy Back and Merge");
	timer.release("Deallocation");

	printf("Test Passed\n");
	return 0;
}
Exemplo n.º 24
0
void ElasticVectorAddition::freeResources() {
	CUDA_CHECK_RETURN(cudaFree(this->a));
	CUDA_CHECK_RETURN(cudaFree(this->b));

}
// points only need to be cudaMalloced once for point cloud and voxels.  Move outside for efficency TODO
void CloudConstructor::calcPoints(bool freeResultPointsOnGPU) {

	//	size_t free, tot;
	//	cudaMemGetInfo(&free, &tot);	
	//	std::cout << "ENTER calcPoints::cudaMemGetInfo " << free << " " << tot << std::endl;

	if(isInitNeeded) {
		isInitNeeded = false;
		// assume all images have same dims 
		// TODO: is this safe?
		if(camCnt >= 1) {
			imgWidth = cams[0]->imgWidth;
			imgHeight = cams[0]->imgHeight;
			imgSize = imgWidth * imgHeight;
		} else {
			imgWidth = 0;
			imgHeight = 0;
			imgSize = 0;
		}


		std::cout << "image dims " << imgWidth << " x " << imgHeight << std::endl;
		pointCnt = imgWidth*imgHeight*camCnt + 1; //zeroth point is for junk!

		//assume params dont' change after first grab
		// TODO: is this safe?
		//		CamParams *ptParam = &CamParams[0];
		int paramI = 0;


		for(int i = 0; i < camCnt; i++) {

			camParams[paramI++]  = cams[i]->params.cx;
			camParams[paramI++]= cams[i]->params.cy;

			camParams[paramI++]= cams[i]->params.imageCenterU;
			camParams[paramI++]= cams[i]->params.imageCenterV;

			camParams[paramI++]= cams[i]->params.tx;
			camParams[paramI++]= cams[i]->params.ty;
			camParams[paramI++]= cams[i]->params.tz;

			camParams[paramI++]= cams[i]->params.rx;
			camParams[paramI++]= cams[i]->params.ry;
			camParams[paramI++]= cams[i]->params.rz;
		}

		paramI = 0;
		for(int i = 0; i < camCnt; i++) {
			std::cout  << i << " cx " << camParams[paramI++]  << std::endl;
			std::cout  << i << " cy " << camParams[paramI++]  << std::endl;

			std::cout  << i << " cu " << camParams[paramI++]  << std::endl;
			std::cout  << i << " cv " << camParams[paramI++]  << std::endl;

			std::cout  << i << " tx " << camParams[paramI++]  << std::endl;
			std::cout  << i << " ty " << camParams[paramI++]  << std::endl;
			std::cout  << i << " tz " << camParams[paramI++]  << std::endl;

			std::cout  << i << " rx " << camParams[paramI++]  << std::endl;
			std::cout  << i << " ry " << camParams[paramI++]  << std::endl;
			std::cout  << i << " rz " << camParams[paramI++]  << std::endl;

		}


		points = (float *)malloc(pointCnt * 3 * sizeof(float));
		//		points = new float[pointCnt * 3]; 
		if(points==NULL) {
			std::cerr << "CloudConstructor::calcPoints unable to allocate memory for " << pointCnt << " points" << std::endl;
			exit(1);
		}
		memset(points, 0, pointCnt * 3 * sizeof(float));
	}


#ifdef USE_CPU
	// extra allocation and de-allocation is done here to stay consistant with GPU code
	size_t paramSize = sizeof(double) * camCnt * CAM_PARAM_CNT;
	double *d_params;
	d_params = (double*) malloc(paramSize); // need to look up safecall TODO
	memcpy(d_params, camParams, paramSize);

	size_t transformSize = sizeof(double) * camCnt * 12;
	double *d_tansforms;
	d_tansforms =(double*) malloc(transformSize); // need to look up safecall TODO
	double* transPrt =d_tansforms;

	size_t singleTransformSize = sizeof(double) * 12;	
	for(int i = 0; i < camCnt; i++) {
		memcpy(transPrt, cams[i]->tMatrix, singleTransformSize);
		transPrt+=12;
	}

	size_t singleZImageArrSize = imgSize * sizeof(unsigned short);
	unsigned short *d_zimg;
	d_zimg = (unsigned short*) malloc(singleZImageArrSize * camCnt); // need to look up safecall TODO

	for(int i = 0; i < camCnt;i++) {
		memcpy(&d_zimg[i*imgSize], cams[i]->getZImage(), singleZImageArrSize);
	}


	size_t resultSize = pointCnt * 3 * sizeof(float);
	d_resultPoints = (float*) malloc(resultSize);

	cpu_calcPointCloud(camCnt,imgWidth,imgHeight, d_params, d_tansforms, d_zimg, d_resultPoints);


	//	std::cout << "done caluclated doing mem copy for " << resultSize << "points"; 

	memcpy(points, d_resultPoints, resultSize);

	//	std::cout << "done with mem copy"; 


	free(d_params);
	free(d_tansforms);
	free(d_zimg);
	free(d_resultPoints);
	d_resultPoints = NULL; // does free set to null for you?


#else // use gpu
	size_t paramSize = sizeof(double) * camCnt * CAM_PARAM_CNT;
	double *d_params;
	cutilSafeCall( cudaMalloc((void**)&d_params, paramSize) ); // need to look up safecall TODO
	cutilSafeCall( cudaMemcpy(d_params, camParams, paramSize, cudaMemcpyHostToDevice) );



	size_t transformSize = sizeof(double) * camCnt * 12;
	//	size_t transformSize = sizeof(double) * camCnt * 16;
	double *d_tansforms;
	cutilSafeCall( cudaMalloc((void**)&d_tansforms, transformSize) ); // need to look up safecall TODO

	double* transPrt =d_tansforms;
	//	size_t singleTransformSize = sizeof(double) * 16;	
	size_t singleTransformSize = sizeof(double) * 12;	
	for(int i = 0; i < camCnt; i++) {
		cutilSafeCall( cudaMemcpy(transPrt, cams[i]->tMatrix, singleTransformSize, cudaMemcpyHostToDevice) );
		transPrt+=12;
	}


	size_t singleZImageArrSize = imgSize * sizeof(unsigned short);
	unsigned short *d_zimg;
	cutilSafeCall( cudaMalloc((void**)&d_zimg, singleZImageArrSize * camCnt) ); // need to look up safecall TODO

	for(int i = 0; i < camCnt;i++) {
		cutilSafeCall( cudaMemcpy(&d_zimg[i*imgSize], cams[i]->getZImage(), singleZImageArrSize, cudaMemcpyHostToDevice) );
	}


	size_t resultSize = pointCnt * 3 * sizeof(float);
	cutilSafeCall( cudaMalloc((void**)&d_resultPoints, resultSize));

	gpu_calcPointCloud(camCnt,imgWidth,imgHeight, d_params, d_tansforms, d_zimg, d_resultPoints);


	//	std::cout << "done caluclated doing mem copy for " << resultSize << "points"; 

	cutilSafeCall(cudaMemcpy(points, d_resultPoints, resultSize, cudaMemcpyDeviceToHost));

	//	std::cout << "done with mem copy"; 


	cudaFree(d_params);
	cudaFree(d_tansforms);
	cudaFree(d_zimg);
	if(freeResultPointsOnGPU)
		freeGPUPoints();
#endif // USE_CPU

	//	cudaFree(d_resultPoints);


	//	cudaMemGetInfo(&free, &tot);	
	//	std::cout << "EXIT  calcPoints::cudaMemGetInfo " << free << " " << tot << std::endl;

}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
{
    cudaDeviceProp deviceProp;

    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

    // use a larger block size for Fermi and above
    int block_size = (deviceProp.major < 2) ? 16 : 32;

    // set seed for rand()
    srand(2006);

    // allocate host memory for matrices A and B
    unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float *h_A = (float *)malloc(mem_size_A);
    unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float *h_B = (float *)malloc(mem_size_B);

    // set seed for rand()
    srand(2006);

    // initialize host memory
    randomInit(h_A, size_A);
    randomInit(h_B, size_B);

    // allocate device memory
    float *d_A, *d_B, *d_C;
    unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
    unsigned int mem_size_C = sizeof(float) * size_C;

    // allocate host memory for the result
    float *h_C      = (float *) malloc(mem_size_C);
    float *h_CUBLAS = (float *) malloc(mem_size_C);

    checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A));
    checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B));
    checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C));

    // setup execution parameters
    dim3 threads(block_size, block_size);
    dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);

    // create and start timer
    printf("Computing result using CUBLAS...");

    // execute the kernel
    int nIter = 30;

    // CUBLAS version 2.0
    {
        const float alpha = 1.0f;
        const float beta  = 0.0f;
        cublasHandle_t handle;
        cudaEvent_t start, stop;

        checkCudaErrors(cublasCreate(&handle));

        //Perform warmup operation with cublas
        checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA));

        // Allocate CUDA events that we'll use for timing
        checkCudaErrors(cudaEventCreate(&start));
        checkCudaErrors(cudaEventCreate(&stop));

        // Record the start event
        checkCudaErrors(cudaEventRecord(start, NULL));

        for (int j = 0; j < nIter; j++)
        {
            //note cublas is column primary!
            //need to transpose the order
            checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA));

        }

        printf("done.\n");

        // Record the stop event
        checkCudaErrors(cudaEventRecord(stop, NULL));

        // Wait for the stop event to complete
        checkCudaErrors(cudaEventSynchronize(stop));

        float msecTotal = 0.0f;
        checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));

        // Compute and print the performance
        float msecPerMatrixMul = msecTotal / nIter;
        double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB;
        double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
        printf(
            "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
            gigaFlops,
            msecPerMatrixMul,
            flopsPerMatrixMul);

        // copy result from device to host
        checkCudaErrors(cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost));

        // Destroy the handle
        checkCudaErrors(cublasDestroy(handle));
    }

    // compute reference solution
    printf("Computing result using host CPU...");
    float *reference = (float *)malloc(mem_size_C);
    matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
    printf("done.\n");

    // check result (CUBLAS)
    bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);

    if (resCUBLAS != true)
    {
        printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
    }

    printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");

    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");

    // clean up memory
    free(h_A);
    free(h_B);
    free(h_C);
    free(reference);
    checkCudaErrors(cudaFree(d_A));
    checkCudaErrors(cudaFree(d_B));
    checkCudaErrors(cudaFree(d_C));

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    if (resCUBLAS == true)
    {
        return EXIT_SUCCESS;    // return value = 1
    }
    else
    {
        return EXIT_FAILURE;     // return value = 0
    }
}
Exemplo n.º 27
0
 static void TearDownTestCase() { cudaFree(data); }
FieldContainer_Kokkos<Scalar,ScalarPointer,Kokkos::LayoutRight,Kokkos::Cuda>::~FieldContainer_Kokkos(){
count_=count_-1;
if(count_==0 && intrepidManaged){cudaFree(containerMemory);}
}
Exemplo n.º 29
0
/*
 * Function to be called
 */
void* device_thread(void* passing_ptr) {
    DataArray* data_arr_ptr = (DataArray*) passing_ptr; // casting passed pointer


    cuDoubleComplex* data_r_dev;
    cuDoubleComplex* data_k_dev;


    // init device, allocate suitable variables in gpu memory ...
    //alloc_data_device(data_arr_ptr);
    cudaMalloc((void**) &data_r_dev, sizeof(double complex)*N); // pinnable memory <- check here for cudaMallocHost (could be faster)
    cudaMalloc((void**) &data_k_dev, sizeof(double complex)*N); // pinnable memory
    data_arr_ptr->data_r_dev = &data_r_dev; // in this way it would be easier to handle pointer to arrays
    data_arr_ptr->data_k_dev = &data_k_dev;
    printf("data allocated by host thread\n");

    // Each thread creates new stream ustomatically???
    // http://devblogs.nvidia.com/parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
    cudaStreamCreateWithFlags(streams_arr, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(streams_arr+1, cudaStreamNonBlocking);
    printf("streams created\n");

    // synchronize after allocating memory - data on host should be allocated and ready for copying
    cudaDeviceSynchronize(); // CHECK IF THIS DO NOT CAUSE ERRORS! - should syncronize host and device irrespective on pthreads
    // cudaStreamSynchronize( <enum stream> ); // to synchronize only with stream !!!
    pthread_barrier_wait (&barrier);
    printf("1st barier device thread - allocating mem on gpu\n");




    //copying data
    cudaMemcpyAsync( *(data_arr_ptr->data_r_dev), *(data_arr_ptr->data_r), N*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, streams_arr[MEMORY_STREAM] );

    // synchronize after copying data
    cudaDeviceSynchronize(); // should be used on
    pthread_barrier_wait (&barrier);
    printf("2nd barier device thread - copying data on gpu\n");






    printf("data visible in device thread:\n");

    /*for (uint64_t ii = 0; ii < (data_arr_ptr->size <= 32) ? data_arr_ptr->size : 32 ; ii++) {
      printf("%lu.\t",ii);
      printf("%lf + %lfj\t", creal( (*(data_arr_ptr->data_r))[ii] ), cimag( (*(data_arr_ptr->data_r))[ii] ));
      printf("%lf + %lfj\n", creal( (*(data_arr_ptr->data_k))[ii] ), cimag( (*(data_arr_ptr->data_k))[ii] ));
    }*/

    // synchronize after copying
    pthread_barrier_wait (&barrier);
    printf("3rd barier device thread - \n");



    //copying data
    //cudaMemcpyAsync( *(data_arr_ptr->data_r), *(data_arr_ptr->data_r_dev), N*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, streams_arr[MEMORY_STREAM] );
    cudaMemcpyAsync( *(data_arr_ptr->data_r), data_r_dev, N*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, streams_arr[MEMORY_STREAM] );


    // synchronize after copying back data
    cudaDeviceSynchronize(); // should be used on
    pthread_barrier_wait (&barrier);
    printf("4th barier device thread - \n");


    cudaStreamDestroy(streams_arr[KERNEL_STREAM]);
    cudaStreamDestroy(streams_arr[MEMORY_STREAM]);

    cudaFree(data_r_dev);
    printf("device r space freed\n");
    cudaFree(data_k_dev);
    cudaDeviceSynchronize();
    printf("device k space freed\n");

    printf("closing device thread\n");
    pthread_exit(NULL);
}
Exemplo n.º 30
0
int main(int argc, char **argv) {
	printf("Computing Game Of Life On %d x %d Board.\n", DIM_X, DIM_Y);
	
	int *host_current, *host_future, *host_future_naive, *host_future_cached;
	int *gpu_current, *gpu_future;
	clock_t start, stop;
	
	cudaMallocHost((void**) &host_current, DIM_X * DIM_Y * sizeof(int));
	cudaMallocHost((void**) &host_future, DIM_X * DIM_Y * sizeof(int));	
	cudaMallocHost((void**) &host_future_naive, DIM_X * DIM_Y * sizeof(int));
	cudaMallocHost((void**) &host_future_cached, DIM_X * DIM_Y * sizeof(int));
	assert(cudaGetLastError() == cudaSuccess);
	
	cudaMalloc((void**) &gpu_current, DIM_X * DIM_Y * sizeof(int));
	cudaMalloc((void**) &gpu_future, DIM_X * DIM_Y * sizeof(int));
	printf("%s\n", cudaGetErrorString(cudaGetLastError()));
	assert(cudaGetLastError() == cudaSuccess);
	
	fill_board(host_current, 40); 
	add_glider(host_current);
	
	cudaMemcpy(gpu_current, host_current, DIM_X * DIM_Y * sizeof(int), cudaMemcpyHostToDevice);

//	print_board(host_current);
	
	float time_naive, time_cached, time_cpu;
	for(int i = 1; i < STEPS; i++) {
		printf("=========\n");
		
		start = clock();
		naive_game_of_life_wrapper(gpu_current, gpu_future);
		cudaMemcpy(host_future_naive, gpu_future, DIM_X * DIM_Y * sizeof(int), cudaMemcpyDeviceToHost);
		stop = clock();
		time_naive = (float)(stop - start)/CLOCKS_PER_SEC;
		printf("Time for Naive GPU To Compute Next Phase: %.5f s\n", time_naive);
		
		start = clock();
		cached_game_of_life_wrapper(gpu_current, gpu_future);
		cudaMemcpy(host_future_cached, gpu_future, DIM_X * DIM_Y * sizeof(int), cudaMemcpyDeviceToHost);
		stop = clock();
		time_cached = (float)(stop - start)/CLOCKS_PER_SEC;
		printf("Time for Cached GPU To Compute Next Phase: %.5f s\n", time_cached);
		
		start = clock();
		update_board(host_current, host_future);
		stop = clock();
		time_cpu = (float)(stop - start)/CLOCKS_PER_SEC;
		printf("Time for CPU To Compute Next Phase: %.5f s\n", time_cpu);
		
		printf("speedup for naive = %.2f; speedup for cached = %.2f; speedup for cached over naive = %.2f\n", time_cpu/time_naive, time_cpu/time_cached, time_naive/time_cached);

		check_boards(host_future, host_future_naive);
		check_boards(host_future, host_future_cached);
				
		int *temp;
		
		temp = host_current;
		host_current = host_future;
		host_future = temp;
		
		temp = gpu_current;
		gpu_current = gpu_future;
		gpu_future = temp;
	}
	
	cudaFree(host_future);
	cudaFree(host_future_naive);
	cudaFree(host_future_cached);
	cudaFree(host_current);
	cudaFree(gpu_current);
	cudaFree(gpu_future);
	
	return 0;
}