void quantus_cuda_cleanup(quantus_comm<T> *comm) { cudaFree((T *) comm->matrix); }
GPUParams<Dtype>::~GPUParams() { #ifndef CPU_ONLY CUDA_CHECK(cudaFree(data_)); CUDA_CHECK(cudaFree(diff_)); #endif }
static void free(void *data) { if (data) { // std::cout << "free " << data << std::endl; throw_(cudaFree(data)); } }
TxVectorOptimizationDataCU::~TxVectorOptimizationDataCU() { if (devicePtr) { cudaFree(devicePtr); } }
void CloudConstructor::freeGPUPoints() { cudaFree(d_resultPoints); d_resultPoints = NULL; }
int main() { int i; struct timeval start, stop; FILE *fd; char *key; cudaSetDevice(0); /* Allocate memory */ if ((key = (char *)malloc(40 * sizeof(char))) == NULL) { printf("Malloc failed!\n"); exit(EXIT_FAILURE); } cudaMallocHost((void **) &batchKeys, ((BATCH_SIZE + 1) * MAX_LEN_ALIGNED) * sizeof(char)); cudaMallocHost((void **) &nKeys, BATCH_SIZE * sizeof(size_t)); cudaMallocHost((void **) &batchIndex, (BATCH_SIZE + 1) * sizeof(int)); cudaMallocHost((void **) &hashedKeys, BATCH_SIZE * sizeof(uint32_t)); cudaMalloc((void **) &d_keys, ((BATCH_SIZE + 1) * MAX_LEN_ALIGNED) * sizeof(char)); cudaMalloc((void **) &d_len, BATCH_SIZE * sizeof(size_t)); cudaMalloc((void **) &d_index, (BATCH_SIZE + 1) * sizeof(int)); cudaMalloc((void **) &d_res, BATCH_SIZE * sizeof(uint32_t)); /* Create 'BATCH_SIZE' number of random keys * and add them to batch table */ batchNo = 0; batchIndex[0] = 0; for(i = 0; i < BATCH_SIZE; i++) { gen_random(key, 30); add_to_batch(key, 30); } /* Start Time (execution + memory) */ #ifdef EXEC_MEM gettimeofday(&start, NULL); #endif // EXEC_MEM /* MemCpy Host -> Device */ cudaMemcpy(d_keys, batchKeys, (batchIndex[BATCH_SIZE-1] + strlen(&batchKeys[batchIndex[BATCH_SIZE - 1]])) * sizeof(char), cudaMemcpyHostToDevice); cudaMemcpy(d_len, nKeys, BATCH_SIZE * sizeof(size_t), cudaMemcpyHostToDevice); cudaMemcpy(d_index, batchIndex, BATCH_SIZE * sizeof(int), cudaMemcpyHostToDevice); /* Start Time (execution only)*/ #ifndef EXEC_MEM gettimeofday(&start, NULL); #endif // EXEC_MEM /* Call the kernel */ CUDAhash(d_keys, d_index, d_len, d_res); /* Start Time (execution only)*/ #ifndef EXEC_MEM cudaDeviceSynchronize(); gettimeofday(&stop, NULL); #endif // EXEC_MEM /* MemCpy Device -> Host */ cudaMemcpy(hashedKeys, d_res, BATCH_SIZE * sizeof(uint32_t), cudaMemcpyDeviceToHost); /* Start Time (execution + memory) */ #ifdef EXEC_MEM gettimeofday(&stop, NULL); #endif // EXEC_MEM #ifdef DEBUG for(i = 0; i < BATCH_SIZE; i++) { printf("%s\n", &batchKeys[batchIndex[i]]); printf("%u\n", hashedKeys[i]); } #endif // DEBUG /* Print Time */ fd = fopen("log.txt", "a+"); fprintf(fd, "%lu", ((stop.tv_sec * USECS) + stop.tv_usec ) - ((start.tv_sec * USECS) + start.tv_usec)); fprintf(fd, "\t%1.f\n", ((double)BATCH_SIZE / ((double)(((stop.tv_sec * USECS) + stop.tv_usec ) - ((start.tv_sec * USECS) + start.tv_usec)) / 1000000 )) / 1000); fclose(fd); #ifdef DEBUG printf("Time: %lu \n", ((stop.tv_sec * USECS) + stop.tv_usec ) - ((start.tv_sec * USECS) + start.tv_usec)); #endif // DEBUG /* Free memory */ cudaFree(batchKeys); cudaFree(nKeys); cudaFree(hashedKeys); cudaFree(batchIndex); cudaFree(d_keys); cudaFree(d_len); cudaFree(d_res); cudaFree(d_index); return 0; }
void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const { try { CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) ); } catch(...) {} }
QList<resType> calculateOnGPU(const char * seqLib, int seqLibLength, ScoreType* queryProfile, ScoreType qProfLen, int queryLength, ScoreType gapOpen, ScoreType gapExtension, ScoreType maxScore, U2::SmithWatermanSettings::SWResultView resultView) { //TODO: calculate maximum alignment length const int overlapLength = calcOverlap(queryLength); int partsNumber = calcPartsNumber(seqLibLength, overlapLength); int queryDevider = 1; if (queryLength > sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH) { queryDevider = (queryLength + sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH - 1) / sw_cuda_cpp::MAX_SHARED_VECTOR_LENGTH; } int partQuerySize = (queryLength + queryDevider - 1) / queryDevider; int partSeqSize = calcPartSeqSize(seqLibLength, overlapLength, partsNumber); int sizeRow = calcSizeRow(seqLibLength, overlapLength, partsNumber, partSeqSize); u2log.details(QString("partsNumber: %1 queryDevider: %2").arg(partsNumber).arg(queryDevider)); u2log.details(QString("seqLen: %1 partSeqSize: %2 overlapSize: %3").arg(seqLibLength).arg(partSeqSize).arg(overlapLength)); u2log.details(QString("queryLen %1 partQuerySize: %2").arg(queryLength).arg(partQuerySize)); //************************** declare some temp variables on host ScoreType* tempRow = new ScoreType[sizeRow]; ScoreType* zerroArr = new ScoreType[sizeRow]; for (int i = 0; i < sizeRow; i++) { zerroArr[i] = 0; } ScoreType* directionRow = new ScoreType[sizeRow]; size_t directionMatrixSize = 0; size_t backtraceBeginsSize = 0; int * globalMatrix = NULL; int * backtraceBegins = NULL; if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { directionMatrixSize = seqLibLength * queryLength * sizeof(int); backtraceBeginsSize = 2 * sizeRow * sizeof(int); globalMatrix = new int[directionMatrixSize / sizeof(int)]; backtraceBegins = new int[backtraceBeginsSize / sizeof(int)]; memset(globalMatrix, 0, directionMatrixSize); memset(backtraceBegins, 0, backtraceBeginsSize); } //************************** sizes of arrays size_t sizeQ = sizeRow * sizeof(ScoreType); size_t sizeQQ = (sizeRow) * sizeof(ScoreType); size_t sizeP = qProfLen * sizeof(ScoreType); size_t sizeL = (seqLibLength) * sizeof(char); //************************** declare arrays on device char * g_seqLib; ScoreType* g_queryProfile; ScoreType* g_HdataMax; ScoreType* g_HdataUp; ScoreType* g_HdataRec; ScoreType* g_HdataTmp; ScoreType* g_FdataUp; ScoreType* g_directionsUp; ScoreType* g_directionsMax; ScoreType* g_directionsRec; int * g_directionsMatrix = NULL; int * g_backtraceBegins = NULL; //************************** allocate global memory on device cudaMalloc((void **)& g_seqLib, sizeL); cudaMalloc((void **)& g_queryProfile, sizeP); cudaMalloc((void **)& g_HdataMax, sizeQ); cudaMalloc((void **)& g_HdataUp, sizeQ); cudaMalloc((void **)& g_FdataUp, sizeQ); cudaMalloc((void **)& g_directionsUp, sizeQ); cudaMalloc((void **)& g_directionsMax, sizeQ); cudaMalloc((void **)& g_HdataRec, sizeQ); cudaMalloc((void **)& g_directionsRec, sizeQ); if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { cudaError errorMatrix = cudaMalloc(reinterpret_cast<void **>(&g_directionsMatrix), directionMatrixSize); cudaError errorBacktrace = cudaMalloc(reinterpret_cast<void **>(&g_backtraceBegins), backtraceBeginsSize); } u2log.details(QString("GLOBAL MEMORY USED %1 KB").arg((sizeL + sizeP + sizeQ * 7 + directionMatrixSize + backtraceBeginsSize) / 1024)); //************************** copy from host to device cudaMemcpy(g_seqLib, seqLib, sizeL, cudaMemcpyHostToDevice); cudaMemcpy(g_queryProfile, queryProfile, sizeP, cudaMemcpyHostToDevice); cudaMemcpy(g_HdataMax, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_HdataUp, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_FdataUp, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_directionsUp, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_directionsMax, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_directionsRec, zerroArr, sizeQ, cudaMemcpyHostToDevice); cudaMemcpy(g_HdataRec, zerroArr, sizeQ, cudaMemcpyHostToDevice); if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { cudaMemcpy(g_directionsMatrix, globalMatrix, directionMatrixSize, cudaMemcpyHostToDevice); cudaMemcpy(g_backtraceBegins, backtraceBegins, backtraceBeginsSize, cudaMemcpyHostToDevice); } //************************** start calculation int BLOCK_SIZE = partsNumber; dim3 dimBlock(BLOCK_SIZE); dim3 dimGrid(partQuerySize); //move constants variables to constant cuda memory setConstants(partSeqSize, partsNumber, overlapLength, seqLibLength, queryLength, gapOpen, gapExtension, maxScore, partQuerySize, U2::SmithWatermanAlgorithm::UP, U2::SmithWatermanAlgorithm::LEFT, U2::SmithWatermanAlgorithm::DIAG, U2::SmithWatermanAlgorithm::STOP); size_t sh_mem_size = sizeof(ScoreType) * (dimGrid.x + 1) * 3; u2log.details(QString("SHARED MEM SIZE USED: %1 B").arg(sh_mem_size)); // start main loop for (int i = 0; i < queryDevider; i++) { calculateMatrix_wrap( dimBlock.x, dimGrid.x, g_seqLib, g_queryProfile, g_HdataUp, g_HdataRec, g_HdataMax, g_FdataUp, g_directionsUp, g_directionsRec, g_directionsMax, i * partQuerySize, g_directionsMatrix, g_backtraceBegins); cudaError hasErrors = cudaThreadSynchronize(); if (hasErrors != 0) { u2log.trace(QString("CUDA ERROR HAPPEN, errorId: ") + QString::number(hasErrors)); } //revert arrays g_HdataTmp = g_HdataRec; g_HdataRec = g_HdataUp; g_HdataUp = g_HdataTmp; g_HdataTmp = g_directionsRec; g_directionsRec = g_directionsUp; g_directionsUp = g_HdataTmp; } //Copy vectors on host and find actual results cudaMemcpy(tempRow, g_HdataMax, sizeQQ, cudaMemcpyDeviceToHost); cudaMemcpy(directionRow, g_directionsMax, sizeQQ, cudaMemcpyDeviceToHost); if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { cudaMemcpy(globalMatrix, g_directionsMatrix, directionMatrixSize, cudaMemcpyDeviceToHost); cudaMemcpy(backtraceBegins, g_backtraceBegins, backtraceBeginsSize, cudaMemcpyDeviceToHost); } QList<resType> pas; resType res; for (int j = 0; j < (sizeRow); j++) { if (tempRow[j] >= maxScore) { res.refSubseq.startPos = directionRow[j]; res.refSubseq.length = j - res.refSubseq.startPos + 1 - (j) / (partSeqSize + 1) * overlapLength - (j) / (partSeqSize + 1); res.score = tempRow[j]; if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { qint32 pairAlignOffset = 0; qint32 row = backtraceBegins[2 * j]; qint32 column = backtraceBegins[2 * j + 1]; while(U2::SmithWatermanAlgorithm::STOP != globalMatrix[seqLibLength * row + column]) { if(U2::SmithWatermanAlgorithm::DIAG == globalMatrix[seqLibLength * row + column]) { res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::DIAG; row--; column--; } else if(U2::SmithWatermanAlgorithm::LEFT == globalMatrix[seqLibLength * row + column]) { res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::UP; column--; } else if(U2::SmithWatermanAlgorithm::UP == globalMatrix[seqLibLength * row + column]) { res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::LEFT; row--; } if(0 >= row || 0 >= column) { break; } } res.patternSubseq.startPos = row; res.patternSubseq.length = backtraceBegins[2 * j] - row + 1; } pas.append(res); } } //deallocation memory cudaFree(g_seqLib); cudaFree(g_queryProfile); cudaFree(g_HdataMax); cudaFree(g_HdataUp); cudaFree(g_HdataRec); cudaFree(g_FdataUp); cudaFree(g_directionsUp); cudaFree(g_directionsMax); cudaFree(g_directionsRec); if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) { cudaFree(g_directionsMatrix); cudaFree(g_backtraceBegins); } delete[] tempRow; delete[] directionRow; delete[] zerroArr; delete[] globalMatrix; delete[] backtraceBegins; return pas; }
~curandStateManager() { //if(_state != NULL) memFree((char*)_state); if(_state != NULL) CUDA_CHECK(cudaFree(_state)); }
/* ========================================================================== */ int sci_gpuLU(char *fname) { CheckRhs(1,2); CheckLhs(2,2); #ifdef WITH_CUDA cublasStatus status; #endif SciErr sciErr; int* piAddr_A = NULL; double* h_A = NULL; double* hi_A = NULL; int rows_A; int cols_A; int* piAddr_Opt = NULL; double* option = NULL; int rows_Opt; int cols_Opt; void* d_A = NULL; int na; void* pvPtr = NULL; int size_A = sizeof(double); bool bComplex_A = FALSE; int inputType_A; int inputType_Opt; double res; int posOutput = 1; try { sciErr = getVarAddressFromPosition(pvApiCtx, 1, &piAddr_A); if(sciErr.iErr) throw sciErr; if(Rhs == 2) { sciErr = getVarAddressFromPosition(pvApiCtx, 2, &piAddr_Opt); if(sciErr.iErr) throw sciErr; sciErr = getVarType(pvApiCtx, piAddr_Opt, &inputType_Opt); if(sciErr.iErr) throw sciErr; if(inputType_Opt == sci_matrix) { sciErr = getMatrixOfDouble(pvApiCtx, piAddr_Opt, &rows_Opt, &cols_Opt, &option); if(sciErr.iErr) throw sciErr; } else throw "Option syntax is [number,number]."; } else { rows_Opt=1; cols_Opt=2; option = (double*)malloc(2*sizeof(double)); option[0]=0; option[1]=0; } if(rows_Opt != 1 || cols_Opt != 2) throw "Option syntax is [number,number]."; if((int)option[1] == 1 && !isGpuInit()) throw "gpu is not initialised. Please launch gpuInit() before use this function."; sciErr = getVarType(pvApiCtx, piAddr_A, &inputType_A); if(sciErr.iErr) throw sciErr; #ifdef WITH_CUDA if (useCuda()) { if(inputType_A == sci_pointer) { sciErr = getPointer(pvApiCtx, piAddr_A, (void**)&pvPtr); if(sciErr.iErr) throw sciErr; gpuMat_CUDA* gmat; gmat = static_cast<gpuMat_CUDA*>(pvPtr); if(!gmat->useCuda) throw "Please switch to OpenCL mode before use this data."; rows_A=gmat->rows; cols_A=gmat->columns; if(gmat->complex) { bComplex_A = TRUE; size_A = sizeof(cuDoubleComplex); d_A=(cuDoubleComplex*)gmat->ptr->get_ptr(); } else d_A=(double*)gmat->ptr->get_ptr(); // Initialize CUBLAS status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) throw status; na = rows_A * cols_A; } else if(inputType_A == 1) { // Get size and data if(isVarComplex(pvApiCtx, piAddr_A)) { sciErr = getComplexMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A, &hi_A); if(sciErr.iErr) throw sciErr; size_A = sizeof(cuDoubleComplex); bComplex_A = TRUE; } else { sciErr = getMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A); if(sciErr.iErr) throw sciErr; } na = rows_A * cols_A; // Initialize CUBLAS status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) throw status; // Allocate device memory status = cublasAlloc(na, size_A, (void**)&d_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; // Initialize the device matrices with the host matrices if(!bComplex_A) { status = cublasSetMatrix(rows_A,cols_A, sizeof(double), h_A, rows_A, (double*)d_A, rows_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; } else writecucomplex(h_A, hi_A, rows_A, cols_A, (cuDoubleComplex *)d_A); } else throw "Bad argument type."; cuDoubleComplex resComplex; // Performs operation if(!bComplex_A) status = decomposeBlockedLU(rows_A, cols_A, rows_A, (double*)d_A, 1); // else // resComplex = cublasZtrsm(na,(cuDoubleComplex*)d_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; // Put the result in scilab switch((int)option[0]) { case 2 : case 1 : sciprint("The first option must be 0 for this function. Considered as 0.\n"); case 0 : // Keep the result on the Host. { // Put the result in scilab if(!bComplex_A) { double* h_res = NULL; sciErr=allocMatrixOfDouble(pvApiCtx, Rhs + posOutput, rows_A, cols_A, &h_res); if(sciErr.iErr) throw sciErr; status = cublasGetMatrix(rows_A,cols_A, sizeof(double), (double*)d_A, rows_A, h_res, rows_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; } else { sciErr = createComplexMatrixOfDouble(pvApiCtx, Rhs + posOutput, 1, 1, &resComplex.x,&resComplex.y); if(sciErr.iErr) throw sciErr; } LhsVar(posOutput)=Rhs+posOutput; posOutput++; break; } default : throw "First option argument must be 0 or 1 or 2."; } switch((int)option[1]) { case 0 : // Don't keep the data input on Device. { if(inputType_A == sci_matrix) { status = cublasFree(d_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; d_A = NULL; } break; } case 1 : // Keep data of the fisrt argument on Device and return the Device pointer. { if(inputType_A == sci_matrix) { gpuMat_CUDA* dptr; gpuMat_CUDA tmp={getCudaContext()->genMatrix<double>(getCudaQueue(),rows_A*cols_A),rows_A,cols_A}; dptr=new gpuMat_CUDA(tmp); dptr->useCuda = true; dptr->ptr->set_ptr((double*)d_A); if(bComplex_A) dptr->complex=TRUE; else dptr->complex=FALSE; sciErr = createPointer(pvApiCtx,Rhs+posOutput, (void*)dptr); if(sciErr.iErr) throw sciErr; LhsVar(posOutput)=Rhs+posOutput; } else throw "The first input argument is already a GPU variable."; posOutput++; break; } default : throw "Second option argument must be 0 or 1."; } // Shutdown status = cublasShutdown(); if (status != CUBLAS_STATUS_SUCCESS) throw status; } #endif #ifdef WITH_OPENCL if (!useCuda()) { throw "not implemented with OpenCL."; } #endif if(Rhs == 1) { free(option); option = NULL; } if(posOutput < Lhs+1) throw "Too many output arguments."; if(posOutput > Lhs+1) throw "Too few output arguments."; PutLhsVar(); return 0; } catch(const char* str) { Scierror(999,"%s\n",str); } catch(SciErr E) { printError(&E, 0); } #ifdef WITH_CUDA catch(cudaError_t cudaE) { GpuError::treat_error<CUDAmode>((CUDAmode::Status)cudaE); } catch(cublasStatus CublasE) { GpuError::treat_error<CUDAmode>((CUDAmode::Status)CublasE,1); } if (useCuda()) { if(inputType_A == 1 && d_A != NULL) cudaFree(d_A); } #endif #ifdef WITH_OPENCL if (!useCuda()) { Scierror(999,"not implemented with OpenCL.\n"); } #endif if(Rhs == 1 && option != NULL) free(option); return EXIT_FAILURE; }
/* Main */ int main(int argc, char **argv) { cublasStatus_t status; float *h_A; float *h_B; float *h_C; float *h_C_ref; float *d_A = 0; float *d_B = 0; float *d_C = 0; float alpha = 1.0f; float beta = 0.0f; int n2 = N * N; int i; float error_norm; float ref_norm; float diff; cublasHandle_t handle; int dev = findCudaDevice(argc, (const char **) argv); if (dev == -1) { return EXIT_FAILURE; } /* Initialize CUBLAS */ printf("simpleCUBLAS test running..\n"); status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate host memory for the matrices */ h_A = (float *)malloc(n2 * sizeof(h_A[0])); if (h_A == 0) { fprintf(stderr, "!!!! host memory allocation error (A)\n"); return EXIT_FAILURE; } h_B = (float *)malloc(n2 * sizeof(h_B[0])); if (h_B == 0) { fprintf(stderr, "!!!! host memory allocation error (B)\n"); return EXIT_FAILURE; } h_C = (float *)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf(stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = rand() / (float)RAND_MAX; h_B[i] = rand() / (float)RAND_MAX; h_C[i] = rand() / (float)RAND_MAX; } /* Allocate device memory for the matrices */ if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate A)\n"); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate B)\n"); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate C)\n"); return EXIT_FAILURE; } /* Initialize the device matrices with the host matrices */ status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write A)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write B)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write C)\n"); return EXIT_FAILURE; } /* Performs operation using plain C code */ simple_sgemm(N, alpha, h_A, h_B, beta, h_C); h_C_ref = h_C; /* Performs operation using cublas */ status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Allocate host memory for reading back the result from device memory */ h_C = (float *)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf(stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (read C)\n"); return EXIT_FAILURE; } /* Check result against reference */ error_norm = 0; ref_norm = 0; for (i = 0; i < n2; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (float)sqrt((double)error_norm); ref_norm = (float)sqrt((double)ref_norm); if (fabs(ref_norm) < 1e-7) { fprintf(stderr, "!!!! reference norm is 0\n"); return EXIT_FAILURE; } /* Memory clean up */ free(h_A); free(h_B); free(h_C); free(h_C_ref); if (cudaFree(d_A) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (A)\n"); return EXIT_FAILURE; } if (cudaFree(d_B) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (B)\n"); return EXIT_FAILURE; } if (cudaFree(d_C) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (C)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasDestroy(handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } if (error_norm / ref_norm < 1e-6f) { printf("simpleCUBLAS test passed.\n"); exit(EXIT_SUCCESS); } else { printf("simpleCUBLAS test failed.\n"); exit(EXIT_FAILURE); } }
int main(int argc, char* argv[]) { // Process config (to be filled completely // later). config_t config; config.idevice = 0; config.nx = nx; config.ny = ny; config.step = 0; // Create shared memory region. int fd = shm_open("/shmem_mmap_cuda_shm", O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); if (fd == -1) { fprintf(stderr, "Cannot open shared region, errno = %d\n", errno); return errno; } // Create first semaphore (set to 0 to create it initially locked). sem_t* sem1 = sem_open("/shmem_mmap_cuda_sem1", O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO, 0); if (sem1 == SEM_FAILED) { fprintf(stderr, "Cannot open semaphore #1, errno = %d\n", errno); return errno; } // Create second semaphore (set to 0 to create it initially locked). sem_t* sem2 = sem_open("/shmem_mmap_cuda_sem2", O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO, 0); if (sem2 == SEM_FAILED) { fprintf(stderr, "Cannot open semaphore #2, errno = %d\n", errno); return errno; } // Call fork to create another process. // Standard: "Memory mappings created in the parent // shall be retained in the child process." pid_t fork_status = fork(); // From this point two processes are running the same code, if no errors. if (fork_status == -1) { fprintf(stderr, "Cannot fork process, errno = %d\n", errno); return errno; } // Get the process ID. int pid = (int)getpid(); // By fork return value we can determine the process role: // master or child (worker). int master = fork_status ? 1 : 0, worker = !master; int ndevices = 0; cudaError_t cuda_status = cudaGetDeviceCount(&ndevices); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot get the cuda device count by process %d, status = %d\n", pid, cuda_status); return cuda_status; } // Return if no cuda devices present. if (master) printf("%d CUDA device(s) found\n", ndevices); if (!ndevices) return 0; ndevices = 1; size_t np = nx * ny; size_t size = np * sizeof(float); float* inout; if (!master) { // Lock semaphore to finish shared region configuration on master. int sem_status = sem_wait(sem1); if (sem_status == -1) { fprintf(stderr, "Cannot wait on semaphore by process %d, errno = %d\n", pid, errno); return errno; } // Map the shared region into the address space of the current process. inout = (float*)mmap(0, size * (ndevices + 1), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (inout == MAP_FAILED) { fprintf(stderr, "Cannot map shared region to memory by process %d, errno = %d\n", pid, errno); return errno; } } else { config.idevice = ndevices; // Set shared region size. int ftrunk_status = ftruncate(fd, size * (ndevices + 1)); if (ftrunk_status == -1) { fprintf(stderr, "Cannot truncate shared region, errno = %d\n", errno); return errno; } // Map the shared region into the address space of the current process. inout = (float*)mmap(0, size * (ndevices + 1), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (inout == MAP_FAILED) { fprintf(stderr, "Cannot map shared region to memory by process %d, errno = %d\n", pid, errno); return errno; } // Create input data. Let each device to have an equal piece // of single shared data array. float invdrandmax = 1.0 / RAND_MAX; for (size_t i = 0; i < np; i++) inout[i] = rand() * invdrandmax; for (int i = 0; i < ndevices; i++) memcpy(inout + np * (i + 1), inout, np * sizeof(float)); // Sync changed content with shared region. int msync_status = msync(inout, size * (ndevices + 1), MS_SYNC); if (msync_status == -1) { fprintf(stderr, "Cannot sync shared memory %p, errno = %d\n", inout, errno); return errno; } // Unlock semaphore to let other processes to move forward. int sem_status = sem_post(sem1); if (sem_status == -1) { fprintf(stderr, "Cannot post on semaphore by process %d, errno = %d\n", pid, errno); return errno; } } config.inout_cpu = inout + config.idevice * np; // Let workers to use CUDA devices, and master - the CPU. // Create device buffers. if (worker) { // Create device arrays for input and output data. cuda_status = cudaMalloc((void**)&config.in_dev, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot allocate CUDA input buffer by process %d, status = %d\n", pid, cuda_status); return cuda_status; } cuda_status = cudaMalloc((void**)&config.out_dev, size); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot allocate CUDA output buffer by process %d, status = %d\n", pid, cuda_status); return cuda_status; } } else { // Create device arrays for input and output data. config.in_dev = (float*)malloc(size); config.out_dev = (float*)malloc(size); } printf("Device %d initialized py process %d\n", config.idevice, pid); // Perform some "iterations" on data arrays, assigned to devices, // and shift input data array after each iteration. for (int i = 0; i < nticks; i++) { int status; if (master) { // Copy input data to device buffer. memcpy(config.in_dev, config.inout_cpu, size); status = pattern2d_cpu(1, config.nx, 1, 1, config.ny, 1, config.in_dev, config.out_dev, config.idevice); if (status) { fprintf(stderr, "Cannot execute pattern 2d by process %d, status = %d\n", pid, status); return status; } // Copy output data from device buffer. memcpy(config.inout_cpu, config.out_dev, size); // Sync with changed content in shared region. int msync_status = msync(inout, size * (ndevices + 1), MS_SYNC); if (msync_status == -1) { fprintf(stderr, "Cannot sync shared memory %p, errno = %d\n", inout, errno); return errno; } int sem_status = sem_post(sem1); if (sem_status == -1) { fprintf(stderr, "Cannot post on semaphore #1 by process %d, errno = %d\n", pid, errno); return errno; } sem_status = sem_wait(sem2); if (sem_status == -1) { fprintf(stderr, "Cannot post on semaphore #2 by process %d, errno = %d\n", pid, errno); return errno; } } else { // Copy input data to device buffer. cuda_status = cudaMemcpy(config.in_dev, config.inout_cpu, size, cudaMemcpyHostToDevice); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot copy input data to CUDA buffer by process %d, status = %d\n", pid, cuda_status); return cuda_status; } status = pattern2d_gpu(1, config.nx, 1, 1, config.ny, 1, config.in_dev, config.out_dev, config.idevice); if (status) { fprintf(stderr, "Cannot execute pattern 2d by process %d, status = %d\n", pid, status); return status; } // Copy output data from device buffer. cuda_status = cudaMemcpy(config.inout_cpu, config.out_dev, size, cudaMemcpyDeviceToHost); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot copy output data from CUDA buffer by process %d, status = %d\n", pid, cuda_status); return cuda_status; } // Sync with changed content in shared region. int msync_status = msync(inout, size * (ndevices + 1), MS_SYNC); if (msync_status == -1) { fprintf(stderr, "Cannot sync shared memory %p, errno = %d\n", inout, errno); return errno; } int sem_status = sem_wait(sem1); if (sem_status == -1) { fprintf(stderr, "Cannot wait on semaphore #1 by process %d, errno = %d\n", pid, errno); return errno; } sem_status = sem_post(sem2); if (sem_status == -1) { fprintf(stderr, "Cannot post on semaphore #2 by process %d, errno = %d\n", pid, errno); return errno; } } // At this point two processes are synchronized. config.step++; // Reassign porcesses' input data segments to show some // possible manipulation on shared memory. // Here we perform cyclic shift of data pointers. config.idevice++; config.idevice %= ndevices + 1; config.inout_cpu = inout + config.idevice * np; } // Release device buffers. if (worker) { cuda_status = cudaFree(config.in_dev); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot release input buffer by process %d, status = %d\n", pid, cuda_status); return cuda_status; } cuda_status = cudaFree(config.out_dev); if (cuda_status != cudaSuccess) { fprintf(stderr, "Cannot release output buffer by process %d, status = %d\n", pid, cuda_status); return cuda_status; } } else { free(config.in_dev); free(config.out_dev); } printf("Device %d deinitialized py process %d\n", config.idevice, pid); // On master process perform results check: // compare each GPU result to CPU result. if (master) { float* control = inout + np * ndevices; for (int idevice = 0; idevice < ndevices; idevice++) { // Find the maximum abs difference. int maxi = 0, maxj = 0; float maxdiff = fabs(control[0] - (inout + idevice * np)[0]); for (int j = 0; j < ny; j++) { for (int i = 0; i < nx; i++) { float diff = fabs( control[i + j * nx] - (inout + idevice * np)[i + j * nx]); if (diff > maxdiff) { maxdiff = diff; maxi = i; maxj = j; } } } printf("Device %d result abs max diff = %f @ (%d,%d)\n", idevice, maxdiff, maxi, maxj); } } // Unlink semaphore. if (master) { int sem_status = sem_unlink("/shmem_mmap_cuda_sem1"); if (sem_status == -1) { fprintf(stderr, "Cannot unlink semaphore #1 by process %d, errno = %d\n", pid, errno); return errno; } } // Close semaphore. int sem_status = sem_close(sem1); if (sem_status == -1) { fprintf(stderr, "Cannot close semaphore #1 by process %d, errno = %d\n", pid, errno); return errno; } // Unlink semaphore. if (master) { int sem_status = sem_unlink("/shmem_mmap_cuda_sem2"); if (sem_status == -1) { fprintf(stderr, "Cannot unlink semaphore #2 by process %d, errno = %d\n", pid, errno); return errno; } } // Close semaphore. sem_status = sem_close(sem2); if (sem_status == -1) { fprintf(stderr, "Cannot close semaphore #2 by process %d, errno = %d\n", pid, errno); return errno; } // Unmap shared region. close(fd); int munmap_status = munmap(inout, size * (ndevices + 1)); if (munmap_status == -1) { fprintf(stderr, "Cannot unmap shared region by process %d, errno = %d\n", pid, errno); return errno; } // Unlink shared region. if (master) { int unlink_status = shm_unlink("/shmem_mmap_cuda_shm"); if (unlink_status == -1) { fprintf(stderr, "Cannot unlink shared region by process %d, errno = %d\n", pid, errno); return errno; } } return 0; }
OsdCudaTable::~OsdCudaTable() { if (_devicePtr) cudaFree(_devicePtr); }
int main(int argc, char *argv[]) { // needed to work correctly with piped benchmarkrunner setlinebuf(stdout); setlinebuf(stdin); int n_indices = 1; int n_dimensions = 1; char inBuf[200]; // ridiculously large input buffer. bool isFirst = true; do { // Allocate memory for the arrays int *h_indices = 0; double *h_outputGPU = 0; try { h_indices = new int [n_indices * n_dimensions]; h_outputGPU = new double [n_indices * n_dimensions]; } catch (std::exception e) { std::cerr << "Caught exception: " << e.what() << std::endl; std::cerr << "Unable to allocate CPU memory (try running with fewer vectors/dimensions)" << std::endl; return -1; } int *d_indices; double *d_output; try { cudaError_t cudaResult; cudaResult = cudaMalloc((void **)&d_indices, n_dimensions * n_indices * sizeof(int)); if (cudaResult != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(cudaResult)); } } catch (std::runtime_error e) { std::cerr << "Caught exception: " << e.what() << std::endl; std::cerr << "Unable to allocate GPU memory (try running with fewer vectors/dimensions)" << std::endl; return -1; } // Initialize the indices (done on the host) for(int i = 0; i < n_indices; i++) { h_indices[i] = i; } // Copy the indices to the device cudaMemcpy(d_indices, h_indices, n_dimensions * n_indices * sizeof(int), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); // Execute the QRNG on the device int n_vec; sobol_nikola_unsimplified(n_indices, d_indices, n_indices, &d_output, &n_vec); cudaDeviceSynchronize(); cudaMemcpy(h_outputGPU, d_output, n_indices * n_dimensions * sizeof(double), cudaMemcpyDeviceToHost); // Cleanup and terminate delete h_indices; cudaFree(d_indices); cudaFree(d_output); if(!isFirst) { printf("RESULT "); for(int i = 0; i < std::min(n_indices,10); i++) printf("%f ", h_outputGPU[i]); printf("\n"); } else { printf("OK\n"); isFirst = false; } delete h_outputGPU; fgets(inBuf, 200, stdin); if (sscanf(inBuf, "%u", &n_indices) == 0) { // if input is not a number, it has to be "EXIT" if (strncmp("EXIT",inBuf,4)==0) { printf("OK\n"); break; } else { printf("ERROR. Bad input: %s\n", inBuf); break; } } } while (true); cudaDeviceReset(); return -1; }
PhysicsProcessor::~PhysicsProcessor(void) { gpuErrchk(cudaFree(d_V)); }
void ControlCubeCache::_reSizeCache() { _nLevels = _nextnLevels; _levelCube = _nextLevelCube; _offset = _nextOffset; _nextnLevels = 0; _nextLevelCube = 0; _dimCube = exp2(_nLevels - _levelCube) + 2 * CUBE_INC; _sizeElement = pow(_dimCube, 3); int dimV = exp2(_nLevels); _minValue = coordinateToIndex(vmml::vector<3,int>(0,0,0), _levelCube, _nLevels); _maxValue = coordinateToIndex(vmml::vector<3,int>(dimV-1,dimV-1,dimV-1), _levelCube, _nLevels); int dc = exp2(_nLevels - _levelCube); vmml::vector<3,int> mn = _cpuCache->getMinCoord(); vmml::vector<3,int> mx = _cpuCache->getMaxCoord(); _maxC = mx - mn; if ((mx.x() - mn.x()) % dc != 0) _maxC[0] += dc; if ((mx.y() - mn.y()) % dc != 0) _maxC[1] += dc; if ((mx.z() - mn.z()) % dc != 0) _maxC[2] += dc; if (cudaSuccess != cudaSetDevice(_device)) { std::cerr<<"Control Cube Cache, error setting device: "<<cudaGetErrorString(cudaGetLastError())<<std::endl; throw; } if (_memory != 0) if (cudaSuccess != cudaFree((void*)_memory)) { std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl; throw; } size_t total = 0; size_t free = 0; if (cudaSuccess != cudaMemGetInfo(&free, &total)) { std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl; throw; } float memorySize = (0.80f*free); // Get 80% of free memory _maxNumCubes = memorySize/ (_sizeElement*sizeof(float)); if (_maxNumCubes == 0) { std::cerr<<"Control Cube Cache: Memory aviable is not enough "<<memorySize/1024/1024<<" MB"<<std::endl; throw; } if (cudaSuccess != cudaMalloc((void**)&_memory, _maxNumCubes*_sizeElement*sizeof(float))) { std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl; throw; } _freeSlots = _maxNumCubes; ControlElementCache::_reSizeCache(); }
void mpla_redistribute_vector_for_dgesv(struct mpla_vector* b_redist, struct mpla_vector* b, struct mpla_matrix* A, struct mpla_instance* instance) { // attention: this code does no correctness check for the input data // b_redist->vec_row_count = b->vec_row_count; // // // allocating memory for process-wise vector information // vector->proc_row_count = new int*[instance->proc_rows]; // vector->proc_row_offset = new int*[instance->proc_rows]; // for (int i=0; i<instance->proc_rows; i++) // { // b_redist->proc_row_count[i] = new int[instance->proc_cols]; // b_redist->proc_row_offset[i] = new int[instance->proc_cols]; // } // // // set sizes of // for (int i=0; i<instance->proc_rows; i++) // { // for (int j=0; j<instance->proc_cols; j++) // { // b_redist->proc_row_count[i][j] = A->proc_col_count[i][j]; // b_redist->proc_row_offset[i][j] = A->proc_col_offset[i][j]; // } // } // // // retrieving local data for current process // b_redist->cur_proc_row_count = A->cur_proc_col_count; // b_redist->cur_proc_row_offset = A->cur_proc_col_offset; // // // allocating temporary vector storage // cudaMalloc((void*)&(b_redist->data), sizeof(double)*b_redist->cur_proc_row_count); // WARNING: The following code is not efficient for a strong parallelization !!!!! // create sub-communicator for each process column int remain_dims[2]; remain_dims[0]=1; remain_dims[1]=0; MPI_Comm column_comm; MPI_Cart_sub(instance->comm, remain_dims, &column_comm); int column_rank; MPI_Comm_rank(column_comm, &column_rank); // columnwise creation of the full vector double* full_vector; int* recvcounts = new int[instance->proc_rows]; int* displs = new int[instance->proc_rows]; for (int i=0; i<instance->proc_rows; i++) { recvcounts[i] = b->proc_row_count[i][instance->cur_proc_col]; displs[i] = b->proc_row_offset[i][instance->cur_proc_col]; } cudaMalloc((void**)&full_vector, sizeof(double)*b->vec_row_count); cudaThreadSynchronize(); checkCUDAError("cudaMalloc"); MPI_Allgatherv(b->data, b->cur_proc_row_count, MPI_DOUBLE, full_vector, recvcounts, displs, MPI_DOUBLE, column_comm); // extract column-wise local part of full vector cudaMemcpy(b_redist->data, &(full_vector[b_redist->cur_proc_row_offset]), sizeof(double)*b_redist->cur_proc_row_count, cudaMemcpyDeviceToDevice); // memory cleanup cudaFree(full_vector); MPI_Comm_free(&column_comm); }
RealKernel::~RealKernel() { delete[] data; #ifdef GPU_ENABLED cudaFree(dataGpu); #endif }
void run_2D_GLOBAL_MEMORY() { int arrayWidth = 4; int arrayHeight = 4; bool SEQ = true; /* Host allocation */ float* inArr_1_H = (float*) malloc(arrayWidth * arrayHeight * sizeof(float)); float* inArr_2_H = (float*) malloc(arrayWidth * arrayHeight * sizeof(float)); float* outArr_H = (float*) malloc(arrayWidth * arrayHeight * sizeof(float)); /* Fill arrays */ int index = 0; if (SEQ) { int ctr = 0; for(int j = 0; j < (arrayHeight); j++) { for(int i = 0; i < (arrayWidth); i++) { index = ((j * arrayWidth) + i); inArr_1_H[index] = (float) ctr++; inArr_2_H[index] = (float) ctr++; outArr_H[index] = (float) 0; } } } else { for(int j = 0; j < (arrayHeight); j++) { for(int i = 0; i < (arrayWidth); i++) { index = ((j * arrayWidth) + i); inArr_1_H[index] = (float)rand()/(float)RAND_MAX; inArr_2_H[index] = (float)rand()/(float)RAND_MAX; outArr_H[index] = 0; } } } /* Print host arrays */ printf("inArr_1_H \n"); print_2D_Array(inArr_1_H, arrayWidth, arrayHeight); printf("inArr_2_H \n"); print_2D_Array(inArr_2_H, arrayWidth, arrayHeight); /* Device allocation + <__pitch> */ float *inArr_1_D, *inArr_2_D, *outArr_D; size_t __pitch; cudaMallocPitch((void**)&inArr_1_D, &__pitch, arrayHeight * sizeof(float), arrayWidth); cudaMallocPitch((void**)&inArr_2_D, &__pitch, arrayHeight * sizeof(float), arrayWidth); cudaMallocPitch((void**)&outArr_D, &__pitch, arrayHeight * sizeof(float), arrayWidth); /* Print __pitch */ printf("__pitch %d \n", (__pitch/sizeof(float))); /* Uploading data */ cudaMemcpy2D(inArr_1_D, __pitch, inArr_1_H, arrayHeight * sizeof(float), arrayHeight * sizeof(float), arrayWidth, cudaMemcpyHostToDevice); cudaMemcpy2D(inArr_2_D, __pitch, inArr_2_H, arrayHeight * sizeof(float), arrayHeight * sizeof(float), arrayWidth, cudaMemcpyHostToDevice); /* Gridding */ dim3 __numBlocks(1,1,1); dim3 __numThreadsPerBlock(BLOCK_SIZE, BLOCK_SIZE, 1); __numBlocks.x = ((arrayWidth / BLOCK_SIZE) + (((arrayWidth) % BLOCK_SIZE) == 0 ? 0:1)); __numBlocks.y = ((arrayHeight / BLOCK_SIZE) + (((arrayHeight) % BLOCK_SIZE) == 0 ? 0:1)); /* Kernel invokation */ add_2D_Array(inArr_1_D, inArr_2_D, outArr_D, arrayWidth, arrayHeight, __pitch, __numBlocks, __numThreadsPerBlock); /* Synchronization */ cudaThreadSynchronize(); /* Download result */ cudaMemcpy2D(outArr_H, arrayHeight * sizeof(float), outArr_D, __pitch, arrayHeight * sizeof(float), arrayWidth, cudaMemcpyDeviceToHost); /* Free device arrays */ cudaFree(inArr_1_D); cudaFree(inArr_2_D); cudaFree(outArr_D); /* Display results */ printf("outArr \n"); print_2D_Array(outArr_H, arrayWidth, arrayHeight); }
void gpu_data:: set_size( size_t new_size ) { if (new_size == 0) { if (device_in_use) { // Wait for any possible CUDA kernels that might be using our memory block to // complete before we free the memory. synchronize_stream(0); device_in_use = false; } wait_for_transfer_to_finish(); data_size = 0; host_current = true; device_current = true; device_in_use = false; data_host.reset(); data_device.reset(); } else if (new_size != data_size) { if (device_in_use) { // Wait for any possible CUDA kernels that might be using our memory block to // complete before we free the memory. synchronize_stream(0); device_in_use = false; } wait_for_transfer_to_finish(); data_size = new_size; host_current = true; device_current = true; device_in_use = false; try { CHECK_CUDA(cudaGetDevice(&the_device_id)); // free memory blocks before we allocate new ones. data_host.reset(); data_device.reset(); void* data; CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float))); // Note that we don't throw exceptions since the free calls are invariably // called in destructors. They also shouldn't fail anyway unless someone // is resetting the GPU card in the middle of their program. data_host.reset((float*)data, [](float* ptr){ auto err = cudaFreeHost(ptr); if(err!=cudaSuccess) std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl; }); CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float))); data_device.reset((float*)data, [](float* ptr){ auto err = cudaFree(ptr); if(err!=cudaSuccess) std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl; }); if (!cuda_stream) { cudaStream_t cstream; CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking)); cuda_stream.reset(cstream, [](void* ptr){ auto err = cudaStreamDestroy((cudaStream_t)ptr); if(err!=cudaSuccess) std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl; }); } } catch(...) { set_size(0); throw; } } }
void TxVectorOptimizationDataCU::freeResources() { if (devicePtr) { cudaError_t err = cudaFree(devicePtr); CHKCUDAERR(err); } }
int main(int argc, char **argv) { // Start logs printf("%s Starting...\n\n", argv[0]); unsigned int useDoublePrecision; char *precisionChoice; getCmdLineArgumentString(argc, (const char **)argv, "type", &precisionChoice); if (precisionChoice == NULL) { useDoublePrecision = 0; } else { if (!STRCASECMP(precisionChoice, "double")) { useDoublePrecision = 1; } else { useDoublePrecision = 0; } } unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; float *h_OutputGPU, *d_Output; int dim, pos; double delta, ref, sumDelta, sumRef, L1norm, gpuTime; StopWatchInterface *hTimer = NULL; if (sizeof(INT64) != 8) { printf("sizeof(INT64) != 8\n"); return 0; } // use command-line specified CUDA device, otherwise use device with highest Gflops/s int dev = findCudaDevice(argc, (const char **)argv); sdkCreateTimer(&hTimer); int deviceIndex; checkCudaErrors(cudaGetDevice(&deviceIndex)); cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, deviceIndex)); int version = deviceProp.major * 10 + deviceProp.minor; if (useDoublePrecision && version < 13) { printf("Double precision not supported.\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); return 0; } printf("Allocating GPU memory...\n"); checkCudaErrors(cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float))); printf("Allocating CPU memory...\n"); h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); printf("Initializing QRNG tables...\n\n"); initQuasirandomGenerator(tableCPU); if (useDoublePrecision) { initTable_SM13(tableCPU); } else { initTable_SM10(tableCPU); } printf("Testing QRNG...\n\n"); checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); int numIterations = 20; for (int i = -1; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } if (useDoublePrecision) { quasirandomGenerator_SM13(d_Output, 0, N); } else { quasirandomGenerator_SM10(d_Output, 0, N); } } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3; printf("quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); printf("\nReading GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); printf("Comparing to the CPU results...\n\n"); sumDelta = 0; sumRef = 0; for (dim = 0; dim < QRNG_DIMENSIONS; dim++) for (pos = 0; pos < N; pos++) { ref = getQuasirandomValue63(pos, dim); delta = (double)h_OutputGPU[dim * N + pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } printf("L1 norm: %E\n", sumDelta / sumRef); printf("\nTesting inverseCNDgpu()...\n\n"); checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); for (int i = -1; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } if (useDoublePrecision) { inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N); } else { inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N); } } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3; printf("quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); printf("Reading GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); printf("\nComparing to the CPU results...\n"); sumDelta = 0; sumRef = 0; unsigned int distance = ((unsigned int)-1) / (QRNG_DIMENSIONS * N + 1); for (pos = 0; pos < QRNG_DIMENSIONS * N; pos++) { unsigned int d = (pos + 1) * distance; ref = MoroInvCNDcpu(d); delta = (double)h_OutputGPU[pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } printf("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); printf("Shutting down...\n"); sdkDeleteTimer(&hTimer); free(h_OutputGPU); checkCudaErrors(cudaFree(d_Output)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); }
// Main ------------------------------------------------------------------------------------------ int main(int argc, char **argv) { const Params p(argc, argv); CUDASetup setcuda(p.device); Timer timer; cudaError_t cudaStatus; int it_cpu = 0; int it_gpu = 0; int err = 0; #ifdef LOGS set_iter_interval_print(10); char test_info[500]; snprintf(test_info, 500, "-i %d -g %d -t %d -f %s -l %d\n",p.n_gpu_threads, p.n_gpu_blocks,p.n_threads, p.file_name,p.switching_limit); start_log_file("cudaSingleSourceShortestPath", test_info); //printf("Com LOG\n"); #endif // Allocate int n_nodes, n_edges; // int n_nodes_o; read_input_size(n_nodes, n_edges, p); timer.start("Allocation"); Node * h_nodes = (Node *) malloc(sizeof(Node) * n_nodes); //*************************** Alocando Memoria para o Gold ************************************* Gold * gold = (Gold *) malloc(sizeof(Gold) * n_nodes); if (p.mode == 1) { // ********************** Lendo O gold ********************************* read_gold(gold, p); // ********************************************************************** } //*********************************************************************************************** Node * d_nodes; cudaStatus = cudaMalloc((void**) &d_nodes, sizeof(Node) * n_nodes); Edge * h_edges = (Edge *) malloc(sizeof(Edge) * n_edges); Edge * d_edges; cudaStatus = cudaMalloc((void**) &d_edges, sizeof(Edge) * n_edges); std::atomic_int *h_color = (std::atomic_int *) malloc( sizeof(std::atomic_int) * n_nodes); int * d_color; cudaStatus = cudaMalloc((void**) &d_color, sizeof(int) * n_nodes); std::atomic_int *h_cost = (std::atomic_int *) malloc( sizeof(std::atomic_int) * n_nodes); int * d_cost; cudaStatus = cudaMalloc((void**) &d_cost, sizeof(int) * n_nodes); int * h_q1 = (int *) malloc(n_nodes * sizeof(int)); int * d_q1; cudaStatus = cudaMalloc((void**) &d_q1, sizeof(int) * n_nodes); int * h_q2 = (int *) malloc(n_nodes * sizeof(int)); int * d_q2; cudaStatus = cudaMalloc((void**) &d_q2, sizeof(int) * n_nodes); std::atomic_int h_head[1]; int * d_head; cudaStatus = cudaMalloc((void**) &d_head, sizeof(int)); std::atomic_int h_tail[1]; int * d_tail; cudaStatus = cudaMalloc((void**) &d_tail, sizeof(int)); std::atomic_int h_threads_end[1]; int * d_threads_end; cudaStatus = cudaMalloc((void**) &d_threads_end, sizeof(int)); std::atomic_int h_threads_run[1]; int * d_threads_run; cudaStatus = cudaMalloc((void**) &d_threads_run, sizeof(int)); int h_num_t[1]; int * d_num_t; cudaStatus = cudaMalloc((void**) &d_num_t, sizeof(int)); int h_overflow[1]; int * d_overflow; cudaStatus = cudaMalloc((void**) &d_overflow, sizeof(int)); std::atomic_int h_gray_shade[1]; int * d_gray_shade; cudaStatus = cudaMalloc((void**) &d_gray_shade, sizeof(int)); std::atomic_int h_iter[1]; int * d_iter; cudaStatus = cudaMalloc((void**) &d_iter, sizeof(int)); cudaDeviceSynchronize(); CUDA_ERR(); ALLOC_ERR(h_nodes, h_edges, h_color, h_cost, h_q1, h_q2); timer.stop("Allocation"); // Initialize timer.start("Initialization"); const int max_gpu_threads = setcuda.max_gpu_threads(); int source; read_input(source, h_nodes, h_edges, p); for (int i = 0; i < n_nodes; i++) { h_cost[i].store(INF); } h_cost[source].store(0); for (int i = 0; i < n_nodes; i++) { h_color[i].store(WHITE); } h_tail[0].store(0); h_head[0].store(0); h_threads_end[0].store(0); h_threads_run[0].store(0); h_q1[0] = source; h_iter[0].store(0); h_overflow[0] = 0; h_gray_shade[0].store(GRAY0); timer.stop("Initialization"); //timer.print("Initialization", 1); // Copy to device timer.start("Copy To Device"); cudaStatus = cudaMemcpy(d_nodes, h_nodes, sizeof(Node) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_edges, h_edges, sizeof(Edge) * n_edges, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); CUDA_ERR(); timer.stop("Copy To Device"); for (int rep = 0; rep < p.n_reps; rep++) { // Reset for (int i = 0; i < n_nodes; i++) { h_cost[i].store(INF); } h_cost[source].store(0); for (int i = 0; i < n_nodes; i++) { h_color[i].store(WHITE); } it_cpu = 0; it_gpu = 0; h_tail[0].store(0); h_head[0].store(0); h_threads_end[0].store(0); h_threads_run[0].store(0); h_q1[0] = source; h_iter[0].store(0); h_overflow[0] = 0; h_gray_shade[0].store(GRAY0); // if(rep >= p.n_warmup) timer.start("Kernel"); #ifdef LOGS start_iteration(); #endif // Run first iteration in master CPU thread h_num_t[0] = 1; int pid; int index_i, index_o; for (index_i = 0; index_i < h_num_t[0]; index_i++) { pid = h_q1[index_i]; h_color[pid].store(BLACK); int cur_cost = h_cost[pid].load(); for (int i = h_nodes[pid].x; i < (h_nodes[pid].y + h_nodes[pid].x); i++) { int id = h_edges[i].x; int cost = h_edges[i].y; cost += cur_cost; h_cost[id].store(cost); h_color[id].store(GRAY0); index_o = h_tail[0].fetch_add(1); h_q2[index_o] = id; } } h_num_t[0] = h_tail[0].load(); h_tail[0].store(0); h_threads_run[0].fetch_add(1); h_gray_shade[0].store(GRAY1); h_iter[0].fetch_add(1); // if(rep >= p.n_warmup) timer.stop("Kernel"); // Pointers to input and output queues int * h_qin = h_q2; int * h_qout = h_q1; int * d_qin = d_q2; int * d_qout = d_q1; const int CPU_EXEC = (p.n_threads > 0) ? 1 : 0; const int GPU_EXEC = (p.n_gpu_blocks > 0 && p.n_gpu_threads > 0) ? 1 : 0; // Run subsequent iterations on CPU or GPU until number of input queue elements is 0 while (*h_num_t != 0) { if ((*h_num_t < p.switching_limit || GPU_EXEC == 0) && CPU_EXEC == 1) { // If the number of input queue elements is lower than switching_limit it_cpu = it_cpu + 1; // if(rep >= p.n_warmup) timer.start("Kernel"); // Continue until switching_limit condition is not satisfied while ((*h_num_t != 0) && (*h_num_t < p.switching_limit || GPU_EXEC == 0) && CPU_EXEC == 1) { // Swap queues if (h_iter[0] % 2 == 0) { h_qin = h_q1; h_qout = h_q2; } else { h_qin = h_q2; h_qout = h_q1; } std::thread main_thread(run_cpu_threads, h_nodes, h_edges, h_cost, h_color, h_qin, h_qout, h_num_t, h_head, h_tail, h_threads_end, h_threads_run, h_gray_shade, h_iter, p.n_threads, p.switching_limit, GPU_EXEC); main_thread.join(); h_num_t[0] = h_tail[0].load(); // Number of elements in output queue h_tail[0].store(0); h_head[0].store(0); if (h_iter[0].load() % 2 == 0) h_gray_shade[0].store(GRAY0); else h_gray_shade[0].store(GRAY1); } // if(rep >= p.n_warmup) timer.stop("Kernel"); } else if ((*h_num_t >= p.switching_limit || CPU_EXEC == 0) && GPU_EXEC == 1) { // If the number of input queue elements is higher than or equal to switching_limit it_gpu = it_gpu + 1; // if(rep >= p.n_warmup) timer.start("Copy To Device"); cudaStatus = cudaMemcpy(d_cost, h_cost, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_color, h_color, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_threads_run, h_threads_run, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_threads_end, h_threads_end, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_overflow, h_overflow, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_q1, h_q1, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_q2, h_q2, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_iter, h_iter, sizeof(int), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy To Device"); // Continue until switching_limit condition is not satisfied while ((*h_num_t != 0) && (*h_num_t >= p.switching_limit || CPU_EXEC == 0) && GPU_EXEC == 1) { // Swap queues if (h_iter[0] % 2 == 0) { d_qin = d_q1; d_qout = d_q2; } else { d_qin = d_q2; d_qout = d_q1; } // if(rep >= p.n_warmup) timer.start("Copy To Device"); cudaStatus = cudaMemcpy(d_num_t, h_num_t, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_tail, h_tail, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_head, h_head, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_gray_shade, h_gray_shade, sizeof(int), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy To Device"); // if(rep >= p.n_warmup) timer.start("Kernel"); assert( p.n_gpu_threads <= max_gpu_threads && "The thread block size is greater than the maximum thread block size that can be used on this device"); cudaStatus = call_SSSP_gpu(p.n_gpu_blocks, p.n_gpu_threads, d_nodes, d_edges, d_cost, d_color, d_qin, d_qout, d_num_t, d_head, d_tail, d_threads_end, d_threads_run, d_overflow, d_gray_shade, d_iter, p.switching_limit, CPU_EXEC, sizeof(int) * (W_QUEUE_SIZE + 3)); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Kernel"); // if(rep >= p.n_warmup) timer.start("Copy Back and Merge"); cudaStatus = cudaMemcpy(h_tail, d_tail, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_iter, d_iter, sizeof(int), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy Back and Merge"); h_num_t[0] = h_tail[0].load(); // Number of elements in output queue h_tail[0].store(0); h_head[0].store(0); if (h_iter[0].load() % 2 == 0) h_gray_shade[0].store(GRAY0); else h_gray_shade[0].store(GRAY1); } // if(rep >= p.n_warmup) timer.start("Copy Back and Merge"); cudaStatus = cudaMemcpy(h_cost, d_cost, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_color, d_color, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_threads_run, d_threads_run, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_threads_end, d_threads_end, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_q1, d_q1, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_q2, d_q2, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy Back and Merge"); } } #ifdef LOGS end_iteration(); #endif // printf("IT CPU:%d\t",it_cpu); //printf("IT GPU:%d\n",it_gpu); if (p.mode == 1) { err = newest_verify(h_cost, n_nodes, n_nodes, gold, it_cpu, it_gpu); } //err=new_verify(h_cost, n_nodes,,it_cpu,it_gpu); if (err > 0) { printf("Errors: %d\n", err); read_input(source, h_nodes, h_edges, p); read_gold(gold, p); } else { printf(".ITERATION %d\n", rep); } #ifdef LOGS log_error_count(err); #endif // Ler a entrada novamente //read_input(source, h_nodes, h_edges, p); //read_gold(gold,p); } // end of iteration #ifdef LOGS end_log_file(); #endif // timer.print("Allocation", 1); //timer.print("Copy To Device", p.n_reps); // timer.print("Kernel", p.n_reps); // timer.print("Copy Back and Merge", p.n_reps); if (p.mode == 0) { create_output(h_cost, n_nodes, n_edges, std::string(p.comparison_file)); } // Verify answer verify(h_cost, n_nodes, p.comparison_file); // Free memory timer.start("Deallocation"); free(h_nodes); free(h_edges); free(h_color); free(h_cost); free(h_q1); free(h_q2); cudaStatus = cudaFree(d_nodes); cudaStatus = cudaFree(d_edges); cudaStatus = cudaFree(d_cost); cudaStatus = cudaFree(d_color); cudaStatus = cudaFree(d_q1); cudaStatus = cudaFree(d_q2); cudaStatus = cudaFree(d_num_t); cudaStatus = cudaFree(d_head); cudaStatus = cudaFree(d_tail); cudaStatus = cudaFree(d_threads_end); cudaStatus = cudaFree(d_threads_run); cudaStatus = cudaFree(d_overflow); cudaStatus = cudaFree(d_iter); cudaStatus = cudaFree(d_gray_shade); CUDA_ERR(); cudaDeviceSynchronize(); timer.stop("Deallocation"); //timer.print("Deallocation", 1); // Release timers timer.release("Allocation"); timer.release("Initialization"); timer.release("Copy To Device"); timer.release("Kernel"); timer.release("Copy Back and Merge"); timer.release("Deallocation"); printf("Test Passed\n"); return 0; }
void ElasticVectorAddition::freeResources() { CUDA_CHECK_RETURN(cudaFree(this->a)); CUDA_CHECK_RETURN(cudaFree(this->b)); }
// points only need to be cudaMalloced once for point cloud and voxels. Move outside for efficency TODO void CloudConstructor::calcPoints(bool freeResultPointsOnGPU) { // size_t free, tot; // cudaMemGetInfo(&free, &tot); // std::cout << "ENTER calcPoints::cudaMemGetInfo " << free << " " << tot << std::endl; if(isInitNeeded) { isInitNeeded = false; // assume all images have same dims // TODO: is this safe? if(camCnt >= 1) { imgWidth = cams[0]->imgWidth; imgHeight = cams[0]->imgHeight; imgSize = imgWidth * imgHeight; } else { imgWidth = 0; imgHeight = 0; imgSize = 0; } std::cout << "image dims " << imgWidth << " x " << imgHeight << std::endl; pointCnt = imgWidth*imgHeight*camCnt + 1; //zeroth point is for junk! //assume params dont' change after first grab // TODO: is this safe? // CamParams *ptParam = &CamParams[0]; int paramI = 0; for(int i = 0; i < camCnt; i++) { camParams[paramI++] = cams[i]->params.cx; camParams[paramI++]= cams[i]->params.cy; camParams[paramI++]= cams[i]->params.imageCenterU; camParams[paramI++]= cams[i]->params.imageCenterV; camParams[paramI++]= cams[i]->params.tx; camParams[paramI++]= cams[i]->params.ty; camParams[paramI++]= cams[i]->params.tz; camParams[paramI++]= cams[i]->params.rx; camParams[paramI++]= cams[i]->params.ry; camParams[paramI++]= cams[i]->params.rz; } paramI = 0; for(int i = 0; i < camCnt; i++) { std::cout << i << " cx " << camParams[paramI++] << std::endl; std::cout << i << " cy " << camParams[paramI++] << std::endl; std::cout << i << " cu " << camParams[paramI++] << std::endl; std::cout << i << " cv " << camParams[paramI++] << std::endl; std::cout << i << " tx " << camParams[paramI++] << std::endl; std::cout << i << " ty " << camParams[paramI++] << std::endl; std::cout << i << " tz " << camParams[paramI++] << std::endl; std::cout << i << " rx " << camParams[paramI++] << std::endl; std::cout << i << " ry " << camParams[paramI++] << std::endl; std::cout << i << " rz " << camParams[paramI++] << std::endl; } points = (float *)malloc(pointCnt * 3 * sizeof(float)); // points = new float[pointCnt * 3]; if(points==NULL) { std::cerr << "CloudConstructor::calcPoints unable to allocate memory for " << pointCnt << " points" << std::endl; exit(1); } memset(points, 0, pointCnt * 3 * sizeof(float)); } #ifdef USE_CPU // extra allocation and de-allocation is done here to stay consistant with GPU code size_t paramSize = sizeof(double) * camCnt * CAM_PARAM_CNT; double *d_params; d_params = (double*) malloc(paramSize); // need to look up safecall TODO memcpy(d_params, camParams, paramSize); size_t transformSize = sizeof(double) * camCnt * 12; double *d_tansforms; d_tansforms =(double*) malloc(transformSize); // need to look up safecall TODO double* transPrt =d_tansforms; size_t singleTransformSize = sizeof(double) * 12; for(int i = 0; i < camCnt; i++) { memcpy(transPrt, cams[i]->tMatrix, singleTransformSize); transPrt+=12; } size_t singleZImageArrSize = imgSize * sizeof(unsigned short); unsigned short *d_zimg; d_zimg = (unsigned short*) malloc(singleZImageArrSize * camCnt); // need to look up safecall TODO for(int i = 0; i < camCnt;i++) { memcpy(&d_zimg[i*imgSize], cams[i]->getZImage(), singleZImageArrSize); } size_t resultSize = pointCnt * 3 * sizeof(float); d_resultPoints = (float*) malloc(resultSize); cpu_calcPointCloud(camCnt,imgWidth,imgHeight, d_params, d_tansforms, d_zimg, d_resultPoints); // std::cout << "done caluclated doing mem copy for " << resultSize << "points"; memcpy(points, d_resultPoints, resultSize); // std::cout << "done with mem copy"; free(d_params); free(d_tansforms); free(d_zimg); free(d_resultPoints); d_resultPoints = NULL; // does free set to null for you? #else // use gpu size_t paramSize = sizeof(double) * camCnt * CAM_PARAM_CNT; double *d_params; cutilSafeCall( cudaMalloc((void**)&d_params, paramSize) ); // need to look up safecall TODO cutilSafeCall( cudaMemcpy(d_params, camParams, paramSize, cudaMemcpyHostToDevice) ); size_t transformSize = sizeof(double) * camCnt * 12; // size_t transformSize = sizeof(double) * camCnt * 16; double *d_tansforms; cutilSafeCall( cudaMalloc((void**)&d_tansforms, transformSize) ); // need to look up safecall TODO double* transPrt =d_tansforms; // size_t singleTransformSize = sizeof(double) * 16; size_t singleTransformSize = sizeof(double) * 12; for(int i = 0; i < camCnt; i++) { cutilSafeCall( cudaMemcpy(transPrt, cams[i]->tMatrix, singleTransformSize, cudaMemcpyHostToDevice) ); transPrt+=12; } size_t singleZImageArrSize = imgSize * sizeof(unsigned short); unsigned short *d_zimg; cutilSafeCall( cudaMalloc((void**)&d_zimg, singleZImageArrSize * camCnt) ); // need to look up safecall TODO for(int i = 0; i < camCnt;i++) { cutilSafeCall( cudaMemcpy(&d_zimg[i*imgSize], cams[i]->getZImage(), singleZImageArrSize, cudaMemcpyHostToDevice) ); } size_t resultSize = pointCnt * 3 * sizeof(float); cutilSafeCall( cudaMalloc((void**)&d_resultPoints, resultSize)); gpu_calcPointCloud(camCnt,imgWidth,imgHeight, d_params, d_tansforms, d_zimg, d_resultPoints); // std::cout << "done caluclated doing mem copy for " << resultSize << "points"; cutilSafeCall(cudaMemcpy(points, d_resultPoints, resultSize, cudaMemcpyDeviceToHost)); // std::cout << "done with mem copy"; cudaFree(d_params); cudaFree(d_tansforms); cudaFree(d_zimg); if(freeResultPointsOnGPU) freeGPUPoints(); #endif // USE_CPU // cudaFree(d_resultPoints); // cudaMemGetInfo(&free, &tot); // std::cout << "EXIT calcPoints::cudaMemGetInfo " << free << " " << tot << std::endl; }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test matrix multiply using CUBLAS //////////////////////////////////////////////////////////////////////////////// int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) { cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); // use a larger block size for Fermi and above int block_size = (deviceProp.major < 2) ? 16 : 32; // set seed for rand() srand(2006); // allocate host memory for matrices A and B unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA; unsigned int mem_size_A = sizeof(float) * size_A; float *h_A = (float *)malloc(mem_size_A); unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB; unsigned int mem_size_B = sizeof(float) * size_B; float *h_B = (float *)malloc(mem_size_B); // set seed for rand() srand(2006); // initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); // allocate device memory float *d_A, *d_B, *d_C; unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC; unsigned int mem_size_C = sizeof(float) * size_C; // allocate host memory for the result float *h_C = (float *) malloc(mem_size_C); float *h_CUBLAS = (float *) malloc(mem_size_C); checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A)); checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B)); checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C)); // setup execution parameters dim3 threads(block_size, block_size); dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); // create and start timer printf("Computing result using CUBLAS..."); // execute the kernel int nIter = 30; // CUBLAS version 2.0 { const float alpha = 1.0f; const float beta = 0.0f; cublasHandle_t handle; cudaEvent_t start, stop; checkCudaErrors(cublasCreate(&handle)); //Perform warmup operation with cublas checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA)); // Allocate CUDA events that we'll use for timing checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); // Record the start event checkCudaErrors(cudaEventRecord(start, NULL)); for (int j = 0; j < nIter; j++) { //note cublas is column primary! //need to transpose the order checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA)); } printf("done.\n"); // Record the stop event checkCudaErrors(cudaEventRecord(stop, NULL)); // Wait for the stop event to complete checkCudaErrors(cudaEventSynchronize(stop)); float msecTotal = 0.0f; checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); // Compute and print the performance float msecPerMatrixMul = msecTotal / nIter; double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB; double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); printf( "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul); // copy result from device to host checkCudaErrors(cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost)); // Destroy the handle checkCudaErrors(cublasDestroy(handle)); } // compute reference solution printf("Computing result using host CPU..."); float *reference = (float *)malloc(mem_size_C); matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB); printf("done.\n"); // check result (CUBLAS) bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); if (resCUBLAS != true) { printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f); } printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL"); printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); // clean up memory free(h_A); free(h_B); free(h_C); free(reference); checkCudaErrors(cudaFree(d_A)); checkCudaErrors(cudaFree(d_B)); checkCudaErrors(cudaFree(d_C)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (resCUBLAS == true) { return EXIT_SUCCESS; // return value = 1 } else { return EXIT_FAILURE; // return value = 0 } }
static void TearDownTestCase() { cudaFree(data); }
FieldContainer_Kokkos<Scalar,ScalarPointer,Kokkos::LayoutRight,Kokkos::Cuda>::~FieldContainer_Kokkos(){ count_=count_-1; if(count_==0 && intrepidManaged){cudaFree(containerMemory);} }
/* * Function to be called */ void* device_thread(void* passing_ptr) { DataArray* data_arr_ptr = (DataArray*) passing_ptr; // casting passed pointer cuDoubleComplex* data_r_dev; cuDoubleComplex* data_k_dev; // init device, allocate suitable variables in gpu memory ... //alloc_data_device(data_arr_ptr); cudaMalloc((void**) &data_r_dev, sizeof(double complex)*N); // pinnable memory <- check here for cudaMallocHost (could be faster) cudaMalloc((void**) &data_k_dev, sizeof(double complex)*N); // pinnable memory data_arr_ptr->data_r_dev = &data_r_dev; // in this way it would be easier to handle pointer to arrays data_arr_ptr->data_k_dev = &data_k_dev; printf("data allocated by host thread\n"); // Each thread creates new stream ustomatically??? // http://devblogs.nvidia.com/parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/ cudaStreamCreateWithFlags(streams_arr, cudaStreamNonBlocking); cudaStreamCreateWithFlags(streams_arr+1, cudaStreamNonBlocking); printf("streams created\n"); // synchronize after allocating memory - data on host should be allocated and ready for copying cudaDeviceSynchronize(); // CHECK IF THIS DO NOT CAUSE ERRORS! - should syncronize host and device irrespective on pthreads // cudaStreamSynchronize( <enum stream> ); // to synchronize only with stream !!! pthread_barrier_wait (&barrier); printf("1st barier device thread - allocating mem on gpu\n"); //copying data cudaMemcpyAsync( *(data_arr_ptr->data_r_dev), *(data_arr_ptr->data_r), N*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, streams_arr[MEMORY_STREAM] ); // synchronize after copying data cudaDeviceSynchronize(); // should be used on pthread_barrier_wait (&barrier); printf("2nd barier device thread - copying data on gpu\n"); printf("data visible in device thread:\n"); /*for (uint64_t ii = 0; ii < (data_arr_ptr->size <= 32) ? data_arr_ptr->size : 32 ; ii++) { printf("%lu.\t",ii); printf("%lf + %lfj\t", creal( (*(data_arr_ptr->data_r))[ii] ), cimag( (*(data_arr_ptr->data_r))[ii] )); printf("%lf + %lfj\n", creal( (*(data_arr_ptr->data_k))[ii] ), cimag( (*(data_arr_ptr->data_k))[ii] )); }*/ // synchronize after copying pthread_barrier_wait (&barrier); printf("3rd barier device thread - \n"); //copying data //cudaMemcpyAsync( *(data_arr_ptr->data_r), *(data_arr_ptr->data_r_dev), N*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, streams_arr[MEMORY_STREAM] ); cudaMemcpyAsync( *(data_arr_ptr->data_r), data_r_dev, N*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, streams_arr[MEMORY_STREAM] ); // synchronize after copying back data cudaDeviceSynchronize(); // should be used on pthread_barrier_wait (&barrier); printf("4th barier device thread - \n"); cudaStreamDestroy(streams_arr[KERNEL_STREAM]); cudaStreamDestroy(streams_arr[MEMORY_STREAM]); cudaFree(data_r_dev); printf("device r space freed\n"); cudaFree(data_k_dev); cudaDeviceSynchronize(); printf("device k space freed\n"); printf("closing device thread\n"); pthread_exit(NULL); }
int main(int argc, char **argv) { printf("Computing Game Of Life On %d x %d Board.\n", DIM_X, DIM_Y); int *host_current, *host_future, *host_future_naive, *host_future_cached; int *gpu_current, *gpu_future; clock_t start, stop; cudaMallocHost((void**) &host_current, DIM_X * DIM_Y * sizeof(int)); cudaMallocHost((void**) &host_future, DIM_X * DIM_Y * sizeof(int)); cudaMallocHost((void**) &host_future_naive, DIM_X * DIM_Y * sizeof(int)); cudaMallocHost((void**) &host_future_cached, DIM_X * DIM_Y * sizeof(int)); assert(cudaGetLastError() == cudaSuccess); cudaMalloc((void**) &gpu_current, DIM_X * DIM_Y * sizeof(int)); cudaMalloc((void**) &gpu_future, DIM_X * DIM_Y * sizeof(int)); printf("%s\n", cudaGetErrorString(cudaGetLastError())); assert(cudaGetLastError() == cudaSuccess); fill_board(host_current, 40); add_glider(host_current); cudaMemcpy(gpu_current, host_current, DIM_X * DIM_Y * sizeof(int), cudaMemcpyHostToDevice); // print_board(host_current); float time_naive, time_cached, time_cpu; for(int i = 1; i < STEPS; i++) { printf("=========\n"); start = clock(); naive_game_of_life_wrapper(gpu_current, gpu_future); cudaMemcpy(host_future_naive, gpu_future, DIM_X * DIM_Y * sizeof(int), cudaMemcpyDeviceToHost); stop = clock(); time_naive = (float)(stop - start)/CLOCKS_PER_SEC; printf("Time for Naive GPU To Compute Next Phase: %.5f s\n", time_naive); start = clock(); cached_game_of_life_wrapper(gpu_current, gpu_future); cudaMemcpy(host_future_cached, gpu_future, DIM_X * DIM_Y * sizeof(int), cudaMemcpyDeviceToHost); stop = clock(); time_cached = (float)(stop - start)/CLOCKS_PER_SEC; printf("Time for Cached GPU To Compute Next Phase: %.5f s\n", time_cached); start = clock(); update_board(host_current, host_future); stop = clock(); time_cpu = (float)(stop - start)/CLOCKS_PER_SEC; printf("Time for CPU To Compute Next Phase: %.5f s\n", time_cpu); printf("speedup for naive = %.2f; speedup for cached = %.2f; speedup for cached over naive = %.2f\n", time_cpu/time_naive, time_cpu/time_cached, time_naive/time_cached); check_boards(host_future, host_future_naive); check_boards(host_future, host_future_cached); int *temp; temp = host_current; host_current = host_future; host_future = temp; temp = gpu_current; gpu_current = gpu_future; gpu_future = temp; } cudaFree(host_future); cudaFree(host_future_naive); cudaFree(host_future_cached); cudaFree(host_current); cudaFree(gpu_current); cudaFree(gpu_future); return 0; }