cumem(int newsz){ sz = newsz; cudaMalloc(&data, sz); status = inuse; next = NULL; };
ImGpu::ImGpu(const char* filename) { FILE *fp = 0; int t1, t2, t3, t4; cudaError_t cudaStatus; sscanf_s(filename, "%dx%dx%dx%d_", &t1, &t2, &t3, &t4); width = t1; height = t2; bpp = t3; dimension = t4; void *pxl = 0; /* Allocate memory for the pixels on the Gpu */ if (8 == bpp) { cudaStatus = cudaMalloc((void**)&dev_pxl, width *height *dimension * sizeof(char)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaMemset(dev_pxl, 255, sizeof(char) * width *height *dimension); pxl = new char[sizeof(char) * width *height *dimension]; } else if (16 == bpp) { cudaStatus = cudaMalloc((void**)&dev_pxl, width *height *dimension * sizeof(unsigned short)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaMemset(dev_pxl, 255, sizeof(unsigned short) * width *height *dimension); pxl = new unsigned short[sizeof(unsigned short) * width *height *dimension]; } /* * Open the file to read the pixels */ fopen_s(&fp, filename, "rb"); /* open for reading */ if (0 != fp){ std::fread(pxl, sizeof(unsigned char), width*height*dimension, fp); fclose(fp); /* close the file */ } // Copy input vectors from host memory to GPU buffers. cudaStatus = cudaMemcpy(dev_pxl, pxl, width *height *dimension * sizeof(char), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } delete(pxl); return; Error: cudaFree(dev_pxl); //delete(pxl); }
/* Main */ void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { if (nrhs != 7) { mexErrMsgTxt("sgemm requires 7 input arguments"); } else if (nlhs != 1) { mexErrMsgTxt("sgemm requires 1 output argument"); } if ( !mxIsSingle(prhs[4]) || !mxIsSingle(prhs[5]) || !mxIsSingle(prhs[6])) { mexErrMsgTxt("Input arrays must be single precision."); } int ta = (int) mxGetScalar(prhs[0]); int tb = (int) mxGetScalar(prhs[1]); float alpha = (float) mxGetScalar(prhs[2]); float beta = (float) mxGetScalar(prhs[3]); float *h_A = (float*) mxGetData(prhs[4]); float *h_B = (float*) mxGetData(prhs[5]); float *h_C = (float*) mxGetData(prhs[6]); int M = mxGetM(prhs[4]); /* gets number of rows of A */ int K = mxGetN(prhs[4]); /* gets number of columns of A */ int L = mxGetM(prhs[5]); /* gets number of rows of B */ int N = mxGetN(prhs[5]); /* gets number of columns of B */ cublasOperation_t transa, transb; int MM, KK, NN; if (ta == 0) { transa = CUBLAS_OP_N; MM=M; KK=K; } else { transa = CUBLAS_OP_T; MM=K; KK=M; } if (tb == 0) { transb = CUBLAS_OP_N; NN=N; } else { transb = CUBLAS_OP_T; NN=L; } /* printf("transa=%c\n",transa); printf("transb=%c\n",transb); printf("alpha=%f\n",alpha); printf("beta=%f\n",beta); */ /* Left hand side matrix set up */ mwSize dims0[2]; dims0[0]=MM; dims0[1]=NN; plhs[0] = mxCreateNumericArray(2,dims0,mxSINGLE_CLASS,mxREAL); float *h_C_out = (float*) mxGetData(plhs[0]); cublasStatus_t status; cublasHandle_t handle; status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { mexErrMsgTxt("!!!! CUBLAS initialization error\n"); } float* d_A = 0; float* d_B = 0; float* d_C = 0; /* Allocate device memory for the matrices */ if (cudaMalloc((void**)&d_A, M * K * sizeof(d_A[0])) != cudaSuccess) { mexErrMsgTxt("!!!! device memory allocation error (allocate A)\n"); } if (cudaMalloc((void**)&d_B, L * N * sizeof(d_B[0])) != cudaSuccess) { mexErrMsgTxt("!!!! device memory allocation error (allocate B)\n"); } if (cudaMalloc((void**)&d_C, MM * NN * sizeof(d_C[0])) != cudaSuccess) { mexErrMsgTxt("!!!! device memory allocation error (allocate C)\n"); } /* Initialize the device matrices with the host matrices */ status = cublasSetVector(M * K, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { mexErrMsgTxt("!!!! device access error (write A)\n"); } status = cublasSetVector(L * N, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { mexErrMsgTxt("!!!! device access error (write B)\n"); } status = cublasSetVector(MM * NN, sizeof(h_C[0]), h_C, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { mexErrMsgTxt("!!!! device access error (write C)\n"); } /* Performs operation using cublas */ status = cublasSgemm(handle, transa, transb, MM, NN, KK, &alpha, d_A, M, d_B, L, &beta, d_C, MM); if (status != CUBLAS_STATUS_SUCCESS) { mexErrMsgTxt("!!!! kernel execution error.\n"); } /* Read the result back */ status = cublasGetVector(MM * NN, sizeof(h_C[0]), d_C, 1, h_C_out, 1); if (status != CUBLAS_STATUS_SUCCESS) { mexErrMsgTxt("!!!! device access error (read C)\n"); } if (cudaFree(d_A) != cudaSuccess) { mexErrMsgTxt("!!!! memory free error (A)\n"); } if (cudaFree(d_B) != cudaSuccess) { mexErrMsgTxt("!!!! memory free error (B)\n"); } if (cudaFree(d_C) != cudaSuccess) { mexErrMsgTxt("!!!! memory free error (C)\n"); } /* Shutdown */ status = cublasDestroy(handle); if (status != CUBLAS_STATUS_SUCCESS) { mexErrMsgTxt("!!!! shutdown error (A)\n"); } }
int main( int argc, char **argv ) { uchar *h_Data; uint *h_HistogramCPU, *h_HistogramGPU; uchar *d_Data; uint *d_Histogram; uint hTimer; int PassFailFlag = 1; uint byteCount = 64 * 1048576; uint uiSizeMult = 1; cudaDeviceProp deviceProp; deviceProp.major = 0; deviceProp.minor = 0; int dev; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("histogram.txt"); //Use command-line specified CUDA device, otherwise use device with highest Gflops/s if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) { dev = cutilDeviceInit(argc, argv); if (dev < 0) { printf("No CUDA Capable Devices found, exiting...\n"); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } } else { cudaSetDevice( dev = cutGetMaxGflopsDeviceId() ); cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) ); } cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) ); printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n", deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = deviceProp.major * 0x10 + deviceProp.minor; if(version < 0x11) { printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n"); cutilDeviceReset(); shrQAFinishExit(argc, (const char **)argv, QA_WAIVED); } cutilCheckError(cutCreateTimer(&hTimer)); // Optional Command-line multiplier to increase size of array to histogram if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult)) { uiSizeMult = CLAMP(uiSizeMult, 1, 10); byteCount *= uiSizeMult; } shrLog("Initializing data...\n"); shrLog("...allocating CPU memory.\n"); h_Data = (uchar *)malloc(byteCount); h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint)); shrLog("...generating input data\n"); srand(2009); for(uint i = 0; i < byteCount; i++) h_Data[i] = rand() % 256; shrLog("...allocating GPU memory and copying input data\n\n"); cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount ) ); cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint) ) ); cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) ); //----- // 64 bin histogram //------ { shrLog("Starting up 64-bin histogram...\n\n"); initHistogram64(); shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram64(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram64CPU()\n"); histogram64CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results...\n"); for(uint i = 0; i < HISTOGRAM64_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...64-bin histograms match\n\n" : " ***64-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 64-bin histogram...\n\n\n"); closeHistogram64(); } //----- // Histogram 256 //----- { shrLog("Initializing 256-bin histogram...\n"); initHistogram256(); shrLog("Running 256-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++){ //iter == -1 -- warmup iteration if(iter == 0){ cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogram256(d_Histogram, d_Data, byteCount); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogram256() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogram256, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram256CPU()\n"); histogram256CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results\n"); for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 256-bin histogram...\n\n\n"); closeHistogram256(); } //----- // Histogram Trish //----- { shrLog("Initializing 256-bin TRISH histogram...\n"); initTrish256(); shrLog("Running 256-bin GPU TRISH histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns); for(int iter = -1; iter < numRuns; iter++) { //iter == -1 -- warmup iteration if (iter == 0) { cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } histogramTrish256( d_Histogram, d_Data, byteCount ); } cutilSafeCall( cutilDeviceSynchronize() ); cutilCheckError( cutStopTimer(hTimer)); double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns; shrLog("histogramTRISH() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs); shrLogEx(LOGBOTH | MASTER, 0, "histogramTRISH, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM256_THREADBLOCK_SIZE); shrLog("\nValidating GPU results...\n"); shrLog(" ...reading back GPU results\n"); cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) ); shrLog(" ...histogram256CPU()\n"); histogram256CPU( h_HistogramCPU, h_Data, byteCount ); shrLog(" ...comparing the results\n"); for(uint i = 0; i < HISTOGRAM256_BIN_COUNT; i++) if(h_HistogramGPU[i] != h_HistogramCPU[i]) PassFailFlag = 0; shrLog(PassFailFlag ? " ...256-bin histograms match\n\n" : " ***256-bin histograms do not match!!!***\n\n" ); shrLog("Shutting down 256-bin TRISH histogram...\n\n\n"); closeTrish256(); } shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); cutilSafeCall( cudaFree(d_Histogram) ); cutilSafeCall( cudaFree(d_Data) ); free(h_HistogramGPU); free(h_HistogramCPU); free(h_Data); cutilDeviceReset(); shrLog("%s - Test Summary\n", sSDKsample ); // pass or fail (for both 64 bit and 256 bit histograms) shrQAFinishExit(argc, (const char **)argv, (PassFailFlag ? QA_PASSED : QA_FAILED)); }
void allocate() { this->destroy(); check_error( cudaMalloc((void**)&_dptr, sizeof(value_type)) ); }
void sparse_matrix_t::alloc_device() { cudaMalloc((void**)&devJc, (numCols+1) * sizeof(int)); cudaMalloc((void**)&devIr, numNonZeroElems * sizeof(int)); cudaMalloc((void**)&devRVals, numNonZeroElems * sizeof(float)); }
bool runTestMax( int argc, char** argv, ReduceType datatype) { int size = 1<<24; // number of elements to reduce int maxThreads = 256; // number of threads per block int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; cutGetCmdLineArgumenti( argc, (const char**) argv, "n", &size); cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads); cutGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel); cutGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks); shrLog("METHOD: MAX\n"); shrLog("%d elements\n", size); shrLog("%d threads (max)\n", maxThreads); cpuFinalReduction = (cutCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == CUTTrue); cutGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold); bool runShmoo = (cutCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == CUTTrue); if (runShmoo) { shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype); } else { // create random input data on CPU unsigned int bytes = size * sizeof(T); T *h_idata = (T *) malloc(bytes); for(int i=0; i<size; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); if (numBlocks == 1) cpuFinalThreshold = 1; // allocate mem for the result on host side T* h_odata = (T*) malloc(numBlocks*sizeof(T)); shrLog("%d blocks\n\n", numBlocks); // allocate device memory and data T* d_idata = NULL; T* d_odata = NULL; cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) ); cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, numBlocks*sizeof(T)) ); // copy data directly to device memory cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) ); cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice) ); // warm-up maxreduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata); int testIterations = 100; unsigned int timer = 0; cutilCheckError( cutCreateTimer( &timer)); T gpu_result = 0; gpu_result = benchmarkReduceMax<T>(size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, testIterations, cpuFinalReduction, cpuFinalThreshold, timer, h_odata, d_idata, d_odata); double reduceTime = cutGetAverageTimerValue(timer) * 1e-3; shrLogEx(LOGBOTH | MASTER, 0, "Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); // compute reference solution T cpu_result = maxreduceCPU<T>(h_idata, size); double threshold = 1e-12; double diff = 0; if (datatype == REDUCE_INT) { shrLog("\nGPU result = %d\n", gpu_result); shrLog("CPU result = %d\n\n", cpu_result); } else { shrLog("\nGPU result = %f\n", gpu_result); shrLog("CPU result = %f\n\n", cpu_result); if (datatype == REDUCE_FLOAT) threshold = 1e-8 * size; diff = fabs((double)gpu_result - (double)cpu_result); } // cleanup cutilCheckError( cutDeleteTimer(timer) ); free(h_idata); free(h_odata); cutilSafeCallNoSync(cudaFree(d_idata)); cutilSafeCallNoSync(cudaFree(d_odata)); if (datatype == REDUCE_INT) { return (gpu_result == cpu_result); } else { return (diff < threshold); } } return true; }
void iB_FFTShift::FFTShift_2D_Float(int size_X, int size_Y, Sheet* xlSheet, int nLoop) { LOG(); INFO("2D FFT Shift Float - CPU " + ITS(size_X) + "x" + ITS(size_Y)); /********************************************************** * Float Case **********************************************************/ if (xlSheet) { for (int iLoop = 0; iLoop < nLoop; iLoop++) { // Headers xlSheet->writeStr(1, ((iLoop * 4) + 0), "I-CPU"); xlSheet->writeStr(1, ((iLoop * 4) + 1), "O-CPU"); xlSheet->writeStr(1, ((iLoop * 4) + 2), "I-GPU"); xlSheet->writeStr(1, ((iLoop * 4) + 3), "O-GPU"); // Allocation: 2D, Flat, Device arr_2D_float = MEM_ALLOC_2D_FLOAT(size_X, size_Y); arr_2D_flat_float = MEM_ALLOC_1D_FLOAT(size_X * size_Y); int devMem = size_X * size_Y * sizeof(float); cudaMalloc((void**)(&dev_arr_2D_flat_float), devMem); // Filling arrays: 2D, Flat Array::fillArray_2D_float(arr_2D_float, size_X, size_Y, 1); Array::fillArray_2D_flat_float(arr_2D_flat_float, size_X, size_Y, 1); // Printing input ctr = 0; for (int i = 0; i < size_X; i++) for (int j = 0; j < size_Y; j++) xlSheet->writeNum((ctr++) + 2, iLoop * 4, arr_2D_float[i][j]); // FFT shift operation - CPU arr_2D_float = FFT::FFT_Shift_2D_float(arr_2D_float, size_X, size_Y); // Printing CPU output ctr = 0; for (int i = 0; i < size_X; i++) for (int j = 0; j < size_Y; j++) xlSheet->writeNum((ctr++) + 2, ((iLoop * 4 ) + 1), arr_2D_float[i][j]); // Printing GPU input ctr = 0; for (int i = 0; i < size_X; i++) for (int j = 0; j < size_Y; j++) { xlSheet->writeNum(ctr + 2, ((iLoop * 4 ) + 2), arr_2D_flat_float[ctr]); ctr++; } // Uploading array cuUtils::upload_2D_float(arr_2D_flat_float, dev_arr_2D_flat_float, size_X, size_Y); // CUDA Gridding dim3 cuBlock(512, 512, 1); dim3 cuGrid(size_X / cuBlock.x, size_Y/ cuBlock.y, 1); // FFT shift cuFFTShift_2D( cuBlock, cuGrid, dev_arr_2D_flat_float, dev_arr_2D_flat_float, size_X); // Downloading array cuUtils::download_2D_float(arr_2D_flat_float, dev_arr_2D_flat_float, size_X, size_Y); // Printing output ctr = 0; for (int i = 0; i < size_X; i++) for (int j = 0; j < size_Y; j++) { xlSheet->writeNum((ctr) + 2, ((iLoop * 4 ) + 3), arr_2D_flat_float[ctr]); ctr++; } // Dellocating memory FREE_MEM_2D_FLOAT(arr_2D_float, size_X, size_Y); } } else { INFO("No valid xlSheet was created, EXITTING ..."); EXIT(0); } }
static cudaError_t cudaMallocWrapper(void* ctx, void** devPtr, size_t size, cudaStream_t stream) { return cudaMalloc(devPtr, size); }
/** Documented at declaration */ struct gpujpeg_encoder* gpujpeg_encoder_create(struct gpujpeg_parameters* param, struct gpujpeg_image_parameters* param_image) { assert(param_image->comp_count == 1 || param_image->comp_count == 3); assert(param_image->comp_count <= GPUJPEG_MAX_COMPONENT_COUNT); assert(param->quality >= 0 && param->quality <= 100); assert(param->restart_interval >= 0); assert(param->interleaved == 0 || param->interleaved == 1); struct gpujpeg_encoder* encoder = (struct gpujpeg_encoder*) malloc(sizeof(struct gpujpeg_encoder)); if ( encoder == NULL ) return NULL; // Get coder struct gpujpeg_coder* coder = &encoder->coder; // Set parameters memset(encoder, 0, sizeof(struct gpujpeg_encoder)); coder->param_image = *param_image; coder->param = *param; int result = 1; // Create writer encoder->writer = gpujpeg_writer_create(encoder); if ( encoder->writer == NULL ) result = 0; // Initialize coder if ( gpujpeg_coder_init(coder) != 0 ) result = 0; // Init preprocessor if ( gpujpeg_preprocessor_encoder_init(&encoder->coder) != 0 ) { fprintf(stderr, "Failed to init preprocessor!"); result = 0; } // Allocate quantization tables in device memory for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) { if ( cudaSuccess != cudaMalloc((void**)&encoder->table_quantization[comp_type].d_table, 64 * sizeof(uint16_t)) ) result = 0; if ( cudaSuccess != cudaMalloc((void**)&encoder->table_quantization[comp_type].d_table_forward, 64 * sizeof(float)) ) result = 0; } gpujpeg_cuda_check_error("Encoder table allocation", return NULL); // Init quantization tables for encoder for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) { if ( gpujpeg_table_quantization_encoder_init(&encoder->table_quantization[comp_type], (enum gpujpeg_component_type)comp_type, coder->param.quality) != 0 ) result = 0; } gpujpeg_cuda_check_error("Quantization init", return NULL); // Init huffman tables for encoder for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) { for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) { if ( gpujpeg_table_huffman_encoder_init(&encoder->table_huffman[comp_type][huff_type], (enum gpujpeg_component_type)comp_type, (enum gpujpeg_huffman_type)huff_type) != 0 ) result = 0; } } gpujpeg_cuda_check_error("Encoder table init", return NULL); // Init huffman encoder if ( gpujpeg_huffman_gpu_encoder_init(encoder) != 0 ) result = 0; if ( result == 0 ) { gpujpeg_encoder_destroy(encoder); return NULL; } // Timers GPUJPEG_CUSTOM_TIMER_CREATE(encoder->def); GPUJPEG_CUSTOM_TIMER_CREATE(encoder->in_gpu); return encoder; }
rk4_mem *SOLVER(rk4, init, TARGET, SIMENGINE_STORAGE, solver_props *props) { #if defined TARGET_GPU GPU_ENTRY(init, SIMENGINE_STORAGE); // Temporary CPU copies of GPU datastructures rk4_mem tmem; // GPU datastructures rk4_mem *dmem; // Computes GPU kernel geometry size_t shmem_per_thread, total_shmem = 1<<14; int warp_size = 1<<5; uint threads_per_block; uint num_gpu_threads; uint num_gpu_blocks; // shared space for model states and solver overhead shmem_per_thread = sizeof(CDATAFORMAT) * props->statesize * 6; // 6 = magic for rk4 // shared space for a vector of time shmem_per_thread += sizeof(CDATAFORMAT); // shared space for a vector of `running' flags shmem_per_thread += sizeof(int); threads_per_block = total_shmem / shmem_per_thread; threads_per_block = warp_size * (threads_per_block / warp_size); num_gpu_threads = threads_per_block < props->num_models ? threads_per_block : props->num_models; num_gpu_blocks = (props->num_models + threads_per_block - 1) / threads_per_block; props->gpu.blockx = num_gpu_threads; props->gpu.blocky = 1; props->gpu.blockz = 1; props->gpu.gridx = num_gpu_blocks; props->gpu.gridy = 1; props->gpu.gridz = 1; props->gpu.shmem_per_block = shmem_per_thread * num_gpu_threads; // Allocate GPU space for mem and pointer fields of mem (other than props) cutilSafeCall(cudaMalloc((void**)&dmem, sizeof(rk4_mem))); tmem.props = GPU_ENTRY(init_props, SIMENGINE_STORAGE, props); cutilSafeCall(cudaMalloc((void**)&tmem.k1, props->statesize*props->num_models*sizeof(CDATAFORMAT))); cutilSafeCall(cudaMalloc((void**)&tmem.k2, props->statesize*props->num_models*sizeof(CDATAFORMAT))); cutilSafeCall(cudaMalloc((void**)&tmem.k3, props->statesize*props->num_models*sizeof(CDATAFORMAT))); cutilSafeCall(cudaMalloc((void**)&tmem.k4, props->statesize*props->num_models*sizeof(CDATAFORMAT))); cutilSafeCall(cudaMalloc((void**)&tmem.temp, props->statesize*props->num_models*sizeof(CDATAFORMAT))); // Copy mem structure to GPU cutilSafeCall(cudaMemcpy(dmem, &tmem, sizeof(rk4_mem), cudaMemcpyHostToDevice)); return dmem; #else // Used for CPU and OPENMP targets rk4_mem *mem = (rk4_mem*)malloc(sizeof(rk4_mem)); mem->props = props; mem->k1 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT)); mem->k2 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT)); mem->k3 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT)); mem->k4 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT)); mem->temp = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT)); return mem; #endif }
float WFIRFilterCuda::cudaFilter( WLEMData::ScalarT* const output, const WLEMData::ScalarT* const input, const WLEMData::ScalarT* const previous, size_t channels, size_t samples, const WLEMData::ScalarT* const coeffs, size_t coeffSize ) { CuScalarT *dev_in = NULL; size_t pitchIn; CuScalarT *dev_prev = NULL; size_t pitchPrev; CuScalarT *dev_out = NULL; size_t pitchOut; CuScalarT *dev_co = NULL; try { CudaThrowsCall( cudaMallocPitch( ( void** )&dev_in, &pitchIn, samples * sizeof( CuScalarT ), channels ) ); CudaThrowsCall( cudaMemcpy2D( dev_in, pitchIn, input, samples * sizeof( CuScalarT ), samples * sizeof( CuScalarT ), channels, cudaMemcpyHostToDevice ) ); CudaThrowsCall( cudaMallocPitch( ( void** )&dev_prev, &pitchPrev, coeffSize * sizeof( CuScalarT ), channels ) ); CudaThrowsCall( cudaMemcpy2D( dev_prev, pitchPrev, previous, coeffSize * sizeof( CuScalarT ), coeffSize * sizeof( CuScalarT ), channels, cudaMemcpyHostToDevice ) ); CudaThrowsCall( cudaMallocPitch( ( void** )&dev_out, &pitchOut, samples * sizeof( CuScalarT ), channels ) ); CudaThrowsCall( cudaMalloc( ( void** )&dev_co, coeffSize * sizeof( CuScalarT ) ) ); CudaThrowsCall( cudaMemcpy( dev_co, coeffs, coeffSize * sizeof( CuScalarT ), cudaMemcpyHostToDevice ) ); } catch( const WException& e ) { wlog::error( CLASS ) << e.what(); if( dev_in ) { CudaSafeCall( cudaFree( ( void* )dev_in ) ); } if( dev_prev ) { CudaSafeCall( cudaFree( ( void* )dev_prev ) ); } if( dev_out ) { CudaSafeCall( cudaFree( ( void* )dev_out ) ); } if( dev_co ) { CudaSafeCall( cudaFree( ( void* )dev_co ) ); } throw WLBadAllocException( "Could not allocate CUDA memory!" ); } size_t threadsPerBlock = 32; size_t blocksPerGrid = ( samples + threadsPerBlock - 1 ) / threadsPerBlock; size_t sharedMem = coeffSize * sizeof( CuScalarT ); cudaEvent_t start, stop; cudaEventCreate( &start ); cudaEventCreate( &stop ); cudaEventRecord( start, 0 ); cuFirFilter( blocksPerGrid, threadsPerBlock, sharedMem, dev_out, dev_in, dev_prev, channels, samples, dev_co, coeffSize, pitchOut, pitchIn, pitchPrev ); cudaError_t kernelError = cudaGetLastError(); cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); float elapsedTime; cudaEventElapsedTime( &elapsedTime, start, stop ); cudaEventDestroy( start ); cudaEventDestroy( stop ); try { if( kernelError != cudaSuccess ) { const std::string err( cudaGetErrorString( kernelError ) ); throw WException( "CUDA kernel failed: " + err ); } CudaThrowsCall( cudaMemcpy2D( output, samples * sizeof( CuScalarT ), dev_out, pitchOut, samples * sizeof( CuScalarT ), channels, cudaMemcpyDeviceToHost ) ); } catch( const WException& e ) { wlog::error( CLASS ) << e.what(); elapsedTime = -1.0; } CudaSafeCall( cudaFree( ( void* )dev_in ) ); CudaSafeCall( cudaFree( ( void* )dev_prev ) ); CudaSafeCall( cudaFree( ( void* )dev_out ) ); CudaSafeCall( cudaFree( ( void* )dev_co ) ); if( elapsedTime > -1.0 ) { return elapsedTime; } else { throw WException( "Error in cudaFilter()" ); } }
BaseData<Dtype>::BaseData(const int length) { length_ = length; checkCudaErrors(cudaHostAlloc(&cpu_data_, sizeof(Dtype)*length_, cudaHostAllocDefault)); checkCudaErrors(cudaMalloc(&gpu_data_, sizeof(Dtype)*length_)); }
void ckm( struct svm_problem *prob, struct svm_problem *pecm, float *gamma ) { cublasStatus_t status; double g_val = *gamma; long int nfa; int len_tv; int ntv; int i_v; int i_el; int i_r, i_c; int trvei; double *tv_sq; double *v_f_g; float *tr_ar; float *tva, *vtm, *DP; float *g_tva = 0, *g_vtm = 0, *g_DotProd = 0; cudaError_t cudaStat; cublasHandle_t handle; status = cublasCreate(&handle); len_tv = prob-> x[0].dim; ntv = prob-> l; nfa = len_tv * ntv; tva = (float*) malloc ( len_tv * ntv* sizeof(float) ); vtm = (float*) malloc ( len_tv * sizeof(float) ); DP = (float*) malloc ( ntv * sizeof(float) ); tr_ar = (float*) malloc ( len_tv * ntv* sizeof(float) ); tv_sq = (double*) malloc ( ntv * sizeof(double) ); v_f_g = (double*) malloc ( ntv * sizeof(double) ); for ( i_r = 0; i_r < ntv ; i_r++ ) { for ( i_c = 0; i_c < len_tv; i_c++ ) tva[i_r * len_tv + i_c] = (float)prob-> x[i_r].values[i_c]; } cudaStat = cudaMalloc((void**)&g_tva, len_tv * ntv * sizeof(float)); if (cudaStat != cudaSuccess) { free( tva ); free( vtm ); free( DP ); free( v_f_g ); free( tv_sq ); cudaFree( g_tva ); cublasDestroy( handle ); fprintf (stderr, "!!!! Device memory allocation error (A)\n"); getchar(); return; } cudaStat = cudaMalloc((void**)&g_vtm, len_tv * sizeof(float)); cudaStat = cudaMalloc((void**)&g_DotProd, ntv * sizeof(float)); for( i_r = 0; i_r < ntv; i_r++ ) for( i_c = 0; i_c < len_tv; i_c++ ) tr_ar[i_c * ntv + i_r] = tva[i_r * len_tv + i_c]; // Copy cpu vector to gpu vector status = cublasSetVector( len_tv * ntv, sizeof(float), tr_ar, 1, g_tva, 1 ); free( tr_ar ); for( i_v = 0; i_v < ntv; i_v++ ) { tv_sq[ i_v ] = 0; for( i_el = 0; i_el < len_tv; i_el++ ) tv_sq[i_v] += pow( tva[i_v*len_tv + i_el], (float)2.0 ); } for ( trvei = 0; trvei < ntv; trvei++ ) { status = cublasSetVector( len_tv, sizeof(float), &tva[trvei * len_tv], 1, g_vtm, 1 ); status = cublasSgemv( handle, CUBLAS_OP_N, ntv, len_tv, &alpha, g_tva, ntv , g_vtm, 1, &beta, g_DotProd, 1 ); status = cublasGetVector( ntv, sizeof(float), g_DotProd, 1, DP, 1 ); for ( i_c = 0; i_c < ntv; i_c++ ) v_f_g[i_c] = exp( -g_val * (tv_sq[trvei] + tv_sq[i_c]-((double)2.0)* (double)DP[i_c] )); pecm-> x[trvei].values[0] = trvei + 1; for ( i_c = 0; i_c < ntv; i_c++ ) pecm-> x[trvei].values[i_c + 1] = v_f_g[i_c]; } free( tva ); free( vtm ); free( DP ); free( v_f_g ); free( tv_sq ); cudaFree( g_tva ); cudaFree( g_vtm ); cudaFree( g_DotProd ); cublasDestroy( handle ); }
void runAutoTest(int argc, char *argv[]) { printf("[%s] (automated testing w/ readback)\n", sSDKsample); int devID = findCudaDevice(argc, (const char **)argv); // Ensure that SM 2.0 or higher device is available before running checkDeviceMeetComputeSpec(argc, argv); loadDefaultImage(argv[0]); Pixel *d_result; checkCudaErrors(cudaMalloc((void **)&d_result, imWidth*imHeight*sizeof(Pixel))); char *ref_file = NULL; char dump_file[256]; int mode = 0; mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode"); getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); switch (mode) { case 0: g_SobelDisplayMode = SOBELDISPLAY_IMAGE; sprintf(dump_file, "lena_orig.pgm"); break; case 1: g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX; sprintf(dump_file, "lena_tex.pgm"); break; case 2: g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED; sprintf(dump_file, "lena_shared.pgm"); break; default: printf("Invalid Filter Mode File\n"); exit(EXIT_FAILURE); break; } printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]); sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp); checkCudaErrors(cudaDeviceSynchronize()); unsigned char *h_result = (unsigned char *)malloc(imWidth*imHeight*sizeof(Pixel)); checkCudaErrors(cudaMemcpy(h_result, d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost)); sdkSavePGM(dump_file, h_result, imWidth, imHeight); if (!sdkComparePGM(dump_file, sdkFindFilePath(ref_file, argv[0]), MAX_EPSILON_ERROR, 0.15f, false)) { g_TotalErrors++; } checkCudaErrors(cudaFree(d_result)); free(h_result); if (g_TotalErrors != 0) { printf("Test failed!\n"); exit(EXIT_FAILURE); } printf("Test passed!\n"); exit(EXIT_SUCCESS); }
int main(int argc, char* argv[]) { int disp_size = 64; const int bits = 8; if (argc >= 2) { disp_size = atoi(argv[1]); } // init zed cam auto cap = new sl::zed::Camera(sl::zed::ZEDResolution_mode::VGA); sl::zed::ERRCODE err = cap->init(sl::zed::MODE::PERFORMANCE, 0, true); if (err != sl::zed::ERRCODE::SUCCESS) { std::cout << sl::zed::errcode2str(err) << std::endl; exit(EXIT_FAILURE); } int width = cap->getImageSize().width; int height = cap->getImageSize().height; sgm::StereoSGM ssgm(width, height, disp_size, 8, 16, sgm::EXECUTE_INOUT_CUDA2CUDA); SGMDemo demo(width, height); if (demo.init()) { printf("fail to init SGM Demo\n"); std::exit(EXIT_FAILURE); } Renderer renderer(width, height); uint16_t* d_output_buffer = NULL; uint8_t* d_input_left = NULL; uint8_t* d_input_right = NULL; cudaMalloc((void**)&d_input_left, width * height); cudaMalloc((void**)&d_input_right, width * height); const NppiSize roi = { width, height }; cv::Mat h_input_left(height, width, CV_8UC1); while (!demo.should_close()) { cap->grab(sl::zed::SENSING_MODE::FULL, false, false); sl::zed::Mat left_zm = cap->retrieveImage_gpu(sl::zed::SIDE::LEFT); sl::zed::Mat right_zm = cap->retrieveImage_gpu(sl::zed::SIDE::RIGHT); nppiRGBToGray_8u_AC4C1R(left_zm.data, width * 4, d_input_left, width, roi); nppiRGBToGray_8u_AC4C1R(right_zm.data, width * 4, d_input_right, width, roi); ssgm.execute(d_input_left, d_input_right, (void**)&d_output_buffer); switch (demo.get_flag()) { case 0: cudaMemcpy(h_input_left.data, d_input_left, width * height, cudaMemcpyDeviceToHost); renderer.render_input((uint8_t*)h_input_left.data); break; case 1: renderer.render_disparity(d_output_buffer, disp_size); break; case 2: renderer.render_disparity_color(d_output_buffer, disp_size); break; } demo.swap_buffer(); } cudaFree(d_input_left); cudaFree(d_input_right); delete cap; }
void matrix_t::alloc_device() { CUDA_SAFE_CALL( cudaMalloc((void**)&device, elems * sizeof(cufftComplex)) ); }
fmat ModelWPAMGPU::ffun(fmat *current) { fmat prediction(current->n_rows,current->n_cols); fmat pNoiseSample = pNoise.sample(current->n_cols); fmat u = U.sample(current->n_cols); float* lastState_dev; float* F_dev; float* U_dev; float* pNoise_dev; int stateDimension = current->n_rows; int numberOfSamples = current->n_cols; float* newState_dev; //allocate memory on gpu cudaMalloc( &lastState_dev, (size_t) current->n_elem * sizeof(float)) ; cudaMalloc( &F_dev, (size_t) F.n_elem * sizeof(float)) ; cudaMalloc( &U_dev, (size_t) u.n_elem * sizeof(float)) ; cudaMalloc( &pNoise_dev, (size_t) pNoiseSample.n_elem * sizeof(float)) ; cudaMalloc( &newState_dev, (size_t) prediction.n_elem * sizeof(float)) ; //Copy particles and weights to the gpu cudaMemcpy(lastState_dev,current->memptr(),(size_t) current->n_elem * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(F_dev,F.memptr(),(size_t) F.n_elem * sizeof(float), cudaMemcpyHostToDevice); //cudaMemcpy(U_dev,u.memptr(),(size_t) u.n_elem * sizeof(float), cudaMemcpyHostToDevice); //cudaMemcpy(pNoise_dev,pNoiseSample.memptr(),(size_t) pNoiseSample.n_elem * sizeof(float), cudaMemcpyHostToDevice); //pNoise curandGenerateNormal(gen, pNoise_dev, numberOfSamples, 0.0f, 50.0e-6f); curandGenerateNormal(gen, pNoise_dev+numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f); curandGenerateNormal(gen, pNoise_dev+2*numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f); curandGenerateNormal(gen, pNoise_dev+3*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f); curandGenerateNormal(gen, pNoise_dev+4*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f); curandGenerateNormal(gen, pNoise_dev+5*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f); curandGenerateNormal(gen, pNoise_dev+6*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f); curandGenerateNormal(gen, pNoise_dev+7*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f); curandGenerateNormal(gen, pNoise_dev+8*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f); // U U.batch.at(0); for (unsigned int i=0; i< 9 ;++i) { curandGenerateNormal(gen, U_dev+ i*numberOfSamples, numberOfSamples, U.batch.at(i)->a, U.batch.at(i)->b); } /*curandGenerateNormal(gen, oNoise_dev, numberOfSamples, 0.0f, 50.0e-6f); curandGenerateNormal(gen, oNoise_dev+numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f); curandGenerateNormal(gen, oNoise_dev+2*numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f); curandGenerateNormal(gen, oNoise_dev+3*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f); curandGenerateNormal(gen, oNoise_dev+4*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f); curandGenerateNormal(gen, oNoise_dev+5*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f); curandGenerateNormal(gen, oNoise_dev+6*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f); curandGenerateNormal(gen, oNoise_dev+7*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f); curandGenerateNormal(gen, oNoise_dev+8*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);*/ //prediction = F * current + pNoiseSample + u ; callFfunKernel(lastState_dev, F_dev, U_dev, pNoise_dev, stateDimension ,numberOfSamples,newState_dev); //printf("%s\n",cudaGetErrorString(cudaGetLastError())); //get estimation from gpu cudaMemcpy(prediction.memptr(),newState_dev,current->n_elem * sizeof(float), cudaMemcpyDeviceToHost); // clean up the graphics card cudaFree(lastState_dev); cudaFree(newState_dev); cudaFree(F_dev); cudaFree(U_dev); cudaFree(pNoise_dev); return prediction; }
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype) { fprintf(stderr, "Shmoo wasn't implemented in this modified kernel!\n"); exit(1); // create random input data on CPU unsigned int bytes = maxN * sizeof(T); T *h_idata = (T*) malloc(bytes); for(int i = 0; i < maxN; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE); // allocate mem for the result on host side T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T)); // allocate device memory and data T* d_idata = NULL; T* d_odata = NULL; cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) ); cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, maxNumBlocks*sizeof(T)) ); // copy data directly to device memory cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) ); cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, maxNumBlocks*sizeof(T), cudaMemcpyHostToDevice) ); // warm-up for (int kernel = 0; kernel < 7; kernel++) { sumreduce<T>(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata); } int testIterations = 100; unsigned int timer = 0; cutilCheckError( cutCreateTimer( &timer)); // print headers shrLog("Time in milliseconds for various numbers of elements for each kernel\n\n\n"); shrLog("Kernel"); for (int i = minN; i <= maxN; i *= 2) { shrLog(", %d", i); } for (int kernel = 0; kernel < 7; kernel++) { shrLog("\n%d", kernel); for (int i = minN; i <= maxN; i *= 2) { cutResetTimer(timer); int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads); float reduceTime; if( numBlocks <= MAX_BLOCK_DIM_SIZE ) { benchmarkReduceSum(i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, testIterations, false, 1, timer, h_odata, d_idata, d_odata); reduceTime = cutGetAverageTimerValue(timer); } else { reduceTime = -1.0; } shrLog(", %.5f", reduceTime); } } // cleanup cutilCheckError(cutDeleteTimer(timer)); free(h_idata); free(h_odata); cutilSafeCallNoSync(cudaFree(d_idata)); cutilSafeCallNoSync(cudaFree(d_odata)); }
int CORE_dtstrf_cublas(int M, int N, int IB, int NB, double *U, int LDU, double *A, int LDA, double *L, int LDL, int *IPIV, double *WORK, int LDWORK, int *INFO) { static double zzero = 0.0; static double mzone =-1.0; cublasStatus_t status; cudaError_t err; double alpha; int i, j, ii, sb; int im, ip; #if CONFIG_VERBOSE fprintf(stdout, "%s: M=%d N=%d IB=%d NB=%d U=%p LDU=%d A=%p LDA=%d L=%p LDL=%d IPIV=%p WORK=%p LDWORK=%d\n", __FUNCTION__, M, N, IB, NB, U, LDU, A, LDA, L, LDL, IPIV, WORK, LDWORK); fflush(stdout); #endif /* Check input arguments */ *INFO = 0; if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDU < max(1,NB)) && (NB > 0)) { coreblas_error(6, "Illegal value of LDU"); return -6; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(8, "Illegal value of LDA"); return -8; } if ((LDL < max(1,IB)) && (IB > 0)) { coreblas_error(10, "Illegal value of LDL"); return -10; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; /* Set L to 0 */ err = cudaMemset(L, 0, LDL*N*sizeof(double)); PLASMA_CUDA_ASSERT(err); double* dev_ptr = 0; err = cudaMalloc((void**)&dev_ptr, 2*sizeof(double)); PLASMA_CUDA_ASSERT(err); double* host_ptr; err = cudaMallocHost((void**)&host_ptr, 2*sizeof(double)); PLASMA_CUDA_ASSERT(err); int* piv = kaapi_memory_get_host_pointer_and_validate(IPIV); ip = 0; for (ii = 0; ii < N; ii += IB) { sb = min(N-ii, IB); for (i = 0; i < sb; i++) { status = cublasIdamax(kaapi_cuda_cublas_handle(), M, &A[LDA*(ii+i)], 1, &im ); PLASMA_CUBLAS_ASSERT(status); /* get im */ err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); /* ajust index, CUBLAS is 1-based indexing */ im--; piv[ip] = ii+i+1; core_dtstrf_cmp(kaapi_cuda_kernel_stream(), &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], dev_ptr, host_ptr); err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); if (host_ptr[0] == 1.0f) { /* * Swap behind. */ status = cublasDswap(kaapi_cuda_cublas_handle(), i, &L[LDL*ii+i], LDL, &WORK[im], LDWORK ); PLASMA_CUBLAS_ASSERT(status); /* * Swap ahead. */ status = cublasDswap(kaapi_cuda_cublas_handle(), sb-i, &U[LDU*(ii+i)+ii+i], LDU, &A[LDA*(ii+i)+im], LDA ); PLASMA_CUBLAS_ASSERT(status); /* * Set IPIV. */ piv[ip] = NB + im + 1; core_dtstrf_set_zero(kaapi_cuda_kernel_stream(), A, LDA, i, ii, im, zzero ); } core_dtstrf_cmp_zzero_and_get_alpha(kaapi_cuda_kernel_stream(), &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], zzero, dev_ptr, host_ptr); err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); if ((*INFO == 0) && (host_ptr[0] == 1.0f)) { *INFO = ii+i+1; } // alpha = ((double)1. / U[LDU*(ii+i)+ii+i]); alpha = host_ptr[1]; status = cublasDscal(kaapi_cuda_cublas_handle(), M, &alpha, &A[LDA*(ii+i)], 1 ); PLASMA_CUBLAS_ASSERT(status); status = cublasDcopy(kaapi_cuda_cublas_handle(), M, &A[LDA*(ii+i)], 1, &WORK[LDWORK*i], 1 ); PLASMA_CUBLAS_ASSERT(status); status = cublasDger(kaapi_cuda_cublas_handle(), M, sb-i-1, &mzone, &A[LDA*(ii+i)], 1, &U[LDU*(ii+i+1)+ii+i], LDU, &A[LDA*(ii+i+1)], LDA ); PLASMA_CUBLAS_ASSERT(status); ip = ip+1; } /* * Apply the subpanel to the rest of the panel. */ if(ii+i < N) { for(j = ii; j < ii+sb; j++) { if (piv[j] <= NB) { piv[j] = piv[j] - ii; } } CORE_dssssm_cublas_v2( NB, N-(ii+sb), M, N-(ii+sb), sb, sb, &U[LDU*(ii+sb)+ii], LDU, &A[LDA*(ii+sb)], LDA, &L[LDL*ii], LDL, WORK, LDWORK, &piv[ii] ); err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); for(j = ii; j < ii+sb; j++) { if (piv[j] <= NB) { piv[j] = piv[j] + ii; } } } } cudaFreeHost(host_ptr); cudaFree(dev_ptr); return PLASMA_SUCCESS; }
void compute_process() { int np, pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); MPI_Comm_size(MPI_COMM_WORLD, &np); int server_process = np - 1; MPI_Status status; int num_comp_nodes = np -1; unsigned int num_bytes = sizeof(sAgents); unsigned int num_halo_points = RADIO * world_width; unsigned int num_halo_bytes = num_halo_points * sizeof(int); size_t size_world = world_width * world_height * sizeof(int); int *h_world = (int *)malloc(size_world); int *d_world; int left_neighbor = (pid > 0) ? (pid - 1) : MPI_PROC_NULL; int right_neighbor = (pid < np -2) ? (pid + 1) : MPI_PROC_NULL; for(int j = 0; j < world_width * world_height; j++) { h_world[j] = 0; } sAgents h_agents_in, h_agents_left_node, h_agents_right_node; float4 h_agents_pos[agents_total], h_agents_ids[agents_total]; float4 *d_agents_pos, *d_agents_ids; unsigned int num_bytes_agents = agents_total * sizeof(float4); int world_height_node = world_height / num_comp_nodes; // Error code to check return values for CUDA calls cudaError_t err = cudaSuccess; // Allocate the device pointer err = cudaMalloc((void **)&d_world, size_world); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMalloc((void **)&d_agents_pos, num_bytes_agents); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMalloc((void **)&d_agents_ids, num_bytes_agents); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } MPI_Recv(&h_agents_in, num_bytes, MPI_BYTE, server_process, 0, MPI_COMM_WORLD, &status); for(int i = 0; i < agents_total; i++) { //identify the active agents according to the y coordinate and set the busy cells in the world if( ( round(h_agents_in.pos[i].y) >= (pid * world_height_node) ) and ( round(h_agents_in.pos[i].y) < ( (pid + 1) * world_height_node ) ) ) { h_agents_in.ids[i].y = 1; h_world[(int)round( (world_width * (h_agents_in.pos[i].y - 1) ) + h_agents_in.pos[i].x )] = h_agents_in.ids[i].x; } //Copy the data to a local arrays h_agents_pos[i] = h_agents_in.pos[i]; h_agents_ids[i] = h_agents_in.ids[i]; } err = cudaMemcpy(d_world, h_world, size_world, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } //for(int it = 0; it < nreps ; it++) while(1) { int it=4; err = cudaMemcpy(d_agents_pos, h_agents_pos, num_bytes_agents, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMemcpy(d_agents_ids, h_agents_ids, num_bytes_agents, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } launch_kernel(d_agents_pos, d_agents_ids, d_world, world_width, world_height, agent_width, agent_height, world_height_node, pid ); cudaMemcpy(h_agents_pos, d_agents_pos, num_bytes_agents, cudaMemcpyDeviceToHost); cudaMemcpy(h_agents_ids, d_agents_ids, num_bytes_agents, cudaMemcpyDeviceToHost); //copy the data to the struct for( int i = 0; i < agents_total; i++) { h_agents_in.pos[i] = h_agents_pos[i]; h_agents_in.ids[i] = h_agents_ids[i]; } MPI_Barrier(MPI_COMM_WORLD); MPI_Send(&h_agents_in, num_bytes, MPI_BYTE, server_process, DATA_COLLECT, MPI_COMM_WORLD); #ifdef DEBUG //printf("pid: %d\n", pid); //display_data(h_agents_in); #endif // send data to left, get data from right MPI_Sendrecv(&h_agents_in, num_bytes, MPI_BYTE, left_neighbor, it, &h_agents_right_node, num_bytes, MPI_BYTE, right_neighbor, it, MPI_COMM_WORLD, &status); // send data to right, get data from left MPI_Sendrecv(&h_agents_in, num_bytes, MPI_BYTE, right_neighbor, it, &h_agents_left_node, num_bytes, MPI_BYTE, left_neighbor, it, MPI_COMM_WORLD, &status); for( int i = 0; i < agents_total; i++) { if(pid != np-2) { if(h_agents_right_node.ids[i].y == 2) { h_agents_in.pos[i] = h_agents_right_node.pos[i]; h_agents_pos[i] = h_agents_right_node.pos[i]; h_agents_in.ids[i].y = 1; h_agents_ids[i].y = 1; } } if(pid != 0) { if(h_agents_left_node.ids[i].y == 3) { h_agents_in.pos[i] = h_agents_left_node.pos[i]; h_agents_pos[i] = h_agents_left_node.pos[i]; h_agents_in.ids[i].y = 1; h_agents_ids[i].y = 1; } } } /*** if(pid == 1) { printf("pid: %d\n", pid); display_data(h_agents_in); display_data(h_agents_right_node); display_data(h_agents_left_node); } ***/ } /* Release resources */ // free(h_agents_in); /* free(h_output); cudaFreeHost(h_left_boundary); cudaFreeHost(h_right_boundary); cudaFreeHost(h_left_halo); cudaFreeHost(h_right_halo); cudaFree(d_input); cudaFree(d_output); */ }
int main(void) { //std::cout << "Generating a time series on device "<< tim.get_nsamps() << std::endl; //DeviceTimeSeries<float> d_tim(8388608); //d_tim.set_tsamp(0.000064); TimeSeries<float> tim; tim.from_file("/lustre/home/ebarr/Soft/peasoup/tmp5.tim"); DeviceTimeSeries<float> d_tim(tim); unsigned int size = d_tim.get_nsamps(); TimeSeriesFolder folder(size); //DeviceTimeSeries<float> d_tim_r(fft_size); //<----for resampled data //TimeDomainResampler resampler; float* folded_buffer; cudaError_t error; cufftResult result; error = cudaMalloc((void**)&folded_buffer, sizeof(float)*size); ErrorChecker::check_cuda_error(error); unsigned nints = 64; unsigned nbins = 32; cufftComplex* fft_out; error = cudaMalloc((void**)&fft_out, sizeof(cufftComplex)*nints*nbins); cufftHandle plan; result = cufftPlan1d(&plan,nbins,CUFFT_R2C, nints); ErrorChecker::check_cufft_error(result); Stopwatch timer; FoldedSubints<float> folded_array(nbins,nints); //folder.fold(d_tim,folded_array,0.007453079228); std::cout << "made it here" << std::endl; FoldOptimiser optimiser(nbins,nints); timer.start(); for (int ii=0;ii<1;ii++){ //FoldedSubints<float> folded_array(nbins,nints); folder.fold(d_tim,folded_array,0.007453099228); Utils::dump_device_buffer<float>(folded_array.get_data(),nints*nbins,"original_fold.bin"); optimiser.optimise(folded_array); } timer.stop(); /* float* temp = new float [nints*nbins]; cudaMemcpy(temp,folded_buffer,nints*nbins*sizeof(float),cudaMemcpyDeviceToHost); ErrorChecker::check_cuda_error(); for (int ii=0;ii<nints*nbins;ii++) std::cout << temp[ii] << std::endl; */ std::cout << "Total execution time (s): " << timer.getTime()<<std::endl; std::cout << "Average execution time (s): " << timer.getTime()/1000.0 << std::endl; return 0; }
void allocate_memory(void) { //allocate host arrays a0 = (float*)malloc(memsize); a1 = (float*)malloc(memsize); c1 = (float*)malloc(memsize); c2 = (float*)malloc(memsize); a2 = (float*)malloc(memsize); a3 = (float*)malloc(memsize); b0 = (float*)malloc(memsize); b1 = (float*)malloc(memsize); b2 = (float*)malloc(memsize); c0 = (float*)malloc(memsize); wrk = (float*)malloc(memsize); bnd = (float*)malloc(memsize); if (!a0 || !a1 || !a2 || !a3 || !b0 || !b1 || !b2 || !c0 || !c1 || !c2 || !wrk || !bnd) { fprintf(stderr, "Host allocation error in file '%s' in line %i\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } //allocate pressure array page-locked CUDA_SAFE_CALL(cudaMallocHost((void**)&p1,memsize)); CUDA_SAFE_CALL(cudaMallocHost((void**)&p2,memsize)); //allocate page-locked gosa variables CUDA_SAFE_CALL(cudaMallocHost((void**)&gosa_btm,sizeof(float))); CUDA_SAFE_CALL(cudaMallocHost((void**)&gosa_top,sizeof(float))); #if defined(USE_PAD) // This padding & offsetting is a workaround for a problem with the r3.0 // global memory allocator #define PAD (1024*1024) #define OFS (6*1024) #else #define PAD (0) #define OFS (0) #endif /* defined(USE_PAD) */ //allocate device arrays CUDA_SAFE_CALL(cudaMalloc((void**)&gosa_d_orig, GRID_X * GRID_Y * sizeof(gosa_d[0]))); gosa_d = 0*OFS + gosa_d_orig; DBGMSG(("gosa_d = %10p size = %10lu\n", gosa_d, GRID_X * GRID_Y * sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&a0_d_orig,memsize + PAD)); a0_d = 1*OFS + a0_d_orig; DBGMSG (("a0_d = %10p size = %10d pad = %10d\n", a0_d, memsize, PAD)); CUDA_SAFE_CALL(cudaMalloc((void**)&a1_d_orig,memsize + PAD)); a1_d = 2*OFS + a1_d_orig; DBGMSG (("a1_d = %10p size = %10d pad = %10d\n", a1_d, memsize, PAD)); CUDA_SAFE_CALL(cudaMalloc((void**)&c1_d_orig,memsize + PAD)); c1_d = 9*OFS + c1_d_orig; DBGMSG (("c1_d = %10p size = %10d pad = %10d\n", c1_d, memsize, PAD)); CUDA_SAFE_CALL(cudaMalloc((void**)&c2_d_orig,memsize + PAD)); c2_d = 10*OFS + c2_d_orig; DBGMSG (("c2_d = %10p size = %10d pad = %10d\n", c2_d, memsize, PAD)); CUDA_SAFE_CALL(cudaMalloc((void**)&a2_d_orig,memsize + PAD)); a2_d = 3*OFS + a2_d_orig; DBGMSG (("a2_d = %10p size = %10d pad = %10d\n", a2_d, memsize, PAD)); CUDA_SAFE_CALL(cudaMalloc((void**)&a3_d_orig,memsize + PAD)); a3_d = 4*OFS + a3_d_orig; DBGMSG (("a3_d = %10p size = %10d pad = %10d\n", a3_d, memsize, PAD)); CUDA_SAFE_CALL(cudaMalloc((void**)&b0_d_orig,memsize + PAD)); b0_d = 5*OFS + b0_d_orig; DBGMSG (("b0_d = %10p size = %10d pad = %10d\n", b0_d, memsize, PAD)); CUDA_SAFE_CALL(cudaMalloc((void**)&b1_d_orig,memsize + PAD)); b1_d = 6*OFS + b1_d_orig; DBGMSG (("b1_d = %10p size = %10d pad = %10d\n", b1_d, memsize, PAD)); CUDA_SAFE_CALL(cudaMalloc((void**)&b2_d_orig,memsize + PAD)); b2_d = 7*OFS + b2_d_orig; DBGMSG (("b2_d = %10p size = %10d pad = %10d\n", b2_d, memsize, PAD)); CUDA_SAFE_CALL(cudaMalloc((void**)&c0_d_orig,memsize + PAD)); c0_d = 8*OFS + c0_d_orig; DBGMSG (("c0_d = %10p size = %10d pad = %10d\n", c0_d, memsize, PAD)); CUDA_SAFE_CALL(cudaMalloc((void**)&wrk_d_orig,memsize + PAD)); wrk_d = 11*OFS + wrk_d_orig; DBGMSG (("wrk_d = %10p size = %10d pad = %10d\n", wrk_d, memsize, PAD)); CUDA_SAFE_CALL(cudaMalloc((void**)&bnd_d_orig,memsize + PAD)); bnd_d = 12*OFS + bnd_d_orig; DBGMSG (("bnd_d = %10p size = %10d pad = %10d\n", bnd_d, memsize, PAD)); CUDA_SAFE_CALL(cudaMalloc((void**)&p1_d_orig,memsize + PAD)); p1_d = 13*OFS + p1_d_orig; DBGMSG (("p1_d = %10p size = %10d pad = %10d\n", p1_d, memsize, PAD)); CUDA_SAFE_CALL(cudaMalloc((void**)&p2_d_orig,memsize + PAD)); p2_d = 14*OFS + p2_d_orig; DBGMSG (("p2_d = %10p size = %10d pad = %10d\n", p2_d, memsize, PAD)); }
int main (int argc, char *argv[]) { int rank, size, n, len, numbytes; void *a_h, *a_d; struct timeval time[2]; double bandwidth; char name[MPI_MAX_PROCESSOR_NAME]; MPI_Status status; MPI_Init (&argc, &argv); MPI_Comm_rank (MPI_COMM_WORLD, &rank); MPI_Comm_size (MPI_COMM_WORLD, &size); MPI_Get_processor_name(name, &len); printf("Process %d is on %s\n", rank, name); printf("Using regular memory \n"); a_h = malloc(NBYTES); cudaMalloc( (void **) &a_d, NBYTES); /* Test host -> device bandwidth. */ MPI_Barrier(MPI_COMM_WORLD); gettimeofday(&time[0], NULL); for (n=0; n<NREPEAT; n ) { cudaMemcpy(a_d, a_h, NBYTES, cudaMemcpyHostToDevice); } gettimeofday(&time[1], NULL); bandwidth = time[1].tv_sec - time[0].tv_sec; bandwidth = 1.e-6*(time[1].tv_usec - time[0].tv_usec); bandwidth = NBYTES*NREPEAT/1.e6/bandwidth; printf("Host->device bandwidth for process %d: %f MB/sec\n",rank,bandwidth); /* Test MPI send/recv bandwidth. */ MPI_Barrier(MPI_COMM_WORLD); gettimeofday(&time[0], NULL); for (n=0; n<NREPEAT; n ) { if (rank == 0) MPI_Send(a_h, NBYTES/sizeof(int), MPI_INT, 1, 0, MPI_COMM_WORLD); else MPI_Recv(a_h, NBYTES/sizeof(int), MPI_INT, 0, 0, MPI_COMM_WORLD, &status); } gettimeofday(&time[1], NULL); bandwidth = time[1].tv_sec - time[0].tv_sec; bandwidth = 1.e-6*(time[1].tv_usec - time[0].tv_usec); bandwidth = NBYTES*NREPEAT/1.e6/bandwidth; if (rank == 0) printf("MPI send/recv bandwidth: %f MB/sec\n", bandwidth); cudaFree(a_d); free(a_h); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int OPT_N = 4000000; int OPT_SZ = OPT_N * sizeof(float); BlackScholes bs; printf("Initializing data...\n"); float *callResult, *putResult, *stockPrice, *optionStrike, *optionYears; checkCuda( cudaMallocHost((void**)&callResult, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&putResult, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&stockPrice, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&optionStrike, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&optionYears, OPT_SZ) ); initOptions(OPT_N, stockPrice, optionStrike, optionYears); printf("Running Host Version...\n"); StartTimer(); // run BlackScholes operator on host bs(callResult, putResult, stockPrice, optionStrike, optionYears, RISKFREE, VOLATILITY, OPT_N); printf("Option 0 call: %f\n", callResult[0]); printf("Option 0 put: %f\n", putResult[0]); double ms = GetTimer(); //Both call and put is calculated printf("Options count : %i \n", 2 * OPT_N); printf("\tBlackScholes() time : %f msec\n", ms); printf("\t%f GB/s, %f GOptions/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3), ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3)); float *d_callResult, *d_putResult; float *d_stockPrice, *d_optionStrike, *d_optionYears; checkCuda( cudaMalloc ((void**)&d_callResult, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_putResult, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_stockPrice, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_optionStrike, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_optionYears, OPT_SZ) ); printf("Running Device Version...\n"); StartTimer(); // Launch Black-Scholes operator on device #ifdef HEMI_CUDA_COMPILER cudaMemcpy(d_stockPrice, stockPrice, OPT_SZ, cudaMemcpyHostToDevice); cudaMemcpy(d_optionStrike, optionStrike, OPT_SZ, cudaMemcpyHostToDevice); cudaMemcpy(d_optionYears, optionYears, OPT_SZ, cudaMemcpyHostToDevice); hemi::launch(bs, d_callResult, d_putResult, d_stockPrice, d_optionStrike, d_optionYears, RISKFREE, VOLATILITY, OPT_N); cudaMemcpy(callResult, d_callResult, OPT_SZ, cudaMemcpyDeviceToHost); cudaMemcpy(putResult, d_putResult, OPT_SZ, cudaMemcpyDeviceToHost); #else // demonstrates that "launch" goes to host when not compiled with NVCC hemi::launch(bs, callResult, putResult, stockPrice, optionStrike, optionYears, RISKFREE, VOLATILITY, OPT_N); #endif printf("Option 0 call: %f\n", callResult[0]); printf("Option 0 put: %f\n", putResult[0]); ms = GetTimer(); //Both call and put is calculated printf("Options count : %i \n", 2 * OPT_N); printf("\tBlackScholes() time : %f msec\n", ms); printf("\t%f GB/s, %f GOptions/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3), ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3)); checkCuda( cudaFree(d_stockPrice) ); checkCuda( cudaFree(d_optionStrike) ); checkCuda( cudaFree(d_optionYears) ); checkCuda( cudaFreeHost(callResult) ); checkCuda( cudaFreeHost(putResult) ); checkCuda( cudaFreeHost(stockPrice) ); checkCuda( cudaFreeHost(optionStrike) ); checkCuda( cudaFreeHost(optionYears) ); }
//////////////////////////////////////////////////////////////////////////////// // initialize marching cubes //////////////////////////////////////////////////////////////////////////////// void initMC(int argc, char** argv) { // parse command line arguments int n; if (cutGetCmdLineArgumenti( argc, (const char**) argv, "grid", &n)) { gridSizeLog2.x = gridSizeLog2.y = gridSizeLog2.z = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridx", &n)) { gridSizeLog2.x = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridy", &n)) { gridSizeLog2.y = n; } if (cutGetCmdLineArgumenti( argc, (const char**) argv, "gridz", &n)) { gridSizeLog2.z = n; } char *filename; if (cutGetCmdLineArgumentstr( argc, (const char**) argv, "file", &filename)) { volumeFilename = filename; } gridSize = make_uint3(1<<gridSizeLog2.x, 1<<gridSizeLog2.y, 1<<gridSizeLog2.z); gridSizeMask = make_uint3(gridSize.x-1, gridSize.y-1, gridSize.z-1); gridSizeShift = make_uint3(0, gridSizeLog2.x, gridSizeLog2.x+gridSizeLog2.y); numVoxels = gridSize.x*gridSize.y*gridSize.z; voxelSize = make_float3(2.0f / gridSize.x, 2.0f / gridSize.y, 2.0f / gridSize.z); maxVerts = gridSize.x*gridSize.y*100; printf("grid: %d x %d x %d = %d voxels\n", gridSize.x, gridSize.y, gridSize.z, numVoxels); printf("max verts = %d\n", maxVerts); #if SAMPLE_VOLUME // load volume data char* path = cutFindFilePath(volumeFilename, argv[0]); if (path == 0) { fprintf(stderr, "Error finding file '%s'\n", volumeFilename); cudaThreadExit(); exit(EXIT_FAILURE); } int size = gridSize.x*gridSize.y*gridSize.z*sizeof(uchar); uchar *volume = loadRawFile(path, size); cutilSafeCall(cudaMalloc((void**) &d_volume, size)); cutilSafeCall(cudaMemcpy(d_volume, volume, size, cudaMemcpyHostToDevice) ); free(volume); bindVolumeTexture(d_volume); #endif if (g_bQAReadback) { cudaMalloc((void **)&(d_pos), maxVerts*sizeof(float)*4); cudaMalloc((void **)&(d_normal), maxVerts*sizeof(float)*4); } else { // create VBOs createVBO(&posVbo, maxVerts*sizeof(float)*4); // DEPRECATED: cutilSafeCall( cudaGLRegisterBufferObject(posVbo) ); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_posvbo_resource, posVbo, cudaGraphicsMapFlagsWriteDiscard)); createVBO(&normalVbo, maxVerts*sizeof(float)*4); // DEPRECATED: cutilSafeCall(cudaGLRegisterBufferObject(normalVbo)); cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_normalvbo_resource, normalVbo, cudaGraphicsMapFlagsWriteDiscard)); } // allocate textures allocateTextures( &d_edgeTable, &d_triTable, &d_numVertsTable ); // allocate device memory unsigned int memSize = sizeof(uint) * numVoxels; cutilSafeCall(cudaMalloc((void**) &d_voxelVerts, memSize)); cutilSafeCall(cudaMalloc((void**) &d_voxelVertsScan, memSize)); cutilSafeCall(cudaMalloc((void**) &d_voxelOccupied, memSize)); cutilSafeCall(cudaMalloc((void**) &d_voxelOccupiedScan, memSize)); cutilSafeCall(cudaMalloc((void**) &d_compVoxelArray, memSize)); // initialize CUDPP scan CUDPPConfiguration config; config.algorithm = CUDPP_SCAN; config.datatype = CUDPP_UINT; config.op = CUDPP_ADD; config.options = CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE; cudppPlan(&scanplan, config, numVoxels, 1, 0); }
attention_layer<dType>::attention_layer(int LSTM_size,int minibatch_size, int device_number, int D, int longest_sent,cublasHandle_t &handle,neuralMT_model<dType> *model, bool feed_input,bool clip_gradients,dType norm_clip) { this->handle = handle; this->model = model; this->device_number = device_number; this->LSTM_size = LSTM_size; this->minibatch_size = minibatch_size; this->clip_gradients = clip_gradients; this->norm_clip = norm_clip; this->feed_input = feed_input; this->longest_sent = longest_sent; cudaSetDevice(device_number); layer_info.init(device_number,D); dType *h_temp; full_matrix_setup(&h_temp,&d_W_a,LSTM_size,LSTM_size); full_matrix_setup(&h_temp,&d_W_p,LSTM_size,LSTM_size); full_matrix_setup(&h_temp,&d_v_p,1,LSTM_size); full_matrix_setup(&h_temp,&d_output_bias,LSTM_size,1); full_matrix_setup(&h_temp,&d_W_c_p1,LSTM_size,LSTM_size); full_matrix_setup(&h_temp,&d_W_c_p2,LSTM_size,LSTM_size); full_matrix_setup(&h_temp,&d_W_a_grad,LSTM_size,LSTM_size); full_matrix_setup(&h_temp,&d_W_p_grad,LSTM_size,LSTM_size); full_matrix_setup(&h_temp,&d_v_p_grad,1,LSTM_size); full_matrix_setup(&h_temp,&d_output_bias_grad,LSTM_size,1); full_matrix_setup(&h_temp,&d_W_c_p1_grad,LSTM_size,LSTM_size); full_matrix_setup(&h_temp,&d_W_c_p2_grad,LSTM_size,LSTM_size); full_matrix_setup(&h_temp,&d_ERRnTOt_tan_htild,LSTM_size,minibatch_size); full_matrix_setup(&h_temp,&d_ERRnTOt_ct,LSTM_size,minibatch_size); full_matrix_setup(&h_temp,&d_ERRnTOt_ht_p1,LSTM_size,minibatch_size); full_matrix_setup(&h_temp,&d_ERRnTOt_as,2*D+1,minibatch_size); full_matrix_setup(&h_temp,&d_ERRnTOt_pt,1,minibatch_size); full_matrix_setup(&h_temp,&d_temp_1,LSTM_size,minibatch_size); full_matrix_setup(&h_temp,&d_h_t_Wa_factor,2*D+1,minibatch_size); full_vector_setup_ones(&h_temp,&d_ones_minibatch,minibatch_size); thrust_d_W_a_grad = thrust::device_pointer_cast(d_W_a_grad); thrust_d_v_p_grad = thrust::device_pointer_cast(d_v_p_grad); thrust_d_W_p_grad = thrust::device_pointer_cast(d_W_p_grad); thrust_d_W_c_p1_grad = thrust::device_pointer_cast(d_W_c_p1_grad); thrust_d_W_c_p2_grad = thrust::device_pointer_cast(d_W_c_p2_grad); thrust_d_output_bias_grad = thrust::device_pointer_cast(d_output_bias_grad); CUDA_ERROR_WRAPPER(cudaMalloc((void**)&d_result, 1*sizeof(dType)),"GPU memory allocation failed\n"); CUDA_ERROR_WRAPPER(cudaMalloc((void**)&d_temp_result, NORM_THREADS*sizeof(dType)),"GPU memory allocation failed\n"); clear_gradients(); CUDA_ERROR_WRAPPER(cudaMalloc((void**)&d_h_t_sum, LSTM_size*minibatch_size*sizeof(dType)),"GPU memory allocation failed\n"); CUDA_ERROR_WRAPPER(cudaMalloc((void**)&d_h_s_sum, LSTM_size*minibatch_size*sizeof(dType)),"GPU memory allocation failed\n"); for(int i=0; i<longest_sent; i++) { nodes.push_back( attention_node<dType>(LSTM_size,minibatch_size,device_number,D,feed_input,this,i) ); } //now construct d_total_hs_mat dType **h_total_hs_mat = (dType **)malloc(longest_sent*sizeof(dType*)); dType **h_total_hs_error = (dType **)malloc(longest_sent*sizeof(dType*)); for(int i=0; i<longest_sent; i++) { if(model->source_hidden_layers.size() == 0) { h_total_hs_mat[i] = model->input_layer_source.nodes[i].d_h_t; h_total_hs_error[i] = model->input_layer_source.nodes[i].d_d_ERRt_ht; } else { h_total_hs_mat[i] = model->source_hidden_layers[model->source_hidden_layers.size()-1].nodes[i].d_h_t; h_total_hs_error[i] = model->source_hidden_layers[model->source_hidden_layers.size()-1].nodes[i].d_d_ERRt_ht; } } CUDA_ERROR_WRAPPER(cudaMalloc((void**)&d_total_hs_mat, longest_sent*sizeof(dType*)),"GPU memory allocation failed\n"); CUDA_ERROR_WRAPPER(cudaMalloc((void**)&d_total_hs_error, longest_sent*sizeof(dType*)),"GPU memory allocation failed\n"); CUDA_ERROR_WRAPPER(cudaMalloc((void**)&d_batch_info, 2*minibatch_size*sizeof(int)),"GPU memory allocation failed\n"); cudaMemcpy(d_total_hs_mat,h_total_hs_mat,longest_sent*sizeof(dType*),cudaMemcpyHostToDevice); cudaMemcpy(d_total_hs_error,h_total_hs_error,longest_sent*sizeof(dType*),cudaMemcpyHostToDevice); free(h_total_hs_mat); }
int dfft_cuda_create_plan(dfft_plan *p, int ndim, int *gdim, int *inembed, int *oembed, int *pdim, int *pidx, int row_m, int input_cyclic, int output_cyclic, MPI_Comm comm, int *proc_map) { int res = dfft_create_plan_common(p, ndim, gdim, inembed, oembed, pdim, pidx, row_m, input_cyclic, output_cyclic, comm, proc_map, 1); #ifndef ENABLE_MPI_CUDA /* allocate staging bufs */ /* we need to use posix_memalign/cudaHostRegister instead * of cudaHostAlloc, because cudaHostAlloc doesn't have hooks * in the MPI library, and using it would lead to data corruption */ int size = p->scratch_size*sizeof(cuda_cpx_t); int page_size = getpagesize(); size = ((size + page_size - 1) / page_size) * page_size; posix_memalign((void **)&(p->h_stage_in),page_size,size); posix_memalign((void **)&(p->h_stage_out),page_size,size); cudaHostRegister(p->h_stage_in, size, cudaHostAllocDefault); CHECK_CUDA(); cudaHostRegister(p->h_stage_out, size, cudaHostAllocDefault); CHECK_CUDA(); #endif /* allocate memory for passing variables */ cudaMalloc((void **)&(p->d_pidx), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_pdim), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_iembed), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_oembed), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_length), sizeof(int)*ndim); CHECK_CUDA(); /* initialize cuda buffers */ int *h_length = (int *)malloc(sizeof(int)*ndim); int i; for (i = 0; i < ndim; ++i) h_length[i] = gdim[i]/pdim[i]; cudaMemcpy(p->d_pidx, pidx, sizeof(int)*ndim, cudaMemcpyDefault); CHECK_CUDA(); cudaMemcpy(p->d_pdim, pdim, sizeof(int)*ndim, cudaMemcpyDefault); CHECK_CUDA(); cudaMemcpy(p->d_iembed, p->inembed, sizeof(int)*ndim, cudaMemcpyDefault); CHECK_CUDA(); cudaMemcpy(p->d_oembed, p->oembed, sizeof(int)*ndim, cudaMemcpyDefault); CHECK_CUDA(); cudaMemcpy(p->d_length, h_length, sizeof(int)*ndim, cudaMemcpyDefault); CHECK_CUDA(); free(h_length); int dmax = p->max_depth + 2; p->d_rev_j1 = (int **) malloc(sizeof(int *)*dmax); p->d_rev_global = (int **) malloc(sizeof(int *)*dmax); p->d_rev_partial = (int **) malloc(sizeof(int *)*dmax); p->d_c0 = (int **) malloc(sizeof(int *)*dmax); p->d_c1 = (int **) malloc(sizeof(int *)*dmax); if (p->max_depth) { p->h_alpha = (cuda_scalar_t **) malloc(sizeof(cuda_scalar_t *)*p->max_depth); p->d_alpha = (cuda_scalar_t **) malloc(sizeof(cuda_scalar_t *)*p->max_depth); } int d; for (d = 0; d < dmax; ++d) { cudaMalloc((void **)&(p->d_rev_j1[d]), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_rev_partial[d]), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_rev_global[d]), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_c0[d]), sizeof(int)*ndim); CHECK_CUDA(); cudaMalloc((void **)&(p->d_c1[d]), sizeof(int)*ndim); CHECK_CUDA(); } for (d = 0; d < p->max_depth; ++d) { cudaMalloc((void **)&(p->d_alpha[d]), sizeof(cuda_scalar_t)*ndim); CHECK_CUDA(); p->h_alpha[d] = (cuda_scalar_t *) malloc(sizeof(cuda_scalar_t)*ndim); } /* perform initialization run */ dfft_cuda_execute(NULL, NULL, 0, p); /* initialization finished */ p->init = 0; return res; }
int main(int argc, char **argv) { // Start logs shrSetLogFileName ("quasirandomGenerator.txt"); shrLog("%s Starting...\n\n", argv[0]); unsigned int useDoublePrecision; char *precisionChoice; cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice); if(precisionChoice == NULL) useDoublePrecision = 0; else{ if(!strcasecmp(precisionChoice, "double")) useDoublePrecision = 1; else useDoublePrecision = 0; } unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; float *h_OutputGPU; float *d_Output; int dim, pos; double delta, ref, sumDelta, sumRef, L1norm, gpuTime; unsigned int hTimer; if(sizeof(INT64) != 8){ shrLog("sizeof(INT64) != 8\n"); return 0; } // use command-line specified CUDA device, otherwise use device with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); cutilCheckError(cutCreateTimer(&hTimer)); int deviceIndex; cutilSafeCall(cudaGetDevice(&deviceIndex)); cudaDeviceProp deviceProp; cutilSafeCall(cudaGetDeviceProperties(&deviceProp, deviceIndex)); int version = deviceProp.major * 10 + deviceProp.minor; if(useDoublePrecision && version < 13){ shrLog("Double precision not supported.\n"); cudaThreadExit(); return 0; } shrLog("Allocating GPU memory...\n"); cutilSafeCall( cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float)) ); shrLog("Allocating CPU memory...\n"); h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); shrLog("Initializing QRNG tables...\n\n"); initQuasirandomGenerator(tableCPU); if(useDoublePrecision) initTable_SM13(tableCPU); else initTable_SM10(tableCPU); shrLog("Testing QRNG...\n\n"); cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) ); int numIterations = 20; for (int i = -1; i < numIterations; i++) { if (i == 0) { cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } if(useDoublePrecision) quasirandomGenerator_SM13(d_Output, 0, N); else quasirandomGenerator_SM10(d_Output, 0, N); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError(cutStopTimer(hTimer)); gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3; shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); shrLog("\nReading GPU results...\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) ); shrLog("Comparing to the CPU results...\n\n"); sumDelta = 0; sumRef = 0; for(dim = 0; dim < QRNG_DIMENSIONS; dim++) for(pos = 0; pos < N; pos++){ ref = getQuasirandomValue63(pos, dim); delta = (double)h_OutputGPU[dim * N + pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } shrLog("L1 norm: %E\n", sumDelta / sumRef); shrLog("\nTesting inverseCNDgpu()...\n\n"); cutilSafeCall( cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)) ); for (int i = -1; i < numIterations; i++) { if (i == 0) { cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError( cutResetTimer(hTimer) ); cutilCheckError( cutStartTimer(hTimer) ); } if(useDoublePrecision) inverseCND_SM13(d_Output, NULL, QRNG_DIMENSIONS * N); else inverseCND_SM10(d_Output, NULL, QRNG_DIMENSIONS * N); } cutilSafeCall( cudaThreadSynchronize() ); cutilCheckError(cutStopTimer(hTimer)); gpuTime = cutGetTimerValue(hTimer)/(double)numIterations*1e-3; shrLogEx(LOGBOTH | MASTER, 0, "quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); shrLog("Reading GPU results...\n"); cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost) ); shrLog("\nComparing to the CPU results...\n"); sumDelta = 0; sumRef = 0; for(pos = 0; pos < QRNG_DIMENSIONS * N; pos++){ double p = (double)(pos + 1) / (double)(QRNG_DIMENSIONS * N + 1); ref = MoroInvCNDcpu(p); delta = (double)h_OutputGPU[pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } shrLog("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); shrLog((L1norm < 1E-6) ? "PASSED\n\n" : "FAILED\n\n"); shrLog("Shutting down...\n"); cutilCheckError(cutDeleteTimer(hTimer)); free(h_OutputGPU); cutilSafeCall( cudaFree(d_Output) ); cudaThreadExit(); shrEXIT(argc, (const char**)argv); }
void compute_process(int agents_total, int nreps, int world_width, int world_height) { int np, pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); MPI_Comm_size(MPI_COMM_WORLD, &np); int server_process = np - 1; MPI_Status status; /* create a type for struct agent */ const int nitems=5; int blocklengths[5] = {1,1,1,1,1}; MPI_Datatype types[5] = {MPI_INT, MPI_INT, MPI_INT, MPI_FLOAT, MPI_FLOAT}; MPI_Datatype mpi_agent_type; MPI_Aint offsets[5]; offsets[0] = offsetof(agent, id); offsets[1] = offsetof(agent, x); offsets[2] = offsetof(agent, y); offsets[3] = offsetof(agent, z); offsets[4] = offsetof(agent, w); MPI_Type_create_struct(nitems, blocklengths, offsets, types, &mpi_agent_type); MPI_Type_commit(&mpi_agent_type); unsigned int num_bytes = agents_total * sizeof(float4); unsigned int num_halo_points = RADIO * world_width; unsigned int num_halo_bytes = num_halo_points * sizeof(short int); //unsigned int world_node_height = (world_height / (np-1)) + (RADIO * 2); //if(pid == 0 or pid == np - 2) // world_node_height -= RADIO; size_t size_world = world_width * world_height * sizeof(short int); short int *h_world = (short int *)malloc(size_world); *h_world = 0; short int *d_world; for(int j = 0; j < world_width * world_height; j++) { h_world[j] = 0; } /* alloc host memory */ agent *h_agents_in = (agent *)malloc(num_bytes); //agent *d_agents_in; float4 *h_agents_pos; float4 *d_agents_pos; //MPI_Recv(rcv_address, num_points, MPI_FLOAT, server_process, MPI_ANY_TAG, MPI_COMM_WORLD, &status); MPI_Recv(h_agents_in, agents_total, mpi_agent_type, server_process, 0, MPI_COMM_WORLD, &status); //Iniatialize world for( int i = 0; i < agents_total; i++) { h_world[(world_width * (h_agents_in[i].y - 1) ) + h_agents_in[i].x] = (h_agents_in[i].x!=0?1:0); //if(h_world[(world_width * (h_agents_in[i].y - 1) ) + h_agents_in[i].x] == 1) //printf("world x: %d, y: %d\n", h_agents_in[i].x, h_agents_in[i].y); h_agents_pos[i].x = h_agents_in[i].x; h_agents_pos[i].y = h_agents_in[i].y; h_agents_pos[i].z = h_agents_in[i].z; h_agents_pos[i].w = h_agents_in[i].w; } /*** if(pid ==1) { int k=0; for(int j = 0; j < world_width * world_height; j++) { if ( j%96 == 0 and j>0) { k++; printf("%d row: %d\n", h_world[j], k); } else printf("%d ", h_world[j]); } } ***/ // Error code to check return values for CUDA calls cudaError_t err = cudaSuccess; // Allocate the device pointer err = cudaMalloc((void **)&d_world, size_world); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMemcpy(d_world, h_world, size_world, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } //http://cuda-programming.blogspot.com.es/2013/02/cuda-array-in-cuda-how-to-use-cuda.html //http://stackoverflow.com/questions/17924705/structure-of-arrays-vs-array-of-structures-in-cuda // Allocate the device pointer err = cudaMalloc((void **)&d_agents_pos, num_bytes); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMemcpy(d_agents_pos, h_agents_pos, num_bytes, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } launch_kernel(d_agents_pos, d_world, world_width, world_height ); MPI_Barrier( MPI_COMM_WORLD); #ifdef DEBUG // printf("pid: %d\n", pid); // display_data(h_agents_in, agents_total ); #endif MPI_Send(h_agents_in, agents_total, mpi_agent_type, server_process, DATA_COLLECT, MPI_COMM_WORLD); /* Release resources */ free(h_agents_in); /* free(h_output); cudaFreeHost(h_left_boundary); cudaFreeHost(h_right_boundary); cudaFreeHost(h_left_halo); cudaFreeHost(h_right_halo); cudaFree(d_input); cudaFree(d_output); */ }