const cudaDeviceProp& getCurrentDeviceProperties() { int device = 0; auto err = cudaGetDevice(&device); checkCuda(err, std::string("CUDA ERROR: cudaGetDeviceCount ")); return getDeviceProperties(device); }
/*! \brief Destroy distributed L & U matrices. */ void Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) { int_t i, nb, nsupers; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; #if ( DEBUGlevel>=1 ) int iam; MPI_Comm_rank( MPI_COMM_WORLD, &iam ); CHECK_MALLOC(iam, "Enter Destroy_LU()"); #endif nsupers = Glu_persist->supno[n-1] + 1; nb = CEILING(nsupers, grid->npcol); for (i = 0; i < nb; ++i) if ( Llu->Lrowind_bc_ptr[i] ) { SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); #ifdef GPU_ACC checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i])); #else SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); #endif } SUPERLU_FREE (Llu->Lrowind_bc_ptr); SUPERLU_FREE (Llu->Lnzval_bc_ptr); nb = CEILING(nsupers, grid->nprow); for (i = 0; i < nb; ++i) if ( Llu->Ufstnz_br_ptr[i] ) { SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]); SUPERLU_FREE (Llu->Unzval_br_ptr[i]); } SUPERLU_FREE (Llu->Ufstnz_br_ptr); SUPERLU_FREE (Llu->Unzval_br_ptr); /* The following can be freed after factorization. */ SUPERLU_FREE(Llu->ToRecv); SUPERLU_FREE(Llu->ToSendD); SUPERLU_FREE(Llu->ToSendR[0]); SUPERLU_FREE(Llu->ToSendR); /* The following can be freed only after iterative refinement. */ SUPERLU_FREE(Llu->ilsum); SUPERLU_FREE(Llu->fmod); SUPERLU_FREE(Llu->fsendx_plist[0]); SUPERLU_FREE(Llu->fsendx_plist); SUPERLU_FREE(Llu->bmod); SUPERLU_FREE(Llu->bsendx_plist[0]); SUPERLU_FREE(Llu->bsendx_plist); SUPERLU_FREE(Llu->mod_bit); SUPERLU_FREE(Glu_persist->xsup); SUPERLU_FREE(Glu_persist->supno); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit Destroy_LU()"); #endif }
int main() { // Initialize variables double *h_u; h_u = (double*)malloc(sizeof(double)*(NX*NY*NZ)); // Set Domain Initial Condition and BCs Call_IC(h_u); // GPU Memory Arrays double *d_u; checkCuda(cudaMalloc((void**)&d_u, sizeof(double)*(NX*NY))); double *d_un; checkCuda(cudaMalloc((void**)&d_un,sizeof(double)*(NX*NY))); // Copy Initial Condition from host to device checkCuda(cudaMemcpy(d_u, h_u,sizeof(double)*(NX*NY),cudaMemcpyHostToDevice)); checkCuda(cudaMemcpy(d_un,h_u,sizeof(double)*(NX*NY),cudaMemcpyHostToDevice)); // GPU kernel launch parameters dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z); dim3 dimGrid (DIVIDE_INTO(NX, BLOCK_SIZE_X), DIVIDE_INTO(NY, BLOCK_SIZE_Y), DIVIDE_INTO(NZ, BLOCK_SIZE_Z)); // Request computer current time time_t t = clock(); // Solver Loop for (int step=0; step < NO_STEPS; step+=2) { if (step%10000==0) printf("Step %d of %d\n",step,(int)NO_STEPS); // Compute Laplace Call_Laplace(dimGrid,dimBlock,d_u,d_un); // Call_Laplace_Texture(dimGrid,dimBlock,d_u,d_un); } if (DEBUG) printf("CUDA error (Jacobi_Method) %s\n",cudaGetErrorString(cudaPeekAtLastError())); // Measure and Report computation time t = clock()-t; printf("Computing time (%f seconds).\n",((float)t)/CLOCKS_PER_SEC); // Copy data from device to host checkCuda(cudaMemcpy(h_u,d_u,sizeof(double)*(NX*NY*NZ),cudaMemcpyDeviceToHost)); // uncomment to print solution to terminal if (DEBUG) Print2D(h_u); // Write solution to file Save_Results(h_u); // Free device memory checkCuda(cudaFree(d_u)); checkCuda(cudaFree(d_un)); // Reset device checkCuda(cudaDeviceReset()); // Free memory on host and device free(h_u); return 0; }
void ChannelInfo::free() { if (use_gpu) { #ifdef __CUDACC__ checkCuda(cudaFree(channels)); #else assert(false); #endif } else { delete[] channels; } }
ChannelInfo::ChannelInfo(const std::vector<Channels> &channels, bool use_gpu) : use_gpu(use_gpu) { num_channels = (int)channels.size(); radiance_dimension = -1; num_total_dimensions = compute_num_channels(channels); if (use_gpu) { #ifdef __CUDACC__ checkCuda(cudaMallocManaged(&this->channels, channels.size() * sizeof(Channels))); #else assert(false); #endif } else { this->channels = new Channels[channels.size()]; } for (int i = 0; i < (int)channels.size(); i++) { if (channels[i] == Channels::radiance) { if (radiance_dimension != -1) { throw std::runtime_error("Duplicated radiance channel"); } radiance_dimension = i; } this->channels[i] = channels[i]; } }
//////////////////////////////////////////////////////////////////////////////// // Program Main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { int Nx, Ny, Nz, max_iters; int blockX, blockY, blockZ; if (argc == 8) { Nx = atoi(argv[1]); Ny = atoi(argv[2]); Nz = atoi(argv[3]); max_iters = atoi(argv[4]); blockX = atoi(argv[5]); blockY = atoi(argv[6]); blockZ = atoi(argv[7]); } else { printf("Usage: %s nx ny nz i block_x block_y block_z number_of_threads\n", argv[0]); exit(1); } // Get the number of GPUS int number_of_devices; checkCuda(cudaGetDeviceCount(&number_of_devices)); if (number_of_devices < 2) { printf("Less than two devices were found.\n"); printf("Exiting...\n"); return -1; } // Decompose along the Z-axis int _Nz = Nz/number_of_devices; // Define constants const _DOUBLE_ L = 1.0; const _DOUBLE_ h = L/(Nx+1); const _DOUBLE_ dt = h*h/6.0; const _DOUBLE_ beta = dt/(h*h); const _DOUBLE_ c0 = beta; const _DOUBLE_ c1 = (1-6*beta); // Check if ECC is turned on ECCCheck(number_of_devices); // Set the number of OpenMP threads omp_set_num_threads(number_of_devices); #pragma omp parallel { unsigned int tid = omp_get_num_threads(); #pragma omp single { printf("Number of OpenMP threads: %d\n", tid); } } // CPU memory operations int dt_size = sizeof(_DOUBLE_); _DOUBLE_ *u_new, *u_old; u_new = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); u_old = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); init(u_old, u_new, h, Nx, Ny, Nz); // Allocate and generate arrays on the host size_t pitch_bytes; size_t pitch_gc_bytes; _DOUBLE_ *h_Unew, *h_Uold; _DOUBLE_ *h_s_Uolds[number_of_devices], *h_s_Unews[number_of_devices]; _DOUBLE_ *left_send_buffer[number_of_devices], *left_receive_buffer[number_of_devices]; _DOUBLE_ *right_send_buffer[number_of_devices], *right_receive_buffer[number_of_devices]; h_Unew = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); h_Uold = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); init(h_Uold, h_Unew, h, Nx, Ny, Nz); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); h_s_Unews[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); h_s_Uolds[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); right_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); right_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); checkCuda(cudaHostAlloc((void**)&h_s_Unews[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&h_s_Uolds[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); init_subdomain(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid); } // GPU memory operations _DOUBLE_ *d_s_Unews[number_of_devices], *d_s_Uolds[number_of_devices]; _DOUBLE_ *d_right_send_buffer[number_of_devices], *d_left_send_buffer[number_of_devices]; _DOUBLE_ *d_right_receive_buffer[number_of_devices], *d_left_receive_buffer[number_of_devices]; #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); CopyToConstantMemory(c0, c1); checkCuda(cudaMallocPitch((void**)&d_s_Uolds[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_s_Unews[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_left_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); } // Copy data from host to the device double HtD_timer = 0.; HtD_timer -= omp_get_wtime(); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(d_s_Uolds[tid], pitch_bytes, h_s_Uolds[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); checkCuda(cudaMemcpy2D(d_s_Unews[tid], pitch_bytes, h_s_Unews[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); } HtD_timer += omp_get_wtime(); int pitch = pitch_bytes/dt_size; int gc_pitch = pitch_gc_bytes/dt_size; // GPU kernel launch parameters dim3 threads_per_block(blockX, blockY, blockZ); unsigned int blocksInX = getBlock(Nx, blockX); unsigned int blocksInY = getBlock(Ny, blockY); unsigned int blocksInZ = getBlock(_Nz-2, k_loop); dim3 thread_blocks(blocksInX, blocksInY, blocksInZ); dim3 thread_blocks_halo(blocksInX, blocksInY); double compute_timer = 0.; compute_timer -= omp_get_wtime(); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); for(int iterations = 0; iterations < max_iters; iterations++) { // Compute inner nodes checkCuda(cudaSetDevice(tid)); ComputeInnerPoints(thread_blocks, threads_per_block, d_s_Unews[tid], d_s_Uolds[tid], pitch, Nx, Ny, _Nz); // Copy right boundary data to host if (tid == 0) { checkCuda(cudaSetDevice(tid)); CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0); checkCuda(cudaMemcpy2D(right_send_buffer[tid], dt_size*(Nx+2), d_right_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault)); } // Copy left boundary data to host if (tid == 1) { checkCuda(cudaSetDevice(tid)); CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1); checkCuda(cudaMemcpy2D(left_send_buffer[tid], dt_size*(Nx+2), d_left_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault)); } #pragma omp barrier // Copy right boundary data to device 1 if (tid == 1) { checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(d_left_receive_buffer[tid], pitch_gc_bytes, right_send_buffer[tid-1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault)); CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1); } // Copy left boundary data to device 0 if (tid == 0) { checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(d_right_receive_buffer[tid], pitch_gc_bytes, left_send_buffer[tid+1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault)); CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0); } // Swap pointers on the host #pragma omp barrier checkCuda(cudaSetDevice(tid)); checkCuda(cudaDeviceSynchronize()); swap(_DOUBLE_*, d_s_Unews[tid], d_s_Uolds[tid]); } } compute_timer += omp_get_wtime(); // Copy data from device to host double DtH_timer = 0; DtH_timer -= omp_get_wtime(); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(h_s_Uolds[tid], dt_size*(Nx+2), d_s_Uolds[tid], pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDeviceToHost)); } DtH_timer += omp_get_wtime(); // Merge sub-domains into a one big domain #pragma omp parallel { unsigned int tid = omp_get_thread_num(); merge_domains(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid); } // Calculate on host #if defined(DEBUG) || defined(_DEBUG) cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz); #endif float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz); PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx); _DOUBLE_ t = max_iters * dt; CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz); #if defined(DEBUG) || defined(_DEBUG) //exportToVTK(h_Uold, h, "heat3D.vtk", Nx, Ny, Nz); #endif #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); checkCuda(cudaFree(d_s_Unews[tid])); checkCuda(cudaFree(d_s_Uolds[tid])); checkCuda(cudaFree(d_right_send_buffer[tid])); checkCuda(cudaFree(d_left_send_buffer[tid])); checkCuda(cudaFree(d_right_receive_buffer[tid])); checkCuda(cudaFree(d_left_receive_buffer[tid])); checkCuda(cudaFreeHost(h_s_Unews[tid])); checkCuda(cudaFreeHost(h_s_Uolds[tid])); checkCuda(cudaFreeHost(left_send_buffer[tid])); checkCuda(cudaFreeHost(right_send_buffer[tid])); checkCuda(cudaFreeHost(left_receive_buffer[tid])); checkCuda(cudaFreeHost(right_receive_buffer[tid])); checkCuda(cudaDeviceReset()); } free(u_old); free(u_new); return 0; }
int main(int argc, char **argv) { int OPT_N = 4000000; int OPT_SZ = OPT_N * sizeof(float); printf("Initializing data...\n"); float *callResult, *putResult, *stockPrice, *optionStrike, *optionYears; checkCuda( cudaMallocHost((void**)&callResult, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&putResult, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&stockPrice, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&optionStrike, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&optionYears, OPT_SZ) ); initOptions(OPT_N, stockPrice, optionStrike, optionYears); printf("Running Host Version...\n"); StartTimer(); BlackScholesLaunch_host(callResult, putResult, stockPrice, optionStrike, optionYears, RISKFREE, VOLATILITY, OPT_N); printf("Option 0 call: %f\n", callResult[0]); printf("Option 0 put: %f\n", putResult[0]); double ms = GetTimer(); //Both call and put is calculated printf("Options count : %i \n", 2 * OPT_N); printf("\tBlackScholes() time : %f msec\n", ms); printf("\t%f GB/s, %f GOptions/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3), ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3)); float *d_callResult, *d_putResult; float *d_stockPrice, *d_optionStrike, *d_optionYears; checkCuda( cudaMalloc ((void**)&d_callResult, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_putResult, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_stockPrice, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_optionStrike, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_optionYears, OPT_SZ) ); printf("Running Device Version...\n"); StartTimer(); cudaMemcpy(d_stockPrice, stockPrice, OPT_SZ, cudaMemcpyHostToDevice); cudaMemcpy(d_optionStrike, optionStrike, OPT_SZ, cudaMemcpyHostToDevice); cudaMemcpy(d_optionYears, optionYears, OPT_SZ, cudaMemcpyHostToDevice); BlackScholesLaunch_device(d_callResult, d_putResult, d_stockPrice, d_optionStrike, d_optionYears, RISKFREE, VOLATILITY, OPT_N); cudaMemcpy(callResult, d_callResult, OPT_SZ, cudaMemcpyDeviceToHost); cudaMemcpy(putResult, d_putResult, OPT_SZ, cudaMemcpyDeviceToHost); printf("Option 0 call: %f\n", callResult[0]); printf("Option 0 put: %f\n", putResult[0]); ms = GetTimer(); //Both call and put is calculated printf("Options count : %i \n", 2 * OPT_N); printf("\tBlackScholes() time : %f msec\n", ms); printf("\t%f GB/s, %f GOptions/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3), ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3)); checkCuda( cudaFree(d_stockPrice) ); checkCuda( cudaFree(d_optionStrike) ); checkCuda( cudaFree(d_optionYears) ); checkCuda( cudaFreeHost(callResult) ); checkCuda( cudaFreeHost(putResult) ); checkCuda( cudaFreeHost(stockPrice) ); checkCuda( cudaFreeHost(optionStrike) ); checkCuda( cudaFreeHost(optionYears) ); }
int main(int argc, char **argv) { int OPT_N = 4000000; int OPT_SZ = OPT_N * sizeof(float); BlackScholes bs; printf("Initializing data...\n"); float *callResult, *putResult, *stockPrice, *optionStrike, *optionYears; checkCuda( cudaMallocHost((void**)&callResult, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&putResult, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&stockPrice, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&optionStrike, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&optionYears, OPT_SZ) ); initOptions(OPT_N, stockPrice, optionStrike, optionYears); printf("Running Host Version...\n"); StartTimer(); // run BlackScholes operator on host bs(callResult, putResult, stockPrice, optionStrike, optionYears, RISKFREE, VOLATILITY, OPT_N); printf("Option 0 call: %f\n", callResult[0]); printf("Option 0 put: %f\n", putResult[0]); double ms = GetTimer(); //Both call and put is calculated printf("Options count : %i \n", 2 * OPT_N); printf("\tBlackScholes() time : %f msec\n", ms); printf("\t%f GB/s, %f GOptions/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3), ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3)); float *d_callResult, *d_putResult; float *d_stockPrice, *d_optionStrike, *d_optionYears; checkCuda( cudaMalloc ((void**)&d_callResult, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_putResult, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_stockPrice, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_optionStrike, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_optionYears, OPT_SZ) ); printf("Running Device Version...\n"); StartTimer(); // Launch Black-Scholes operator on device #ifdef HEMI_CUDA_COMPILER cudaMemcpy(d_stockPrice, stockPrice, OPT_SZ, cudaMemcpyHostToDevice); cudaMemcpy(d_optionStrike, optionStrike, OPT_SZ, cudaMemcpyHostToDevice); cudaMemcpy(d_optionYears, optionYears, OPT_SZ, cudaMemcpyHostToDevice); hemi::launch(bs, d_callResult, d_putResult, d_stockPrice, d_optionStrike, d_optionYears, RISKFREE, VOLATILITY, OPT_N); cudaMemcpy(callResult, d_callResult, OPT_SZ, cudaMemcpyDeviceToHost); cudaMemcpy(putResult, d_putResult, OPT_SZ, cudaMemcpyDeviceToHost); #else // demonstrates that "launch" goes to host when not compiled with NVCC hemi::launch(bs, callResult, putResult, stockPrice, optionStrike, optionYears, RISKFREE, VOLATILITY, OPT_N); #endif printf("Option 0 call: %f\n", callResult[0]); printf("Option 0 put: %f\n", putResult[0]); ms = GetTimer(); //Both call and put is calculated printf("Options count : %i \n", 2 * OPT_N); printf("\tBlackScholes() time : %f msec\n", ms); printf("\t%f GB/s, %f GOptions/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3), ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3)); checkCuda( cudaFree(d_stockPrice) ); checkCuda( cudaFree(d_optionStrike) ); checkCuda( cudaFree(d_optionYears) ); checkCuda( cudaFreeHost(callResult) ); checkCuda( cudaFreeHost(putResult) ); checkCuda( cudaFreeHost(stockPrice) ); checkCuda( cudaFreeHost(optionStrike) ); checkCuda( cudaFreeHost(optionYears) ); }
/////////////////////// // Main program entry /////////////////////// int main(int argc, char** argv) { unsigned int max_iters, Nx, Ny, Nz, blockX, blockY, blockZ; int rank, numberOfProcesses; if (argc == 8) { Nx = atoi(argv[1]); Ny = atoi(argv[2]); Nz = atoi(argv[3]); max_iters = atoi(argv[4]); blockX = atoi(argv[5]); blockY = atoi(argv[6]); blockZ = atoi(argv[7]); } else { printf("Usage: %s nx ny nz i block_x block_y block_z\n", argv[0]); exit(1); } InitializeMPI(&argc, &argv, &rank, &numberOfProcesses); AssignDevices(rank); ECCCheck(rank); // Define constants const _DOUBLE_ L = 1.0; const _DOUBLE_ h = L/(Nx+1); const _DOUBLE_ dt = h*h/6.0; const _DOUBLE_ beta = dt/(h*h); const _DOUBLE_ c0 = beta; const _DOUBLE_ c1 = (1-6*beta); // Copy constants to Constant Memory on the GPUs CopyToConstantMemory(c0, c1); // Decompose along the z-axis const int _Nz = Nz/numberOfProcesses; const int dt_size = sizeof(_DOUBLE_); // Host memory allocations _DOUBLE_ *u_new, *u_old; _DOUBLE_ *h_Uold; u_new = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); u_old = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); if (rank == 0) { h_Uold = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); } init(u_old, u_new, h, Nx, Ny, Nz); // Allocate and generate host subdomains _DOUBLE_ *h_s_Uolds, *h_s_Unews, *h_s_rbuf[numberOfProcesses]; _DOUBLE_ *left_send_buffer, *left_receive_buffer; _DOUBLE_ *right_send_buffer, *right_receive_buffer; h_s_Unews = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); h_s_Uolds = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); #if defined(DEBUG) || defined(_DEBUG) if (rank == 0) { for (int i = 0; i < numberOfProcesses; i++) { h_s_rbuf[i] = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); checkCuda(cudaHostAlloc((void**)&h_s_rbuf[i], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); } } #endif right_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); right_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); checkCuda(cudaHostAlloc((void**)&h_s_Unews, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&h_s_Uolds, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); init_subdomain(h_s_Uolds, u_old, Nx, Ny, _Nz, rank); // GPU stream operations cudaStream_t compute_stream; cudaStream_t data_stream; checkCuda(cudaStreamCreate(&compute_stream)); checkCuda(cudaStreamCreate(&data_stream)); // GPU Memory Operations size_t pitch_bytes, pitch_gc_bytes; _DOUBLE_ *d_s_Unews, *d_s_Uolds; _DOUBLE_ *d_right_send_buffer, *d_left_send_buffer; _DOUBLE_ *d_right_receive_buffer, *d_left_receive_buffer; checkCuda(cudaMallocPitch((void**)&d_s_Uolds, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_s_Unews, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_left_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); // Copy subdomains from host to device and get walltime double HtD_timer = 0.; MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); HtD_timer -= MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); checkCuda(cudaMemcpy2D(d_s_Uolds, pitch_bytes, h_s_Uolds, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); checkCuda(cudaMemcpy2D(d_s_Unews, pitch_bytes, h_s_Unews, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); HtD_timer += MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); unsigned int ghost_width = 1; int pitch = pitch_bytes/dt_size; int gc_pitch = pitch_gc_bytes/dt_size; // GPU kernel launch parameters dim3 threads_per_block(blockX, blockY, blockZ); unsigned int blocksInX = getBlock(Nx, blockX); unsigned int blocksInY = getBlock(Ny, blockY); unsigned int blocksInZ = getBlock(_Nz-2, k_loop); dim3 thread_blocks(blocksInX, blocksInY, blocksInZ); dim3 thread_blocks_halo(blocksInX, blocksInY); //MPI_Status status; MPI_Status status[numberOfProcesses]; MPI_Request gather_send_request[numberOfProcesses]; MPI_Request right_send_request[numberOfProcesses], left_send_request[numberOfProcesses]; MPI_Request right_receive_request[numberOfProcesses], left_receive_request[numberOfProcesses]; double compute_timer = 0.; MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); compute_timer -= MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); for(unsigned int iterations = 0; iterations < max_iters; iterations++) { // Compute right boundary data on device 0 if (rank == 0) { int kstart = (_Nz+1)-ghost_width; int kstop = _Nz+1; ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0); checkCuda(cudaMemcpy2DAsync(right_send_buffer, dt_size*(Nx+2), d_right_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream)); checkCuda(cudaStreamSynchronize(data_stream)); MPI_CHECK(MPI_Isend(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, &right_send_request[rank])); } else { int kstart = 1; int kstop = 1+ghost_width; ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1); checkCuda(cudaMemcpy2DAsync(left_send_buffer, dt_size*(Nx+2), d_left_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream)); checkCuda(cudaStreamSynchronize(data_stream)); MPI_CHECK(MPI_Isend(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &left_send_request[rank])); } // Compute inner nodes for device 0 if (rank == 0) { int kstart = 1; int kstop = (_Nz+1)-ghost_width; ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); } // Compute inner nodes for device 1 else { int kstart = 1+ghost_width; int kstop = _Nz+1; ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); } // Receive data from device 1 if (rank == 0) { MPI_CHECK(MPI_Irecv(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 1, MPI_COMM_WORLD, &right_receive_request[rank])); } else { MPI_CHECK(MPI_Irecv(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &left_receive_request[rank])); } if (rank == 0) { MPI_CHECK(MPI_Wait(&right_receive_request[rank], &status[rank])); checkCuda(cudaMemcpy2DAsync(d_right_receive_buffer, pitch_gc_bytes, left_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream)); CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0); } else { MPI_CHECK(MPI_Wait(&left_receive_request[rank], &status[rank])); checkCuda(cudaMemcpy2DAsync(d_left_receive_buffer, pitch_gc_bytes, right_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream)); CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1); } if (rank == 0) { MPI_CHECK(MPI_Wait(&right_send_request[rank], MPI_STATUS_IGNORE)); } else { MPI_CHECK(MPI_Wait(&left_send_request[rank], MPI_STATUS_IGNORE)); } // Swap pointers on the host checkCuda(cudaDeviceSynchronize()); swap(_DOUBLE_*, d_s_Unews, d_s_Uolds); } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); compute_timer += MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); // Copy data from device to host double DtH_timer = 0; MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); DtH_timer -= MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); checkCuda(cudaMemcpy2D(h_s_Uolds, dt_size*(Nx+2), d_s_Uolds, pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDefault)); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); DtH_timer += MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); // Gather results from subdomains MPI_CHECK(MPI_Isend(h_s_Uolds, (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &gather_send_request[rank])); if (rank == 0) { for (int i = 0; i < numberOfProcesses; i++) { MPI_CHECK(MPI_Recv(h_s_rbuf[i], (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &status[rank])); merge_domains(h_s_rbuf[i], h_Uold, Nx, Ny, _Nz, i); } } // Calculate on host #if defined(DEBUG) || defined(_DEBUG) if (rank == 0) { cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz); } #endif if (rank == 0) { float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz); PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx); _DOUBLE_ t = max_iters * dt; CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz); } Finalize(); // Free device memory checkCuda(cudaFree(d_s_Unews)); checkCuda(cudaFree(d_s_Uolds)); checkCuda(cudaFree(d_right_send_buffer)); checkCuda(cudaFree(d_left_send_buffer)); checkCuda(cudaFree(d_right_receive_buffer)); checkCuda(cudaFree(d_left_receive_buffer)); // Free host memory checkCuda(cudaFreeHost(h_s_Unews)); checkCuda(cudaFreeHost(h_s_Uolds)); #if defined(DEBUG) || defined(_DEBUG) if (rank == 0) { for (int i = 0; i < numberOfProcesses; i++) { checkCuda(cudaFreeHost(h_s_rbuf[i])); } free(h_Uold); } #endif checkCuda(cudaFreeHost(left_send_buffer)); checkCuda(cudaFreeHost(left_receive_buffer)); checkCuda(cudaFreeHost(right_send_buffer)); checkCuda(cudaFreeHost(right_receive_buffer)); checkCuda(cudaDeviceReset()); free(u_old); free(u_new); return 0; }
int getDevice() { int dev; checkCuda(cudaGetDevice(&dev), std::string("CUDA ERROR: cudaGetDevice ")); return dev; }
int main(int argc, char **argv) { int OPT_N = 4000000; int OPT_SZ = OPT_N * sizeof(float); printf("Initializing data...\n"); float *callResult, *putResult, *stockPrice, *optionStrike, *optionYears; float *d_callResult, *d_putResult; float *d_stockPrice, *d_optionStrike, *d_optionYears; #ifdef HEMI_CUDA_COMPILER checkCuda( cudaMallocHost((void**)&callResult, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&putResult, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&stockPrice, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&optionStrike, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&optionYears, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_callResult, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_putResult, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_stockPrice, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_optionStrike, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_optionYears, OPT_SZ) ); #else callResult = (float*)malloc(OPT_SZ); putResult = (float*)malloc(OPT_SZ); stockPrice = (float*)malloc(OPT_SZ); optionStrike = (float*)malloc(OPT_SZ); optionYears = (float*)malloc(OPT_SZ); #endif initOptions(OPT_N, stockPrice, optionStrike, optionYears); int blockDim = 128; // blockDim, gridDim ignored by host code int gridDim = std::min<int>(1024, (OPT_N + blockDim - 1) / blockDim); printf("Running %s Version...\n", HEMI_LOC_STRING); StartTimer(); #ifdef HEMI_CUDA_COMPILER checkCuda( cudaMemcpy(d_stockPrice, stockPrice, OPT_SZ, cudaMemcpyHostToDevice) ); checkCuda( cudaMemcpy(d_optionStrike, optionStrike, OPT_SZ, cudaMemcpyHostToDevice) ); checkCuda( cudaMemcpy(d_optionYears, optionYears, OPT_SZ, cudaMemcpyHostToDevice) ); #else d_callResult = callResult; d_putResult = putResult; d_stockPrice = stockPrice; d_optionStrike = optionStrike; d_optionYears = optionYears; #endif HEMI_KERNEL_LAUNCH(BlackScholes, gridDim, blockDim, 0, 0, d_callResult, d_putResult, d_stockPrice, d_optionStrike, d_optionYears, RISKFREE, VOLATILITY, OPT_N); #ifdef HEMI_CUDA_COMPILER checkCuda( cudaMemcpy(callResult, d_callResult, OPT_SZ, cudaMemcpyDeviceToHost) ); checkCuda( cudaMemcpy(putResult, d_putResult, OPT_SZ, cudaMemcpyDeviceToHost) ); #endif printf("Option 0 call: %f\n", callResult[0]); printf("Option 0 put: %f\n", putResult[0]); double ms = GetTimer(); //Both call and put is calculated printf("Options count : %i \n", 2 * OPT_N); printf("\tBlackScholes() time : %f msec\n", ms); printf("\t%f GB/s, %f GOptions/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3), ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3)); #ifdef HEMI_CUDA_COMPILER checkCuda( cudaFree(d_stockPrice) ); checkCuda( cudaFree(d_optionStrike) ); checkCuda( cudaFree(d_optionYears) ); checkCuda( cudaFreeHost(callResult) ); checkCuda( cudaFreeHost(putResult) ); checkCuda( cudaFreeHost(stockPrice) ); checkCuda( cudaFreeHost(optionStrike) ); checkCuda( cudaFreeHost(optionYears) ); #else free(callResult); free(putResult); free(stockPrice); free(optionStrike); free(optionYears); #endif // HEMI_CUDA_COMPILER }