void step3_gpu(int *n) { int nprocs, procid; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Create Cartesian Communicator */ int c_dims[2]={0}; MPI_Comm c_comm; accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm); Complexf *data, *data_cpu; Complexf *data_hat; double f_time=0*MPI_Wtime(),i_time=0, setup_time=0; int alloc_max=0; int isize[3],osize[3],istart[3],ostart[3]; /* Get the local pencil size and the allocation size */ alloc_max=accfft_local_size_dft_c2c_gpuf(n,isize,istart,osize,ostart,c_comm); #ifdef INPLACE data_cpu=(Complexf*)malloc(alloc_max); cudaMalloc((void**) &data, alloc_max); #else data_cpu=(Complexf*)malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float)); cudaMalloc((void**) &data,isize[0]*isize[1]*isize[2]*2*sizeof(float)); cudaMalloc((void**) &data_hat, alloc_max); #endif //accfft_init(nthreads); setup_time=-MPI_Wtime(); /* Create FFT plan */ #ifdef INPLACE accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data,c_comm,ACCFFT_MEASURE); #else accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data_hat,c_comm,ACCFFT_MEASURE); #endif setup_time+=MPI_Wtime(); /* Warmup Runs */ #ifdef INPLACE accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data); accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data); #else accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat); accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat); #endif /* Initialize data */ initialize(data_cpu,n,c_comm); #ifdef INPLACE cudaMemcpy(data, data_cpu,alloc_max, cudaMemcpyHostToDevice); #else cudaMemcpy(data, data_cpu,isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyHostToDevice); #endif MPI_Barrier(c_comm); /* Perform forward FFT */ f_time-=MPI_Wtime(); #ifdef INPLACE accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data); #else accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat); #endif f_time+=MPI_Wtime(); MPI_Barrier(c_comm); #ifndef INPLACE Complexf *data2_cpu, *data2; cudaMalloc((void**) &data2, isize[0]*isize[1]*isize[2]*2*sizeof(float)); data2_cpu=(Complexf*) malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float)); #endif /* Perform backward FFT */ i_time-=MPI_Wtime(); #ifdef INPLACE accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data,data); #else accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data_hat,data2); #endif i_time+=MPI_Wtime(); /* copy back results on CPU and check error*/ #ifdef INPLACE cudaMemcpy(data_cpu, data, alloc_max, cudaMemcpyDeviceToHost); check_err(data_cpu,n,c_comm); #else cudaMemcpy(data2_cpu, data2, isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyDeviceToHost); check_err(data2_cpu,n,c_comm); #endif /* Compute some timings statistics */ double g_f_time, g_i_time, g_setup_time; MPI_Reduce(&f_time,&g_f_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD); MPI_Reduce(&i_time,&g_i_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD); MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD); #ifdef INPLACE PCOUT<<"GPU Timing for Inplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; #else PCOUT<<"GPU Timing for Outplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; #endif PCOUT<<"Setup \t"<<g_setup_time<<std::endl; PCOUT<<"FFT \t"<<g_f_time<<std::endl; PCOUT<<"IFFT \t"<<g_i_time<<std::endl; MPI_Barrier(c_comm); cudaDeviceSynchronize(); free(data_cpu); cudaFree(data); #ifndef INPLACE cudaFree(data_hat); free(data2_cpu); cudaFree(data2); #endif accfft_destroy_plan_gpu(plan); accfft_cleanup_gpuf(); MPI_Comm_free(&c_comm); return ; } // end step3_gpu
void copy_device_to_host(const size_t size, double *h_input,double *h_output,double *d_input,double *d_output){ CHECK_CUDA(cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost)); CHECK_CUDA(cudaMemcpy(h_input, d_input, size, cudaMemcpyDeviceToHost)); }
cudaError_t WINAPI wine_cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) { WINE_TRACE("\n"); return cudaMemcpy(dst, src, count, kind); }
void init_arrays(Arrays *arr, FLOAT_TYPE** cu_F, Command_line_opts *opts, Detector_settings *sett) { // Allocates and initializes to zero the data, detector ephemeris // and the F-statistic arrays // arr->xDat = (double *) calloc (sett->N, sizeof (double)); CudaSafeCall( cudaMallocHost((void**)&arr->xDat, sizeof(double)*sett->N)); CudaSafeCall ( cudaMalloc((void**)&arr->cu_xDat, sizeof(double)*sett->N)); // arr->DetSSB = (double *) calloc (3*sett->N, sizeof (double)); CudaSafeCall( cudaMallocHost((void**)&arr->DetSSB, sizeof(double)*3*sett->N) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_DetSSB, sizeof(double)*3*sett->N)); CudaSafeCall ( cudaMalloc((void**)cu_F, sizeof(FLOAT_TYPE)*sett->fftpad*sett->nfft)); CudaSafeCall ( cudaMemset(*cu_F, 0, sizeof(FLOAT_TYPE)*sett->fftpad*sett->nfft)); char filename[CHAR_BUFFER_SIZE]; FILE *data; // Input time-domain data handling sprintf (filename, "%s/%03d/xdatc_%03d_%03d%s.bin", opts->dtaprefix, opts->ident, \ opts->ident, opts->band, opts->label); if ((data = fopen (filename, "r")) != NULL) { fread ((void *)(arr->xDat), sizeof (double), sett->N, data); // !!! wczytanie danych fclose (data); } else { perror (filename); printf("Problem with %s... Exiting...\n", filename); exit(1); } //copy to device CudaSafeCall ( cudaMemcpy(arr->cu_xDat, arr->xDat, sizeof(double)*sett->N, cudaMemcpyHostToDevice)); int Nzeros=0; int i; // Checking for null values in the data for(i=0; i < sett->N; i++) if(!arr->xDat[i]) Nzeros++; // factor N/(N - Nzeros) to account for null values in the data sett->crf0 = (double)sett->N/(sett->N-Nzeros); //if white noise... if (opts->white_flag) sett->sig2 = sett->N*var (arr->xDat, sett->N); else sett->sig2 = -1.; double epsm, phir; /* ############ Efemerydy ################ */ // Ephemeris file handling sprintf (filename, "%s/%03d/DetSSB.bin", opts->dtaprefix, opts->ident); if ((data = fopen (filename, "r")) != NULL) { // Detector position w.r.t solar system baricenter // for every datapoint fread ((void *)(arr->DetSSB), sizeof (double), 3*sett->N, data); // Deterministic phase defining the position of the Earth // in its diurnal motion at t=0 fread ((void *)(&phir), sizeof (double), 1, data); // Earth's axis inclination to the ecliptic at t=0 fread ((void *)(&epsm), sizeof (double), 1, data); fclose (data); } else { perror (filename); printf("Problem with %s... Exiting...\n", filename); exit(1); } //copy DetSSB to device CudaSafeCall ( cudaMemcpy(arr->cu_DetSSB, arr->DetSSB, sizeof(double)*sett->N*3, cudaMemcpyHostToDevice)); /* ############ Sincos ################ */ sett->sphir = sin (phir); sett->cphir = cos (phir); sett->sepsm = sin (epsm); sett->cepsm = cos (epsm); //misc. arrays //arr->aa = (double*) malloc(sizeof(double)*sett->N); //arr->bb = (double*) malloc(sizeof(double)*sett->N); CudaSafeCall( cudaMallocHost((void**)&arr->aa, sizeof(double)*sett->N) ); CudaSafeCall( cudaMallocHost((void**)&arr->bb, sizeof(double)*sett->N) ); CudaSafeCall ( cudaMalloc((void**)&arr->cu_aa, sizeof(double)*sett->nfft)); CudaSafeCall ( cudaMalloc((void**)&arr->cu_bb, sizeof(double)*sett->nfft)); CudaSafeCall ( cudaMalloc((void**)&arr->cu_shft, sizeof(double)*sett->N)); CudaSafeCall ( cudaMalloc((void**)&arr->cu_shftf, sizeof(double)*sett->N)); CudaSafeCall ( cudaMalloc((void**)&arr->cu_tshift, sizeof(double)*sett->N)); //for splines init_spline_matrices(&arr->cu_d, &arr->cu_dl, &arr->cu_du, &arr->cu_B, sett->Ninterp); arr->cand_params_size = (sett->nmax - sett->nmin); arr->cand_buffer_size = (sett->nmax - sett->nmin)*CANDIDATE_BUFFER_SCALE; //parameters of found signal CudaSafeCall (cudaMalloc((void**)&arr->cu_cand_params, sizeof(FLOAT_TYPE)*arr->cand_params_size)); CudaSafeCall (cudaMalloc((void**)&arr->cu_cand_buffer, sizeof(FLOAT_TYPE)*arr->cand_buffer_size)); CudaSafeCall (cudaMalloc((void**)&arr->cu_cand_count, sizeof(int))); //arr->cand_buffer = (FLOAT_TYPE*)malloc(sizeof(FLOAT_TYPE)*arr->cand_buffer_size); CudaSafeCall( cudaMallocHost((void**)&arr->cand_buffer, sizeof(FLOAT_TYPE)*arr->cand_buffer_size) ); }
// Host code int main(int argc, char** argv) { ParseArguments(argc, argv); float s_SobelMatrix[25]; s_SobelMatrix[0] = 1; s_SobelMatrix[1] = 2; s_SobelMatrix[2]= 0; s_SobelMatrix[3] = -2; s_SobelMatrix[4] = -1; s_SobelMatrix[5] = 4; s_SobelMatrix[6] = 8; s_SobelMatrix[7] = 0; s_SobelMatrix[8] = -8; s_SobelMatrix[9] = -4; s_SobelMatrix[10] = 6; s_SobelMatrix[11] = 12; s_SobelMatrix[12] = 0; s_SobelMatrix[13] = -12; s_SobelMatrix[14] = -6; s_SobelMatrix[15] = 4; s_SobelMatrix[16] = 8; s_SobelMatrix[17] = 0; s_SobelMatrix[18] = -8; s_SobelMatrix[19] =-4; s_SobelMatrix[20] =1; s_SobelMatrix[21] =2; s_SobelMatrix[22] =0; s_SobelMatrix[23] =-2; s_SobelMatrix[24] =-1; unsigned char *palete = NULL; unsigned char *data = NULL, *out = NULL; PPMImage *input_image=NULL, *output_image=NULL; output_image = (PPMImage *)malloc(sizeof(PPMImage)); input_image = readPPM(PPMInFileL); printf("Running %s filter\n", Filter); out = (unsigned char *)malloc(); printf("Computing the CPU output\n"); printf("Image details: %d by %d = %d , imagesize = %d\n", input_image->x, input_image->y, input_image->x * input_image->y, input_image->x * input_image->y); cutilCheckError(cutStartTimer(time_CPU)); if(FilterMode == SOBEL_FILTER){ printf("Running Sobel\n"); CPU_Sobel(intput_image->data, output_image, input_image->x, input_image->y); } else if(FilterMode == HIGH_BOOST_FILTER){ printf("Running boost\n"); CPU_Boost(data, out, dib.width, dib.height); } cutilCheckError(cutStopTimer(time_CPU)); if(FilterMode == SOBEL_FILTER || FilterMode == SOBEL_FILTER5) BitMapWrite("CPU_sobel.bmp", &bmp, &dib, out, palete); else if(FilterMode == AVERAGE_FILTER) BitMapWrite("CPU_average.bmp", &bmp, &dib, out, palete); else if(FilterMode == HIGH_BOOST_FILTER) BitMapWrite("CPU_boost.bmp", &bmp, &dib, out, palete); printf("Done with CPU output\n"); printf("CPU execution time %f \n", cutGetTimerValue(time_CPU)); printf("Allocating %d bytes for image \n", dib.image_size); cutilSafeCall( cudaMalloc( (void **)&d_In, dib.image_size*sizeof(unsigned char)) ); cutilSafeCall( cudaMalloc( (void **)&d_Out, dib.image_size*sizeof(unsigned char)) ); // creating space for filter matrix cutilSafeCall( cudaMalloc( (void **)&sobel_matrix, 25*sizeof(float)) ); cutilCheckError(cutStartTimer(time_mem)); cudaMemcpy(d_In, data, dib.image_size*sizeof(unsigned char), cudaMemcpyHostToDevice); cudaMemcpy(sobel_matrix, s_SobelMatrix, 25*sizeof(float), cudaMemcpyHostToDevice); cutilCheckError(cutStopTimer(time_mem)); FilterWrapper(data, dib.width, dib.height); // Copy image back to host cutilCheckError(cutStartTimer(time_mem)); cudaMemcpy(out, d_Out, dib.image_size*sizeof(unsigned char), cudaMemcpyDeviceToHost); cutilCheckError(cutStopTimer(time_mem)); printf("GPU execution time %f Memtime %f \n", cutGetTimerValue(time_GPU), cutGetTimerValue(time_mem)); printf("Total GPU = %f \n", (cutGetTimerValue(time_GPU) + cutGetTimerValue(time_mem))); // Write output image BitMapWrite(BMPOutFile, &bmp, &dib, out, palete); Cleanup(); }
void toHost(T* base) const { cudaCheck(cudaMemcpy(base, vals_, n_ * sizeof(T), cudaMemcpyDeviceToHost)); }
//return types are void since any internal error will be handled by quitting //no point in returning error codes... //returns a pointer to an RGBA version of the input image //and a pointer to the single channel grey-scale output //on both the host and device void preProcess(uchar4 **h_inputImageRGBA, uchar4 **h_outputImageRGBA, uchar4 **d_inputImageRGBA, uchar4 **d_outputImageRGBA, unsigned char **d_redBlurred, unsigned char **d_greenBlurred, unsigned char **d_blueBlurred, float **h_filter, int *filterWidth, const std::string &filename) { //make sure the context initializes ok checkCudaErrors(cudaFree(0)); cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR); if (image.empty()) { std::cerr << "Couldn't open file: " << filename << std::endl; exit(1); } cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA); //allocate memory for the output imageOutputRGBA.create(image.rows, image.cols, CV_8UC4); //This shouldn't ever happen given the way the images are created //at least based upon my limited understanding of OpenCV, but better to check if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) { std::cerr << "Images aren't continuous!! Exiting." << std::endl; exit(1); } *h_inputImageRGBA = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0); *h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0); const size_t numPixels = numRows() * numCols(); //allocate memory on the device for both input and output checkCudaErrors(cudaMalloc(d_inputImageRGBA, sizeof(uchar4) * numPixels)); checkCudaErrors(cudaMalloc(d_outputImageRGBA, sizeof(uchar4) * numPixels)); checkCudaErrors(cudaMemset(*d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around //copy input array to the GPU checkCudaErrors(cudaMemcpy(*d_inputImageRGBA, *h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice)); d_inputImageRGBA__ = *d_inputImageRGBA; d_outputImageRGBA__ = *d_outputImageRGBA; //now create the filter that they will use const int blurKernelWidth = 9; const float blurKernelSigma = 2.; *filterWidth = blurKernelWidth; //create and fill the filter we will convolve with *h_filter = new float[blurKernelWidth * blurKernelWidth]; h_filter__ = *h_filter; float filterSum = 0.f; //for normalization for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma)); (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue; filterSum += filterValue; } } float normalizationFactor = 1.f / filterSum; for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor; } } //blurred checkCudaErrors(cudaMalloc(d_redBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMalloc(d_greenBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMalloc(d_blueBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMemset(*d_redBlurred, 0, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMemset(*d_blueBlurred, 0, sizeof(unsigned char) * numPixels)); }
int main(int argc, char **argv) { // Start logs printf("%s Starting...\n\n", argv[0]); unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION]; float *h_OutputGPU, *d_Output; int dim, pos; double delta, ref, sumDelta, sumRef, L1norm, gpuTime; StopWatchInterface *hTimer = NULL; if (sizeof(INT64) != 8) { printf("sizeof(INT64) != 8\n"); return 0; } cudaDeviceProp deviceProp; int dev = findCudaDevice(argc, (const char **)argv); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); if (((deviceProp.major << 4) + deviceProp.minor) < 0x20) { fprintf(stderr, "quasirandomGenerator requires Compute Capability of SM 2.0 or higher to run.\n"); cudaDeviceReset(); exit(EXIT_WAIVED); } sdkCreateTimer(&hTimer); printf("Allocating GPU memory...\n"); checkCudaErrors(cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float))); printf("Allocating CPU memory...\n"); h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float)); printf("Initializing QRNG tables...\n\n"); initQuasirandomGenerator(tableCPU); initTableGPU(tableCPU); printf("Testing QRNG...\n\n"); checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); int numIterations = 20; for (int i = -1; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } quasirandomGeneratorGPU(d_Output, 0, N); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3; printf("quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS); printf("\nReading GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); printf("Comparing to the CPU results...\n\n"); sumDelta = 0; sumRef = 0; for (dim = 0; dim < QRNG_DIMENSIONS; dim++) for (pos = 0; pos < N; pos++) { ref = getQuasirandomValue63(pos, dim); delta = (double)h_OutputGPU[dim * N + pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } printf("L1 norm: %E\n", sumDelta / sumRef); printf("\nTesting inverseCNDgpu()...\n\n"); checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float))); for (int i = -1; i < numIterations; i++) { if (i == 0) { checkCudaErrors(cudaDeviceSynchronize()); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } inverseCNDgpu(d_Output, NULL, QRNG_DIMENSIONS * N); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3; printf("quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n", (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128); printf("Reading GPU results...\n"); checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost)); printf("\nComparing to the CPU results...\n"); sumDelta = 0; sumRef = 0; unsigned int distance = ((unsigned int)-1) / (QRNG_DIMENSIONS * N + 1); for (pos = 0; pos < QRNG_DIMENSIONS * N; pos++) { unsigned int d = (pos + 1) * distance; ref = MoroInvCNDcpu(d); delta = (double)h_OutputGPU[pos] - ref; sumDelta += fabs(delta); sumRef += fabs(ref); } printf("L1 norm: %E\n\n", L1norm = sumDelta / sumRef); printf("Shutting down...\n"); sdkDeleteTimer(&hTimer); free(h_OutputGPU); checkCudaErrors(cudaFree(d_Output)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE); }
int main(int argc, char **argv) { uchar4 *h_inputImageRGBA, *d_inputImageRGBA; uchar4 *h_outputImageRGBA, *d_outputImageRGBA; unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred; float *h_filter; int filterWidth; std::string input_file; std::string output_file; std::string reference_file; double perPixelError = 0.0; double globalError = 0.0; bool useEpsCheck = false; std::string blur_impl = "hw"; switch (argc) { case 2: input_file = std::string(argv[1]); output_file = "HW2_output.png"; reference_file = "HW2_reference.png"; break; case 3: input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = "HW2_reference.png"; break; case 4: input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = std::string(argv[3]); break; case 5: input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = std::string(argv[3]); blur_impl = std::string(argv[4]); break; default: std::cerr << "Usage: ./HW2 input_file [output_filename] " "[reference_filename] [blur_impl]]" << std::endl; exit(1); } // load the image and give us our input and output pointers preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA, &d_redBlurred, &d_greenBlurred, &d_blueBlurred, &h_filter, &filterWidth, input_file); allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth); GpuTimer timer; timer.Start(); // call the students' code if (blur_impl == "hw") { your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(), d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth); } else if (blur_impl == "shared") { gaussian_blur_shared(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(), d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth); } timer.Stop(); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); if (err < 0) { // Couldn't print! Probably the student closed stdout - bad news std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } // check results and output the blurred image size_t numPixels = numRows() * numCols(); // copy the output back to the host checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost)); std::cerr << "postProcess output...\n"; postProcess(output_file, h_outputImageRGBA); timer.Start(); referenceCalculation(h_inputImageRGBA, h_outputImageRGBA, numRows(), numCols(), h_filter, filterWidth); timer.Stop(); std::cerr << "referenceCalculation elapsed: " << timer.Elapsed() << " ms\n"; std::cerr << "postProcess reference...\n"; postProcess(reference_file, h_outputImageRGBA); // Cheater easy way with OpenCV // generateReferenceImage(input_file, reference_file, filterWidth); compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); checkCudaErrors(cudaFree(d_redBlurred)); checkCudaErrors(cudaFree(d_greenBlurred)); checkCudaErrors(cudaFree(d_blueBlurred)); cleanUp(); return 0; }
void pcl::gpu::DeviceMemory::upload(const void *host_ptr_arg, size_t sizeBytes_arg) { create(sizeBytes_arg); cudaSafeCall( cudaMemcpy(data_, host_ptr_arg, sizeBytes_, cudaMemcpyHostToDevice) ); cudaSafeCall( cudaDeviceSynchronize() ); }
void pcl::gpu::DeviceMemory::download(void *host_ptr_arg) const { cudaSafeCall( cudaMemcpy(host_ptr_arg, data_, sizeBytes_, cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaDeviceSynchronize() ); }
DeepCopy<CudaSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n ) { CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
int main2(int sockfd) { cufftHandle plan; cufftComplex *devPtr; cufftReal indata[NX*BATCH]; cufftComplex data[NX*BATCH]; int i,timer,j,k; char fname[15]; FILE *f; #define BUFSIZE (21*4096*sizeof(int)) int buffer[BUFSIZE]; int p,nread; f = fopen("21-4096","rb"); nread=fread(buffer,BUFSIZE,1,f); printf("nread=%i\n",nread); fclose(f); i=0; for (j=0;j<BATCH;j++) { for (k=0;k<NX;k++) { data[j*NX+k].x = buffer[j*NX+k]; data[j*NX+k].y = 0; } } //f=fopen("y.txt","r"); /* source data creation */ //int sockfd = myconnect(); //printf("connected\n"); /* WORKING!!!!!!!! i=0; for (j=0;j<BATCH;j++) { sprintf(fname,"%i.txt",j); printf("%s\n",fname); f = fopen(fname,"r"); for (k=0;k<NX;k++) { fscanf(f,"%i\n",&p); data[j*NX+k].x = p; data[j*NX+k].y = 0; } fclose(f); */ /* for(i= 0 ; i < NX*BATCH ; i++){ //fscanf(f,"%i\n",&p); //data[i].x= p; data[i].x= 1.0f; //printf("%f\n",data[i].x); data[i].y = 0.0f; } //fclose(f) */ //} /* creates 1D FFT plan */ cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH); /* cutCreateTimer(&timer); cutResetTimer(timer); cutStartTimer(timer); */ /* GPU memory allocation */ cudaMalloc((void**)&devPtr, sizeof(cufftComplex)*NX*BATCH); /* transfer to GPU memory */ cudaMemcpy(devPtr, data, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice); /* executes FFT processes */ cufftExecC2C(plan, devPtr, devPtr, CUFFT_FORWARD); /* executes FFT processes (inverse transformation) */ //cufftExecC2C(plan, devPtr, devPtr, CUFFT_INVERSE); /* transfer results from GPU memory */ cudaMemcpy(data, devPtr, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyDeviceToHost); /* deletes CUFFT plan */ cufftDestroy(plan); /* frees GPU memory */ cudaFree(devPtr); /* cudaThreadSynchronize(); cutStopTimer(timer); printf("%f\n",cutGetTimerValue(timer)/(float)1000); cutDeleteTimer(timer); */ /* float mag; for(i = 0 ; i < NX*BATCH ; i++){ //printf("data[%d] %f %f\n", i, data[i].x, data[i].y); //printf("%f\n", data[i].x); mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX; printf("%f\n",mag); } */ /* // save as text file float mag; i=0; for (j=0;j<BATCH;j++) { sprintf(fname,"%i-mag.txt",j); printf("%s\n",fname); f = fopen(fname,"w"); for (k=0;k<NX;k++) { //fscanf(f,"%i\n",&p); if (k>50) continue; i = j*NX+k; mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX; fprintf(f,"%f\n",mag); } fclose(f); } */ float mag; i=0; float mags[NX]; int magsint[NX*BATCH]; memset(magsint,0,sizeof(int)*NX*BATCH); int u = 0; printf("%f %f %f %f\n",data[0].x,data[1].x,data[2].x,data[3].x); //printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]); // f = fopen("ffts.bin","wb"); for (j=0;j<BATCH;j++) { // sprintf(fname,"%i-bin.dat",j); // printf("%s\n",fname); for (k=0;k<NX;k++) { //fscanf(f,"%i\n",&p); if (k>50) continue; i = j*NX+k; mags[k] = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX; magsint[u]=mags[k] ; u++; //fprintf(f,"%f\n",mag); } //f = fopen(fname,"wb"); // fwrite(magsint,sizeof(int)*50,1,f); } int n; n = write(sockfd,magsint,sizeof(int)*BATCH*50); printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]); printf("send ok, size: %i\n",n); //fclose(f); return 0; }
int main(int argc, char **argv) { char *h_Worig; uint8_t *h_We, *d_We; uint64_t *h_nWe, *d_nWe; uint32_t *h_k, *h_l, *d_k, *d_l; uint32_t *h_ki, *h_li, *d_ki, *d_li; intmax_t *h_k2, *h_l2; intmax_t *h_ki2, *h_li2; bwt_index backward, forward; exome ex; comp_matrix h_O, d_O, h_Oi, d_Oi; vector h_C, d_C, h_C1, d_C1; comp_vector h_S, h_Si; results_list *r_lists; uint32_t *k, *l; cudaSetDevice(0); cudaError_t error; if (argc!=8) { printf("Sintaxis:\n\t%s fichero_bus dir_entrada fichero_sal max_bus_gpu repeticiones max_length nucleotides\n", argv[0]); return 1; } timevars(); init_replace_table(argv[7]); queries_file = fopen(argv[1], "r"); check_file_open(queries_file, argv[1]); output_file = fopen(argv[3], "w"); check_file_open(output_file, argv[3]); MAX_BUS_GPU = atoi(argv[4]); MAXLINE = atoi(argv[6]); tic("Cargando FM-Index"); read_vector(&h_C, argv[2], "C"); read_vector(&h_C1, argv[2], "C1"); copy_vector_gpu(&d_C, &h_C); copy_vector_gpu(&d_C1, &h_C1); read_comp_matrix_gpu(&h_O, argv[2], "O"); read_comp_matrix_gpu(&h_Oi, argv[2], "Oi"); copy_comp_matrix_gpu(&d_O, &h_O); copy_comp_matrix_gpu(&d_Oi, &h_Oi); read_comp_vector(&h_S, argv[2], "S"); read_comp_vector(&h_Si, argv[2], "Si"); backward.C = h_C; backward.C1 = h_C1; backward.O = h_O; backward.S = h_S; forward.C = h_C; forward.C1 = h_C1; forward.O = h_Oi; forward.S = h_Si; load_exome_file(&ex, argv[2]); h_Worig = (char*)malloc(MAX_BUS_GPU * MAXLINE * sizeof(char)); cudaMallocHost((void**) &h_We, MAX_BUS_GPU * MAXLINE * sizeof(uint8_t)); cudaMallocHost((void**) &h_nWe, MAX_BUS_GPU * sizeof(uint64_t)); cudaMalloc((void**) &d_We, MAX_BUS_GPU * MAXLINE * sizeof(uint8_t)); manageCudaError(); cudaMalloc((void**) &d_nWe, MAX_BUS_GPU * sizeof(uint64_t)); manageCudaError(); cudaMallocHost((void**) &h_k, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t)); cudaMallocHost((void**) &h_l, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t)); cudaMallocHost((void**) &h_ki, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t)); cudaMallocHost((void**) &h_li, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t)); cudaMallocHost((void**) &h_k2, MAX_BUS_GPU * MAXLINE * sizeof(intmax_t)); cudaMallocHost((void**) &h_l2, MAX_BUS_GPU * MAXLINE * sizeof(intmax_t)); cudaMallocHost((void**) &h_ki2, MAX_BUS_GPU * MAXLINE * sizeof(intmax_t)); cudaMallocHost((void**) &h_li2, MAX_BUS_GPU * MAXLINE * sizeof(intmax_t)); cudaMalloc((void**) &d_k, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t)); manageCudaError(); cudaMalloc((void**) &d_l, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t)); manageCudaError(); cudaMalloc((void**) &d_ki, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t)); manageCudaError(); cudaMalloc((void**) &d_li, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t)); manageCudaError(); r_lists = (results_list *) malloc(MAX_BUS_GPU * sizeof(results_list)); for (int i=0; i<MAX_BUS_GPU; i++) { new_results_list(&r_lists[i], RESULTS); } k = (uint32_t*)malloc(RESULTS * sizeof(uint32_t)); l = (uint32_t*)malloc(RESULTS * sizeof(uint32_t)); toc(); int TAM_BUS_GPU=0, NUM_BLOQUES_GPU=0; NUM_REP = atoi(argv[5]); tic("Leer de disco"); while(nextFASTAToken(queries_file, h_Worig + TAM_BUS_GPU * MAXLINE, h_We + TAM_BUS_GPU * MAXLINE, h_nWe + TAM_BUS_GPU)) { TAM_BUS_GPU++; if (TAM_BUS_GPU == MAX_BUS_GPU) break; } toc(); NUM_BLOQUES_GPU = (TAM_BUS_GPU / TAM_BLOQUE_GPU); cudaThreadSynchronize(); tic("CPU -> GPU"); cudaMemcpy(d_We, h_We, TAM_BUS_GPU * MAXLINE * sizeof(uint8_t), cudaMemcpyHostToDevice); manageCudaError(); cudaMemcpy(d_nWe, h_nWe, TAM_BUS_GPU * sizeof(uint64_t), cudaMemcpyHostToDevice); manageCudaError(); cudaThreadSynchronize(); toc(); cudaThreadSynchronize(); tic("GPU Kernel"); BWExactSearchBackwardVectorGPUWrapper(NUM_BLOQUES_GPU, TAM_BLOQUE_GPU, d_We, d_nWe, MAXLINE, d_k, d_l, 0, d_O.siz-2, &d_C, &d_C1, &d_O); BWExactSearchForwardVectorGPUWrapper(NUM_BLOQUES_GPU, TAM_BLOQUE_GPU, d_We, d_nWe, MAXLINE, d_ki, d_li, 0, d_Oi.siz-2, &d_C, &d_C1, &d_Oi); cudaThreadSynchronize(); toc(); cudaThreadSynchronize(); tic("GPU -> CPU"); cudaMemcpy(h_k, d_k, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost); manageCudaError(); cudaMemcpy(h_l, d_l, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost); manageCudaError(); cudaMemcpy(h_ki, d_ki, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost); manageCudaError(); cudaMemcpy(h_li, d_li, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost); manageCudaError(); cudaThreadSynchronize(); toc(); tic("CPU Vector"); for (int i=0; i<TAM_BUS_GPU; i++) { BWExactSearchVectorBackward(h_We + MAXLINE*i, 0, h_nWe[i]-1, 0, d_O.siz-2, h_k2 + MAXLINE*i, h_l2 + MAXLINE*i, &backward); BWExactSearchVectorForward(h_We + MAXLINE*i, 0, h_nWe[i]-1, 0, d_Oi.siz-2, h_ki2 + MAXLINE*i, h_li2 + MAXLINE*i, &forward); } toc(); tic("CPU Search 1 error"); for (int i=0; i<TAM_BUS_GPU; i++) { result res; init_result(&res, 1); bound_result(&res, 0, h_nWe[i]-1); change_result(&res, 0, h_O.siz-2, h_nWe[i]-1); r_lists[i].num_results = 0; r_lists[i].read_index = i; BWSearch1CPU( h_We + i * MAXLINE, &backward, &forward, &res, &r_lists[i]); } toc(); tic("CPU Search 1 Error Helper"); for (int i=0; i<TAM_BUS_GPU; i++) { r_lists[i].num_results = 0; r_lists[i].read_index = i; BWSearch1GPUHelper( h_We + i * MAXLINE, 0, h_nWe[i]-1, h_k + i*MAXLINE, h_l + i*MAXLINE, h_ki + i*MAXLINE, h_li + i*MAXLINE, &backward, &forward, &r_lists[i] ); } toc(); tic("Write results"); for (int i=0; i<TAM_BUS_GPU; i++) { write_results_gpu(&r_lists[i], k, l, &ex, &backward, &forward, h_Worig + i*MAXLINE, h_nWe[i], 1, output_file); } toc(); /* for (int i=0; i<TAM_BUS_GPU; i++) { for (int j=0; j<h_nWe[i]; j++) { if (h_k[i*MAXLINE + j] != h_k2[i*MAXLINE + j]) { printf("Diferente %d %d\n", i, j); goto salir; } } } salir: */ /* for (int i=0; i<h_nWe[0]; i++) { printf("%u ", h_k[i]); } printf("\n"); printf("\n"); for (int i=0; i<h_nWe[0]; i++) { printf("%u ", h_k2[i]); } printf("\n"); */ cudaFreeHost(h_k); cudaFree(d_k); cudaFreeHost(h_l); cudaFree(d_l); cudaFreeHost(h_We); cudaFreeHost(h_nWe); cudaFree(d_We); cudaFree(d_nWe); free(h_C.vector); cudaFree(d_C.vector); free(h_C1.vector); cudaFree(d_C1.vector); free_comp_matrix_gpu_host(NULL, &h_O); free_comp_matrix_gpu_device(NULL, &d_O); fclose(queries_file); return 0; }
//------------------------------------------------------- //copy a buffer from device memory to host memory // //param : des //param : src //param : size //------------------------------------------------------- void D_MEMCPY_D2H(void *des, void *src, size_t size) { CUDA_SAFE_CALL(cudaMemcpy(des, src, size, cudaMemcpyDeviceToHost)); }
int main(int argc, char **argv) { int OPT_N = 4000000; int OPT_SZ = OPT_N * sizeof(float); printf("Initializing data...\n"); float *callResult, *putResult, *stockPrice, *optionStrike, *optionYears; float *d_callResult, *d_putResult; float *d_stockPrice, *d_optionStrike, *d_optionYears; #ifdef HEMI_CUDA_COMPILER checkCuda( cudaMallocHost((void**)&callResult, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&putResult, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&stockPrice, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&optionStrike, OPT_SZ) ); checkCuda( cudaMallocHost((void**)&optionYears, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_callResult, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_putResult, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_stockPrice, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_optionStrike, OPT_SZ) ); checkCuda( cudaMalloc ((void**)&d_optionYears, OPT_SZ) ); #else callResult = (float*)malloc(OPT_SZ); putResult = (float*)malloc(OPT_SZ); stockPrice = (float*)malloc(OPT_SZ); optionStrike = (float*)malloc(OPT_SZ); optionYears = (float*)malloc(OPT_SZ); #endif initOptions(OPT_N, stockPrice, optionStrike, optionYears); int blockDim = 128; // blockDim, gridDim ignored by host code int gridDim = std::min<int>(1024, (OPT_N + blockDim - 1) / blockDim); printf("Running %s Version...\n", HEMI_LOC_STRING); StartTimer(); #ifdef HEMI_CUDA_COMPILER checkCuda( cudaMemcpy(d_stockPrice, stockPrice, OPT_SZ, cudaMemcpyHostToDevice) ); checkCuda( cudaMemcpy(d_optionStrike, optionStrike, OPT_SZ, cudaMemcpyHostToDevice) ); checkCuda( cudaMemcpy(d_optionYears, optionYears, OPT_SZ, cudaMemcpyHostToDevice) ); #else d_callResult = callResult; d_putResult = putResult; d_stockPrice = stockPrice; d_optionStrike = optionStrike; d_optionYears = optionYears; #endif HEMI_KERNEL_LAUNCH(BlackScholes, gridDim, blockDim, 0, 0, d_callResult, d_putResult, d_stockPrice, d_optionStrike, d_optionYears, RISKFREE, VOLATILITY, OPT_N); #ifdef HEMI_CUDA_COMPILER checkCuda( cudaMemcpy(callResult, d_callResult, OPT_SZ, cudaMemcpyDeviceToHost) ); checkCuda( cudaMemcpy(putResult, d_putResult, OPT_SZ, cudaMemcpyDeviceToHost) ); #endif printf("Option 0 call: %f\n", callResult[0]); printf("Option 0 put: %f\n", putResult[0]); double ms = GetTimer(); //Both call and put is calculated printf("Options count : %i \n", 2 * OPT_N); printf("\tBlackScholes() time : %f msec\n", ms); printf("\t%f GB/s, %f GOptions/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3), ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3)); #ifdef HEMI_CUDA_COMPILER checkCuda( cudaFree(d_stockPrice) ); checkCuda( cudaFree(d_optionStrike) ); checkCuda( cudaFree(d_optionYears) ); checkCuda( cudaFreeHost(callResult) ); checkCuda( cudaFreeHost(putResult) ); checkCuda( cudaFreeHost(stockPrice) ); checkCuda( cudaFreeHost(optionStrike) ); checkCuda( cudaFreeHost(optionYears) ); #else free(callResult); free(putResult); free(stockPrice); free(optionStrike); free(optionYears); #endif // HEMI_CUDA_COMPILER }
CUDA(const T* base, size_t n) : n_(n) { cudaCheck(cudaMalloc(&vals_, n_ * sizeof(T))); cudaCheck(cudaMemcpy(vals_, base, n_ * sizeof(T), cudaMemcpyHostToDevice)); }
void MultiStageMeanfieldLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { init_cpu = false; init_gpu = false; const caffe::MultiStageMeanfieldParameter meanfield_param = this->layer_param_.multi_stage_meanfield_param(); num_iterations_ = meanfield_param.num_iterations(); CHECK_GT(num_iterations_, 1) << "Number of iterations must be greater than 1."; theta_alpha_ = meanfield_param.theta_alpha(); theta_beta_ = meanfield_param.theta_beta(); theta_gamma_ = meanfield_param.theta_gamma(); count_ = bottom[0]->count(); num_ = bottom[0]->num(); channels_ = bottom[0]->channels(); height_ = bottom[0]->height(); width_ = bottom[0]->width(); num_pixels_ = height_ * width_; LOG(INFO) << "This implementation has not been tested batch size > 1."; top[0]->Reshape(num_, channels_, height_, width_); // Initialize the parameters that will updated by backpropagation. if (this->blobs_.size() > 0) { LOG(INFO) << "Multimeanfield layer skipping parameter initialization."; } else { this->blobs_.resize(3);// blobs_[0] - spatial kernel weights, blobs_[1] - bilateral kernel weights, blobs_[2] - compatability matrix // Allocate space for kernel weights. this->blobs_[0].reset(new Blob<Dtype>(1, 1, channels_, channels_)); this->blobs_[1].reset(new Blob<Dtype>(1, 1, channels_, channels_)); caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[0]->mutable_cpu_data()); caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[1]->mutable_cpu_data()); // Initialize the kernels weights. The two files spatial.par and bilateral.par should be available. FILE * pFile; pFile = fopen("spatial.par", "r"); CHECK(pFile) << "The file 'spatial.par' is not found. Please create it with initial spatial kernel weights."; for (int i = 0; i < channels_; i++) { fscanf(pFile, "%lf", &this->blobs_[0]->mutable_cpu_data()[i * channels_ + i]); } fclose(pFile); pFile = fopen("bilateral.par", "r"); CHECK(pFile) << "The file 'bilateral.par' is not found. Please create it with initial bilateral kernel weights."; for (int i = 0; i < channels_; i++) { fscanf(pFile, "%lf", &this->blobs_[1]->mutable_cpu_data()[i * channels_ + i]); } fclose(pFile); // Initialize the compatibility matrix. this->blobs_[2].reset(new Blob<Dtype>(1, 1, channels_, channels_)); caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[2]->mutable_cpu_data()); // Initialize it to have the Potts model. for (int c = 0; c < channels_; ++c) { (this->blobs_[2]->mutable_cpu_data())[c * channels_ + c] = Dtype(-1.); } } float spatial_kernel[2 * num_pixels_]; float *spatial_kernel_gpu_; compute_spatial_kernel(spatial_kernel); spatial_lattice_.reset(new ModifiedPermutohedral()); spatial_norm_.Reshape(1, 1, height_, width_); Dtype* norm_data_gpu ; Dtype* norm_data; // Initialize the spatial lattice. This does not need to be computed for every image because we use a fixed size. switch (Caffe::mode()) { case Caffe::CPU: norm_data = spatial_norm_.mutable_cpu_data(); spatial_lattice_->init(spatial_kernel, 2, width_, height_); // Calculate spatial filter normalization factors. norm_feed_= new Dtype[num_pixels_]; caffe_set(num_pixels_, Dtype(1.0), norm_feed_); // pass norm_feed and norm_data to gpu spatial_lattice_->compute(norm_data, norm_feed_, 1); bilateral_kernel_buffer_ = new float[5 * num_pixels_]; init_cpu = true; break; #ifndef CPU_ONLY case Caffe::GPU: CUDA_CHECK(cudaMalloc((void**)&spatial_kernel_gpu_, 2*num_pixels_ * sizeof(float))) ; CUDA_CHECK(cudaMemcpy(spatial_kernel_gpu_, spatial_kernel, 2*num_pixels_ * sizeof(float), cudaMemcpyHostToDevice)) ; spatial_lattice_->init(spatial_kernel_gpu_, 2, width_, height_); CUDA_CHECK(cudaMalloc((void**)&norm_feed_, num_pixels_ * sizeof(Dtype))) ; caffe_gpu_set(num_pixels_, Dtype(1.0), norm_feed_); norm_data_gpu = spatial_norm_.mutable_gpu_data(); spatial_lattice_->compute(norm_data_gpu, norm_feed_, 1); norm_data = spatial_norm_.mutable_cpu_data(); CUDA_CHECK(cudaMalloc((void**)&bilateral_kernel_buffer_, 5 * num_pixels_ * sizeof(float))) ; CUDA_CHECK(cudaFree(spatial_kernel_gpu_)); init_gpu = true; break; #endif default: LOG(FATAL) << "Unknown caffe mode."; } for (int i = 0; i < num_pixels_; ++i) { norm_data[i] = 1.0f / (norm_data[i] + 1e-20f); } bilateral_norms_.Reshape(num_, 1, height_, width_); // Configure the split layer that is used to make copies of the unary term. One copy for each iteration. // It may be possible to optimize this calculation later. split_layer_bottom_vec_.clear(); split_layer_bottom_vec_.push_back(bottom[0]); split_layer_top_vec_.clear(); split_layer_out_blobs_.resize(num_iterations_); for (int i = 0; i < num_iterations_; i++) { split_layer_out_blobs_[i].reset(new Blob<Dtype>()); split_layer_top_vec_.push_back(split_layer_out_blobs_[i].get()); } LayerParameter split_layer_param; split_layer_.reset(new SplitLayer<Dtype>(split_layer_param)); split_layer_->SetUp(split_layer_bottom_vec_, split_layer_top_vec_); // Make blobs to store outputs of each meanfield iteration. Output of the last iteration is stored in top[0]. // So we need only (num_iterations_ - 1) blobs. iteration_output_blobs_.resize(num_iterations_ - 1); for (int i = 0; i < num_iterations_ - 1; ++i) { iteration_output_blobs_[i].reset(new Blob<Dtype>(num_, channels_, height_, width_)); } // Make instances of MeanfieldIteration and initialize them. meanfield_iterations_.resize(num_iterations_); for (int i = 0; i < num_iterations_; ++i) { meanfield_iterations_[i].reset(new MeanfieldIteration<Dtype>()); meanfield_iterations_[i]->OneTimeSetUp( split_layer_out_blobs_[i].get(), // unary terms (i == 0) ? bottom[1] : iteration_output_blobs_[i - 1].get(), // softmax input (i == num_iterations_ - 1) ? top[0] : iteration_output_blobs_[i].get(), // output blob spatial_lattice_, // spatial lattice &spatial_norm_); // spatial normalization factors. } this->param_propagate_down_.resize(this->blobs_.size(), true); LOG(INFO) << ("MultiStageMeanfieldLayer initialized."); }
void CUDARunner::FindBestConfiguration() { unsigned long lowb=16; unsigned long highb=128; unsigned long lowt=16; unsigned long hight=256; unsigned long bestb=16; unsigned long bestt=16; int64 besttime=std::numeric_limits<int64>::max(); if(m_requestedgrid>0 && m_requestedgrid<=65536) { lowb=m_requestedgrid; highb=m_requestedgrid; } if(m_requestedthreads>0 && m_requestedthreads<=65536) { lowt=m_requestedthreads; hight=m_requestedthreads; } for(int numb=lowb; numb<=highb; numb*=2) { for(int numt=lowt; numt<=hight; numt*=2) { AllocateResources(numb,numt); // clear out any existing error cudaError_t err=cudaGetLastError(); err=cudaSuccess; int64 st=GetTimeMillis(); for(int it=0; it<128*256*2 && err==0; it+=(numb*numt)) { cutilSafeCall(cudaMemcpy(m_devin,m_in,sizeof(cuda_in),cudaMemcpyHostToDevice)); cuda_process_helper(m_devin,m_devout,64,6,numb,numt); cutilSafeCall(cudaMemcpy(m_out,m_devout,numb*numt*sizeof(cuda_out),cudaMemcpyDeviceToHost)); err=cudaGetLastError(); if(err!=cudaSuccess) { printf("CUDA error %d\n",err); } } int64 et=GetTimeMillis(); printf("Finding best configuration step end (%d,%d) %"PRI64d"ms prev best=%"PRI64d"ms\n",numb,numt,et-st,besttime); if((et-st)<besttime && err==cudaSuccess) { bestb=numb; bestt=numt; besttime=et-st; } } } m_numb=bestb; m_numt=bestt; AllocateResources(m_numb,m_numt); }
void GRTRegenerateChains::RunGPUWorkunit(GRTWorkunitElement *WU, GRTRegenerateThreadRunData *data) { //printf("In RunGPUWorkunit!\n"); //printf("Got startPoint %d\n", WU->StartPoint); //printf("Got endpoint %d\n", WU->EndPoint); //printf("Chains to regen: %d\n", WU->EndPoint - WU->StartPoint); unsigned char *DEVICE_Chains_To_Regen, *HOST_Interleaved_Chains_To_Regen; UINT4 i, j, k; UINT4 ChainsCompleted, CurrentChainStartOffset, PasswordSpaceOffset, CharsetOffset, StepsPerInvocation; int ActiveThreads; CHHiresTimer kernelTimer; // Calculate the number of chains being regen'd by this thread. uint32_t NumberChainsToRegen = WU->EndPoint - WU->StartPoint + 1; this->setNumberOfChainsToRegen(NumberChainsToRegen); // Allocate device memory for chains. if (cudaErrorMemoryAllocation == cudaMalloc((void**)&DEVICE_Chains_To_Regen, this->HashLengthBytes * NumberChainsToRegen * sizeof(unsigned char))) { printf("ERROR: Cannot allocate GPU memory. Try rebooting?\n"); exit(1); } // Interleave chains for better GPU performance and coalescing. HOST_Interleaved_Chains_To_Regen = (unsigned char *)malloc(this->HashLengthBytes * NumberChainsToRegen * sizeof(unsigned char)); memset(HOST_Interleaved_Chains_To_Regen, 0, this->HashLengthBytes * NumberChainsToRegen * sizeof(unsigned char)); hashPasswordData chainData; // Password in our space to regen for (i = 0; i < NumberChainsToRegen; i++) { UINT4 base_offset; // Get the chain being interleaved //printf("Adding chain %d\n", (i + WU->StartPoint)); chainData = this->ChainsToRegen->at(i + WU->StartPoint); for (j = 0; j < (this->HashLengthBytes / 4); j++) { base_offset = 4 * j * NumberChainsToRegen; base_offset += i * 4; for (k = 0; k < 4; k++) { HOST_Interleaved_Chains_To_Regen[base_offset + k] = chainData.password[j*4 + k]; } } } cudaMemset((void *)DEVICE_Chains_To_Regen, 0, this->HashLengthBytes * NumberChainsToRegen * sizeof(unsigned char)); cudaMemcpy(DEVICE_Chains_To_Regen, HOST_Interleaved_Chains_To_Regen, this->HashLengthBytes * NumberChainsToRegen * sizeof(unsigned char), cudaMemcpyHostToDevice); // Kernel time! // Number of chains completed ChainsCompleted = 0; // Where we are in the current chain CurrentChainStartOffset = 0; // Calculated on the host for modulus reasons CharsetOffset = 0; PasswordSpaceOffset = 0; StepsPerInvocation = 1000; // If kernel time is set to zero, run full chains. if (!this->ThreadData[data->threadID].kernelTimeMs) { StepsPerInvocation = this->TableHeader->getChainLength(); } // While we haven't finished all the chains: while (ChainsCompleted < NumberChainsToRegen) { CurrentChainStartOffset = 0; while (CurrentChainStartOffset < this->TableHeader->getChainLength()) { // Calculate the right charset offset CharsetOffset = CurrentChainStartOffset % this->hostConstantCharsetLengths[0]; // PasswordSpaceOffset: The offset into the password space we are using. // 0, 1, 2, etc. PasswordSpaceOffset = (ChainsCompleted / (this->ThreadData[data->threadID].CUDABlocks * this->ThreadData[data->threadID].CUDAThreads)); kernelTimer.start(); // Don't overrun the end of the chain if ((CurrentChainStartOffset + StepsPerInvocation) > this->TableHeader->getChainLength()) { StepsPerInvocation = this->TableHeader->getChainLength() - CurrentChainStartOffset; } this->Launch_CUDA_Kernel(DEVICE_Chains_To_Regen, this->DEVICE_Passwords[data->threadID], this->DEVICE_Hashes[data->threadID], PasswordSpaceOffset, CurrentChainStartOffset, StepsPerInvocation, CharsetOffset, this->DEVICE_Success[data->threadID], data); cudaThreadSynchronize(); // Copy the success and password data to the host cudaMemcpy(this->HOST_Success[data->threadID], this->DEVICE_Success[data->threadID], this->NumberOfHashes * sizeof(unsigned char), cudaMemcpyDeviceToHost); cudaMemcpy(this->HOST_Passwords[data->threadID], this->DEVICE_Passwords[data->threadID], this->NumberOfHashes * MAX_PASSWORD_LENGTH * sizeof(unsigned char), cudaMemcpyDeviceToHost); // Do something with the passwords... this->outputFoundHashes(data); // If all hashes are found, no point in continuing. if (this->HashFile->GetUncrackedHashCount() == 0) { return; } float ref_time = kernelTimer.getElapsedTimeInMilliSec(); ActiveThreads = (this->ThreadData[data->threadID].CUDABlocks * this->ThreadData[data->threadID].CUDAThreads); if ((NumberChainsToRegen - ChainsCompleted) < ActiveThreads) { ActiveThreads = (NumberChainsToRegen - ChainsCompleted); } if (!silent) { if (this->Display) { this->Display->setThreadFractionDone(data->threadID, (float)(((float)((float)ChainsCompleted * (float)this->TableHeader->getChainLength() + (float)CurrentChainStartOffset * (float)ActiveThreads) / (float)((float)NumberChainsToRegen * (float)this->TableHeader->getChainLength())))); this->Display->setThreadCrackSpeed(data->threadID, GPU_THREAD, ((ActiveThreads * StepsPerInvocation) / 1000) / ref_time); } else { printf("Kernel Time: %0.3f ms Step rate: %0.2f M/s Done: %0.2f%% \r",ref_time, ((ActiveThreads * StepsPerInvocation) / 1000) / ref_time, (float)(100 * ((float)((float)ChainsCompleted * (float)this->TableHeader->getChainLength() + (float)CurrentChainStartOffset * (float)ActiveThreads) / (float)((float)NumberChainsToRegen * (float)this->TableHeader->getChainLength())))); fflush(stdout); } } CurrentChainStartOffset += StepsPerInvocation; if (this->ThreadData[data->threadID].kernelTimeMs) { // Adjust the steps per invocation if needed. if ((ref_time > 1.1 * (float)this->ThreadData[data->threadID].kernelTimeMs) || (ref_time < 0.9 * (float)this->ThreadData[data->threadID].kernelTimeMs)) { StepsPerInvocation = (UINT4)((float)StepsPerInvocation * ((float)this->ThreadData[data->threadID].kernelTimeMs / ref_time)); //printf("Adjusted SPI to %d\n", StepsPerInvocation); } } } ChainsCompleted += (this->ThreadData[data->threadID].CUDABlocks * this->ThreadData[data->threadID].CUDAThreads); } //printf("Freeing chains to regen.\n"); // Free the chains we were working on. cudaFree(DEVICE_Chains_To_Regen); free(HOST_Interleaved_Chains_To_Regen); }
int main(int argc, char* argv[]) { // Parse test command line arguments, perform early // initializations. const char *name, *mode; int n, nt, sx, sy, ss, rank, szcomm; #ifdef CUDA struct cudaDeviceProp props; #endif test_parse(argc, argv, &name, &mode, &n, &nt, &sx, &sy, &ss, &rank, &szcomm #ifdef CUDA , &props #endif ); #ifdef CUDA int cpu = !strcmp(mode, "CPU"); int gpu = !strcmp(mode, "GPU"); #else int cpu = 1; int gpu = 0; #endif // Create test configuration. struct test_config_t* t = test_init( name, mode, n, nt, sx, sy, ss, rank, szcomm, xmin, ymin, zmin, xmax, ymax, zmax, bx, by, bs, ex, ey, es #ifdef CUDA , &props #endif ); // Create another test configuration to check correctness. struct test_config_t* t_check = NULL; #ifdef MPI if (t->rank == MPI_ROOT_NODE) #endif { t_check = test_init( name, mode, n, nt, 1, 1, 1, 0, 1, xmin, ymin, zmin, xmax, ymax, zmax, bx, by, bs, ex, ey, es #ifdef CUDA , &props #endif ); } // Generate the initial data disrtibution and load it // onto compute nodes. integer* array = (integer*)malloc(t->cpu.parent->grid->extsize * sizeof(integer)); genirand(t->cpu.parent->grid->extsize, array); test_load(t, n, sx, sy, ss, sizeof(integer), (char*)array); #ifdef MPI if (t->rank == MPI_ROOT_NODE) #endif { size_t nxysb = n * n * n * sizeof(integer); // Copy the data array. memcpy(t_check->cpu.arrays[0], array, nxysb); // Duplicate initial distribution to the second level array. memcpy(t_check->cpu.arrays[1], t_check->cpu.arrays[0], nxysb); } free(array); #ifdef VERBOSE printf("step 0\n"); printf("step 1\n"); #endif // The time iterations loop, CPU and GPU versions. for (int it = 2; it < t->nt; it++) { // Run one iteration of the stencil, measuring its time. // In case of MPI, the time of iteration is measured together // with the time of data sync. struct timespec start, stop; #ifdef MPI if (t->rank == MPI_ROOT_NODE) #endif { stenfw_get_time(&start); } #ifdef MPI struct grid_domain_t* subdomains = t->cpu.subdomains; int nsubdomains = t->cpu.nsubdomains; // Copy the current iteration data into boundary slices // and compute stencil in them. // Boundary slices themselves are subdomains with respect // to each MPI decomposition domains. { // Set subdomain data copying callbacks: // use simple memcpy in this case. for (int i = 0; i < nsubdomains; i++) { struct grid_domain_t* sub = subdomains + i; sub->scatter_memcpy = &grid_subcpy; sub->gather_memcpy = &grid_subcpy; } // Scatter domain edges for separate computation. grid_scatter(subdomains, &t->cpu, 0, LAYOUT_MODE_CUSTOM); // Process edges subdomains. for (int i = 0; i < nsubdomains; i++) { struct grid_domain_t* sub = subdomains + i; int nx = sub->grid[0].bx + sub->grid[0].nx + sub->grid[0].ex; int ny = sub->grid[0].by + sub->grid[0].ny + sub->grid[0].ey; int ns = sub->grid[0].bs + sub->grid[0].ns + sub->grid[0].es; isum13pt_cpu(nx, ny, ns, (integer(*)[ny][nx])sub->arrays[0], (integer(*)[ny][nx])sub->arrays[1], (integer(*)[ny][nx])sub->arrays[2]); } } // Start sharing boundary slices between linked subdomains. MPI_Request* reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * 2 * nsubdomains); for (int i = 0; i < nsubdomains; i++) { struct grid_domain_t* subdomain = subdomains + i; struct grid_domain_t* neighbor = *(subdomain->links.dense[0]); assert(neighbor->grid[1].extsize == subdomain->grid[0].extsize); int szelem = sizeof(integer); size_t dnx = neighbor->grid[1].nx * szelem; size_t dny = neighbor->grid[1].ny; size_t dns = neighbor->grid[1].ns; size_t snx = subdomain->grid[0].nx * szelem; size_t sbx = subdomain->grid[0].bx * szelem; size_t sex = subdomain->grid[0].ex * szelem; size_t sny = subdomain->grid[0].ny, sns = subdomain->grid[0].ns; size_t sby = subdomain->grid[0].by, sbs = subdomain->grid[0].bs; size_t sey = subdomain->grid[0].ey, ses = subdomain->grid[0].es; size_t soffset = sbx + (sbx + snx + sex) * (sby + sbs * (sby + sny + sey)); struct grid_domain_t obuf; memset(&obuf, 0, sizeof(struct grid_domain_t)); obuf.arrays = subdomain->arrays + 1; obuf.narrays = 1; obuf.offset = 0; obuf.grid[0].nx = dnx; obuf.grid[0].ny = dny; obuf.grid[0].ns = dns; obuf.grid->size = dnx * dny * dns; struct grid_domain_t scpy = *subdomain; scpy.arrays = subdomain->arrays + 2; scpy.narrays = 1; scpy.offset = soffset; scpy.grid[0].nx = sbx + snx + sex; scpy.grid[0].ny = sby + sny + sey; scpy.grid[0].ns = sbs + sns + ses; // Copy data to the temporary buffer. grid_subcpy(dnx, dny, dns, &obuf, &scpy); // Exchange temporary buffers with the subdomain neighbour. int subdomain_rank = grid_rank1d(subdomain->parent->parent, subdomain->parent->grid); int neighbor_rank = grid_rank1d(neighbor->parent->parent, neighbor->parent->grid); MPI_SAFE_CALL(MPI_Isend(subdomain->arrays[1], obuf.grid->size, MPI_BYTE, neighbor_rank, 0, MPI_COMM_WORLD, &reqs[2 * i])); MPI_SAFE_CALL(MPI_Irecv(subdomain->arrays[0], obuf.grid->size, MPI_BYTE, neighbor_rank, 0, MPI_COMM_WORLD, &reqs[2 * i + 1])); #ifdef VERBOSE printf("sharing: send %d->%d\n", subdomain_rank, neighbor_rank); printf("sharing: recv %d->%d\n", neighbor_rank, subdomain_rank); #endif } #endif // MPI // Compute inner grid points of the subdomain. int nx = t->cpu.grid->bx + t->cpu.grid->nx + t->cpu.grid->ex; int ny = t->cpu.grid->by + t->cpu.grid->ny + t->cpu.grid->ey; int ns = t->cpu.grid->bs + t->cpu.grid->ns + t->cpu.grid->es; if (cpu) { isum13pt_cpu(nx, ny, ns, (integer(*)[ny][nx])t->cpu.arrays[0], (integer(*)[ny][nx])t->cpu.arrays[1], (integer(*)[ny][nx])t->cpu.arrays[2]); } #ifdef CUDA if (gpu) { isum13pt_gpu(nx, ny, ns, (integer*)t->gpu.arrays[0], (integer*)t->gpu.arrays[1], (integer*)t->gpu.arrays[2]); #ifdef VISUALIZE #ifndef CUDA_MAPPED // If GPU is not using mapped host memory, then need to fetch // the current iteration solution explicitly. // TODO: in case of MPI/CUDA/!MAPPED this copy must go AFTER // boundaries gathering. CUDA_SAFE_CALL(cudaMemcpy(t->cpu.arrays[2], t->gpu.arrays[2], t->gpu.grid->extsize * sizeof(real), cudaMemcpyDeviceToHost)); #endif // CUDA_MAPPED #endif } #endif // CUDA #ifdef MPI // Wait for boundaries sharing completion. MPI_Status* statuses = (MPI_Status*)malloc(2 * nsubdomains * sizeof(MPI_Status)); MPI_SAFE_CALL(MPI_Waitall(2 * nsubdomains, reqs, statuses)); for (int i = 0; i < 2 * nsubdomains; i++) MPI_SAFE_CALL(statuses[i].MPI_ERROR); free(statuses); free(reqs); for (int i = 0; i < nsubdomains; i++) { struct grid_domain_t* subdomain = subdomains + i; int szelem = sizeof(integer); size_t dnx = subdomain->grid[1].nx * szelem; size_t dbx = subdomain->grid[1].bx * szelem; size_t dex = subdomain->grid[1].ex * szelem; size_t dny = subdomain->grid[1].ny, dns = subdomain->grid[1].ns; size_t dby = subdomain->grid[1].by, dbs = subdomain->grid[1].bs; size_t dey = subdomain->grid[1].ey, des = subdomain->grid[1].es; size_t doffset = dbx + (dbx + dnx + dex) * (dby + dbs * (dby + dny + dey)); struct grid_domain_t dcpy = *subdomain; dcpy.arrays = subdomain->arrays + 2; dcpy.narrays = 1; dcpy.offset = doffset; dcpy.grid[0].nx = dbx + dnx + dex; dcpy.grid[0].ny = dby + dny + dey; dcpy.grid[0].ns = dbs + dns + des; struct grid_domain_t ibuf; memset(&ibuf, 0, sizeof(struct grid_domain_t)); ibuf.arrays = subdomain->arrays; ibuf.narrays = 1; ibuf.offset = 0; ibuf.grid[0].nx = dnx; ibuf.grid[0].ny = dny; ibuf.grid[0].ns = dns; // Copy data to temporary buffer. grid_subcpy(dnx, dny, dns, &dcpy, &ibuf); // Swap pointers to make the last iteration in the bottom. char* w = subdomain->arrays[0]; subdomain->arrays[0] = subdomain->arrays[2]; subdomain->arrays[2] = w; } // Gather bounradies on for the next time step. Insert the // separately computed boundaries back into the sudomains // for the next time step. struct grid_domain_t target = t->cpu; target.narrays = 1; target.arrays = t->cpu.arrays + 2; grid_gather(&target, subdomains, 1, LAYOUT_MODE_CUSTOM); if (t->rank != MPI_ROOT_NODE) { #ifdef VERBOSE printf("step %d\n", it); #endif } else #endif // MPI { stenfw_get_time(&stop); printf("step %d time = ", it); stenfw_print_time_diff(start, stop); printf(" sec\n"); } #ifdef MPI if (t->rank == MPI_ROOT_NODE) #endif { // Compute inner grid points of the control solution subdomain. int nx = t_check->cpu.grid->bx + t_check->cpu.grid->nx + t_check->cpu.grid->ex; int ny = t_check->cpu.grid->by + t_check->cpu.grid->ny + t_check->cpu.grid->ey; int ns = t_check->cpu.grid->bs + t_check->cpu.grid->ns + t_check->cpu.grid->es; isum13pt_cpu(nx, ny, ns, (integer(*)[ny][nx])t_check->cpu.arrays[0], (integer(*)[ny][nx])t_check->cpu.arrays[1], (integer(*)[ny][nx])t_check->cpu.arrays[2]); } // Print the stats of difference between the solution and // the control solution. test_write_imaxabsdiff(t, t_check, 2, it); // Swap pointers to rewrite the oldest iteration with // the next one. char* w = t->cpu.arrays[0]; t->cpu.arrays[0] = t->cpu.arrays[1]; t->cpu.arrays[1] = t->cpu.arrays[2]; t->cpu.arrays[2] = w; #ifdef CUDA if (gpu) { // Also swap the corresponding GPU arrays pointers. w = t->gpu.arrays[0]; t->gpu.arrays[0] = t->gpu.arrays[1]; t->gpu.arrays[1] = t->gpu.arrays[2]; t->gpu.arrays[2] = w; } #endif #ifdef MPI if (t->rank == MPI_ROOT_NODE) #endif { // Swap pointers to rewrite the oldest control solution // iteration with the next one. char* w = t_check->cpu.arrays[0]; t_check->cpu.arrays[0] = t_check->cpu.arrays[1]; t_check->cpu.arrays[1] = t_check->cpu.arrays[2]; t_check->cpu.arrays[2] = w; } } // Dispose the test configurations. #ifdef MPI if (t->rank == MPI_ROOT_NODE) #endif { test_dispose(t_check); } test_dispose(t); return 0; }
/* * cl_update (CUDA version) */ static void update_func_cuda(void *descr[], void *arg) { struct block_description *block = arg; int workerid = starpu_worker_get_id(); DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); if (block->bz == 0) fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid); else DEBUG( "!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid); #ifdef STARPU_USE_MPI int rank = 0; MPI_Comm_rank(MPI_COMM_WORLD, &rank); DEBUG( "!!! RANK %d !!!\n", rank); #endif DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); unsigned block_size_z = get_block_size(block->bz); unsigned i; update_per_worker[workerid]++; struct timeval tv, tv2, diff, delta = {.tv_sec = 0, .tv_usec = get_ticks()*1000}; gettimeofday(&tv, NULL); timersub(&tv, &start, &tv2); timersub(&tv2, &last_tick[block->bz], &diff); while (timercmp(&diff, &delta, >=)) { timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]); timersub(&tv2, &last_tick[block->bz], &diff); if (who_runs_what_index[block->bz] < who_runs_what_len) who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1; } if (who_runs_what_index[block->bz] < who_runs_what_len) who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid); /* * Load neighbours' boundaries : TOP */ /* The offset along the z axis is (block_size_z + K) */ load_subblock_from_buffer_cuda(descr[0], descr[2], block_size_z+K); load_subblock_from_buffer_cuda(descr[1], descr[3], block_size_z+K); /* * Load neighbours' boundaries : BOTTOM */ load_subblock_from_buffer_cuda(descr[0], descr[4], 0); load_subblock_from_buffer_cuda(descr[1], descr[5], 0); /* * Stencils ... do the actual work here :) TODO */ for (i=1; i<=K; i++) { starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2]; TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr; /* Shadow data */ cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i); /* And perform actual computation */ #ifdef LIFE cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i); #else cudaMemcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice); #endif /* LIFE */ } cudaError_t cures; if ((cures = cudaThreadSynchronize()) != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures); } #endif /* STARPU_USE_CUDA */ /* * cl_update (CPU version) */ static void update_func_cpu(void *descr[], void *arg) { struct block_description *block = arg; int workerid = starpu_worker_get_id(); DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); if (block->bz == 0) fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid); else DEBUG( "!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid); #ifdef STARPU_USE_MPI int rank = 0; MPI_Comm_rank(MPI_COMM_WORLD, &rank); DEBUG( "!!! RANK %d !!!\n", rank); #endif DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); unsigned block_size_z = get_block_size(block->bz); unsigned i; update_per_worker[workerid]++; struct timeval tv, tv2, diff, delta = {.tv_sec = 0, .tv_usec = get_ticks() * 1000}; gettimeofday(&tv, NULL); timersub(&tv, &start, &tv2); timersub(&tv2, &last_tick[block->bz], &diff); while (timercmp(&diff, &delta, >=)) { timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]); timersub(&tv2, &last_tick[block->bz], &diff); if (who_runs_what_index[block->bz] < who_runs_what_len) who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1; } if (who_runs_what_index[block->bz] < who_runs_what_len) who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid); /* * Load neighbours' boundaries : TOP */ /* The offset along the z axis is (block_size_z + K) */ load_subblock_from_buffer_cpu(descr[0], descr[2], block_size_z+K); load_subblock_from_buffer_cpu(descr[1], descr[3], block_size_z+K); /* * Load neighbours' boundaries : BOTTOM */ load_subblock_from_buffer_cpu(descr[0], descr[4], 0); load_subblock_from_buffer_cpu(descr[1], descr[5], 0); /* * Stencils ... do the actual work here :) TODO */ for (i=1; i<=K; i++) { starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2]; TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr; /* Shadow data */ unsigned ldy = oldb->ldy, ldz = oldb->ldz; unsigned nx = oldb->nx, ny = oldb->ny, nz = oldb->nz; unsigned x, y, z; unsigned stepx = 1; unsigned stepy = 1; unsigned stepz = 1; unsigned idx = 0; unsigned idy = 0; unsigned idz = 0; TYPE *ptr = old; # include "shadow.h" /* And perform actual computation */ #ifdef LIFE life_update(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i); #else memcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new)); #endif /* LIFE */ } } /* Performance model and codelet structure */ static struct starpu_perfmodel_t cl_update_model = { .type = STARPU_HISTORY_BASED, .symbol = "cl_update" }; starpu_codelet cl_update = { .where = #ifdef STARPU_USE_CUDA STARPU_CUDA| #endif STARPU_CPU, .cpu_func = update_func_cpu, #ifdef STARPU_USE_CUDA .cuda_func = update_func_cuda, #endif .model = &cl_update_model, .nbuffers = 6 }; /* * Save the block internal boundaries to give them to our neighbours. */ /* CPU version */ static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block, starpu_block_interface_t *boundary, unsigned firstz) { /* Sanity checks */ STARPU_ASSERT(block->nx == boundary->nx); STARPU_ASSERT(block->ny == boundary->ny); STARPU_ASSERT(boundary->nz == K); /* NB: this is not fully garanteed ... but it's *very* likely and that * makes our life much simpler */ STARPU_ASSERT(block->ldy == boundary->ldy); STARPU_ASSERT(block->ldz == boundary->ldz); /* We do a contiguous memory transfer */ size_t boundary_size = K*block->ldz*block->elemsize; unsigned offset = firstz*block->ldz; TYPE *block_data = (TYPE *)block->ptr; TYPE *boundary_data = (TYPE *)boundary->ptr; memcpy(boundary_data, &block_data[offset], boundary_size); } /* CUDA version */ #ifdef STARPU_USE_CUDA static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block, starpu_block_interface_t *boundary, unsigned firstz) { /* Sanity checks */ STARPU_ASSERT(block->nx == boundary->nx); STARPU_ASSERT(block->ny == boundary->ny); STARPU_ASSERT(boundary->nz == K); /* NB: this is not fully garanteed ... but it's *very* likely and that * makes our life much simpler */ STARPU_ASSERT(block->ldy == boundary->ldy); STARPU_ASSERT(block->ldz == boundary->ldz); /* We do a contiguous memory transfer */ size_t boundary_size = K*block->ldz*block->elemsize; unsigned offset = firstz*block->ldz; TYPE *block_data = (TYPE *)block->ptr; TYPE *boundary_data = (TYPE *)boundary->ptr; cudaMemcpy(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice); } #endif /* STARPU_USE_CUDA */ /* Record how many top/bottom saves each worker performed */ unsigned top_per_worker[STARPU_NMAXWORKERS]; unsigned bottom_per_worker[STARPU_NMAXWORKERS]; /* top save, CPU version */ static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg) { struct block_description *block = arg; int workerid = starpu_worker_get_id(); top_per_worker[workerid]++; DEBUG( "DO SAVE Bottom block %d\n", block->bz); /* The offset along the z axis is (block_size_z + K)- K */ unsigned block_size_z = get_block_size(block->bz); load_subblock_into_buffer_cpu(descr[0], descr[2], block_size_z); load_subblock_into_buffer_cpu(descr[1], descr[3], block_size_z); } /* bottom save, CPU version */ static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg) { struct block_description *block = arg; int workerid = starpu_worker_get_id(); bottom_per_worker[workerid]++; DEBUG( "DO SAVE Top block %d\n", block->bz); load_subblock_into_buffer_cpu(descr[0], descr[2], K); load_subblock_into_buffer_cpu(descr[1], descr[3], K); } /* top save, CUDA version */ #ifdef STARPU_USE_CUDA static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg) { struct block_description *block = arg; int workerid = starpu_worker_get_id(); top_per_worker[workerid]++; DEBUG( "DO SAVE Top block %d\n", block->bz); /* The offset along the z axis is (block_size_z + K)- K */ unsigned block_size_z = get_block_size(block->bz); load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z); load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z); cudaThreadSynchronize(); } /* bottom save, CUDA version */ static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg) { struct block_description *block = arg; int workerid = starpu_worker_get_id(); bottom_per_worker[workerid]++; DEBUG( "DO SAVE Bottom block %d on CUDA\n", block->bz); load_subblock_into_buffer_cuda(descr[0], descr[2], K); load_subblock_into_buffer_cuda(descr[1], descr[3], K); cudaThreadSynchronize(); } #endif /* STARPU_USE_CUDA */ /* Performance models and codelet for save */ static struct starpu_perfmodel_t save_cl_bottom_model = { .type = STARPU_HISTORY_BASED, .symbol = "save_cl_bottom" }; static struct starpu_perfmodel_t save_cl_top_model = { .type = STARPU_HISTORY_BASED, .symbol = "save_cl_top" }; starpu_codelet save_cl_bottom = { .where = #ifdef STARPU_USE_CUDA STARPU_CUDA| #endif STARPU_CPU, .cpu_func = dummy_func_bottom_cpu, #ifdef STARPU_USE_CUDA .cuda_func = dummy_func_bottom_cuda, #endif .model = &save_cl_bottom_model, .nbuffers = 4 }; starpu_codelet save_cl_top = { .where = #ifdef STARPU_USE_CUDA STARPU_CUDA| #endif STARPU_CPU, .cpu_func = dummy_func_top_cpu, #ifdef STARPU_USE_CUDA .cuda_func = dummy_func_top_cuda, #endif .model = &save_cl_top_model, .nbuffers = 4 };
int main (int argc, char *argv[]) { int rank, nprocs, ilen; char processor[MPI_MAX_PROCESSOR_NAME]; double tstart = 0.0, tend = 0.0; MPI_Status reqstat; MPI_Request send_request; MPI_Request recv_request; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Get_processor_name(processor, &ilen); if (nprocs != 2) { if(rank == 0) printf("This test requires exactly two processes\n"); MPI_Finalize(); exit(EXIT_FAILURE); } int other_proc = (rank == 1 ? 0 : 1); // Hard code GPU affinity since this example only works with 2 GPUs. int igpu = 0; if(rank == 0 ) printf("%s allocates %d MB pinned memory with regual mpi and " "bidirectional bandwidth\n", argv[0], MAX_MSG_SIZE / 1024 / 1024); printf("node=%d(%s): my other _proc = %d and using GPU=%d\n", rank, processor, other_proc, igpu); char *h_src, *h_rcv; CHECK(cudaSetDevice(igpu)); CHECK(cudaMallocHost((void**)&h_src, MYBUFSIZE)); CHECK(cudaMallocHost((void**)&h_rcv, MYBUFSIZE)); char *d_src, *d_rcv; CHECK(cudaSetDevice(igpu)); CHECK(cudaMalloc((void **)&d_src, MYBUFSIZE)); CHECK(cudaMalloc((void **)&d_rcv, MYBUFSIZE)); initalData(h_src, h_rcv, MYBUFSIZE); CHECK(cudaMemcpy(d_src, h_src, MYBUFSIZE, cudaMemcpyDefault)); CHECK(cudaMemcpy(d_rcv, h_rcv, MYBUFSIZE, cudaMemcpyDefault)); // latency test for(int size = 1; size <= MAX_MSG_SIZE; size = size * 2) { MPI_Barrier(MPI_COMM_WORLD); if(rank == 0) { tstart = MPI_Wtime(); for(int i = 0; i < loop; i++) { /* * Transfer data from the GPU to the host to be transmitted to * the other MPI process. */ CHECK(cudaMemcpy(h_src, d_src, size, cudaMemcpyDeviceToHost)); MPI_Send(h_src, size, MPI_CHAR, other_proc, 100, MPI_COMM_WORLD); MPI_Recv(h_rcv, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD, &reqstat); // bi-directional transmission /* MPI_Send(h_src, size, MPI_CHAR, other_proc, 100, MPI_COMM_WORLD); MPI_Recv(h_rcv, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD, &reqstat); */ // MPI_Waitall(1, &recv_request, &reqstat); // MPI_Waitall(1, &send_request, &reqstat); /* * Transfer the data received from the other MPI process to * the device. */ CHECK(cudaMemcpy(d_rcv, h_rcv, size, cudaMemcpyHostToDevice)); } tend = MPI_Wtime(); } else { for(int i = 0; i < loop; i++) { /* * Transfer data from the GPU to the host to be transmitted to * the other MPI process. CHECK(cudaMemcpy(d_rcv, h_rcv, size, cudaMemcpyHostToDevice)); */ CHECK(cudaMemcpy(h_src, d_src, size, cudaMemcpyDeviceToHost)); MPI_Recv(h_rcv, size, MPI_CHAR, other_proc, 100, MPI_COMM_WORLD, &reqstat); MPI_Send(h_src, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD); CHECK(cudaMemcpy(d_rcv, h_rcv, size, cudaMemcpyHostToDevice)); // bi-directional transmission /* MPI_Recv(h_rcv, size, MPI_CHAR, other_proc, 100, MPI_COMM_WORLD, &reqstat); MPI_Send(h_src, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD); */ //MPI_Waitall(1, &recv_request, &reqstat); //MPI_Waitall(1, &send_request, &reqstat); /* * Transfer the data received from the other MPI process to * the device. */ } } MPI_Barrier(MPI_COMM_WORLD); if(rank == 0) { double latency = (tend - tstart) * 1e6 / (2.0 * loop); float performance = (float) size / (float) latency; printf("%6d %s %10.2f μs %10.2f MB/sec\n", (size >= 1024 * 1024) ? size / 1024 / 1024 : size / 1024, (size >= 1024 * 1024) ? "MB" : "KB", latency, performance); fflush(stdout); } } CHECK(cudaFreeHost(h_src)); CHECK(cudaFreeHost(h_rcv)); CHECK(cudaSetDevice(igpu)); CHECK(cudaFree(d_src)); CHECK(cudaFree(d_rcv)); MPI_Finalize(); return EXIT_SUCCESS; }
void update_kt_factor( value_type* host_kt_factor ) { cuda_assert( cudaMemcpy( reinterpret_cast<void*>(data.kt_factor), reinterpret_cast<const void*>(host_kt_factor), config.tilt_size*sizeof(value_type)*3, cudaMemcpyHostToDevice ) ); }
void copy_host_to_device(const size_t size, double *h_input,double *h_output,double *d_input,double *d_output){ CHECK_CUDA(cudaMemcpy(d_output, h_output, size, cudaMemcpyHostToDevice)); CHECK_CUDA(cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice)); }
psaError_t transferGPUtoCPU(alignments_t *alignments) { CUDA_ERROR(cudaMemcpy(alignments->h_results, alignments->d_results, alignments->num * sizeof(alignmentEntry_t), cudaMemcpyDeviceToHost)); return (SUCCESS); }
int _tmain(int argc, _TCHAR* argv[]) { uchar4 *h_inputImageRGBA, *d_inputImageRGBA; uchar4 *h_outputImageRGBA, *d_outputImageRGBA; unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred; float *h_filter; int filterWidth; //PreProcess const std::string *filename = new std::string("./cinque_terre_small.jpg"); cv::Mat imageInputRGBA; cv::Mat imageOutputRGBA; //make sure the context initializes ok checkCudaErrors(cudaFree(0)); cv::Mat image = cv::imread(filename->c_str(), CV_LOAD_IMAGE_COLOR); if (image.empty()) { std::cerr << "Couldn't open file: " << filename << std::endl; cv::waitKey(0); exit(1); } cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA); //allocate memory for the output imageOutputRGBA.create(image.rows, image.cols, CV_8UC4); //This shouldn't ever happen given the way the images are created //at least based upon my limited understanding of OpenCV, but better to check if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) { std::cerr << "Images aren't continuous!! Exiting." << std::endl; exit(1); } h_inputImageRGBA = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0); h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0); const size_t numPixels = image.rows * image.cols; //allocate memory on the device for both input and output checkCudaErrors(cudaMalloc(&d_inputImageRGBA, sizeof(uchar4) * numPixels)); checkCudaErrors(cudaMalloc(&d_outputImageRGBA, sizeof(uchar4) * numPixels)); checkCudaErrors(cudaMemset(d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around //copy input array to the GPU checkCudaErrors(cudaMemcpy(d_inputImageRGBA, h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice)); //now create the filter that they will use const int blurKernelWidth = 9; const float blurKernelSigma = 2.; filterWidth = blurKernelWidth; //create and fill the filter we will convolve with h_filter = new float[blurKernelWidth * blurKernelWidth]; float filterSum = 0.f; //for normalization for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma)); h_filter[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue; filterSum += filterValue; } } float normalizationFactor = 1.f / filterSum; for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { h_filter[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor; } } //blurred checkCudaErrors(cudaMalloc(&d_redBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMalloc(&d_greenBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMalloc(&d_blueBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMemset(d_redBlurred, 0, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMemset(d_greenBlurred, 0, sizeof(unsigned char) * numPixels)); checkCudaErrors(cudaMemset(d_blueBlurred, 0, sizeof(unsigned char) * numPixels)); allocateMemoryAndCopyToGPU(image.rows, image.cols, h_filter, filterWidth); GpuTimer timer; timer.Start(); //call the students' code your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, image.rows, image.cols, d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth); timer.Stop(); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); int err = printf("%f msecs.\n", timer.Elapsed()); if (err < 0) { //Couldn't print! Probably the student closed stdout - bad news std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } cleanup(); //check results and output the blurred image //PostProcess //copy the output back to the host checkCudaErrors(cudaMemcpy(imageOutputRGBA.ptr<unsigned char>(0), d_outputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost)); cv::Mat imageOutputBGR; cv::cvtColor(imageOutputRGBA, imageOutputBGR, CV_RGBA2BGR); //output the image cv::imwrite("./blurredResult.jpg", imageOutputBGR); cv::namedWindow( "Display window", CV_WINDOW_NORMAL); cv::imshow("Display window", imageOutputBGR); cv::waitKey(0); checkCudaErrors(cudaFree(d_redBlurred)); checkCudaErrors(cudaFree(d_greenBlurred)); checkCudaErrors(cudaFree(d_blueBlurred)); return 0; }
//------------------------------------------------------- //copy a buffer from host memory to device memory // //param : des //param : src //param : size //------------------------------------------------------- void D_MEMCPY_H2D(void *des, void *src, size_t size) { CUDA_SAFE_CALL(cudaMemcpy(des, src, size, cudaMemcpyHostToDevice)); }
void testCusolver(int rows, int cols, int nnz, int *row_ptr, int *col_index, double *values, double *valuesB){ // --- Initialize cuSPARSE cusolverSpHandle_t cusolver_handle = NULL; checkCudaErrors(cusolverSpCreate(&cusolver_handle)); cusparseHandle_t cusparse_handle = NULL; checkCudaErrors(cusparseCreate(&cusparse_handle)); cudaStream_t cudaStream = NULL; checkCudaErrors(cudaStreamCreate(&cudaStream)); checkCudaErrors(cusolverSpSetStream(cusolver_handle, cudaStream)); checkCudaErrors(cusparseSetStream(cusparse_handle, cudaStream)); cusparseMatDescr_t descrA; checkCudaErrors(cusparseCreateMatDescr(&descrA)); checkCudaErrors(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)); double start, stop, time_to_solve; start = second(); // --- Device side dense matrix printf("\nAlloc GPU memory...\n"); double *d_A; checkCudaErrors(cudaMalloc(&d_A, nnz * sizeof(double))); int *d_A_RowIndices; checkCudaErrors(cudaMalloc(&d_A_RowIndices, (rows + 1) * sizeof(int))); int *d_A_ColIndices; checkCudaErrors(cudaMalloc(&d_A_ColIndices, nnz * sizeof(int))); double *d_x; checkCudaErrors(cudaMalloc(&d_x, rows * sizeof(double))); checkCudaErrors(cudaMemcpy(d_A, values, nnz * sizeof(double), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_A_RowIndices, row_ptr, (rows + 1) * sizeof(int), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_A_ColIndices, col_index, nnz * sizeof(int), cudaMemcpyHostToDevice)); double *d_b; checkCudaErrors(cudaMalloc(&d_b, rows * sizeof(double))); checkCudaErrors(cudaMemcpy(d_b, valuesB, rows * sizeof(double), cudaMemcpyHostToDevice)); double *h_x = (double *)malloc(rows * sizeof(double)); double tol = 1.e-12; int reorder = 0; int singularity = 0; printf("\nProcessing in GPU using cusolver QR...\n"); //checkCudaErrors(cusolverSpDcsrlsvluHost(cusolver_handle, Nrows, nnz, descrA, sparse.Values(), // sparse.RowPtr(), sparse.ColIdx(), mB.values, tol, reorder, h_x, &singularity)); checkCudaErrors(cusolverSpDcsrlsvqr(cusolver_handle, rows, nnz, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_b, tol, reorder, d_x, &singularity)); checkCudaErrors(cudaDeviceSynchronize()); stop = second(); time_to_solve = stop - start; checkCudaErrors(cudaMemcpy(h_x, d_x, rows * sizeof(double), cudaMemcpyDeviceToHost)); double minusOne = -1.0; double one = 1.0; double *d_r; checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*rows)); checkCudaErrors(cudaMemcpy(d_r, d_b, sizeof(double)*rows, cudaMemcpyDeviceToDevice)); checkCudaErrors(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, rows, cols, nnz, &minusOne, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_x, &one, d_r)); double *h_r; h_r = (double*) malloc(rows * sizeof(double)); checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rows, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(h_r, d_r, rows * sizeof(double), cudaMemcpyDeviceToHost)); double r_inf = vec_norminf(rows, h_r); printf("(GPU - cuSolver) Time (sec): %f\n", time_to_solve); printf("(Eigen) |b - A*x| = %E \n", r_inf); checkCudaErrors(cusparseDestroy(cusparse_handle)); checkCudaErrors(cusolverSpDestroy(cusolver_handle)); checkCudaErrors(cudaStreamDestroy(cudaStream)); checkCudaErrors(cudaFree(d_b)); checkCudaErrors(cudaFree(d_x)); checkCudaErrors(cudaFree(d_r)); checkCudaErrors(cudaFree(d_A)); checkCudaErrors(cudaFree(d_A_RowIndices)); checkCudaErrors(cudaFree(d_A_ColIndices)); free(h_x); free(h_r); }
int main(int argc, char* argv[]) { double AllTime=0; double x0=0.0, y0=0.0, z0=0.0; double xn=10.0, yn=10.0, zn=10.0; int Sx=100, Sy=100, Sz=100, St=100; if (argc>1) Sx=Sy=Sz=atoi(argv[1]); double * masprev; double * masnext; masprev=new double[Sx*Sy*Sz]; masnext=new double[Sx*Sy*Sz]; double dx=(xn-x0)/Sx, dy=(yn-y0)/Sy, dz=(zn-z0)/Sz; FILE* filex=fopen("filex.txt","w"); FILE* filey=fopen("filey.txt","w"); FILE* filez=fopen("filez.txt","w"); fprintf(filex,"%d %f %f\n",Sx,x0,xn); fprintf(filey,"%d %f %f\n",Sy,y0,xn); fprintf(filez,"%d %f %f\n",Sz,z0,xn); fprintf(filex,"%f %f\n",4.0,0); fprintf(filey,"%f %f\n",4.0,0); fprintf(filez,"%f %f\n",4.0,0); double dt=0.000001; //выбираем dt memset(masprev, 0, Sx*Sy*Sz*sizeof(double)); memset(masnext, 0, Sx*Sy*Sz*sizeof(double)); for (int x=1; x<Sx-1; x++) for(int y=1; y<Sy-1; y++) for(int z=1; z<Sz-1; z++) masprev[x+y*Sx+z*Sx*Sy]=u(x0+dx*x, y0+dy*y, z0+dz*z); int maxi; double max = 0.0; for( int i = 0; i < Sx*Sy*Sz; i ++ ){ if( masprev[i] > max ){ max = masprev[i]; maxi = i; } } printf("i=%d, max=%f\n", maxi, max); for(int x=1; x<Sx-1; x++) // fprintf(filex,"%f ", masprev[x+25*(Sx)+25*(Sx)*(Sy)]); // for(int y=1; y<Sy-1; y++) // fprintf(filey,"%f ", masprev[25+y*(Sx)+25*(Sx)*(Sy)]); // вывод в файл for(int z=1; z<Sz-1; z++) // fprintf(filez,"%f ", masprev[25+25*(Sx)+z*(Sx)*(Sy)]); // fprintf(filex,"\n"); // fprintf(filey,"\n"); // переводим указатель на новую строчку fprintf(filez,"\n"); // double Time; double _Time; double *dev_a,*dev_b; cudaMalloc( (void**)&dev_a, Sx*Sy*Sz*sizeof(double)); cudaMalloc( (void**)&dev_b, Sx*Sy*Sz*sizeof(double)); cudaMemset(dev_a,0, Sx*Sy*Sz*sizeof(double)); cudaMemset(dev_b,0,Sx*Sy*Sz*sizeof(double)); cudaMemcpy(dev_a,masprev,Sx*Sy*Sz*sizeof(double),cudaMemcpyHostToDevice); double ddx=1.0;//(dx*dx); double ddy=1.0;//(dy*dy); double ddz=1.0;//(dz*dz); Time=PortableGetTime(); for (int t=1; t<St; t++) { StartCuda(dev_a,dev_b,Sx, Sy, Sz,dx,dy,dz,x0,y0,z0,dt,ddx,ddy,ddz); cudaMemcpy(masprev,dev_b,Sx*Sy*Sz*sizeof(double),cudaMemcpyDeviceToHost); double* tmp=dev_b; dev_b=dev_a; dev_a=tmp; for(int x=1; x<Sx-1; x++) // fprintf(filex,"%f ", masprev[x+25*(Sx)+25*(Sx)*(Sy)]); // for(int y=1; y<Sy-1; y++) // fprintf(filey,"%f ", masprev[25+y*(Sx)+25*(Sx)*(Sy)]); // вывод в файл for(int z=1; z<Sz-1; z++) // fprintf(filez,"%f ", masprev[25+25*(Sx)+z*(Sx)*(Sy)]); // fprintf(filex,"\n"); // fprintf(filey,"\n"); // переводим указатель на новую строчку fprintf(filez,"\n"); // } for(int x=1; x<Sx-1; x++) // fprintf(filex,"%f ", masprev[x+25*(Sx)+25*(Sx)*(Sy)]); // for(int y=1; y<Sy-1; y++) // fprintf(filey,"%f ", masprev[25+y*(Sx)+25*(Sx)*(Sy)]); // вывод в файл for(int z=1; z<Sz-1; z++) // fprintf(filez,"%f ", masprev[25+25*(Sx)+z*(Sx)*(Sy)]); // fprintf(filex,"\n"); // fprintf(filey,"\n"); // переводим указатель на новую строчку fprintf(filez,"\n"); // _Time=PortableGetTime(); AllTime=_Time-Time; printf(" %lf \n",AllTime); system("PAUSE"); fclose(filex); fclose(filey); fclose(filez); delete[] masprev; delete[] masnext; cudaFree(dev_a); cudaFree(dev_b); return 0; }