Example #1
0
void step3_gpu(int *n) {

  int nprocs, procid;
  MPI_Comm_rank(MPI_COMM_WORLD, &procid);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  /* Create Cartesian Communicator */
  int c_dims[2]={0};
  MPI_Comm c_comm;
  accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);

  Complexf *data, *data_cpu;
  Complexf *data_hat;
  double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
  int alloc_max=0;

  int isize[3],osize[3],istart[3],ostart[3];
  /* Get the local pencil size and the allocation size */
  alloc_max=accfft_local_size_dft_c2c_gpuf(n,isize,istart,osize,ostart,c_comm);

#ifdef INPLACE
  data_cpu=(Complexf*)malloc(alloc_max);
  cudaMalloc((void**) &data, alloc_max);
#else
  data_cpu=(Complexf*)malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float));
  cudaMalloc((void**) &data,isize[0]*isize[1]*isize[2]*2*sizeof(float));
  cudaMalloc((void**) &data_hat, alloc_max);
#endif

  //accfft_init(nthreads);
  setup_time=-MPI_Wtime();

  /* Create FFT plan */
#ifdef INPLACE
  accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data,c_comm,ACCFFT_MEASURE);
#else
  accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data_hat,c_comm,ACCFFT_MEASURE);
#endif
  setup_time+=MPI_Wtime();

  /* Warmup Runs */
#ifdef INPLACE
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data);
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data);
#else
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat);
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat);
#endif

  /*  Initialize data */
  initialize(data_cpu,n,c_comm);
#ifdef INPLACE
  cudaMemcpy(data, data_cpu,alloc_max, cudaMemcpyHostToDevice);
#else
  cudaMemcpy(data, data_cpu,isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyHostToDevice);
#endif

  MPI_Barrier(c_comm);


  /* Perform forward FFT */
  f_time-=MPI_Wtime();
#ifdef INPLACE
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data);
#else
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat);
#endif
  f_time+=MPI_Wtime();

  MPI_Barrier(c_comm);

#ifndef INPLACE
  Complexf *data2_cpu, *data2;
  cudaMalloc((void**) &data2, isize[0]*isize[1]*isize[2]*2*sizeof(float));
  data2_cpu=(Complexf*) malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float));
#endif

  /* Perform backward FFT */
  i_time-=MPI_Wtime();
#ifdef INPLACE
  accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data,data);
#else
  accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data_hat,data2);
#endif
  i_time+=MPI_Wtime();

  /* copy back results on CPU and check error*/
#ifdef INPLACE
  cudaMemcpy(data_cpu, data, alloc_max, cudaMemcpyDeviceToHost);
  check_err(data_cpu,n,c_comm);
#else
  cudaMemcpy(data2_cpu, data2, isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyDeviceToHost);
  check_err(data2_cpu,n,c_comm);
#endif


  /* Compute some timings statistics */
  double g_f_time, g_i_time, g_setup_time;
  MPI_Reduce(&f_time,&g_f_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);
  MPI_Reduce(&i_time,&g_i_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);
  MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);

#ifdef INPLACE
  PCOUT<<"GPU Timing for Inplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
#else
  PCOUT<<"GPU Timing for Outplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
#endif
  PCOUT<<"Setup \t"<<g_setup_time<<std::endl;
  PCOUT<<"FFT \t"<<g_f_time<<std::endl;
  PCOUT<<"IFFT \t"<<g_i_time<<std::endl;

  MPI_Barrier(c_comm);
  cudaDeviceSynchronize();
  free(data_cpu);
  cudaFree(data);
#ifndef INPLACE
  cudaFree(data_hat);
  free(data2_cpu);
  cudaFree(data2);
#endif
  accfft_destroy_plan_gpu(plan);
  accfft_cleanup_gpuf();
  MPI_Comm_free(&c_comm);
  return ;

} // end step3_gpu
void copy_device_to_host(const size_t size, double *h_input,double *h_output,double *d_input,double *d_output){

        CHECK_CUDA(cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost));
        CHECK_CUDA(cudaMemcpy(h_input, d_input, size, cudaMemcpyDeviceToHost));
}
Example #3
0
cudaError_t WINAPI wine_cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) {
    WINE_TRACE("\n");
    return cudaMemcpy(dst, src, count, kind);
}
Example #4
0
void init_arrays(Arrays *arr, FLOAT_TYPE** cu_F,
		 Command_line_opts *opts, Detector_settings *sett)
{

  // Allocates and initializes to zero the data, detector ephemeris
  // and the F-statistic arrays

//  arr->xDat = (double *) calloc (sett->N, sizeof (double));
	CudaSafeCall( cudaMallocHost((void**)&arr->xDat, sizeof(double)*sett->N));
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_xDat, sizeof(double)*sett->N));

//  arr->DetSSB = (double *) calloc (3*sett->N, sizeof (double));
	CudaSafeCall( cudaMallocHost((void**)&arr->DetSSB, sizeof(double)*3*sett->N) );
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_DetSSB, sizeof(double)*3*sett->N));

  CudaSafeCall ( cudaMalloc((void**)cu_F, sizeof(FLOAT_TYPE)*sett->fftpad*sett->nfft));
  CudaSafeCall ( cudaMemset(*cu_F, 0, sizeof(FLOAT_TYPE)*sett->fftpad*sett->nfft));

  char filename[CHAR_BUFFER_SIZE];
  FILE *data;
  // Input time-domain data handling
  sprintf (filename, "%s/%03d/xdatc_%03d_%03d%s.bin", opts->dtaprefix, opts->ident, \
	   opts->ident, opts->band, opts->label);
  if ((data = fopen (filename, "r")) != NULL) {
    fread ((void *)(arr->xDat), sizeof (double), sett->N, data); // !!! wczytanie danych
    fclose (data);
  } else {
    perror (filename);
    printf("Problem with %s... Exiting...\n", filename);
    exit(1);
  }
  //copy to device
  CudaSafeCall ( cudaMemcpy(arr->cu_xDat, arr->xDat, sizeof(double)*sett->N, cudaMemcpyHostToDevice));


  int Nzeros=0;
  int i;
  // Checking for null values in the data
  for(i=0; i < sett->N; i++)
    if(!arr->xDat[i]) Nzeros++;

  // factor N/(N - Nzeros) to account for null values in the data
  sett->crf0 = (double)sett->N/(sett->N-Nzeros);


  //if white noise...
  if (opts->white_flag)
    sett->sig2 = sett->N*var (arr->xDat, sett->N);
  else
    sett->sig2 = -1.;

  double epsm, phir;

  /*
    ############ Efemerydy ################
  */

  // Ephemeris file handling
  sprintf (filename, "%s/%03d/DetSSB.bin", opts->dtaprefix, opts->ident);
  if ((data = fopen (filename, "r")) != NULL) {
    // Detector position w.r.t solar system baricenter
    // for every datapoint
    fread ((void *)(arr->DetSSB), sizeof (double), 3*sett->N, data);
    // Deterministic phase defining the position of the Earth
    // in its diurnal motion at t=0
    fread ((void *)(&phir), sizeof (double), 1, data);
    // Earth's axis inclination to the ecliptic at t=0
    fread ((void *)(&epsm), sizeof (double), 1, data);
    fclose (data);
  } else {
    perror (filename);
    printf("Problem with %s... Exiting...\n", filename);
    exit(1);
  }

  //copy DetSSB to device
  CudaSafeCall ( cudaMemcpy(arr->cu_DetSSB, arr->DetSSB, sizeof(double)*sett->N*3, cudaMemcpyHostToDevice));


  /*
    ############ Sincos ################
  */


  sett->sphir = sin (phir);
  sett->cphir = cos (phir);
  sett->sepsm = sin (epsm);
  sett->cepsm = cos (epsm);

  //misc. arrays
  //arr->aa = (double*) malloc(sizeof(double)*sett->N);
  //arr->bb = (double*) malloc(sizeof(double)*sett->N);
  CudaSafeCall( cudaMallocHost((void**)&arr->aa, sizeof(double)*sett->N) );
  CudaSafeCall( cudaMallocHost((void**)&arr->bb, sizeof(double)*sett->N) );
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_aa, sizeof(double)*sett->nfft));
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_bb, sizeof(double)*sett->nfft));

  CudaSafeCall ( cudaMalloc((void**)&arr->cu_shft, sizeof(double)*sett->N));
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_shftf, sizeof(double)*sett->N));
  CudaSafeCall ( cudaMalloc((void**)&arr->cu_tshift, sizeof(double)*sett->N));

  //for splines
  init_spline_matrices(&arr->cu_d,
		       &arr->cu_dl,
		       &arr->cu_du,
		       &arr->cu_B,
		       sett->Ninterp);

  arr->cand_params_size = (sett->nmax - sett->nmin);
  arr->cand_buffer_size = (sett->nmax - sett->nmin)*CANDIDATE_BUFFER_SCALE;

  //parameters of found signal

  CudaSafeCall (cudaMalloc((void**)&arr->cu_cand_params, sizeof(FLOAT_TYPE)*arr->cand_params_size));
  CudaSafeCall (cudaMalloc((void**)&arr->cu_cand_buffer, sizeof(FLOAT_TYPE)*arr->cand_buffer_size));
  CudaSafeCall (cudaMalloc((void**)&arr->cu_cand_count, sizeof(int)));

  //arr->cand_buffer = (FLOAT_TYPE*)malloc(sizeof(FLOAT_TYPE)*arr->cand_buffer_size);
	CudaSafeCall( cudaMallocHost((void**)&arr->cand_buffer, sizeof(FLOAT_TYPE)*arr->cand_buffer_size) );
}
Example #5
0
// Host code
int main(int argc, char** argv)
{
    ParseArguments(argc, argv);
	
	float s_SobelMatrix[25];  
	s_SobelMatrix[0] = 1;
	s_SobelMatrix[1] = 2;
	s_SobelMatrix[2]= 0;
	s_SobelMatrix[3] = -2;
	s_SobelMatrix[4] = -1;
	s_SobelMatrix[5] = 4;
	s_SobelMatrix[6] = 8;
	s_SobelMatrix[7] = 0;
	s_SobelMatrix[8] = -8;
	s_SobelMatrix[9] = -4;
	s_SobelMatrix[10] = 6;
	s_SobelMatrix[11] = 12;
	s_SobelMatrix[12] = 0;
	s_SobelMatrix[13] = -12;
	s_SobelMatrix[14] = -6;
	s_SobelMatrix[15] = 4;
	s_SobelMatrix[16] = 8; 
	s_SobelMatrix[17] = 0;
	s_SobelMatrix[18] = -8;
	s_SobelMatrix[19] =-4;
	s_SobelMatrix[20] =1;
	s_SobelMatrix[21] =2;
	s_SobelMatrix[22] =0;
	s_SobelMatrix[23] =-2;
	s_SobelMatrix[24] =-1;
	
    unsigned char *palete = NULL;
    unsigned char *data = NULL, *out = NULL;
    PPMImage *input_image=NULL, *output_image=NULL;
    output_image = (PPMImage *)malloc(sizeof(PPMImage));
    input_image = readPPM(PPMInFileL);
    printf("Running %s filter\n", Filter);
    out = (unsigned char *)malloc();

    printf("Computing the CPU output\n");
    printf("Image details: %d by %d = %d , imagesize = %d\n", input_image->x, input_image->y, input_image->x * input_image->y, input_image->x * input_image->y);
    
	cutilCheckError(cutStartTimer(time_CPU));
	if(FilterMode == SOBEL_FILTER){
	printf("Running Sobel\n");
	CPU_Sobel(intput_image->data, output_image, input_image->x, input_image->y);
	}
	else if(FilterMode == HIGH_BOOST_FILTER){
	printf("Running boost\n");
	CPU_Boost(data, out, dib.width, dib.height);
	}
	cutilCheckError(cutStopTimer(time_CPU));
	if(FilterMode == SOBEL_FILTER || FilterMode == SOBEL_FILTER5)
    BitMapWrite("CPU_sobel.bmp", &bmp, &dib, out, palete);
	
	else if(FilterMode == AVERAGE_FILTER)
	BitMapWrite("CPU_average.bmp", &bmp, &dib, out, palete);
	
	else if(FilterMode == HIGH_BOOST_FILTER)
	BitMapWrite("CPU_boost.bmp", &bmp, &dib, out, palete);
	
    printf("Done with CPU output\n");
	printf("CPU execution time %f \n", cutGetTimerValue(time_CPU));
	
	
    printf("Allocating %d bytes for image \n", dib.image_size);
	
    cutilSafeCall( cudaMalloc( (void **)&d_In, dib.image_size*sizeof(unsigned char)) );
    cutilSafeCall( cudaMalloc( (void **)&d_Out, dib.image_size*sizeof(unsigned char)) );
    
	// creating space for filter matrix
	cutilSafeCall( cudaMalloc( (void **)&sobel_matrix, 25*sizeof(float)) );
	
	cutilCheckError(cutStartTimer(time_mem));
	
	cudaMemcpy(d_In, data, dib.image_size*sizeof(unsigned char), cudaMemcpyHostToDevice);
	
	cudaMemcpy(sobel_matrix, s_SobelMatrix, 25*sizeof(float), cudaMemcpyHostToDevice);
	
	cutilCheckError(cutStopTimer(time_mem));
    
	FilterWrapper(data, dib.width, dib.height);

    // Copy image back to host
	
	cutilCheckError(cutStartTimer(time_mem));
    cudaMemcpy(out, d_Out, dib.image_size*sizeof(unsigned char), cudaMemcpyDeviceToHost);
	cutilCheckError(cutStopTimer(time_mem));
	
	printf("GPU execution time %f Memtime %f \n", cutGetTimerValue(time_GPU), cutGetTimerValue(time_mem));
    printf("Total GPU = %f \n", (cutGetTimerValue(time_GPU) + cutGetTimerValue(time_mem)));
	// Write output image   
    BitMapWrite(BMPOutFile, &bmp, &dib, out, palete);

    Cleanup();
}
Example #6
0
 void toHost(T* base) const {
   cudaCheck(cudaMemcpy(base, vals_, n_ * sizeof(T), cudaMemcpyDeviceToHost));
 }
Example #7
0
//return types are void since any internal error will be handled by quitting
//no point in returning error codes...
//returns a pointer to an RGBA version of the input image
//and a pointer to the single channel grey-scale output
//on both the host and device
void preProcess(uchar4 **h_inputImageRGBA, uchar4 **h_outputImageRGBA,
                uchar4 **d_inputImageRGBA, uchar4 **d_outputImageRGBA,
                unsigned char **d_redBlurred,
                unsigned char **d_greenBlurred,
                unsigned char **d_blueBlurred,
                float **h_filter, int *filterWidth,
                const std::string &filename) {

  //make sure the context initializes ok
  checkCudaErrors(cudaFree(0));

  cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
  if (image.empty()) {
    std::cerr << "Couldn't open file: " << filename << std::endl;
    exit(1);
  }

  cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA);

  //allocate memory for the output
  imageOutputRGBA.create(image.rows, image.cols, CV_8UC4);

  //This shouldn't ever happen given the way the images are created
  //at least based upon my limited understanding of OpenCV, but better to check
  if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) {
    std::cerr << "Images aren't continuous!! Exiting." << std::endl;
    exit(1);
  }

  *h_inputImageRGBA  = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0);
  *h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0);

  const size_t numPixels = numRows() * numCols();
  //allocate memory on the device for both input and output
  checkCudaErrors(cudaMalloc(d_inputImageRGBA, sizeof(uchar4) * numPixels));
  checkCudaErrors(cudaMalloc(d_outputImageRGBA, sizeof(uchar4) * numPixels));
  checkCudaErrors(cudaMemset(*d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around

  //copy input array to the GPU
  checkCudaErrors(cudaMemcpy(*d_inputImageRGBA, *h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));

  d_inputImageRGBA__  = *d_inputImageRGBA;
  d_outputImageRGBA__ = *d_outputImageRGBA;

  //now create the filter that they will use
  const int blurKernelWidth = 9;
  const float blurKernelSigma = 2.;

  *filterWidth = blurKernelWidth;

  //create and fill the filter we will convolve with
  *h_filter = new float[blurKernelWidth * blurKernelWidth];
  h_filter__ = *h_filter;

  float filterSum = 0.f; //for normalization

  for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
    for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
      float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma));
      (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue;
      filterSum += filterValue;
    }
  }

  float normalizationFactor = 1.f / filterSum;

  for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
    for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
      (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor;
    }
  }

  //blurred
  checkCudaErrors(cudaMalloc(d_redBlurred,    sizeof(unsigned char) * numPixels));
  checkCudaErrors(cudaMalloc(d_greenBlurred,  sizeof(unsigned char) * numPixels));
  checkCudaErrors(cudaMalloc(d_blueBlurred,   sizeof(unsigned char) * numPixels));
  checkCudaErrors(cudaMemset(*d_redBlurred,   0, sizeof(unsigned char) * numPixels));
  checkCudaErrors(cudaMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels));
  checkCudaErrors(cudaMemset(*d_blueBlurred,  0, sizeof(unsigned char) * numPixels));
}
int main(int argc, char **argv)
{
    // Start logs
    printf("%s Starting...\n\n", argv[0]);

    unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION];

    float *h_OutputGPU, *d_Output;

    int dim, pos;
    double delta, ref, sumDelta, sumRef, L1norm, gpuTime;

    StopWatchInterface *hTimer = NULL;

    if (sizeof(INT64) != 8)
    {
        printf("sizeof(INT64) != 8\n");
        return 0;
    }

    cudaDeviceProp deviceProp;
    int dev = findCudaDevice(argc, (const char **)argv);
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));

    if (((deviceProp.major << 4) + deviceProp.minor) < 0x20)
    {
        fprintf(stderr, "quasirandomGenerator requires Compute Capability of SM 2.0 or higher to run.\n");
        cudaDeviceReset();
        exit(EXIT_WAIVED);
    }

    sdkCreateTimer(&hTimer);

    printf("Allocating GPU memory...\n");
    checkCudaErrors(cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float)));

    printf("Allocating CPU memory...\n");
    h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float));

    printf("Initializing QRNG tables...\n\n");
    initQuasirandomGenerator(tableCPU);

    initTableGPU(tableCPU);

    printf("Testing QRNG...\n\n");
    checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)));
    int numIterations = 20;

    for (int i = -1; i < numIterations; i++)
    {
        if (i == 0)
        {
            checkCudaErrors(cudaDeviceSynchronize());
            sdkResetTimer(&hTimer);
            sdkStartTimer(&hTimer);
        }

        quasirandomGeneratorGPU(d_Output, 0, N);
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3;
    printf("quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n",
           (double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS);

    printf("\nReading GPU results...\n");
    checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost));

    printf("Comparing to the CPU results...\n\n");
    sumDelta = 0;
    sumRef = 0;

    for (dim = 0; dim < QRNG_DIMENSIONS; dim++)
        for (pos = 0; pos < N; pos++)
        {
            ref       = getQuasirandomValue63(pos, dim);
            delta     = (double)h_OutputGPU[dim * N + pos] - ref;
            sumDelta += fabs(delta);
            sumRef   += fabs(ref);
        }

    printf("L1 norm: %E\n", sumDelta / sumRef);

    printf("\nTesting inverseCNDgpu()...\n\n");
    checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)));

    for (int i = -1; i < numIterations; i++)
    {
        if (i == 0)
        {
            checkCudaErrors(cudaDeviceSynchronize());
            sdkResetTimer(&hTimer);
            sdkStartTimer(&hTimer);
        }

        inverseCNDgpu(d_Output, NULL, QRNG_DIMENSIONS * N);
    }

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&hTimer);
    gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3;
    printf("quasirandomGenerator-inverse, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n",
           (double)QRNG_DIMENSIONS * (double)N * 1E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128);

    printf("Reading GPU results...\n");
    checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost));

    printf("\nComparing to the CPU results...\n");
    sumDelta = 0;
    sumRef = 0;
    unsigned int distance = ((unsigned int)-1) / (QRNG_DIMENSIONS * N + 1);

    for (pos = 0; pos < QRNG_DIMENSIONS * N; pos++)
    {
        unsigned int d = (pos + 1) * distance;
        ref       = MoroInvCNDcpu(d);
        delta     = (double)h_OutputGPU[pos] - ref;
        sumDelta += fabs(delta);
        sumRef   += fabs(ref);
    }

    printf("L1 norm: %E\n\n", L1norm = sumDelta / sumRef);

    printf("Shutting down...\n");
    sdkDeleteTimer(&hTimer);
    free(h_OutputGPU);
    checkCudaErrors(cudaFree(d_Output));

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    exit(L1norm < 1e-6 ? EXIT_SUCCESS : EXIT_FAILURE);
}
Example #9
0
int main(int argc, char **argv) {
  uchar4 *h_inputImageRGBA, *d_inputImageRGBA;
  uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
  unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;

  float *h_filter;
  int filterWidth;

  std::string input_file;
  std::string output_file;
  std::string reference_file;
  double perPixelError = 0.0;
  double globalError = 0.0;
  bool useEpsCheck = false;
  std::string blur_impl = "hw";
  switch (argc) {
    case 2:
      input_file = std::string(argv[1]);
      output_file = "HW2_output.png";
      reference_file = "HW2_reference.png";
      break;
    case 3:
      input_file = std::string(argv[1]);
      output_file = std::string(argv[2]);
      reference_file = "HW2_reference.png";
      break;
    case 4:
      input_file = std::string(argv[1]);
      output_file = std::string(argv[2]);
      reference_file = std::string(argv[3]);
      break;
    case 5:
      input_file = std::string(argv[1]);
      output_file = std::string(argv[2]);
      reference_file = std::string(argv[3]);
      blur_impl = std::string(argv[4]);
      break;
    default:
      std::cerr << "Usage: ./HW2 input_file [output_filename] "
                   "[reference_filename] [blur_impl]]"
                << std::endl;
      exit(1);
  }
  // load the image and give us our input and output pointers
  preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA,
             &d_outputImageRGBA, &d_redBlurred, &d_greenBlurred, &d_blueBlurred,
             &h_filter, &filterWidth, input_file);

  allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth);
  GpuTimer timer;
  timer.Start();
  // call the students' code
  if (blur_impl == "hw") {
    your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA,
                       numRows(), numCols(), d_redBlurred, d_greenBlurred,
                       d_blueBlurred, filterWidth);
  } else if (blur_impl == "shared") {
    gaussian_blur_shared(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA,
                       numRows(), numCols(), d_redBlurred, d_greenBlurred,
                       d_blueBlurred, filterWidth);
  }

  timer.Stop();
  cudaDeviceSynchronize();
  checkCudaErrors(cudaGetLastError());
  int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    // Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!"
              << std::endl;
    exit(1);
  }

  // check results and output the blurred image

  size_t numPixels = numRows() * numCols();
  // copy the output back to the host
  checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__,
                             sizeof(uchar4) * numPixels,
                             cudaMemcpyDeviceToHost));

  std::cerr << "postProcess output...\n";
  postProcess(output_file, h_outputImageRGBA);

  timer.Start();
  referenceCalculation(h_inputImageRGBA, h_outputImageRGBA, numRows(),
                       numCols(), h_filter, filterWidth);
  timer.Stop();
  std::cerr << "referenceCalculation elapsed: " << timer.Elapsed() << " ms\n";

  std::cerr << "postProcess reference...\n";
  postProcess(reference_file, h_outputImageRGBA);

  //  Cheater easy way with OpenCV
  // generateReferenceImage(input_file, reference_file, filterWidth);

  compareImages(reference_file, output_file, useEpsCheck, perPixelError,
                globalError);

  checkCudaErrors(cudaFree(d_redBlurred));
  checkCudaErrors(cudaFree(d_greenBlurred));
  checkCudaErrors(cudaFree(d_blueBlurred));

  cleanUp();

  return 0;
}
Example #10
0
void pcl::gpu::DeviceMemory::upload(const void *host_ptr_arg, size_t sizeBytes_arg)
{
    create(sizeBytes_arg);
    cudaSafeCall( cudaMemcpy(data_, host_ptr_arg, sizeBytes_, cudaMemcpyHostToDevice) );
    cudaSafeCall( cudaDeviceSynchronize() );
}
Example #11
0
void pcl::gpu::DeviceMemory::download(void *host_ptr_arg) const
{    
    cudaSafeCall( cudaMemcpy(host_ptr_arg, data_, sizeBytes_, cudaMemcpyDeviceToHost) );
    cudaSafeCall( cudaDeviceSynchronize() );
}          
Example #12
0
DeepCopy<CudaSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n )
{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
Example #13
0
int main2(int sockfd)
{
        cufftHandle plan;
        cufftComplex *devPtr;
        cufftReal indata[NX*BATCH];
        cufftComplex data[NX*BATCH];
        int i,timer,j,k;
        char fname[15];
        FILE *f;
	#define BUFSIZE (21*4096*sizeof(int))
	int buffer[BUFSIZE];

        int p,nread;


	f = fopen("21-4096","rb");
	nread=fread(buffer,BUFSIZE,1,f);
	printf("nread=%i\n",nread);
	fclose(f);

        i=0;
        for (j=0;j<BATCH;j++) {
            for (k=0;k<NX;k++) {
                data[j*NX+k].x = buffer[j*NX+k];
                data[j*NX+k].y = 0;
            }
	}


        //f=fopen("y.txt","r");
    /* source data creation */

        //int sockfd = myconnect();
        //printf("connected\n");
	
		

        /* WORKING!!!!!!!!
        i=0;
        for (j=0;j<BATCH;j++) {
            sprintf(fname,"%i.txt",j);
            printf("%s\n",fname);
            f = fopen(fname,"r");
            for (k=0;k<NX;k++) {
                fscanf(f,"%i\n",&p);
                data[j*NX+k].x = p;
                data[j*NX+k].y = 0;
            }
            fclose(f);
	*/
/*
        for(i=  0 ; i < NX*BATCH ; i++){
                //fscanf(f,"%i\n",&p);
                //data[i].x= p;
                data[i].x= 1.0f;
                //printf("%f\n",data[i].x);
                data[i].y = 0.0f;
        }
        //fclose(f)
        */
        //}


        /* creates 1D FFT plan */
        cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);


        /*
        cutCreateTimer(&timer);
        cutResetTimer(timer);
        cutStartTimer(timer);
        */
        
    /* GPU memory allocation */
        cudaMalloc((void**)&devPtr, sizeof(cufftComplex)*NX*BATCH);

    /* transfer to GPU memory */
        cudaMemcpy(devPtr, data, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice);


        /* executes FFT processes */
        cufftExecC2C(plan, devPtr, devPtr, CUFFT_FORWARD);

        /* executes FFT processes (inverse transformation) */
       //cufftExecC2C(plan, devPtr, devPtr, CUFFT_INVERSE);

    /* transfer results from GPU memory */
        cudaMemcpy(data, devPtr, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyDeviceToHost);

        /* deletes CUFFT plan */
        cufftDestroy(plan);

    /* frees GPU memory */
        cudaFree(devPtr);

        /*
        cudaThreadSynchronize();
        cutStopTimer(timer);
        printf("%f\n",cutGetTimerValue(timer)/(float)1000);
        cutDeleteTimer(timer);
        */

        /*
        float mag;
        for(i = 0 ; i < NX*BATCH ; i++){
                //printf("data[%d] %f %f\n", i, data[i].x, data[i].y);
                //printf("%f\n", data[i].x);
                mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX;
                printf("%f\n",mag);

        }
        */

/*
        // save as text file
        float mag;
        i=0;
        for (j=0;j<BATCH;j++) {
            sprintf(fname,"%i-mag.txt",j);
            printf("%s\n",fname);
            f = fopen(fname,"w");
            for (k=0;k<NX;k++) {
                //fscanf(f,"%i\n",&p);
                if (k>50)
                    continue;
                i = j*NX+k;
                mag = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX;
                fprintf(f,"%f\n",mag);
            }
            fclose(f);
        }
*/


        float mag;
        i=0;
        float mags[NX];
        int magsint[NX*BATCH];
        memset(magsint,0,sizeof(int)*NX*BATCH);
        int u = 0;

        printf("%f %f %f %f\n",data[0].x,data[1].x,data[2].x,data[3].x);

        //printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]);

//        f = fopen("ffts.bin","wb");
        for (j=0;j<BATCH;j++) {
//            sprintf(fname,"%i-bin.dat",j);
//            printf("%s\n",fname);

            for (k=0;k<NX;k++) {
                //fscanf(f,"%i\n",&p);
                if (k>50)
                    continue;
                i = j*NX+k;
                mags[k] = sqrtf(data[i].x*data[i].x+data[i].y*data[i].y)*2.0/NX;
                magsint[u]=mags[k]    ;
                u++;
                //fprintf(f,"%f\n",mag);
                
            }

            //f = fopen(fname,"wb");
  //          fwrite(magsint,sizeof(int)*50,1,f);
        }
        int n;
        n = write(sockfd,magsint,sizeof(int)*BATCH*50);
        printf("%i %i %i %i\n",magsint[0],magsint[1],magsint[2],magsint[3]);
        printf("send ok, size: %i\n",n);
        //fclose(f);
        
        
        return 0;
}
int main(int argc, char **argv) {
	char *h_Worig;
  uint8_t  *h_We, *d_We;
  uint64_t *h_nWe, *d_nWe;

  uint32_t *h_k, *h_l, *d_k, *d_l;
  uint32_t *h_ki, *h_li, *d_ki, *d_li;
	intmax_t *h_k2, *h_l2;
	intmax_t *h_ki2, *h_li2;

	bwt_index backward, forward;

	exome ex;

  comp_matrix h_O, d_O, h_Oi, d_Oi;
  vector h_C, d_C, h_C1, d_C1;
  comp_vector h_S, h_Si;

	results_list *r_lists;
	uint32_t *k, *l;

  cudaSetDevice(0);

  cudaError_t error;

  if (argc!=8) {
    printf("Sintaxis:\n\t%s fichero_bus dir_entrada fichero_sal max_bus_gpu repeticiones max_length nucleotides\n", argv[0]);
    return 1;
  }

  timevars();
  init_replace_table(argv[7]);

	queries_file = fopen(argv[1], "r");
	check_file_open(queries_file, argv[1]);
	output_file = fopen(argv[3], "w");
	check_file_open(output_file, argv[3]);

  MAX_BUS_GPU = atoi(argv[4]);
	MAXLINE = atoi(argv[6]);

	tic("Cargando FM-Index");

  read_vector(&h_C, argv[2], "C");
  read_vector(&h_C1, argv[2], "C1");

  copy_vector_gpu(&d_C,   &h_C);
  copy_vector_gpu(&d_C1,  &h_C1);

  read_comp_matrix_gpu(&h_O, argv[2], "O");
  read_comp_matrix_gpu(&h_Oi, argv[2], "Oi");

  copy_comp_matrix_gpu(&d_O, &h_O);
  copy_comp_matrix_gpu(&d_Oi, &h_Oi);

  read_comp_vector(&h_S, argv[2], "S");
  read_comp_vector(&h_Si, argv[2], "Si");

	backward.C  = h_C;
	backward.C1 = h_C1;
	backward.O  = h_O;
	backward.S  = h_S;

	forward.C  = h_C;
	forward.C1 = h_C1;
	forward.O  = h_Oi;
	forward.S  = h_Si;

	load_exome_file(&ex, argv[2]);

  h_Worig = (char*)malloc(MAX_BUS_GPU * MAXLINE     * sizeof(char));
 
  cudaMallocHost((void**) &h_We, MAX_BUS_GPU * MAXLINE * sizeof(uint8_t));
  cudaMallocHost((void**) &h_nWe, MAX_BUS_GPU * sizeof(uint64_t));

  cudaMalloc((void**) &d_We,  MAX_BUS_GPU * MAXLINE * sizeof(uint8_t));
  manageCudaError();
  cudaMalloc((void**) &d_nWe, MAX_BUS_GPU * sizeof(uint64_t));
  manageCudaError();

  cudaMallocHost((void**) &h_k, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t));
  cudaMallocHost((void**) &h_l, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t));
  cudaMallocHost((void**) &h_ki, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t));
  cudaMallocHost((void**) &h_li, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t));

  cudaMallocHost((void**) &h_k2, MAX_BUS_GPU * MAXLINE * sizeof(intmax_t));
  cudaMallocHost((void**) &h_l2, MAX_BUS_GPU * MAXLINE * sizeof(intmax_t));
  cudaMallocHost((void**) &h_ki2, MAX_BUS_GPU * MAXLINE * sizeof(intmax_t));
  cudaMallocHost((void**) &h_li2, MAX_BUS_GPU * MAXLINE * sizeof(intmax_t));

	cudaMalloc((void**) &d_k, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t));
  manageCudaError();
  cudaMalloc((void**) &d_l, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t));
  manageCudaError();
	cudaMalloc((void**) &d_ki, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t));
  manageCudaError();
  cudaMalloc((void**) &d_li, MAX_BUS_GPU * MAXLINE * sizeof(uint32_t));
  manageCudaError();

	r_lists = (results_list *) malloc(MAX_BUS_GPU * sizeof(results_list));

	for (int i=0; i<MAX_BUS_GPU; i++) {
			new_results_list(&r_lists[i], RESULTS);
	}

	k = (uint32_t*)malloc(RESULTS * sizeof(uint32_t));
	l = (uint32_t*)malloc(RESULTS * sizeof(uint32_t));	

  toc();

  int TAM_BUS_GPU=0, NUM_BLOQUES_GPU=0;

  NUM_REP          = atoi(argv[5]);

  tic("Leer de disco");

  while(nextFASTAToken(queries_file, h_Worig + TAM_BUS_GPU * MAXLINE, h_We + TAM_BUS_GPU * MAXLINE, h_nWe + TAM_BUS_GPU)) {

    TAM_BUS_GPU++;

    if (TAM_BUS_GPU == MAX_BUS_GPU) break;

  }

  toc();

  NUM_BLOQUES_GPU = (TAM_BUS_GPU / TAM_BLOQUE_GPU);

  cudaThreadSynchronize();
  tic("CPU -> GPU");
  cudaMemcpy(d_We, h_We, TAM_BUS_GPU * MAXLINE * sizeof(uint8_t), cudaMemcpyHostToDevice);
  manageCudaError();
  cudaMemcpy(d_nWe,  h_nWe,  TAM_BUS_GPU * sizeof(uint64_t), cudaMemcpyHostToDevice);
  manageCudaError();
  cudaThreadSynchronize();
  toc();

	cudaThreadSynchronize();
  tic("GPU Kernel");
  BWExactSearchBackwardVectorGPUWrapper(NUM_BLOQUES_GPU, TAM_BLOQUE_GPU, d_We, d_nWe, MAXLINE, d_k, d_l, 0, d_O.siz-2, &d_C, &d_C1, &d_O);
  BWExactSearchForwardVectorGPUWrapper(NUM_BLOQUES_GPU, TAM_BLOQUE_GPU, d_We, d_nWe, MAXLINE, d_ki, d_li, 0, d_Oi.siz-2, &d_C, &d_C1, &d_Oi);
  cudaThreadSynchronize();
  toc();

  cudaThreadSynchronize();
  tic("GPU -> CPU");
  cudaMemcpy(h_k, d_k, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost);
  manageCudaError();
  cudaMemcpy(h_l, d_l, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost);
  manageCudaError();
  cudaMemcpy(h_ki, d_ki, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost);
  manageCudaError();
  cudaMemcpy(h_li, d_li, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost);
  manageCudaError();  
  cudaThreadSynchronize();
  toc();

  tic("CPU Vector");
  for (int i=0; i<TAM_BUS_GPU; i++) {
    BWExactSearchVectorBackward(h_We + MAXLINE*i, 0, h_nWe[i]-1, 0, d_O.siz-2, h_k2 + MAXLINE*i, h_l2 + MAXLINE*i, &backward);
	  BWExactSearchVectorForward(h_We + MAXLINE*i, 0, h_nWe[i]-1, 0, d_Oi.siz-2, h_ki2 + MAXLINE*i, h_li2 + MAXLINE*i, &forward);

  }
  toc();

  tic("CPU Search 1 error");
  for (int i=0; i<TAM_BUS_GPU; i++) {

	 result res;
   init_result(&res, 1);
   bound_result(&res, 0, h_nWe[i]-1);
   change_result(&res, 0, h_O.siz-2, h_nWe[i]-1);

	 r_lists[i].num_results = 0;
	 r_lists[i].read_index = i;

	 BWSearch1CPU(
			h_We + i * MAXLINE,
			&backward,
		  &forward,
			&res,
		  &r_lists[i]);

	}
	toc();

  tic("CPU Search 1 Error Helper");
  for (int i=0; i<TAM_BUS_GPU; i++) {

		r_lists[i].num_results = 0;
		r_lists[i].read_index = i;

		BWSearch1GPUHelper(
				h_We + i * MAXLINE,
				0,
				h_nWe[i]-1,
				h_k  + i*MAXLINE,
				h_l  + i*MAXLINE,
				h_ki + i*MAXLINE,
				h_li + i*MAXLINE,
				&backward,
				&forward,
				&r_lists[i]
				);

	}
	toc();

	tic("Write results");
	for (int i=0; i<TAM_BUS_GPU; i++) {
		write_results_gpu(&r_lists[i], k, l, &ex, &backward, &forward, h_Worig + i*MAXLINE, h_nWe[i], 1, output_file);
	}
	toc();

	/*
  for (int i=0; i<TAM_BUS_GPU; i++) {

    for (int j=0; j<h_nWe[i]; j++) {

      if (h_k[i*MAXLINE + j] != h_k2[i*MAXLINE + j]) {
	printf("Diferente %d %d\n", i, j);
	goto salir;
      }

    }

  }

  salir:
  */
  /*
  for (int i=0; i<h_nWe[0]; i++) {
    printf("%u ", h_k[i]);
  }
  printf("\n");

  printf("\n");

  for (int i=0; i<h_nWe[0]; i++) {
    printf("%u ", h_k2[i]);
  }
  printf("\n");
  */

  cudaFreeHost(h_k);
  cudaFree(d_k);
  cudaFreeHost(h_l);
  cudaFree(d_l);

  cudaFreeHost(h_We);
  cudaFreeHost(h_nWe);
  cudaFree(d_We);
  cudaFree(d_nWe);

  free(h_C.vector);
  cudaFree(d_C.vector);
  free(h_C1.vector);
  cudaFree(d_C1.vector);

  free_comp_matrix_gpu_host(NULL, &h_O);
  free_comp_matrix_gpu_device(NULL, &d_O);

  fclose(queries_file);

  return 0;

}
Example #15
0
//-------------------------------------------------------
//copy a buffer from device memory to host memory
//
//param	: des
//param	: src
//param	: size
//-------------------------------------------------------
void D_MEMCPY_D2H(void *des, void *src, size_t size)
{
	CUDA_SAFE_CALL(cudaMemcpy(des, src, size, cudaMemcpyDeviceToHost));
}
Example #16
0
int main(int argc, char **argv)
{
    int OPT_N  = 4000000;
    int OPT_SZ = OPT_N * sizeof(float);

    printf("Initializing data...\n");
    
    float *callResult, *putResult, *stockPrice, *optionStrike, *optionYears;
    float *d_callResult, *d_putResult;
    float *d_stockPrice, *d_optionStrike, *d_optionYears;

#ifdef HEMI_CUDA_COMPILER
    checkCuda( cudaMallocHost((void**)&callResult,     OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&putResult,      OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&stockPrice,     OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&optionStrike,   OPT_SZ) );
    checkCuda( cudaMallocHost((void**)&optionYears,    OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_callResult,   OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_putResult,    OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_stockPrice,   OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_optionStrike, OPT_SZ) );
    checkCuda( cudaMalloc    ((void**)&d_optionYears,  OPT_SZ) );
#else
    callResult   = (float*)malloc(OPT_SZ);
    putResult    = (float*)malloc(OPT_SZ);
    stockPrice   = (float*)malloc(OPT_SZ);
    optionStrike = (float*)malloc(OPT_SZ);
    optionYears  = (float*)malloc(OPT_SZ);
#endif

    initOptions(OPT_N, stockPrice, optionStrike, optionYears);
        
    int blockDim = 128; // blockDim, gridDim ignored by host code
    int gridDim  = std::min<int>(1024, (OPT_N + blockDim - 1) / blockDim);

    printf("Running %s Version...\n", HEMI_LOC_STRING);

    StartTimer();

#ifdef HEMI_CUDA_COMPILER 
    checkCuda( cudaMemcpy(d_stockPrice,   stockPrice,   OPT_SZ, cudaMemcpyHostToDevice) );
    checkCuda( cudaMemcpy(d_optionStrike, optionStrike, OPT_SZ, cudaMemcpyHostToDevice) );
    checkCuda( cudaMemcpy(d_optionYears,  optionYears,  OPT_SZ, cudaMemcpyHostToDevice) );
#else
    d_callResult   = callResult; 
    d_putResult    = putResult;
    d_stockPrice   = stockPrice; 
    d_optionStrike = optionStrike;
    d_optionYears  = optionYears;
#endif

    HEMI_KERNEL_LAUNCH(BlackScholes, gridDim, blockDim, 0, 0,
                       d_callResult, d_putResult, d_stockPrice, d_optionStrike, 
                       d_optionYears, RISKFREE, VOLATILITY, OPT_N);
       
#ifdef HEMI_CUDA_COMPILER 
    checkCuda( cudaMemcpy(callResult, d_callResult, OPT_SZ, cudaMemcpyDeviceToHost) );
    checkCuda( cudaMemcpy(putResult,  d_putResult,  OPT_SZ, cudaMemcpyDeviceToHost) );
#endif

    printf("Option 0 call: %f\n", callResult[0]); 
    printf("Option 0 put:  %f\n", putResult[0]);

    double ms = GetTimer();

    //Both call and put is calculated
    printf("Options count             : %i     \n", 2 * OPT_N);
    printf("\tBlackScholes() time    : %f msec\n", ms);
    printf("\t%f GB/s, %f GOptions/s\n", 
           ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3),
           ((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3));

#ifdef HEMI_CUDA_COMPILER 
    checkCuda( cudaFree(d_stockPrice) );
    checkCuda( cudaFree(d_optionStrike) );
    checkCuda( cudaFree(d_optionYears) );
    checkCuda( cudaFreeHost(callResult) );
    checkCuda( cudaFreeHost(putResult) );
    checkCuda( cudaFreeHost(stockPrice) );
    checkCuda( cudaFreeHost(optionStrike) );
    checkCuda( cudaFreeHost(optionYears) );
#else
    free(callResult);
    free(putResult);
    free(stockPrice);
    free(optionStrike);
    free(optionYears);
#endif // HEMI_CUDA_COMPILER
}
Example #17
0
 CUDA(const T* base, size_t n) :
 n_(n) {
   cudaCheck(cudaMalloc(&vals_, n_ * sizeof(T)));
   cudaCheck(cudaMemcpy(vals_, base, n_ * sizeof(T), cudaMemcpyHostToDevice));
 }
void MultiStageMeanfieldLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {

  init_cpu = false;
  init_gpu = false;
  const caffe::MultiStageMeanfieldParameter meanfield_param = this->layer_param_.multi_stage_meanfield_param();

  num_iterations_ = meanfield_param.num_iterations();

  CHECK_GT(num_iterations_, 1) << "Number of iterations must be greater than 1.";

  theta_alpha_ = meanfield_param.theta_alpha();
  theta_beta_ = meanfield_param.theta_beta();
  theta_gamma_ = meanfield_param.theta_gamma();

  count_ = bottom[0]->count();
  num_ = bottom[0]->num();
  channels_ = bottom[0]->channels();
  height_ = bottom[0]->height();
  width_ = bottom[0]->width();
  num_pixels_ = height_ * width_;

  LOG(INFO) << "This implementation has not been tested batch size > 1.";

  top[0]->Reshape(num_, channels_, height_, width_);

  // Initialize the parameters that will updated by backpropagation.
  if (this->blobs_.size() > 0) {
    LOG(INFO) << "Multimeanfield layer skipping parameter initialization.";
  } else {

    this->blobs_.resize(3);// blobs_[0] - spatial kernel weights, blobs_[1] - bilateral kernel weights, blobs_[2] - compatability matrix

    // Allocate space for kernel weights.
    this->blobs_[0].reset(new Blob<Dtype>(1, 1, channels_, channels_));
    this->blobs_[1].reset(new Blob<Dtype>(1, 1, channels_, channels_));

    caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[0]->mutable_cpu_data());
    caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[1]->mutable_cpu_data());

    // Initialize the kernels weights. The two files spatial.par and bilateral.par should be available.
    FILE * pFile;
    pFile = fopen("spatial.par", "r");
    CHECK(pFile) << "The file 'spatial.par' is not found. Please create it with initial spatial kernel weights.";
    for (int i = 0; i < channels_; i++) {
      fscanf(pFile, "%lf", &this->blobs_[0]->mutable_cpu_data()[i * channels_ + i]);
    }
    fclose(pFile);

    pFile = fopen("bilateral.par", "r");
    CHECK(pFile) << "The file 'bilateral.par' is not found. Please create it with initial bilateral kernel weights.";
    for (int i = 0; i < channels_; i++) {
      fscanf(pFile, "%lf", &this->blobs_[1]->mutable_cpu_data()[i * channels_ + i]);
    }
    fclose(pFile);

    // Initialize the compatibility matrix.
    this->blobs_[2].reset(new Blob<Dtype>(1, 1, channels_, channels_));
    caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[2]->mutable_cpu_data());

    // Initialize it to have the Potts model.
    for (int c = 0; c < channels_; ++c) {
      (this->blobs_[2]->mutable_cpu_data())[c * channels_ + c] = Dtype(-1.);
    }
  }

  float spatial_kernel[2 * num_pixels_];
  float *spatial_kernel_gpu_;
  compute_spatial_kernel(spatial_kernel);
  spatial_lattice_.reset(new ModifiedPermutohedral());
  spatial_norm_.Reshape(1, 1, height_, width_);
  Dtype* norm_data_gpu ;
  Dtype*  norm_data;
  // Initialize the spatial lattice. This does not need to be computed for every image because we use a fixed size.
  switch (Caffe::mode()) {
    case Caffe::CPU:
      norm_data = spatial_norm_.mutable_cpu_data();
      spatial_lattice_->init(spatial_kernel, 2, width_, height_);
      // Calculate spatial filter normalization factors.
      norm_feed_= new Dtype[num_pixels_];
      caffe_set(num_pixels_, Dtype(1.0), norm_feed_);
      // pass norm_feed and norm_data to gpu
      spatial_lattice_->compute(norm_data, norm_feed_, 1);
      bilateral_kernel_buffer_ = new float[5 * num_pixels_];
      init_cpu = true;
      break;
    #ifndef CPU_ONLY
    case Caffe::GPU:
      CUDA_CHECK(cudaMalloc((void**)&spatial_kernel_gpu_, 2*num_pixels_ * sizeof(float))) ;
      CUDA_CHECK(cudaMemcpy(spatial_kernel_gpu_, spatial_kernel, 2*num_pixels_ * sizeof(float), cudaMemcpyHostToDevice)) ;
      spatial_lattice_->init(spatial_kernel_gpu_, 2, width_, height_);
      CUDA_CHECK(cudaMalloc((void**)&norm_feed_, num_pixels_ * sizeof(Dtype))) ;
      caffe_gpu_set(num_pixels_, Dtype(1.0), norm_feed_);
      norm_data_gpu = spatial_norm_.mutable_gpu_data();
      spatial_lattice_->compute(norm_data_gpu, norm_feed_, 1); 
      norm_data = spatial_norm_.mutable_cpu_data();
      CUDA_CHECK(cudaMalloc((void**)&bilateral_kernel_buffer_, 5 * num_pixels_ * sizeof(float))) ;
      CUDA_CHECK(cudaFree(spatial_kernel_gpu_));
      init_gpu = true;
      break;
    #endif
    default:
    LOG(FATAL) << "Unknown caffe mode.";
  }
  
  for (int i = 0; i < num_pixels_; ++i) {
    norm_data[i] = 1.0f / (norm_data[i] + 1e-20f);
  }
  bilateral_norms_.Reshape(num_, 1, height_, width_);  

  // Configure the split layer that is used to make copies of the unary term. One copy for each iteration.
  // It may be possible to optimize this calculation later.
  split_layer_bottom_vec_.clear();
  split_layer_bottom_vec_.push_back(bottom[0]);

  split_layer_top_vec_.clear();

  split_layer_out_blobs_.resize(num_iterations_);
  for (int i = 0; i < num_iterations_; i++) {
    split_layer_out_blobs_[i].reset(new Blob<Dtype>());
    split_layer_top_vec_.push_back(split_layer_out_blobs_[i].get());
  }

  LayerParameter split_layer_param;
  split_layer_.reset(new SplitLayer<Dtype>(split_layer_param));
  split_layer_->SetUp(split_layer_bottom_vec_, split_layer_top_vec_);

  // Make blobs to store outputs of each meanfield iteration. Output of the last iteration is stored in top[0].
  // So we need only (num_iterations_ - 1) blobs.
  iteration_output_blobs_.resize(num_iterations_ - 1);
  for (int i = 0; i < num_iterations_ - 1; ++i) {
    iteration_output_blobs_[i].reset(new Blob<Dtype>(num_, channels_, height_, width_));
  }
  // Make instances of MeanfieldIteration and initialize them.
  meanfield_iterations_.resize(num_iterations_);
  for (int i = 0; i < num_iterations_; ++i) {
    meanfield_iterations_[i].reset(new MeanfieldIteration<Dtype>());
    meanfield_iterations_[i]->OneTimeSetUp(
        split_layer_out_blobs_[i].get(), // unary terms
        (i == 0) ? bottom[1] : iteration_output_blobs_[i - 1].get(), // softmax input
        (i == num_iterations_ - 1) ? top[0] : iteration_output_blobs_[i].get(), // output blob
        spatial_lattice_, // spatial lattice
        &spatial_norm_); // spatial normalization factors.
  }
  this->param_propagate_down_.resize(this->blobs_.size(), true);
  LOG(INFO) << ("MultiStageMeanfieldLayer initialized.");
}
Example #19
0
void CUDARunner::FindBestConfiguration()
{
	unsigned long lowb=16;
	unsigned long highb=128;
	unsigned long lowt=16;
	unsigned long hight=256;
	unsigned long bestb=16;
	unsigned long bestt=16;
	int64 besttime=std::numeric_limits<int64>::max();

	if(m_requestedgrid>0 && m_requestedgrid<=65536)
	{
		lowb=m_requestedgrid;
		highb=m_requestedgrid;
	}

	if(m_requestedthreads>0 && m_requestedthreads<=65536)
	{
		lowt=m_requestedthreads;
		hight=m_requestedthreads;
	}

	for(int numb=lowb; numb<=highb; numb*=2)
	{
		for(int numt=lowt; numt<=hight; numt*=2)
		{
			AllocateResources(numb,numt);
			// clear out any existing error
			cudaError_t err=cudaGetLastError();
			err=cudaSuccess;

			int64 st=GetTimeMillis();

			for(int it=0; it<128*256*2 && err==0; it+=(numb*numt))
			{
				cutilSafeCall(cudaMemcpy(m_devin,m_in,sizeof(cuda_in),cudaMemcpyHostToDevice));

				cuda_process_helper(m_devin,m_devout,64,6,numb,numt);

				cutilSafeCall(cudaMemcpy(m_out,m_devout,numb*numt*sizeof(cuda_out),cudaMemcpyDeviceToHost));

				err=cudaGetLastError();
				if(err!=cudaSuccess)
				{
					printf("CUDA error %d\n",err);
				}
			}

			int64 et=GetTimeMillis();

			printf("Finding best configuration step end (%d,%d) %"PRI64d"ms  prev best=%"PRI64d"ms\n",numb,numt,et-st,besttime);

			if((et-st)<besttime && err==cudaSuccess)
			{
				bestb=numb;
				bestt=numt;
				besttime=et-st;
			}
		}
	}

	m_numb=bestb;
	m_numt=bestt;

	AllocateResources(m_numb,m_numt);

}
void GRTRegenerateChains::RunGPUWorkunit(GRTWorkunitElement *WU, GRTRegenerateThreadRunData *data) {
    //printf("In RunGPUWorkunit!\n");
    //printf("Got startPoint %d\n", WU->StartPoint);
    //printf("Got endpoint %d\n", WU->EndPoint);
    //printf("Chains to regen: %d\n", WU->EndPoint - WU->StartPoint);


    unsigned char *DEVICE_Chains_To_Regen, *HOST_Interleaved_Chains_To_Regen;
    UINT4 i, j, k;
    UINT4 ChainsCompleted, CurrentChainStartOffset, PasswordSpaceOffset, CharsetOffset, StepsPerInvocation;
    int ActiveThreads;
    CHHiresTimer kernelTimer;

    // Calculate the number of chains being regen'd by this thread.
    uint32_t NumberChainsToRegen = WU->EndPoint - WU->StartPoint + 1;

    this->setNumberOfChainsToRegen(NumberChainsToRegen);

    // Allocate device memory for chains.
    if (cudaErrorMemoryAllocation ==
            cudaMalloc((void**)&DEVICE_Chains_To_Regen, this->HashLengthBytes *
                NumberChainsToRegen * sizeof(unsigned char))) {
        printf("ERROR: Cannot allocate GPU memory.  Try rebooting?\n");
        exit(1);
    }

    // Interleave chains for better GPU performance and coalescing.
    HOST_Interleaved_Chains_To_Regen = (unsigned char *)malloc(this->HashLengthBytes * NumberChainsToRegen * sizeof(unsigned char));
    memset(HOST_Interleaved_Chains_To_Regen, 0, this->HashLengthBytes * NumberChainsToRegen * sizeof(unsigned char));

    hashPasswordData chainData;
    // Password in our space to regen
    for (i = 0; i < NumberChainsToRegen; i++) {
        UINT4 base_offset;
        // Get the chain being interleaved
        //printf("Adding chain %d\n", (i + WU->StartPoint));
        chainData = this->ChainsToRegen->at(i + WU->StartPoint);
        for (j = 0; j < (this->HashLengthBytes / 4); j++) {
            base_offset = 4 * j * NumberChainsToRegen;
            base_offset += i * 4;
            for (k = 0; k < 4; k++) {
                HOST_Interleaved_Chains_To_Regen[base_offset + k] = chainData.password[j*4 + k]; 
            }
        }
    }

    cudaMemset((void *)DEVICE_Chains_To_Regen, 0, this->HashLengthBytes * NumberChainsToRegen * sizeof(unsigned char));
    cudaMemcpy(DEVICE_Chains_To_Regen, HOST_Interleaved_Chains_To_Regen, this->HashLengthBytes * NumberChainsToRegen * sizeof(unsigned char), cudaMemcpyHostToDevice);


    // Kernel time!
    // Number of chains completed
    ChainsCompleted = 0;
    // Where we are in the current chain
    CurrentChainStartOffset = 0;
    // Calculated on the host for modulus reasons
    CharsetOffset = 0;
    PasswordSpaceOffset = 0;
    StepsPerInvocation = 1000;

    // If kernel time is set to zero, run full chains.
    if (!this->ThreadData[data->threadID].kernelTimeMs) {
        StepsPerInvocation = this->TableHeader->getChainLength();
    }
    // While we haven't finished all the chains:
    while (ChainsCompleted < NumberChainsToRegen) {
        CurrentChainStartOffset = 0;

        while (CurrentChainStartOffset < this->TableHeader->getChainLength()) {
            // Calculate the right charset offset
            CharsetOffset = CurrentChainStartOffset % this->hostConstantCharsetLengths[0];
            // PasswordSpaceOffset: The offset into the password space we are using.
            // 0, 1, 2, etc.
            PasswordSpaceOffset = (ChainsCompleted / 
                    (this->ThreadData[data->threadID].CUDABlocks * this->ThreadData[data->threadID].CUDAThreads));

            kernelTimer.start();

            // Don't overrun the end of the chain
            if ((CurrentChainStartOffset + StepsPerInvocation) > this->TableHeader->getChainLength()) {
                StepsPerInvocation = this->TableHeader->getChainLength() - CurrentChainStartOffset;
            }

            this->Launch_CUDA_Kernel(DEVICE_Chains_To_Regen, this->DEVICE_Passwords[data->threadID],
                this->DEVICE_Hashes[data->threadID], PasswordSpaceOffset, CurrentChainStartOffset,
                StepsPerInvocation, CharsetOffset, this->DEVICE_Success[data->threadID], data);

            cudaThreadSynchronize();
            // Copy the success and password data to the host
            cudaMemcpy(this->HOST_Success[data->threadID], this->DEVICE_Success[data->threadID],
                this->NumberOfHashes * sizeof(unsigned char), cudaMemcpyDeviceToHost);
            cudaMemcpy(this->HOST_Passwords[data->threadID], this->DEVICE_Passwords[data->threadID],
                this->NumberOfHashes * MAX_PASSWORD_LENGTH * sizeof(unsigned char), cudaMemcpyDeviceToHost);

            // Do something with the passwords...
            this->outputFoundHashes(data);

            // If all hashes are found, no point in continuing.
            if (this->HashFile->GetUncrackedHashCount() == 0) {
                return;
            }

            float ref_time = kernelTimer.getElapsedTimeInMilliSec();

            ActiveThreads = (this->ThreadData[data->threadID].CUDABlocks * this->ThreadData[data->threadID].CUDAThreads);
            
            if ((NumberChainsToRegen - ChainsCompleted) < ActiveThreads) {
                ActiveThreads = (NumberChainsToRegen - ChainsCompleted);
            }

            if (!silent) {
                if (this->Display) {
                    this->Display->setThreadFractionDone(data->threadID,
                            (float)(((float)((float)ChainsCompleted * (float)this->TableHeader->getChainLength() +
                        (float)CurrentChainStartOffset * (float)ActiveThreads) /
                        (float)((float)NumberChainsToRegen * (float)this->TableHeader->getChainLength()))));
                    this->Display->setThreadCrackSpeed(data->threadID, GPU_THREAD, ((ActiveThreads * StepsPerInvocation) / 1000) / ref_time);
                } else {
                    printf("Kernel Time: %0.3f ms  Step rate: %0.2f M/s Done: %0.2f%%    \r",ref_time,
                        ((ActiveThreads * StepsPerInvocation) / 1000) / ref_time,
                        (float)(100 * ((float)((float)ChainsCompleted * (float)this->TableHeader->getChainLength() +
                        (float)CurrentChainStartOffset * (float)ActiveThreads) /
                        (float)((float)NumberChainsToRegen * (float)this->TableHeader->getChainLength()))));
                    fflush(stdout);
                }
            }

            CurrentChainStartOffset += StepsPerInvocation;
            
            if (this->ThreadData[data->threadID].kernelTimeMs) {
                // Adjust the steps per invocation if needed.
                if ((ref_time > 1.1 * (float)this->ThreadData[data->threadID].kernelTimeMs) || (ref_time < 0.9 * (float)this->ThreadData[data->threadID].kernelTimeMs)) {
                    StepsPerInvocation = (UINT4)((float)StepsPerInvocation * ((float)this->ThreadData[data->threadID].kernelTimeMs / ref_time));
                    //printf("Adjusted SPI to %d\n", StepsPerInvocation);
                }
            }

        }
        ChainsCompleted += (this->ThreadData[data->threadID].CUDABlocks * this->ThreadData[data->threadID].CUDAThreads);
    }
    

    //printf("Freeing chains to regen.\n");
    // Free the chains we were working on.
    cudaFree(DEVICE_Chains_To_Regen);
    free(HOST_Interleaved_Chains_To_Regen);
}
Example #21
0
int main(int argc, char* argv[])
{
	// Parse test command line arguments, perform early
	// initializations.
	const char *name, *mode;
	int n, nt, sx, sy, ss, rank, szcomm;
#ifdef CUDA
	struct cudaDeviceProp props;
#endif
	test_parse(argc, argv, &name, &mode,
		&n, &nt, &sx, &sy, &ss, &rank, &szcomm
#ifdef CUDA
		, &props
#endif
		);

#ifdef CUDA
	int cpu = !strcmp(mode, "CPU");
	int gpu = !strcmp(mode, "GPU");
#else
	int cpu = 1;
	int gpu = 0;
#endif

	// Create test configuration.
	struct test_config_t* t = test_init(
		name, mode, n, nt, sx, sy, ss, rank, szcomm,
		xmin, ymin, zmin, xmax, ymax, zmax,
		bx, by, bs, ex, ey, es
#ifdef CUDA
		, &props
#endif
		);
	
	// Create another test configuration to check correctness.
	struct test_config_t* t_check = NULL;
#ifdef MPI
	if (t->rank == MPI_ROOT_NODE)
#endif
	{
		t_check = test_init(
			name, mode, n, nt, 1, 1, 1, 0, 1,
			xmin, ymin, zmin, xmax, ymax, zmax,
			bx, by, bs, ex, ey, es
#ifdef CUDA
			, &props
#endif
			);
	}
	
	// Generate the initial data disrtibution and load it
	// onto compute nodes.
	integer* array = (integer*)malloc(t->cpu.parent->grid->extsize * sizeof(integer));
	genirand(t->cpu.parent->grid->extsize, array);
	test_load(t, n, sx, sy, ss, sizeof(integer), (char*)array);
#ifdef MPI
	if (t->rank == MPI_ROOT_NODE)
#endif
	{
		size_t nxysb = n * n * n * sizeof(integer);
	
		// Copy the data array.
		memcpy(t_check->cpu.arrays[0], array, nxysb);
		
		// Duplicate initial distribution to the second level array.
		memcpy(t_check->cpu.arrays[1], t_check->cpu.arrays[0], nxysb);
	}
	free(array);
#ifdef VERBOSE
	printf("step 0\n");
	printf("step 1\n");
#endif
	// The time iterations loop, CPU and GPU versions.
	for (int it = 2; it < t->nt; it++)
	{
		// Run one iteration of the stencil, measuring its time.
		// In case of MPI, the time of iteration is measured together
		// with the time of data sync.
		struct timespec start, stop;
#ifdef MPI
		if (t->rank == MPI_ROOT_NODE)
#endif
		{
			stenfw_get_time(&start);
		}
#ifdef MPI
		struct grid_domain_t* subdomains = t->cpu.subdomains;

		int nsubdomains = t->cpu.nsubdomains;

		// Copy the current iteration data into boundary slices
		// and compute stencil in them.
		// Boundary slices themselves are subdomains with respect
		// to each MPI decomposition domains.
		{
			// Set subdomain data copying callbacks:
			// use simple memcpy in this case.
			for (int i = 0; i < nsubdomains; i++)
			{
				struct grid_domain_t* sub = subdomains + i;
				sub->scatter_memcpy = &grid_subcpy;
				sub->gather_memcpy = &grid_subcpy;
			}

			// Scatter domain edges for separate computation.
			grid_scatter(subdomains, &t->cpu, 0, LAYOUT_MODE_CUSTOM);
			
			// Process edges subdomains.
			for (int i = 0; i < nsubdomains; i++)
			{
				struct grid_domain_t* sub = subdomains + i;

				int nx = sub->grid[0].bx + sub->grid[0].nx + sub->grid[0].ex;
				int ny = sub->grid[0].by + sub->grid[0].ny + sub->grid[0].ey;
				int ns = sub->grid[0].bs + sub->grid[0].ns + sub->grid[0].es;

				isum13pt_cpu(nx, ny, ns,
					(integer(*)[ny][nx])sub->arrays[0],
					(integer(*)[ny][nx])sub->arrays[1],
					(integer(*)[ny][nx])sub->arrays[2]);
			}
		}
		
		// Start sharing boundary slices between linked subdomains.
		MPI_Request* reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * 2 * nsubdomains);
		for (int i = 0; i < nsubdomains; i++)
		{
			struct grid_domain_t* subdomain = subdomains + i;
			struct grid_domain_t* neighbor = *(subdomain->links.dense[0]);

			assert(neighbor->grid[1].extsize == subdomain->grid[0].extsize);
		
			int szelem = sizeof(integer);

			size_t dnx = neighbor->grid[1].nx * szelem;			
			size_t dny = neighbor->grid[1].ny;
			size_t dns = neighbor->grid[1].ns;

			size_t snx = subdomain->grid[0].nx * szelem;
			size_t sbx = subdomain->grid[0].bx * szelem;
			size_t sex = subdomain->grid[0].ex * szelem;
			
			size_t sny = subdomain->grid[0].ny, sns = subdomain->grid[0].ns;
			size_t sby = subdomain->grid[0].by, sbs = subdomain->grid[0].bs;
			size_t sey = subdomain->grid[0].ey, ses = subdomain->grid[0].es;

			size_t soffset = sbx + (sbx + snx + sex) *
				(sby + sbs * (sby + sny + sey));

			struct grid_domain_t obuf;
			memset(&obuf, 0, sizeof(struct grid_domain_t));
			obuf.arrays = subdomain->arrays + 1;
			obuf.narrays = 1;
			obuf.offset = 0;
			obuf.grid[0].nx = dnx;
			obuf.grid[0].ny = dny;
			obuf.grid[0].ns = dns;
			obuf.grid->size = dnx * dny * dns;
		
			struct grid_domain_t scpy = *subdomain;
			scpy.arrays = subdomain->arrays + 2;
			scpy.narrays = 1;
			scpy.offset = soffset;
			scpy.grid[0].nx = sbx + snx + sex;
			scpy.grid[0].ny = sby + sny + sey;
			scpy.grid[0].ns = sbs + sns + ses;
			
			// Copy data to the temporary buffer.
			grid_subcpy(dnx, dny, dns, &obuf, &scpy);

			// Exchange temporary buffers with the subdomain neighbour.
			int subdomain_rank = grid_rank1d(subdomain->parent->parent, subdomain->parent->grid);
			int neighbor_rank = grid_rank1d(neighbor->parent->parent, neighbor->parent->grid);
			MPI_SAFE_CALL(MPI_Isend(subdomain->arrays[1], obuf.grid->size,
				MPI_BYTE, neighbor_rank, 0, MPI_COMM_WORLD, &reqs[2 * i]));
			MPI_SAFE_CALL(MPI_Irecv(subdomain->arrays[0], obuf.grid->size,
				MPI_BYTE, neighbor_rank, 0, MPI_COMM_WORLD, &reqs[2 * i + 1]));
#ifdef VERBOSE
			printf("sharing: send %d->%d\n", subdomain_rank, neighbor_rank);
			printf("sharing: recv %d->%d\n", neighbor_rank, subdomain_rank);
#endif
		}
#endif // MPI
		// Compute inner grid points of the subdomain.
		int nx = t->cpu.grid->bx + t->cpu.grid->nx + t->cpu.grid->ex;
		int ny = t->cpu.grid->by + t->cpu.grid->ny + t->cpu.grid->ey;
		int ns = t->cpu.grid->bs + t->cpu.grid->ns + t->cpu.grid->es;

		if (cpu)
		{
			isum13pt_cpu(nx, ny, ns,
				(integer(*)[ny][nx])t->cpu.arrays[0],
				(integer(*)[ny][nx])t->cpu.arrays[1],
				(integer(*)[ny][nx])t->cpu.arrays[2]);	
		}
#ifdef CUDA
		if (gpu)
		{
			isum13pt_gpu(nx, ny, ns,
				(integer*)t->gpu.arrays[0],
				(integer*)t->gpu.arrays[1],
				(integer*)t->gpu.arrays[2]);
#ifdef VISUALIZE
#ifndef CUDA_MAPPED
			// If GPU is not using mapped host memory, then need to fetch
			// the current iteration solution explicitly.
			// TODO: in case of MPI/CUDA/!MAPPED this copy must go AFTER
			// boundaries gathering.
			CUDA_SAFE_CALL(cudaMemcpy(t->cpu.arrays[2], t->gpu.arrays[2],
				t->gpu.grid->extsize * sizeof(real), cudaMemcpyDeviceToHost));
#endif // CUDA_MAPPED
#endif
		}
#endif // CUDA
#ifdef MPI
		// Wait for boundaries sharing completion.
		MPI_Status* statuses = (MPI_Status*)malloc(2 * nsubdomains * sizeof(MPI_Status));
		MPI_SAFE_CALL(MPI_Waitall(2 * nsubdomains, reqs, statuses));
		for (int i = 0; i < 2 * nsubdomains; i++)
			MPI_SAFE_CALL(statuses[i].MPI_ERROR);
		free(statuses);
		free(reqs);
		for (int i = 0; i < nsubdomains; i++)
		{
			struct grid_domain_t* subdomain = subdomains + i;
			
			int szelem = sizeof(integer);

			size_t dnx = subdomain->grid[1].nx * szelem;
			size_t dbx = subdomain->grid[1].bx * szelem;
			size_t dex = subdomain->grid[1].ex * szelem;
			
			size_t dny = subdomain->grid[1].ny, dns = subdomain->grid[1].ns;
			size_t dby = subdomain->grid[1].by, dbs = subdomain->grid[1].bs;
			size_t dey = subdomain->grid[1].ey, des = subdomain->grid[1].es;

			size_t doffset = dbx + (dbx + dnx + dex) *
				(dby + dbs * (dby + dny + dey));

			struct grid_domain_t dcpy = *subdomain;
			dcpy.arrays = subdomain->arrays + 2;
			dcpy.narrays = 1;
			dcpy.offset = doffset;
			dcpy.grid[0].nx = dbx + dnx + dex;
			dcpy.grid[0].ny = dby + dny + dey;
			dcpy.grid[0].ns = dbs + dns + des;

			struct grid_domain_t ibuf;
			memset(&ibuf, 0, sizeof(struct grid_domain_t));
			ibuf.arrays = subdomain->arrays;
			ibuf.narrays = 1;
			ibuf.offset = 0;
			ibuf.grid[0].nx = dnx;
			ibuf.grid[0].ny = dny;
			ibuf.grid[0].ns = dns;
		
			// Copy data to temporary buffer.
			grid_subcpy(dnx, dny, dns, &dcpy, &ibuf);

			// Swap pointers to make the last iteration in the bottom.
			char* w = subdomain->arrays[0];
			subdomain->arrays[0] = subdomain->arrays[2];
			subdomain->arrays[2] = w;
		}

		// Gather bounradies on for the next time step. Insert the
		// separately computed boundaries back into the sudomains
		// for the next time step.
		struct grid_domain_t target = t->cpu;
		target.narrays = 1;
		target.arrays = t->cpu.arrays + 2;
		grid_gather(&target, subdomains, 1, LAYOUT_MODE_CUSTOM);
		
		if (t->rank != MPI_ROOT_NODE)
		{
#ifdef VERBOSE
			printf("step %d\n", it);
#endif
		}
		else
#endif // MPI		
		{
			stenfw_get_time(&stop);
			printf("step %d time = ", it);
			stenfw_print_time_diff(start, stop);
			printf(" sec\n");
		}
#ifdef MPI
		if (t->rank == MPI_ROOT_NODE)
#endif
		{
			// Compute inner grid points of the control solution subdomain.
			int nx = t_check->cpu.grid->bx + t_check->cpu.grid->nx + t_check->cpu.grid->ex;
			int ny = t_check->cpu.grid->by + t_check->cpu.grid->ny + t_check->cpu.grid->ey;
			int ns = t_check->cpu.grid->bs + t_check->cpu.grid->ns + t_check->cpu.grid->es;

			isum13pt_cpu(nx, ny, ns,
				(integer(*)[ny][nx])t_check->cpu.arrays[0],
				(integer(*)[ny][nx])t_check->cpu.arrays[1],
				(integer(*)[ny][nx])t_check->cpu.arrays[2]);
		}

		// Print the stats of difference between the solution and
		// the control solution.
		test_write_imaxabsdiff(t, t_check, 2, it);

		// Swap pointers to rewrite the oldest iteration with
		// the next one.
		char* w = t->cpu.arrays[0];
		t->cpu.arrays[0] = t->cpu.arrays[1];
		t->cpu.arrays[1] = t->cpu.arrays[2];
		t->cpu.arrays[2] = w;
#ifdef CUDA
		if (gpu)
		{
			// Also swap the corresponding GPU arrays pointers.
			w = t->gpu.arrays[0];
			t->gpu.arrays[0] = t->gpu.arrays[1];
			t->gpu.arrays[1] = t->gpu.arrays[2];
			t->gpu.arrays[2] = w;
		}
#endif
#ifdef MPI
		if (t->rank == MPI_ROOT_NODE)
#endif
		{
			// Swap pointers to rewrite the oldest control solution
			// iteration with the next one.
			char* w = t_check->cpu.arrays[0];
			t_check->cpu.arrays[0] = t_check->cpu.arrays[1];
			t_check->cpu.arrays[1] = t_check->cpu.arrays[2];
			t_check->cpu.arrays[2] = w;
		}
	}

	// Dispose the test configurations.
#ifdef MPI
	if (t->rank == MPI_ROOT_NODE)
#endif
	{
		test_dispose(t_check);
	}
	test_dispose(t);

	return 0;
}
Example #22
0
/*
 * cl_update (CUDA version)
 */
static void update_func_cuda(void *descr[], void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
    if (block->bz == 0)
        fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid);
    else
        DEBUG( "!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid);
#ifdef STARPU_USE_MPI
    int rank = 0;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    DEBUG( "!!!           RANK %d              !!!\n", rank);
#endif
    DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");

    unsigned block_size_z = get_block_size(block->bz);
    unsigned i;
    update_per_worker[workerid]++;

    struct timeval tv, tv2, diff, delta = {.tv_sec = 0, .tv_usec = get_ticks()*1000};
    gettimeofday(&tv, NULL);
    timersub(&tv, &start, &tv2);
    timersub(&tv2, &last_tick[block->bz], &diff);
    while (timercmp(&diff, &delta, >=)) {
        timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]);
        timersub(&tv2, &last_tick[block->bz], &diff);
        if (who_runs_what_index[block->bz] < who_runs_what_len)
            who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1;
    }

    if (who_runs_what_index[block->bz] < who_runs_what_len)
        who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);

    /*
     *	Load neighbours' boundaries : TOP
     */

    /* The offset along the z axis is (block_size_z + K) */
    load_subblock_from_buffer_cuda(descr[0], descr[2], block_size_z+K);
    load_subblock_from_buffer_cuda(descr[1], descr[3], block_size_z+K);

    /*
     *	Load neighbours' boundaries : BOTTOM
     */
    load_subblock_from_buffer_cuda(descr[0], descr[4], 0);
    load_subblock_from_buffer_cuda(descr[1], descr[5], 0);

    /*
     *	Stencils ... do the actual work here :) TODO
     */

    for (i=1; i<=K; i++)
    {
        starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
        TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;

        /* Shadow data */
        cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);

        /* And perform actual computation */
#ifdef LIFE
        cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
#else
        cudaMemcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice);
#endif /* LIFE */
    }

    cudaError_t cures;
    if ((cures = cudaThreadSynchronize()) != cudaSuccess)
        STARPU_CUDA_REPORT_ERROR(cures);

}
#endif /* STARPU_USE_CUDA */

/*
 * cl_update (CPU version)
 */
static void update_func_cpu(void *descr[], void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
    if (block->bz == 0)
        fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
    else
        DEBUG( "!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
#ifdef STARPU_USE_MPI
    int rank = 0;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    DEBUG( "!!!           RANK %d            !!!\n", rank);
#endif
    DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");

    unsigned block_size_z = get_block_size(block->bz);
    unsigned i;
    update_per_worker[workerid]++;

    struct timeval tv, tv2, diff, delta = {.tv_sec = 0, .tv_usec = get_ticks() * 1000};
    gettimeofday(&tv, NULL);
    timersub(&tv, &start, &tv2);
    timersub(&tv2, &last_tick[block->bz], &diff);
    while (timercmp(&diff, &delta, >=)) {
        timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]);
        timersub(&tv2, &last_tick[block->bz], &diff);
        if (who_runs_what_index[block->bz] < who_runs_what_len)
            who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1;
    }

    if (who_runs_what_index[block->bz] < who_runs_what_len)
        who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);

    /*
     *	Load neighbours' boundaries : TOP
     */

    /* The offset along the z axis is (block_size_z + K) */
    load_subblock_from_buffer_cpu(descr[0], descr[2], block_size_z+K);
    load_subblock_from_buffer_cpu(descr[1], descr[3], block_size_z+K);

    /*
     *	Load neighbours' boundaries : BOTTOM
     */
    load_subblock_from_buffer_cpu(descr[0], descr[4], 0);
    load_subblock_from_buffer_cpu(descr[1], descr[5], 0);

    /*
     *	Stencils ... do the actual work here :) TODO
     */

    for (i=1; i<=K; i++)
    {
        starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
        TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;

        /* Shadow data */
        unsigned ldy = oldb->ldy, ldz = oldb->ldz;
        unsigned nx = oldb->nx, ny = oldb->ny, nz = oldb->nz;
        unsigned x, y, z;
        unsigned stepx = 1;
        unsigned stepy = 1;
        unsigned stepz = 1;
        unsigned idx = 0;
        unsigned idy = 0;
        unsigned idz = 0;
        TYPE *ptr = old;

#		include "shadow.h"

        /* And perform actual computation */
#ifdef LIFE
        life_update(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
#else
        memcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new));
#endif /* LIFE */
    }
}

/* Performance model and codelet structure */
static struct starpu_perfmodel_t cl_update_model = {
    .type = STARPU_HISTORY_BASED,
    .symbol = "cl_update"
};

starpu_codelet cl_update = {
    .where =
#ifdef STARPU_USE_CUDA
    STARPU_CUDA|
#endif
    STARPU_CPU,
    .cpu_func = update_func_cpu,
#ifdef STARPU_USE_CUDA
    .cuda_func = update_func_cuda,
#endif
    .model = &cl_update_model,
    .nbuffers = 6
};

/*
 * Save the block internal boundaries to give them to our neighbours.
 */

/* CPU version */
static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
        starpu_block_interface_t *boundary,
        unsigned firstz)
{
    /* Sanity checks */
    STARPU_ASSERT(block->nx == boundary->nx);
    STARPU_ASSERT(block->ny == boundary->ny);
    STARPU_ASSERT(boundary->nz == K);

    /* NB: this is not fully garanteed ... but it's *very* likely and that
     * makes our life much simpler */
    STARPU_ASSERT(block->ldy == boundary->ldy);
    STARPU_ASSERT(block->ldz == boundary->ldz);

    /* We do a contiguous memory transfer */
    size_t boundary_size = K*block->ldz*block->elemsize;

    unsigned offset = firstz*block->ldz;
    TYPE *block_data = (TYPE *)block->ptr;
    TYPE *boundary_data = (TYPE *)boundary->ptr;
    memcpy(boundary_data, &block_data[offset], boundary_size);
}

/* CUDA version */
#ifdef STARPU_USE_CUDA
static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
        starpu_block_interface_t *boundary,
        unsigned firstz)
{
    /* Sanity checks */
    STARPU_ASSERT(block->nx == boundary->nx);
    STARPU_ASSERT(block->ny == boundary->ny);
    STARPU_ASSERT(boundary->nz == K);

    /* NB: this is not fully garanteed ... but it's *very* likely and that
     * makes our life much simpler */
    STARPU_ASSERT(block->ldy == boundary->ldy);
    STARPU_ASSERT(block->ldz == boundary->ldz);

    /* We do a contiguous memory transfer */
    size_t boundary_size = K*block->ldz*block->elemsize;

    unsigned offset = firstz*block->ldz;
    TYPE *block_data = (TYPE *)block->ptr;
    TYPE *boundary_data = (TYPE *)boundary->ptr;
    cudaMemcpy(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice);
}
#endif /* STARPU_USE_CUDA */

/* Record how many top/bottom saves each worker performed */
unsigned top_per_worker[STARPU_NMAXWORKERS];
unsigned bottom_per_worker[STARPU_NMAXWORKERS];

/* top save, CPU version */
static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    top_per_worker[workerid]++;

    DEBUG( "DO SAVE Bottom block %d\n", block->bz);

    /* The offset along the z axis is (block_size_z + K)- K */
    unsigned block_size_z = get_block_size(block->bz);

    load_subblock_into_buffer_cpu(descr[0], descr[2], block_size_z);
    load_subblock_into_buffer_cpu(descr[1], descr[3], block_size_z);
}

/* bottom save, CPU version */
static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    bottom_per_worker[workerid]++;

    DEBUG( "DO SAVE Top block %d\n", block->bz);

    load_subblock_into_buffer_cpu(descr[0], descr[2], K);
    load_subblock_into_buffer_cpu(descr[1], descr[3], K);
}

/* top save, CUDA version */
#ifdef STARPU_USE_CUDA
static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    top_per_worker[workerid]++;

    DEBUG( "DO SAVE Top block %d\n", block->bz);

    /* The offset along the z axis is (block_size_z + K)- K */
    unsigned block_size_z = get_block_size(block->bz);

    load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
    load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
    cudaThreadSynchronize();
}

/* bottom save, CUDA version */
static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    bottom_per_worker[workerid]++;

    DEBUG( "DO SAVE Bottom block %d on CUDA\n", block->bz);

    load_subblock_into_buffer_cuda(descr[0], descr[2], K);
    load_subblock_into_buffer_cuda(descr[1], descr[3], K);
    cudaThreadSynchronize();
}
#endif /* STARPU_USE_CUDA */

/* Performance models and codelet for save */
static struct starpu_perfmodel_t save_cl_bottom_model = {
    .type = STARPU_HISTORY_BASED,
    .symbol = "save_cl_bottom"
};

static struct starpu_perfmodel_t save_cl_top_model = {
    .type = STARPU_HISTORY_BASED,
    .symbol = "save_cl_top"
};

starpu_codelet save_cl_bottom = {
    .where =
#ifdef STARPU_USE_CUDA
    STARPU_CUDA|
#endif
    STARPU_CPU,
    .cpu_func = dummy_func_bottom_cpu,
#ifdef STARPU_USE_CUDA
    .cuda_func = dummy_func_bottom_cuda,
#endif
    .model = &save_cl_bottom_model,
    .nbuffers = 4
};

starpu_codelet save_cl_top = {
    .where =
#ifdef STARPU_USE_CUDA
    STARPU_CUDA|
#endif
    STARPU_CPU,
    .cpu_func = dummy_func_top_cpu,
#ifdef STARPU_USE_CUDA
    .cuda_func = dummy_func_top_cuda,
#endif
    .model = &save_cl_top_model,
    .nbuffers = 4
};
Example #23
0
int main (int argc, char *argv[])
{
    int rank, nprocs, ilen;
    char processor[MPI_MAX_PROCESSOR_NAME];
    double tstart = 0.0, tend = 0.0;

    MPI_Status reqstat;
    MPI_Request send_request;
    MPI_Request recv_request;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Get_processor_name(processor, &ilen);

    if (nprocs != 2)
    {
        if(rank == 0) printf("This test requires exactly two processes\n");

        MPI_Finalize();
        exit(EXIT_FAILURE);
    }

    int other_proc = (rank == 1 ? 0 : 1);

    // Hard code GPU affinity since this example only works with 2 GPUs.
    int igpu = 0;

    if(rank == 0 )
        printf("%s allocates %d MB pinned memory with regual mpi and "
               "bidirectional bandwidth\n", argv[0],
               MAX_MSG_SIZE / 1024 / 1024);

    printf("node=%d(%s): my other _proc = %d and using GPU=%d\n", rank,
            processor, other_proc, igpu);

    char *h_src, *h_rcv;
    CHECK(cudaSetDevice(igpu));
    CHECK(cudaMallocHost((void**)&h_src, MYBUFSIZE));
    CHECK(cudaMallocHost((void**)&h_rcv, MYBUFSIZE));

    char *d_src, *d_rcv;
    CHECK(cudaSetDevice(igpu));
    CHECK(cudaMalloc((void **)&d_src, MYBUFSIZE));
    CHECK(cudaMalloc((void **)&d_rcv, MYBUFSIZE));

    initalData(h_src, h_rcv, MYBUFSIZE);

    CHECK(cudaMemcpy(d_src, h_src, MYBUFSIZE, cudaMemcpyDefault));
    CHECK(cudaMemcpy(d_rcv, h_rcv, MYBUFSIZE, cudaMemcpyDefault));

    // latency test
    for(int size = 1; size <= MAX_MSG_SIZE; size = size * 2)
    {
        MPI_Barrier(MPI_COMM_WORLD);

        if(rank == 0)
        {
            tstart = MPI_Wtime();

            for(int i = 0; i < loop; i++)
            {
                /*
                 * Transfer data from the GPU to the host to be transmitted to
                 * the other MPI process.
                 */
                CHECK(cudaMemcpy(h_src, d_src, size, cudaMemcpyDeviceToHost));

		MPI_Send(h_src, size, MPI_CHAR, other_proc, 100, MPI_COMM_WORLD);
		MPI_Recv(h_rcv, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD, &reqstat);
                // bi-directional transmission
/*
                MPI_Send(h_src, size, MPI_CHAR, other_proc, 100,
                          MPI_COMM_WORLD);
		MPI_Recv(h_rcv, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD,
                          &reqstat);
*/
//                MPI_Waitall(1, &recv_request, &reqstat);
 //               MPI_Waitall(1, &send_request, &reqstat);

                /*
                 * Transfer the data received from the other MPI process to
                 * the device.
                 */
                CHECK(cudaMemcpy(d_rcv, h_rcv, size, cudaMemcpyHostToDevice));
            }

            tend = MPI_Wtime();
        }
        else
        {
            for(int i = 0; i < loop; i++)
            {
                /*
                 * Transfer data from the GPU to the host to be transmitted to
                 * the other MPI process.
                CHECK(cudaMemcpy(d_rcv, h_rcv, size, cudaMemcpyHostToDevice));
                 */
		CHECK(cudaMemcpy(h_src, d_src, size, cudaMemcpyDeviceToHost));
		MPI_Recv(h_rcv, size, MPI_CHAR, other_proc, 100, MPI_COMM_WORLD, &reqstat);

		MPI_Send(h_src, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD);
                CHECK(cudaMemcpy(d_rcv, h_rcv, size, cudaMemcpyHostToDevice));
	
                // bi-directional transmission
  /*              MPI_Recv(h_rcv, size, MPI_CHAR, other_proc, 100,
                          MPI_COMM_WORLD, &reqstat);
                MPI_Send(h_src, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD);
*/
                //MPI_Waitall(1, &recv_request, &reqstat);
                //MPI_Waitall(1, &send_request, &reqstat);

                /*
                 * Transfer the data received from the other MPI process to
                 * the device.
                 */
            }
        }

        MPI_Barrier(MPI_COMM_WORLD);

        if(rank == 0)
        {
            double latency = (tend - tstart) * 1e6 / (2.0 * loop);
            float performance = (float) size / (float) latency;
            printf("%6d %s %10.2f μs %10.2f MB/sec\n",
                   (size >= 1024 * 1024) ? size / 1024 / 1024 : size / 1024,
                   (size >= 1024 * 1024) ? "MB" : "KB", latency, performance);

            fflush(stdout);
        }
    }

    CHECK(cudaFreeHost(h_src));
    CHECK(cudaFreeHost(h_rcv));

    CHECK(cudaSetDevice(igpu));
    CHECK(cudaFree(d_src));
    CHECK(cudaFree(d_rcv));

    MPI_Finalize();

    return EXIT_SUCCESS;
}
 void update_kt_factor( value_type* host_kt_factor )
 {
     cuda_assert( cudaMemcpy( reinterpret_cast<void*>(data.kt_factor), reinterpret_cast<const void*>(host_kt_factor), config.tilt_size*sizeof(value_type)*3, cudaMemcpyHostToDevice ) );
 }
void copy_host_to_device(const size_t size, double *h_input,double *h_output,double *d_input,double *d_output){

        CHECK_CUDA(cudaMemcpy(d_output, h_output, size, cudaMemcpyHostToDevice));
        CHECK_CUDA(cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice));
}
Example #26
0
psaError_t transferGPUtoCPU(alignments_t *alignments)
{	
	CUDA_ERROR(cudaMemcpy(alignments->h_results, alignments->d_results, alignments->num * sizeof(alignmentEntry_t), cudaMemcpyDeviceToHost));

	return (SUCCESS);
}
int _tmain(int argc, _TCHAR* argv[]) 
{
	uchar4 *h_inputImageRGBA,  *d_inputImageRGBA;
	uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
	unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;

	float *h_filter;
	int    filterWidth;

	//PreProcess
	const std::string *filename = new std::string("./cinque_terre_small.jpg");
	cv::Mat imageInputRGBA;
	cv::Mat imageOutputRGBA;

	//make sure the context initializes ok
	checkCudaErrors(cudaFree(0));

	cv::Mat image = cv::imread(filename->c_str(), CV_LOAD_IMAGE_COLOR);
  
	if (image.empty()) 
	{
	std::cerr << "Couldn't open file: " << filename << std::endl;
	cv::waitKey(0);
	exit(1);
	}

	cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA);

	//allocate memory for the output
	imageOutputRGBA.create(image.rows, image.cols, CV_8UC4);

	//This shouldn't ever happen given the way the images are created
	//at least based upon my limited understanding of OpenCV, but better to check
	if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) {
	std::cerr << "Images aren't continuous!! Exiting." << std::endl;
	exit(1);
	}

	h_inputImageRGBA  = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0);
	h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0);

	const size_t numPixels = image.rows * image.cols;
	//allocate memory on the device for both input and output
	checkCudaErrors(cudaMalloc(&d_inputImageRGBA, sizeof(uchar4) * numPixels));
	checkCudaErrors(cudaMalloc(&d_outputImageRGBA, sizeof(uchar4) * numPixels));
	checkCudaErrors(cudaMemset(d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around

	//copy input array to the GPU
	checkCudaErrors(cudaMemcpy(d_inputImageRGBA, h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));

	//now create the filter that they will use
	const int blurKernelWidth = 9;
	const float blurKernelSigma = 2.;

	filterWidth = blurKernelWidth;

	//create and fill the filter we will convolve with
	h_filter = new float[blurKernelWidth * blurKernelWidth];

	float filterSum = 0.f; //for normalization

	for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) 
	{
		for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c)
		{
			float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma));
			h_filter[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue;
			filterSum += filterValue;
		}
	}

	float normalizationFactor = 1.f / filterSum;

	for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r)
	{
		for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c)
		{
			h_filter[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor;
		}
	}

	//blurred
	checkCudaErrors(cudaMalloc(&d_redBlurred,    sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMalloc(&d_greenBlurred,  sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMalloc(&d_blueBlurred,   sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(d_redBlurred,   0, sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(d_greenBlurred, 0, sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(d_blueBlurred,  0, sizeof(unsigned char) * numPixels));


	allocateMemoryAndCopyToGPU(image.rows, image.cols, h_filter, filterWidth);
	GpuTimer timer;
	timer.Start();
	//call the students' code
	your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, image.rows, image.cols,
						d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth);
	timer.Stop();
	cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
	int err = printf("%f msecs.\n", timer.Elapsed());

	if (err < 0) {
	//Couldn't print! Probably the student closed stdout - bad news
	std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
	exit(1);
	}

	cleanup();

	//check results and output the blurred image
	//PostProcess

	//copy the output back to the host
	checkCudaErrors(cudaMemcpy(imageOutputRGBA.ptr<unsigned char>(0), d_outputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost));

	cv::Mat imageOutputBGR;
	cv::cvtColor(imageOutputRGBA, imageOutputBGR, CV_RGBA2BGR);
	//output the image
	cv::imwrite("./blurredResult.jpg", imageOutputBGR);

	cv::namedWindow( "Display window", CV_WINDOW_NORMAL);
	cv::imshow("Display window", imageOutputBGR);
	
	cv::waitKey(0);


	checkCudaErrors(cudaFree(d_redBlurred));
	checkCudaErrors(cudaFree(d_greenBlurred));
	checkCudaErrors(cudaFree(d_blueBlurred));

	return 0;
}
Example #28
0
//-------------------------------------------------------
//copy a buffer from host memory to device memory
//
//param	: des
//param	: src
//param	: size
//-------------------------------------------------------
void D_MEMCPY_H2D(void *des, void *src, size_t size)
{
	CUDA_SAFE_CALL(cudaMemcpy(des, src, size, cudaMemcpyHostToDevice));
}
Example #29
0
void testCusolver(int rows, int cols, int nnz, int *row_ptr, int *col_index, double *values,
		double *valuesB){
    // --- Initialize cuSPARSE
 	cusolverSpHandle_t cusolver_handle = NULL; checkCudaErrors(cusolverSpCreate(&cusolver_handle));
 	cusparseHandle_t cusparse_handle = NULL; checkCudaErrors(cusparseCreate(&cusparse_handle));
 	cudaStream_t cudaStream = NULL; checkCudaErrors(cudaStreamCreate(&cudaStream));
 	checkCudaErrors(cusolverSpSetStream(cusolver_handle, cudaStream));
 	checkCudaErrors(cusparseSetStream(cusparse_handle, cudaStream));


    cusparseMatDescr_t descrA;      checkCudaErrors(cusparseCreateMatDescr(&descrA));
    checkCudaErrors(cusparseSetMatType     (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
    checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));

    double start, stop, time_to_solve;
    start = second();

    // --- Device side dense matrix
    printf("\nAlloc GPU memory...\n");
    double *d_A;            checkCudaErrors(cudaMalloc(&d_A, nnz * sizeof(double)));
    int *d_A_RowIndices;    checkCudaErrors(cudaMalloc(&d_A_RowIndices, (rows + 1) * sizeof(int)));
    int *d_A_ColIndices;    checkCudaErrors(cudaMalloc(&d_A_ColIndices, nnz * sizeof(int)));
    double *d_x;        checkCudaErrors(cudaMalloc(&d_x, rows * sizeof(double)));

    checkCudaErrors(cudaMemcpy(d_A, values, nnz * sizeof(double), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_A_RowIndices, row_ptr, (rows + 1) * sizeof(int), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_A_ColIndices, col_index, nnz * sizeof(int), cudaMemcpyHostToDevice));

    double *d_b; checkCudaErrors(cudaMalloc(&d_b, rows * sizeof(double)));
    checkCudaErrors(cudaMemcpy(d_b, valuesB, rows * sizeof(double), cudaMemcpyHostToDevice));

    double *h_x = (double *)malloc(rows * sizeof(double));

    double tol = 1.e-12;
    int reorder = 0;
    int singularity = 0;
	printf("\nProcessing in GPU using cusolver QR...\n");


    //checkCudaErrors(cusolverSpDcsrlsvluHost(cusolver_handle, Nrows, nnz, descrA, sparse.Values(),
    	//	sparse.RowPtr(), sparse.ColIdx(), mB.values, tol, reorder, h_x, &singularity));
    checkCudaErrors(cusolverSpDcsrlsvqr(cusolver_handle, rows, nnz, descrA, d_A, d_A_RowIndices,
           		d_A_ColIndices, d_b, tol, reorder, d_x, &singularity));
    checkCudaErrors(cudaDeviceSynchronize());
    stop = second();
    time_to_solve = stop - start;
    checkCudaErrors(cudaMemcpy(h_x, d_x, rows * sizeof(double), cudaMemcpyDeviceToHost));

    double minusOne = -1.0;
    double one = 1.0;
    double *d_r; checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*rows));
    checkCudaErrors(cudaMemcpy(d_r, d_b, sizeof(double)*rows, cudaMemcpyDeviceToDevice));
    checkCudaErrors(cusparseDcsrmv(cusparse_handle,
            CUSPARSE_OPERATION_NON_TRANSPOSE,
            rows,
            cols,
            nnz,
            &minusOne,
            descrA,
            d_A,
            d_A_RowIndices,
            d_A_ColIndices,
            d_x,
            &one,
            d_r));
    double *h_r; h_r = (double*) malloc(rows * sizeof(double));
    checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rows, cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(h_r, d_r, rows * sizeof(double), cudaMemcpyDeviceToHost));

    double r_inf = vec_norminf(rows, h_r);

    printf("(GPU - cuSolver) Time (sec): %f\n", time_to_solve);
    printf("(Eigen) |b - A*x| = %E \n", r_inf);

    checkCudaErrors(cusparseDestroy(cusparse_handle));
    checkCudaErrors(cusolverSpDestroy(cusolver_handle));
    checkCudaErrors(cudaStreamDestroy(cudaStream));
    checkCudaErrors(cudaFree(d_b));
    checkCudaErrors(cudaFree(d_x));
    checkCudaErrors(cudaFree(d_r));

    checkCudaErrors(cudaFree(d_A));
    checkCudaErrors(cudaFree(d_A_RowIndices));
    checkCudaErrors(cudaFree(d_A_ColIndices));

    free(h_x);
    free(h_r);
}
Example #30
0
int main(int argc, char* argv[])
{
	double AllTime=0;
    double x0=0.0, y0=0.0, z0=0.0;
    double xn=10.0, yn=10.0, zn=10.0;
    int Sx=100, Sy=100, Sz=100, St=100;
	if (argc>1) Sx=Sy=Sz=atoi(argv[1]);
    double * masprev;
    double * masnext;
    masprev=new double[Sx*Sy*Sz];
    masnext=new double[Sx*Sy*Sz];
    double dx=(xn-x0)/Sx, dy=(yn-y0)/Sy, dz=(zn-z0)/Sz;
   
    FILE* filex=fopen("filex.txt","w");
    FILE* filey=fopen("filey.txt","w");
    FILE* filez=fopen("filez.txt","w");	

	fprintf(filex,"%d %f %f\n",Sx,x0,xn);
	fprintf(filey,"%d %f %f\n",Sy,y0,xn);
	fprintf(filez,"%d %f %f\n",Sz,z0,xn);
	
	fprintf(filex,"%f %f\n",4.0,0);
	fprintf(filey,"%f %f\n",4.0,0);
	fprintf(filez,"%f %f\n",4.0,0);
	
    double dt=0.000001;			//выбираем dt
    memset(masprev, 0, Sx*Sy*Sz*sizeof(double));
	memset(masnext, 0, Sx*Sy*Sz*sizeof(double));
    for (int x=1; x<Sx-1; x++)
        for(int y=1; y<Sy-1; y++)
            for(int z=1; z<Sz-1; z++)
                masprev[x+y*Sx+z*Sx*Sy]=u(x0+dx*x, y0+dy*y, z0+dz*z);
	int maxi; double max = 0.0;
	for( int i = 0; i < Sx*Sy*Sz; i ++ ){
		if( masprev[i] > max ){
			max = masprev[i];
			maxi = i;
		}
	}
	printf("i=%d, max=%f\n", maxi, max);
	for(int x=1; x<Sx-1; x++)										//
		fprintf(filex,"%f ", masprev[x+25*(Sx)+25*(Sx)*(Sy)]);	//
	for(int y=1; y<Sy-1; y++)										//
		fprintf(filey,"%f ", masprev[25+y*(Sx)+25*(Sx)*(Sy)]);	// вывод в файл
	for(int z=1; z<Sz-1; z++)										//
		fprintf(filez,"%f ", masprev[25+25*(Sx)+z*(Sx)*(Sy)]);	//

	fprintf(filex,"\n");											// 
	fprintf(filey,"\n");											// переводим указатель на новую строчку
	fprintf(filez,"\n");											// 
	
	double Time;
	double _Time;
    double *dev_a,*dev_b;
	cudaMalloc( (void**)&dev_a, Sx*Sy*Sz*sizeof(double));
	cudaMalloc( (void**)&dev_b, Sx*Sy*Sz*sizeof(double));
	cudaMemset(dev_a,0, Sx*Sy*Sz*sizeof(double));
	cudaMemset(dev_b,0,Sx*Sy*Sz*sizeof(double));
	cudaMemcpy(dev_a,masprev,Sx*Sy*Sz*sizeof(double),cudaMemcpyHostToDevice);
	
	double ddx=1.0;//(dx*dx);
	double ddy=1.0;//(dy*dy);
	double ddz=1.0;//(dz*dz);

	Time=PortableGetTime();
	for (int t=1; t<St; t++)
    {
		StartCuda(dev_a,dev_b,Sx, Sy, Sz,dx,dy,dz,x0,y0,z0,dt,ddx,ddy,ddz);
		cudaMemcpy(masprev,dev_b,Sx*Sy*Sz*sizeof(double),cudaMemcpyDeviceToHost);
		double* tmp=dev_b;
		dev_b=dev_a;
		dev_a=tmp;
		for(int x=1; x<Sx-1; x++)										//
			fprintf(filex,"%f ", masprev[x+25*(Sx)+25*(Sx)*(Sy)]);	//
		for(int y=1; y<Sy-1; y++)										//
			fprintf(filey,"%f ", masprev[25+y*(Sx)+25*(Sx)*(Sy)]);	// вывод в файл
		for(int z=1; z<Sz-1; z++)										//
			fprintf(filez,"%f ", masprev[25+25*(Sx)+z*(Sx)*(Sy)]);	//

		fprintf(filex,"\n");											// 
		fprintf(filey,"\n");											// переводим указатель на новую строчку
		fprintf(filez,"\n");											// 

    }
	for(int x=1; x<Sx-1; x++)										//
		fprintf(filex,"%f ", masprev[x+25*(Sx)+25*(Sx)*(Sy)]);	//
	for(int y=1; y<Sy-1; y++)										//
		fprintf(filey,"%f ", masprev[25+y*(Sx)+25*(Sx)*(Sy)]);	// вывод в файл
	for(int z=1; z<Sz-1; z++)										//
		fprintf(filez,"%f ", masprev[25+25*(Sx)+z*(Sx)*(Sy)]);	//

	fprintf(filex,"\n");											// 
	fprintf(filey,"\n");											// переводим указатель на новую строчку
	fprintf(filez,"\n");											// 
	
	_Time=PortableGetTime();
	AllTime=_Time-Time;
	
	printf(" %lf \n",AllTime);

	system("PAUSE");
	fclose(filex);
    fclose(filey);
    fclose(filez);
	delete[] masprev;
	delete[] masnext;
	cudaFree(dev_a);
	cudaFree(dev_b);
    return 0;
}