Ejemplo n.º 1
0
void Core::Render()
{
#ifndef FNK_FINAL
	static GpuTimer gpuTimer;
	if ( m_showAnalyticsGui ) 
	{
		gpuTimer.Begin();
	}
#endif

	Timer timer;
	{
		BaseEntityGroup::Render();
		VirtualMachine::Get()->Render();	
	}
	m_msRender = timer.GetTimeMs();
	VirtualMachine::Get()->Gui();

	if ( m_showAnalyticsGui ) 
	{
		GuiStats();

#ifndef FNK_FINAL
		gpuTimer.End();
		m_msGpu = gpuTimer.GetTimeMs();
#endif
	}
}
Ejemplo n.º 2
0
int main(int argc, char **argv) {
  uchar4 *h_inputImageRGBA,  *d_inputImageRGBA;
  uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
  unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;

  float *h_filter;
  int    filterWidth;

  std::string input_file;
  std::string output_file;
  if (argc == 3) {
    input_file  = std::string(argv[1]);
    output_file = std::string(argv[2]);
  }
  else {
    std::cerr << "Usage: ./hw input_file output_file" << std::endl;
    exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA,
             &d_redBlurred, &d_greenBlurred, &d_blueBlurred,
             &h_filter, &filterWidth, input_file);

  allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth);
  GpuTimer timer;
  timer.Start();
  //call the students' code
  your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(),
                     d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth);
  timer.Stop();
  cudaDeviceSynchronize(); //checkCudaErrors(cudaGetLastError());
  int err = printf("%f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  cleanup();
  //check results and output the blurred image
  postProcess(output_file);

  checkCudaErrors(cudaFree(d_redBlurred));
  checkCudaErrors(cudaFree(d_greenBlurred));
  checkCudaErrors(cudaFree(d_blueBlurred));

  return 0;
}
Ejemplo n.º 3
0
int main(int argc, char **argv)
{
	//uchar4 is defined in vector_types.h in the SDK
	//uchar4 { unsigned char x,y,z,w; {
	uchar4 *h_rgbaImage, *d_rgbaImage;
	unsigned char *h_greyImage, *d_greyImage;

	string input_file;
	string output_file;

	input_file = "cinque_terre_small.jpg";
	output_file = "output_terre_small.jpg";	

	/*Load image and give input and output pointers*/
	preProcess(&h_rgbaImage,&h_greyImage,&d_rgbaImage,&d_greyImage, input_file);

	/*Start Timer*/
	GpuTimer timer;
	timer.Start();

	/*Execute Function*/
	rgb_to_grey(h_rgbaImage,d_rgbaImage,d_greyImage,numRows(),numCols());

	/*Stop Timer*/
	timer.Stop();	
	//cudaDeviceSynchronize() forces the program to wait for all previously issued commands in all streams on the device to finish before continuing
	cudaDeviceSynchronize();
	checkCudaErrors(cudaGetLastError());
	cout<<"Code ran in: "<<timer.Elapsed())<<" seconds"<<endl;

	//Copy the image from the device back to the host
	size_t numPixels = numRows() * numCols();
	checkCudaErrors(cudaMemCpy(h_greyImage,d_greyImage,sizeof(unsigned char) * numPixels, cudaMemcpyDeviceToHost));
	
	//check results and output the grey image	
	
	
	
	

	

	
	
	
	
	

}//end of main
Ejemplo n.º 4
0
int main(int argc, char **argv) {
  uchar4 *h_sourceImg, *h_destImg, *h_blendedImg;
  size_t numRowsSource, numColsSource;

  std::string input_source_file;
  std::string input_dest_file;
  std::string output_file;
  if (argc == 4) {
    input_source_file = std::string(argv[1]);
    input_dest_file   = std::string(argv[2]);
    output_file       = std::string(argv[3]);
  }
  else {
    std::cerr << "Usage: ./hw input_source_file input_dest_file output_file" << std::endl;
    exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&h_sourceImg, numRowsSource, numColsSource,
             &h_destImg,
             &h_blendedImg, input_source_file, input_dest_file);

  GpuTimer timer;
  timer.Start();

  //call the students' code
  your_blend(h_sourceImg, numRowsSource, numColsSource,
             h_destImg,
             h_blendedImg);

  timer.Stop();
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
  int err = printf("e57__TIMING__f82 %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  //check results and output the tone-mapped image
  postProcess(h_blendedImg, numRowsSource, numColsSource, output_file);

  delete[] h_destImg;
  delete[] h_sourceImg;
  delete[] h_blendedImg;
  return 0;
}
Ejemplo n.º 5
0
int main(int argc, char **argv) {
  float *d_luminance;
  unsigned int *d_cdf;

  size_t numRows, numCols;
  unsigned int numBins;

  std::string input_file;
  std::string output_file;
  if (argc == 3) {
    input_file  = std::string(argv[1]);
    output_file = std::string(argv[2]);
  }
  else {
    std::cerr << "Usage: ./hw input_file output_file" << std::endl;
    exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&d_luminance, &d_cdf,
             &numRows, &numCols, &numBins, input_file);

  GpuTimer timer;
  float min_logLum, max_logLum;
  min_logLum = 0.f;
  max_logLum = 1.f;
  timer.Start();
  //call the students' code
  your_histogram_and_prefixsum(d_luminance, d_cdf, min_logLum, max_logLum,
                               numRows, numCols, numBins);
  timer.Stop();
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
  int err = printf("%f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  //check results and output the tone-mapped image
  postProcess(output_file, numRows, numCols, min_logLum, max_logLum);

  return 0;
}
Ejemplo n.º 6
0
int main(int argc, char **argv) {
  uchar4        *h_rgbaImage, *d_rgbaImage;
  unsigned char *h_greyImage, *d_greyImage;

  std::string input_file;
  std::string output_file;
  if (argc == 3) {
    input_file  = std::string(argv[1]);
    output_file = std::string(argv[2]);
  }
  else {
    std::cerr << "Usage: ./hw input_file output_file" << std::endl;
    exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file);

  GpuTimer timer;
  timer.Start();
  //call the students' code
  your_rgba_to_greyscale(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols());
  timer.Stop();
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
  printf("\n");
  int err = printf("%f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  //check results and output the grey image
  postProcess(output_file);

  return 0;
}
Ejemplo n.º 7
0
int main(int argc, char **argv) {
  unsigned int *inputVals;
  unsigned int *inputPos;
  unsigned int *outputVals;
  unsigned int *outputPos;

  size_t numElems;

  std::string input_file;
  std::string template_file;
  std::string output_file;
  std::string reference_file;
  double perPixelError = 0.0;
  double globalError   = 0.0;
  bool useEpsCheck = false;

  switch (argc)
  {
  case 3:
    input_file  = std::string(argv[1]);
      template_file = std::string(argv[2]);
    output_file = "HW4_output.png";
    break;
  case 4:
    input_file  = std::string(argv[1]);
      template_file = std::string(argv[2]);
    output_file = std::string(argv[3]);
    break;
  default:
          std::cerr << "Usage: ./HW4 input_file template_file [output_filename]" << std::endl;
          exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&inputVals, &inputPos, &outputVals, &outputPos, numElems, input_file, template_file);

  GpuTimer timer;
  timer.Start();

  //call the students' code
  your_sort(inputVals, inputPos, outputVals, outputPos, numElems);

  timer.Stop();
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
  printf("\n");
  int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  //check results and output the red-eye corrected image
  postProcess(outputVals, outputPos, numElems, output_file);

  // check code moved from HW4.cu
  /****************************************************************************
  * You can use the code below to help with debugging, but make sure to       *
  * comment it out again before submitting your assignment for grading,       *
  * otherwise this code will take too much time and make it seem like your    *
  * GPU implementation isn't fast enough.                                     *
  *                                                                           *
  * This code MUST RUN BEFORE YOUR CODE in case you accidentally change       *
  * the input values when implementing your radix sort.                       *
  *                                                                           *
  * This code performs the reference radix sort on the host and compares your *
  * sorted values to the reference.                                           *
  *                                                                           *
  * Thrust containers are used for copying memory from the GPU                *
  * ************************************************************************* */
  thrust::device_ptr<unsigned int> d_inputVals(inputVals);
  thrust::device_ptr<unsigned int> d_inputPos(inputPos);

  thrust::host_vector<unsigned int> h_inputVals(d_inputVals,
                                                d_inputVals+numElems);
  thrust::host_vector<unsigned int> h_inputPos(d_inputPos,
                                               d_inputPos + numElems);

  thrust::host_vector<unsigned int> h_outputVals(numElems);
  thrust::host_vector<unsigned int> h_outputPos(numElems);

  reference_calculation(&h_inputVals[0], &h_inputPos[0],
            &h_outputVals[0], &h_outputPos[0],
            numElems);

  //postProcess(&h_outputVals[0], &h_outputPos[0], numElems, reference_file);

  compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);

  thrust::device_ptr<unsigned int> d_outputVals(outputVals);
  thrust::device_ptr<unsigned int> d_outputPos(outputPos);

  thrust::host_vector<unsigned int> h_yourOutputVals(d_outputVals,
                                                     d_outputVals + numElems);
  thrust::host_vector<unsigned int> h_yourOutputPos(d_outputPos,
                                                    d_outputPos + numElems);

  checkResultsExact(&h_outputVals[0], &h_yourOutputVals[0], numElems);
  checkResultsExact(&h_outputPos[0], &h_yourOutputPos[0], numElems);

  checkCudaErrors(cudaFree(inputVals));
  checkCudaErrors(cudaFree(inputPos));
  checkCudaErrors(cudaFree(outputVals));
  checkCudaErrors(cudaFree(outputPos));

  return 0;
}
Ejemplo n.º 8
0
int main(int argc, char **argv) {
  uchar4        *h_rgbaImage, *d_rgbaImage;
  unsigned char *h_greyImage, *d_greyImage;

  std::string input_file;
  std::string output_file;
  std::string reference_file;
  double perPixelError = 0.0;
  double globalError   = 0.0;
  bool useEpsCheck = false;
  switch (argc)
  {
	case 2:
	  input_file = std::string(argv[1]);
	  output_file = "HW1_output.png";
	  reference_file = "HW1_reference.png";
	  break;
	case 3:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = "HW1_reference.png";
	  break;
	case 4:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  break;
	case 6:
	  useEpsCheck=true;
	  input_file  = std::string(argv[1]);
	  output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  perPixelError = atof(argv[4]);
      globalError   = atof(argv[5]);
	  break;
	default:
      std::cerr << "Usage: ./HW1 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
      exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file);

  GpuTimer timer;
  timer.Start();
  //call the students' code
  your_rgba_to_greyscale(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols());
  timer.Stop();
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

  int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  size_t numPixels = numRows()*numCols();
  checkCudaErrors(cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels, cudaMemcpyDeviceToHost));

  //check results and output the grey image
  postProcess(output_file, h_greyImage);

  referenceCalculation(h_rgbaImage, h_greyImage, numRows(), numCols());

  postProcess(reference_file, h_greyImage);

  //generateReferenceImage(input_file, reference_file);
  compareImages(reference_file, output_file, useEpsCheck, perPixelError, 
                globalError);

  cleanup();

  return 0;
}
Ejemplo n.º 9
0
int main(int argc, char **argv) {
  float *d_luminance;
  unsigned int *d_cdf;

  size_t numRows, numCols;
  unsigned int numBins;

  std::string input_file;
  std::string output_file;
  std::string reference_file;
  double perPixelError = 0.0;
  double globalError   = 0.0;
  bool useEpsCheck = false;

  switch (argc)
  {
	case 2:
	  input_file = std::string(argv[1]);
	  output_file = "HW3_output.png";
	  reference_file = "HW3_reference.png";
	  break;
	case 3:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = "HW3_reference.png";
	  break;
	case 4:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  break;
	case 6:
	  useEpsCheck=true;
	  input_file  = std::string(argv[1]);
	  output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  perPixelError = atof(argv[4]);
      globalError   = atof(argv[5]);
	  break;
	default:
      std::cerr << "Usage: ./HW3 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
	  system("Pause");
      exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&d_luminance, &d_cdf,
             &numRows, &numCols, &numBins, input_file);

  GpuTimer timer;
  float min_logLum, max_logLum;
  min_logLum = 0.f;
  max_logLum = 1.f;
  timer.Start();
  //call the students' code
  your_histogram_and_prefixsum(d_luminance, d_cdf, min_logLum, max_logLum,
                               numRows, numCols, numBins);
  timer.Stop();
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
  int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
	system("Pause");
    exit(1);
  }

  float *h_luminance = (float *) malloc(sizeof(float)*numRows*numCols);
  unsigned int *h_cdf = (unsigned int *) malloc(sizeof(unsigned int)*numBins);

  checkCudaErrors(cudaMemcpy(h_luminance, d_luminance, numRows*numCols*sizeof(float), cudaMemcpyDeviceToHost));

  //check results and output the tone-mapped image
  postProcess(output_file, numRows, numCols, min_logLum, max_logLum);

  for (size_t i = 1; i < numCols * numRows; ++i) {
	min_logLum = std::min(h_luminance[i], min_logLum);
    max_logLum = std::max(h_luminance[i], max_logLum);
  }

  referenceCalculation(h_luminance, h_cdf, numRows, numCols, numBins, min_logLum, max_logLum);

  checkCudaErrors(cudaMemcpy(d_cdf, h_cdf, sizeof(unsigned int) * numBins, cudaMemcpyHostToDevice));

  //check results and output the tone-mapped image
  postProcess(reference_file, numRows, numCols, min_logLum, max_logLum);

  cleanupGlobalMemory();

  compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);

  system("pause");

  return 0;
}
Ejemplo n.º 10
0
int main(int argc, char **argv) {
  uchar4 *h_inputImageRGBA,  *d_inputImageRGBA;
  uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
  unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;

  float *h_filter;
  int    filterWidth;

  std::string input_file;
  std::string output_file;
  std::string reference_file;
  double perPixelError = 0.0;
  double globalError   = 0.0;
  bool useEpsCheck = false;
  switch (argc)
  {
	case 2:
	  input_file = std::string(argv[1]);
	  output_file = "HW2_output.png";
	  reference_file = "HW2_reference.png";
	  break;
	case 3:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = "HW2_reference.png";
	  break;
	case 4:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  break;
	case 6:
	  useEpsCheck=true;
	  input_file  = std::string(argv[1]);
	  output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  perPixelError = atof(argv[4]);
      globalError   = atof(argv[5]);
	  break;
	default:
      std::cerr << "Usage: ./HW2 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
      exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA,
             &d_redBlurred, &d_greenBlurred, &d_blueBlurred,
             &h_filter, &filterWidth, input_file);

  allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth);
  /*
  for(int i=0;i<filterWidth; ++i) {
    for(int j=0;j<filterWidth; ++j) {
      std::cerr<<h_filter[i*filterWidth + j]<<" ";
    }
    std::cerr<<std::endl;
  }
  */
  GpuTimer timer;
  timer.Start();
  //call the students' code
  your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(),
                     d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth);
  timer.Stop();
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
  int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  //check results and output the blurred image

  size_t numPixels = numRows()*numCols();
  //copy the output back to the host
  checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost));

  postProcess(output_file, h_outputImageRGBA);

  referenceCalculation(h_inputImageRGBA, h_outputImageRGBA,
                       numRows(), numCols(),
                       h_filter, filterWidth);

  postProcess(reference_file, h_outputImageRGBA);

    //  Cheater easy way with OpenCV
    //generateReferenceImage(input_file, reference_file, filterWidth);

  compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);

  checkCudaErrors(cudaFree(d_redBlurred));
  checkCudaErrors(cudaFree(d_greenBlurred));
  checkCudaErrors(cudaFree(d_blueBlurred));

  cleanUp();

  return 0;
}
int _tmain(int argc, _TCHAR* argv[]) 
{
	uchar4 *h_inputImageRGBA,  *d_inputImageRGBA;
	uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
	unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;

	float *h_filter;
	int    filterWidth;

	//PreProcess
	const std::string *filename = new std::string("./cinque_terre_small.jpg");
	cv::Mat imageInputRGBA;
	cv::Mat imageOutputRGBA;

	//make sure the context initializes ok
	checkCudaErrors(cudaFree(0));

	cv::Mat image = cv::imread(filename->c_str(), CV_LOAD_IMAGE_COLOR);
  
	if (image.empty()) 
	{
	std::cerr << "Couldn't open file: " << filename << std::endl;
	cv::waitKey(0);
	exit(1);
	}

	cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA);

	//allocate memory for the output
	imageOutputRGBA.create(image.rows, image.cols, CV_8UC4);

	//This shouldn't ever happen given the way the images are created
	//at least based upon my limited understanding of OpenCV, but better to check
	if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) {
	std::cerr << "Images aren't continuous!! Exiting." << std::endl;
	exit(1);
	}

	h_inputImageRGBA  = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0);
	h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0);

	const size_t numPixels = image.rows * image.cols;
	//allocate memory on the device for both input and output
	checkCudaErrors(cudaMalloc(&d_inputImageRGBA, sizeof(uchar4) * numPixels));
	checkCudaErrors(cudaMalloc(&d_outputImageRGBA, sizeof(uchar4) * numPixels));
	checkCudaErrors(cudaMemset(d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around

	//copy input array to the GPU
	checkCudaErrors(cudaMemcpy(d_inputImageRGBA, h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));

	//now create the filter that they will use
	const int blurKernelWidth = 9;
	const float blurKernelSigma = 2.;

	filterWidth = blurKernelWidth;

	//create and fill the filter we will convolve with
	h_filter = new float[blurKernelWidth * blurKernelWidth];

	float filterSum = 0.f; //for normalization

	for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) 
	{
		for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c)
		{
			float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma));
			h_filter[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue;
			filterSum += filterValue;
		}
	}

	float normalizationFactor = 1.f / filterSum;

	for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r)
	{
		for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c)
		{
			h_filter[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor;
		}
	}

	//blurred
	checkCudaErrors(cudaMalloc(&d_redBlurred,    sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMalloc(&d_greenBlurred,  sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMalloc(&d_blueBlurred,   sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(d_redBlurred,   0, sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(d_greenBlurred, 0, sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(d_blueBlurred,  0, sizeof(unsigned char) * numPixels));


	allocateMemoryAndCopyToGPU(image.rows, image.cols, h_filter, filterWidth);
	GpuTimer timer;
	timer.Start();
	//call the students' code
	your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, image.rows, image.cols,
						d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth);
	timer.Stop();
	cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
	int err = printf("%f msecs.\n", timer.Elapsed());

	if (err < 0) {
	//Couldn't print! Probably the student closed stdout - bad news
	std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
	exit(1);
	}

	cleanup();

	//check results and output the blurred image
	//PostProcess

	//copy the output back to the host
	checkCudaErrors(cudaMemcpy(imageOutputRGBA.ptr<unsigned char>(0), d_outputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost));

	cv::Mat imageOutputBGR;
	cv::cvtColor(imageOutputRGBA, imageOutputBGR, CV_RGBA2BGR);
	//output the image
	cv::imwrite("./blurredResult.jpg", imageOutputBGR);

	cv::namedWindow( "Display window", CV_WINDOW_NORMAL);
	cv::imshow("Display window", imageOutputBGR);
	
	cv::waitKey(0);


	checkCudaErrors(cudaFree(d_redBlurred));
	checkCudaErrors(cudaFree(d_greenBlurred));
	checkCudaErrors(cudaFree(d_blueBlurred));

	return 0;
}
int main(int argc, char **argv) {
  float *d_luminance;
  unsigned int *d_cdf;

  size_t numRows, numCols;
  unsigned int numBins;

  std::string input_file;
  std::string output_file;
  std::string reference_file;
  double perPixelError = 0.0;
  double globalError   = 0.0;
  bool useEpsCheck = false;

  switch (argc)
  {
	case 2:
	  input_file = std::string(argv[1]);
	  output_file = "output.png";
	  break;
	case 3:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  break;
	default:
      std::cerr << "Usage: ./tone input_file [output_filename] [reference_filename]" << std::endl;
      exit(1);
  }

  //load the image and give us our input and output pointers
  preProcess(&d_luminance, &d_cdf,
             &numRows, &numCols, &numBins, input_file);

  GpuTimer timer;
  float min_logLum, max_logLum;
  min_logLum = 0.f;
  max_logLum = 1.f;
  timer.Start();

  //call the students' code
  calculate_cdf(d_luminance, d_cdf, min_logLum, max_logLum,
                               numRows, numCols, numBins);
  timer.Stop();
  cudaDeviceSynchronize(); 
  int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  float *h_luminance = (float *) malloc(sizeof(float)*numRows*numCols);
  unsigned int *h_cdf = (unsigned int *) malloc(sizeof(unsigned int)*numBins);

  cudaMemcpy(h_luminance, d_luminance, numRows*numCols*sizeof(float), cudaMemcpyDeviceToHost);

  //check results and output the tone-mapped image
  postProcess(output_file, numRows, numCols, min_logLum, max_logLum);

  cleanupGlobalMemory();

  return 0;
}