Esempio n. 1
0
void PA::close()
{
	if (is_open()) {
		Pa_StopStream(stream_);
		Pa_CloseStream(stream_);

		if (stream_type_ == PA_STREAM_TYPE_INPUT) {
			printf("PA::close() : close input device...in:%d:%s\n", input_device_idx_, input_device_info().name().c_str());
		}
		else if (stream_type_ == PA_STREAM_TYPE_OUTPUT) {
			printf("PA::close() : close output device...out:%d:%s\n", output_device_idx_, output_device_info().name().c_str());
		}
		else if (stream_type_ == PA_STREAM_TYPE_WIRE) {
			printf("PA::close() : close wire device...in:%d:%s, out:%d:%s\n", 
				input_device_idx_, input_device_info().name().c_str(),
				output_device_idx_, output_device_info().name().c_str());
		}

		stream_ = NULL;
		stream_type_ = PA_STREAM_TYPE_NONE;
		input_device_idx_ = -1;
		output_device_idx_ = -1;
		channels_ = 0;
	}
}
Esempio n. 2
0
bool PA::open_output(const int &dev_idx, const int &channels, const int &sampling_rate, const int &buf_size)
{
	PaError err;
	PaStreamParameters output_param;

	if (is_open()) return false;

	if (dev_idx < 0 || get_device_count() <= dev_idx) {
		printf("error :  PA::open_output() : out of index...dev_idx=%d", dev_idx);
		return false;
	}

	output_param.device = dev_idx;
	output_param.channelCount = channels;
	output_param.sampleFormat = paFloat32;
	output_param.suggestedLatency = Pa_GetDeviceInfo(output_param.device)->defaultLowOutputLatency;
	output_param.hostApiSpecificStreamInfo = NULL;

	err = Pa_OpenStream(
		&stream_,
		NULL,
		&output_param,
		sampling_rate,
		buf_size,  // frames_per_buffer
		paClipOff,
		play_callback_,
		this);

	if (err != paNoError) {
		printf("error :  PA::open_output() : Pa_OpenStream() failed...dev_idx=%d", dev_idx);
		close();
		return false;
	}

	err = Pa_StartStream(stream_);
	if (err != paNoError) {
		printf("error :  PA::open_output() : Pa_StartStream() failed...dev_idx=%d", dev_idx);
		close();
		return false;
	}

	stream_type_ = PA_STREAM_TYPE_OUTPUT;
	output_device_idx_ = dev_idx;
	channels_ = channels;

	PADeviceInfo info = output_device_info();
	printf("PA::open_output() : open device...dev_idx=%d, name=%s\n", output_device_idx_, info.name().c_str());

	return true;
}
Esempio n. 3
0
int main(int argc, char** argv)
{
  // Error code returned from openCL calls
  int err;

  // A, B and C arrays
  float *hA = (float *)calloc(LENGTH, sizeof(float));
  float *hB = (float *)calloc(LENGTH, sizeof(float));
  float *hC = (float *)calloc(LENGTH, sizeof(float));

  // Define the scene
  const unsigned int kScreenWidth = 800;
  const unsigned int kScreenHeight = 600;
  float zoomFactor = -4.f;
  float aliasFactor = 3.f;

  size_t globalWorkSize = kScreenWidth * kScreenHeight;
  size_t localWorkSize = 256;

  // Colours
  Vec whiteCol;
  vinit(whiteCol, 8.f, 8.f, 8.f);
  Vec lowerWhite;
  vinit(lowerWhite, 0.5f, 0.5f, 0.5f);
  Vec redCol;
  vinit(redCol, 0.8f, 1.f, 0.7f);
  Vec greenCol;
  vinit(greenCol, 0.4f, 0.5f, 0.7f);
  Vec col1;
  vinit(col1, 0.01f, 0.8f, 0.01f);

  // Setup materials
  struct Material ballMaterial1; // White
  Vec bm1Gloss; vassign(bm1Gloss, redCol);
  Vec bm1Matte; vassign(bm1Matte, greenCol);
  setMatOpacity(&ballMaterial1, 0.8f);
  setMatteGlossBalance(&ballMaterial1, 0.2f, &bm1Matte, &bm1Gloss);
  setMatRefractivityIndex(&ballMaterial1, 1.5500f);

  struct Material ballMaterial2; // Red
  Vec bm2Gloss; vassign(bm2Gloss, redCol);
  Vec bm2Matte; vassign(bm2Matte, greenCol);
  setMatOpacity(&ballMaterial2, 0.3f);
  setMatteGlossBalance(&ballMaterial2, 0.95f, &bm2Matte, &bm2Gloss);
  setMatRefractivityIndex(&ballMaterial2, 1.5500f);

  struct Material ballMaterial3; // Red
  Vec bm3Gloss; vassign(bm3Gloss, col1);
  Vec bm3Matte; vassign(bm3Matte, col1);
  setMatOpacity(&ballMaterial3, 0.6f);
  setMatteGlossBalance(&ballMaterial3, 0.0, &bm3Matte, &bm3Gloss);
  setMatRefractivityIndex(&ballMaterial3, 1.5500f);

  // Setup spheres
  unsigned int sphNum = 3;
  struct Sphere *hSpheres =
    (struct Sphere *)calloc(sphNum, sizeof(struct Sphere));
  hSpheres[0].material = ballMaterial1;
  vinit(hSpheres[0].pos, -9.f, 0.f, -13.f);
  hSpheres[0].radius = 5.f;
  hSpheres[1].material = ballMaterial2;
  vinit(hSpheres[1].pos, -4.f, 1.5f, -5.f);
  hSpheres[1].radius = 2.f;
  hSpheres[2].material = ballMaterial3;
  vinit(hSpheres[2].pos, 1.f, -1.f, -7.f);
  hSpheres[2].radius = 3.f;

  // Setup light sources
  unsigned int lgtNum = 2;
  struct Light *hLights =
    (struct Light *)calloc(lgtNum, sizeof(struct Light));
  vinit(hLights[0].pos, -45.f, 10.f, 85.f);
  vassign(hLights[0].col, lowerWhite);
  vinit(hLights[1].pos, 20.f, 60.f, -5.f);
  vassign(hLights[1].col, lowerWhite);



  // Fill vectors a and b with random float values
  int count = LENGTH;
  for (int i = 0; i < count; i++){
    hA[i] = rand() / (float)RAND_MAX;
    hB[i] = rand() / (float)RAND_MAX;
  }




  cl_uint numPlatforms;

  // Find number of platforms
  err = clGetPlatformIDs(0, NULL, &numPlatforms);
  checkError(err, "Finding platforms");
  if (numPlatforms == 0) {
    printf("Found 0 platforms!\n");
    return EXIT_FAILURE;
  }




  // Get all platforms
  cl_platform_id *platform = (cl_platform_id *)malloc(sizeof(cl_platform_id)* numPlatforms);
  err = clGetPlatformIDs(numPlatforms, platform, NULL);
  checkError(err, "Getting platforms");




  // Define an ID for the device
  cl_device_id deviceId = 0;
  // Secure a GPU
  for (int i = 0; i < numPlatforms; i++) {
    err = clGetDeviceIDs(platform[i], CL_DEVICE_TYPE_GPU, 1, &deviceId, NULL);
    if (err == CL_SUCCESS) {
      break;
    }
  }

  // Once a device has been obtained, print out its info
  err = output_device_info(deviceId);
  checkError(err, "Printing device output");



  // Create a context for the GPU
  cl_context gpuContext;
  gpuContext = clCreateContext(NULL, 1, &deviceId, NULL, NULL, &err);
  checkError(err, "Creating context");




  // Create a command queue
  cl_command_queue commandsGPU;
  commandsGPU = clCreateCommandQueue(gpuContext, deviceId, NULL, &err);
  checkError(err, "Creating command queue");




  // Load the kernel code
  std::ifstream sourceFstream("raytrace_kernel.cl");
  std::string source((std::istreambuf_iterator<char>(sourceFstream)),
    std::istreambuf_iterator<char>());

  // Create a program from the source
  const char* str = source.c_str();
  cl_program program;
  program = clCreateProgramWithSource(gpuContext, 1, &str, NULL, &err);
  checkError(err, "Creating program");




  // Compile the program
  err = clBuildProgram(program, 0, NULL, "-I C:\Drive\Alberto\Projects\Code\C++\raytracer_gamma\raytracer_gamma", NULL, NULL);
    // If there were compilation errors
    if (err != CL_SUCCESS) {
      // Print out compilation log
      size_t len;
      char buffer[2048];

      printf("Error: Failed to build program executable!\n%s\n", err_code(err));
      clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
      printf("%s\n", buffer);

      // Exit
      return EXIT_FAILURE;
    }




  // Create the kernel
  cl_kernel koRTG;
  koRTG = clCreateKernel(program, "raytrace", &err);
  checkError(err, "Creating kernel");




  // Create the list of spheres and lights in device memory
  cl_mem dSpheres = clCreateBuffer(gpuContext, CL_MEM_READ_WRITE,
    sizeof(struct Sphere) * sphNum, NULL, &err);
  checkError(err, "Creating buffer for spheres");
  cl_mem dLights = clCreateBuffer(gpuContext, CL_MEM_READ_WRITE,
    sizeof(struct Light) * lgtNum, NULL, &err);
  checkError(err, "Creating buffer for lights");
  cl_mem dPixelBuffer = clCreateBuffer(gpuContext, CL_MEM_WRITE_ONLY,
    kScreenWidth * kScreenHeight * sizeof(Vec), NULL, &err);
  checkError(err, "Creating buffer for pixels");

  // Write data from host into device memory (fill the buffers with
  // the host arrays)
  err = clEnqueueWriteBuffer(commandsGPU, dSpheres, CL_TRUE, 0,
    sizeof(struct Sphere) * sphNum, hSpheres, 0, NULL, NULL);
  checkError(err, "Copying hSperes in dSpheres");
  err = clEnqueueWriteBuffer(commandsGPU, dLights, CL_TRUE, 0,
    sizeof(struct Light) * lgtNum, hLights, 0, NULL, NULL);
  checkError(err, "Copying hLights into dLights");

  cl_int   status;
  cl_uint maxDims;
  cl_event events[2];
  size_t maxWorkGroupSize;

  /**
  * Query device capabilities. Maximum
  * work item dimensions and the maximmum
  * work item sizes
  */
  status = clGetDeviceInfo(
	  deviceId,
	  CL_DEVICE_MAX_WORK_GROUP_SIZE,
	  sizeof(size_t),
	  (void*)&maxWorkGroupSize,
	  NULL);
  if (status != CL_SUCCESS)
  {
	  fprintf(stderr, "Error: Getting Device Info. (clGetDeviceInfo)\n");
	  return 1;
  }

  status = clGetDeviceInfo(
	  deviceId,
	  CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
	  sizeof(cl_uint),
	  (void*)&maxDims,
	  NULL);
  if (status != CL_SUCCESS)
  {
	  fprintf(stderr, "Error: Getting Device Info. (clGetDeviceInfo)\n");
	  return 1;
  }

  localWorkSize = maxWorkGroupSize;

  if (globalWorkSize % localWorkSize != 0) {
    globalWorkSize = (globalWorkSize / localWorkSize + 1) * localWorkSize;
  }



  // Set kernel arguments
  err = clSetKernelArg(koRTG, 0, sizeof(cl_mem), &dSpheres);
  err |= clSetKernelArg(koRTG, 1, sizeof(unsigned int), &sphNum);
  err |= clSetKernelArg(koRTG, 2, sizeof(cl_mem), &dLights);
  err |= clSetKernelArg(koRTG, 3, sizeof(unsigned int), &lgtNum);
  err |= clSetKernelArg(koRTG, 4, sizeof(unsigned int), &kScreenWidth);
  err |= clSetKernelArg(koRTG, 5, sizeof(unsigned int), &kScreenHeight);
  err |= clSetKernelArg(koRTG, 6, sizeof(float), &zoomFactor);
  err |= clSetKernelArg(koRTG, 7, sizeof(float), &aliasFactor);
  err |= clSetKernelArg(koRTG, 8, sizeof(cl_mem), &dPixelBuffer);
  err |= clSetKernelArg(koRTG, 9, sizeof(struct Sphere) * sphNum, NULL);
  err |= clSetKernelArg(koRTG, 10, sizeof(struct Light) * lgtNum, NULL);
  checkError(err, "Setting kernel arguments");

  // Start counting the time between kernel enqueuing and completion
  std::chrono::steady_clock::time_point startTime = std::chrono::steady_clock::now();

  // Execute the kernel over the entire range of our 1d input data set
  // letting the OpenCL runtime choose the work-group size
  err = clEnqueueNDRangeKernel(commandsGPU, koRTG, 1, NULL,
    &globalWorkSize, &localWorkSize, 0, NULL, NULL);
  checkError(err, "Enqueueing kernel");

  // Wait for the commands in the queue to be executed
  err = clFinish(commandsGPU);
  checkError(err, "Waiting for commands to finish");

  // Read the time after the kernel has executed
  std::chrono::steady_clock::time_point endTime = std::chrono::steady_clock::now();

  // Compute the duration
  double kernelExecTime = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime).count();

  

  // Print the duration
  printf("Exec time: %.5f ms", kernelExecTime);



  // Image
  //float *imagePtr = (float *)malloc(globalWorkSize * sizeof(Vec));



  //// Screen in world coordinates
  //const float kImageWorldWidth = 16.f;
  //const float kImageWorldHeight = 12.f;

  //// Amount to increase each step for the ray direction
  //const float kRayXStep = kImageWorldWidth / ((float)kScreenWidth);
  //const float kRayYStep = kImageWorldHeight / ((float)kScreenHeight);
  //const float aspectRatio = kImageWorldWidth / kImageWorldHeight;

  //// Variables holding the current step in world coordinates
  //float rayX = 0.f, rayY = 0.f;

  //int pixelsCounter = 0;

  //// Calculate size of an alias step in world coordinates
  //const float kAliasFactorStepInv = kRayXStep / aliasFactor;
  //// Calculate total size of samples to be taken
  //const float kSamplesTot = aliasFactor * aliasFactor;
  //// Also its inverse
  //const float kSamplesTotinv = 1.f / kSamplesTot;

  //for (int y = 0; y < kScreenWidth * kScreenHeight; ++y, pixelsCounter += 3) {
  //  // Retrieve the global ID of the kernel
  //  const unsigned gid = y;

  //  

  //  // Calculate world position of pixel being currently worked on
  //  const float kPxWorldX = ((((float)(gid % kScreenWidth) - 
  //    (kScreenWidth * 0.5f))) * kRayXStep);
  //  const float kPxWorldY = ((kScreenHeight *0.5f) - ((float)(gid / kScreenWidth))) * kRayYStep;

  //  // The ray to be shot. The vantage point (camera) is at the origin,
  //  // and its intensity is maximum
  //  struct Ray ray; vinit(ray.origin, 0.f, 0.f, 0.f); vinit(ray.intensity, 1.f, 1.f, 1.f);

  //  // The colour of the pixel to be computed
  //  Vec pixelCol = { 0.f, 0.f, 0.f };

  //  // Mock background material
  //  struct Material bgMaterial;
  //  Vec black; vinit(black, 0.f, 0.f, 0.f);
  //  setMatteGlossBalance(&bgMaterial, 0.f, &black, &black);
  //  setMatRefractivityIndex(&bgMaterial, 1.00f);

  //  // For each sample to be taken
  //  for (int i = 0; i < aliasFactor; ++i) {
  //    for (int j = 0; j < aliasFactor; ++j) {
  //      // Calculate the direction of the ray
  //      float x = (kPxWorldX + (float)(((float)j) * kAliasFactorStepInv)) * aspectRatio;
  //      float y = (kPxWorldY + (float)(((float)i) * kAliasFactorStepInv));

  //      // Set the ray's dir and normalise it
  //      vinit(ray.dir, x, y, zoomFactor); vnorm(ray.dir);

  //      // Raytrace for the current sample
  //      Vec currentSampleCol = rayTrace(hSpheres, sphNum, hLights, lgtNum,
  //        ray, bgMaterial, 0);

  //      vsmul(currentSampleCol, kSamplesTotinv, currentSampleCol);

  //      // Compute the average
  //      vadd(pixelCol, pixelCol, currentSampleCol);
  //    }
  //  }

  //  // Write result in destination buffer
  //  *(imagePtr + pixelsCounter) = pixelCol.x;
  //  *(imagePtr + pixelsCounter + 1) = pixelCol.y;
  //  *(imagePtr + pixelsCounter + 2) = pixelCol.z;
  //}

  // Create a buffer to hold the result of the computation on the device
  Vec *pixelsIntermediate = (Vec *)calloc(kScreenHeight * kScreenWidth, sizeof(Vec));
  //Vec *pixelsIntermediate = (Vec *)(imagePtr);

  // Read the results back from the device into the host
  err = clEnqueueReadBuffer(commandsGPU, dPixelBuffer, CL_TRUE, 0,
	  kScreenWidth * kScreenHeight * sizeof(Vec), pixelsIntermediate, 0, NULL, NULL);
  // If the reading operation didn't complete successfully
  if (err != CL_SUCCESS) {
    printf("Error: Failed to read output buffer!\n%s\n", err_code(err));

    // Exit
    exit(1);
  }

  // Calculate the maximum colour value across the whole picture
  float maxColourValue = maxColourValuePixelBuffer(pixelsIntermediate,
    kScreenWidth * kScreenHeight);

  // Cast the buffer to the type accepted by the savePPM function
  RGB *pixels = (RGB *)(pixelsIntermediate);

  // Print execution time
  /*rtime = wtime() - rtime;
  printf("\nThe kernel ran in %lf seconds\n", rtime);*/
 

  // Cleanup
  clReleaseMemObject(dPixelBuffer);
  clReleaseMemObject(dLights);
  clReleaseMemObject(dSpheres);
  clReleaseProgram(program);
  clReleaseKernel(koRTG);
  clReleaseCommandQueue(commandsGPU);
  clReleaseContext(gpuContext);
  // ... Also on host

  free(hLights);
  free(hSpheres);
  free(hA);
  free(hB);
  free(hC);
  free(platform);


  // Try to save a PPM picture
  savePPM(pixels, "testPPM.ppm", kScreenWidth, kScreenHeight, maxColourValue);
  free(pixelsIntermediate);
  //free(imagePtr);
 
  getchar();

  return 0;
}
Esempio n. 4
0
int main(int argc, char** argv)
{
    cl_int          err;               // error code returned from OpenCL calls

    size_t dataSize = sizeof(float) * LENGTH;
    float*       h_a = (float *)malloc(dataSize);       // a vector
    float*       h_b = (float *)malloc(dataSize);       // b vector
    float*       h_c = (float *)malloc(dataSize);       // c vector (result)
    float*       h_d = (float *)malloc(dataSize);       // d vector (result)
    float*       h_e = (float *)malloc(dataSize);       // e vector
    float*       h_f = (float *)malloc(dataSize);       // f vector (result)
    float*       h_g = (float *)malloc(dataSize);       // g vector
    unsigned int correct;           // number of correct results

    size_t global;                  // global domain size

    cl_device_id     device_id;     // compute device id
    cl_context       context;       // compute context
    cl_command_queue commands;      // compute command queue
    cl_program       program;       // compute program
    cl_kernel        ko_vadd;       // compute kernel

    cl_mem d_a;                     // device memory used for the input  a vector
    cl_mem d_b;                     // device memory used for the input  b vector
    cl_mem d_c;                     // device memory used for the output c vector
    cl_mem d_d;                     // device memory used for the output d vector
    cl_mem d_e;                     // device memory used for the input e vector
    cl_mem d_f;                     // device memory used for the output f vector
    cl_mem d_g;                     // device memory used for the input g vector

    // Fill vectors a and b with random float values
    int i = 0;
    for(i = 0; i < LENGTH; i++){
        h_a[i] = rand() / (float)RAND_MAX;
        h_b[i] = rand() / (float)RAND_MAX;
        h_e[i] = rand() / (float)RAND_MAX;
        h_g[i] = rand() / (float)RAND_MAX;
    }

    // Set up platform and GPU device

    cl_uint numPlatforms;

    // Find number of platforms
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    checkError(err, "Finding platforms");
    if (numPlatforms == 0)
    {
        printf("Found 0 platforms!\n");
        return EXIT_FAILURE;
    }

    // Get all platforms
    cl_platform_id Platform[numPlatforms];
    err = clGetPlatformIDs(numPlatforms, Platform, NULL);
    checkError(err, "Getting platforms");

    // Secure a GPU
    for (i = 0; i < numPlatforms; i++)
    {
        err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL);
        if (err == CL_SUCCESS)
        {
            break;
        }
    }

    if (device_id == NULL)
        checkError(err, "Getting device");

    err = output_device_info(device_id);
    checkError(err, "Outputting device info");
  
    // Create a compute context 
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    checkError(err, "Creating context");

    // Create a command queue
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    checkError(err, "Creating command queue");

    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    checkError(err, "Creating program");

    // Build the program  
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }

    // Create the compute kernel from the program 
    ko_vadd = clCreateKernel(program, "vadd", &err);
    checkError(err, "Creating kernel");

    // Create the input (a, b, e, g) arrays in device memory
    // NB: we copy the host pointers here too
    d_a  = clCreateBuffer(context,  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,  dataSize, h_a, &err);
    checkError(err, "Creating buffer d_a");
    d_b  = clCreateBuffer(context,  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,  dataSize, h_b, &err);
    checkError(err, "Creating buffer d_b");
    d_e  = clCreateBuffer(context,  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,  dataSize, h_e, &err);
    checkError(err, "Creating buffer d_e");
    d_g  = clCreateBuffer(context,  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,  dataSize, h_g, &err);
    checkError(err, "Creating buffer d_g");
    
    // Create the output arrays in device memory
    d_c  = clCreateBuffer(context,  CL_MEM_READ_WRITE, dataSize, NULL, &err);
    checkError(err, "Creating buffer d_c");
    d_d  = clCreateBuffer(context,  CL_MEM_READ_WRITE, dataSize, NULL, &err);
    checkError(err, "Creating buffer d_d");
    d_f  = clCreateBuffer(context,  CL_MEM_WRITE_ONLY, dataSize, NULL, &err);
    checkError(err, "Creating buffer d_f"); 

    const int count = LENGTH;

    // Enqueue kernel - first time
    // Set the arguments to our compute kernel
    err  = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_a);
    err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_b);
    err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(ko_vadd, 3, sizeof(unsigned int), &count);
    checkError(err, "Setting kernel arguments"); 
	
    // Execute the kernel over the entire range of our 1d input data set
    // letting the OpenCL runtime choose the work-group size
    global = count;
    err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, NULL, 0, NULL, NULL);
    checkError(err, "Enqueueing kernel 1st time");

    // Enqueue kernel - second time
    // Set different arguments to our compute kernel
    err  = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_e);
    err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_d);
    checkError(err, "Setting kernel arguments");
    
    // Enqueue the kernel again    
    err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, NULL, 0, NULL, NULL);
    checkError(err, "Enqueueing kernel 2nd time");

    // Enqueue kernel - third time
    // Set different (again) arguments to our compute kernel
    err  = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_g);
    err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_d);
    err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_f);
    checkError(err, "Setting kernel arguments");

    // Enqueue the kernel again    
    err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, NULL, 0, NULL, NULL);
    checkError(err, "Enqueueing kernel 3rd time");

    // Read back the result from the compute device
    err = clEnqueueReadBuffer( commands, d_f, CL_TRUE, 0, sizeof(float) * count, h_f, 0, NULL, NULL );  
    checkError(err, "Reading back d_f");
    
    // Test the results
    correct = 0;
    float tmp;
    
    for(i = 0; i < count; i++)
    {
        tmp = h_a[i] + h_b[i] + h_e[i] + h_g[i];     // assign element i of a+b+e+g to tmp
        tmp -= h_f[i];                               // compute deviation of expected and output result
        if(tmp*tmp < TOL*TOL)                        // correct if square deviation is less than tolerance squared
            correct++;
        else {
            printf(" tmp %f h_a %f h_b %f h_e %f h_g %f h_f %f\n",tmp, h_a[i], h_b[i], h_e[i], h_g[i], h_f[i]);
        }
    }

    // summarize results
    printf("C = A+B+E+G:  %d out of %d results were correct.\n", correct, count);

    // cleanup then shutdown
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseMemObject(d_d);
    clReleaseMemObject(d_e);
    clReleaseMemObject(d_f);
    clReleaseMemObject(d_g);
    clReleaseProgram(program);
    clReleaseKernel(ko_vadd);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    free(h_a);
    free(h_b);
    free(h_c);
    free(h_d);
    free(h_e);
    free(h_f);
    free(h_g);

    return 0;
}
/* main program */
int main(int argc, char* argv[]) 
{
    cl_float a_data[LENGTH];
    cl_float b_data[LENGTH];
    cl_float c_res [LENGTH];

    cl_int rval;

    size_t domain_size[1];    /* global domain size */
    size_t workgroup_size[1];

    cl_device_id device_id;
    cl_context context;
    cl_command_queue commands;
    cl_program program_obj;
    cl_kernel kernel;
    
    cl_mem a_in;     /* device memory for input vector a */
    cl_mem b_in;     /* device memory for input vector b */
    cl_mem c_out;    /* device memory for output vector c */
    
    cl_uint count = LENGTH;

    double start_time, end_time;

    /* fill input vectors with random values */
    for(int i = 0; i < count; ++i){
        a_data[i] = rand() / (cl_float)RAND_MAX;
        b_data[i] = rand() / (cl_float)RAND_MAX;
    }

    start_time = get_time();
    
    cl_platform_id platform;
    rval = clGetPlatformIDs(1, &platform, NULL);
    if (rval != CL_SUCCESS)
        error_exit("Could not get platform", rval);
    rval = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
    if (rval != CL_SUCCESS)
        error_exit("Could not get device ID", rval);
    printf("Compute device:\n");
    output_device_info(stdout, device_id, CL_FALSE);
  
    /* create compute context */
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &rval);
    if ((rval != CL_SUCCESS) || (context == NULL))
        error_exit("Could not create compute context", rval);

    /* create command queue */
    commands = clCreateCommandQueue(context, device_id, 0, &rval);
    if ((rval != CL_SUCCESS) || (commands == NULL))
        error_exit("Could not create command queue", rval);

    /* create compute program object from source buffer */
    program_obj = clCreateProgramWithSource(context, 
            1, (const char **) &kernel_source, NULL, &rval);
    if ((rval != CL_SUCCESS) || (program_obj == NULL))
        error_exit("Could not create program object from source", rval);

    /* build program object */
    rval = clBuildProgram(program_obj, 0, NULL, NULL, NULL, NULL);
    if (rval != CL_SUCCESS) {
        size_t len;
        char buffer[2048];

        describe_error("Could not build executable", rval, stderr);
        clGetProgramBuildInfo(program_obj, device_id, CL_PROGRAM_BUILD_LOG, 
                sizeof(buffer), buffer, &len);
        fprintf(stderr, "%s\n", buffer);
        return EXIT_FAILURE;
    }

    /* create compute kernel from program object */
    kernel = clCreateKernel(program_obj, "vadd", &rval);
    if ((rval != CL_SUCCESS) || (kernel  == NULL))
        error_exit("Could not create compute kernel", rval);

    end_time = get_time();
    printf("\nInitialization time %f seconds\n", end_time - start_time);

    start_time = get_time();

    /* create input (a, b) and output (c) arrays in device memory  */
    a_in = clCreateBuffer(context,  CL_MEM_READ_ONLY,  
            sizeof(cl_float) * count, NULL, NULL);
    b_in = clCreateBuffer(context,  CL_MEM_READ_ONLY,  
            sizeof(cl_float) * count, NULL, NULL);
    c_out= clCreateBuffer(context,  CL_MEM_WRITE_ONLY, 
            sizeof(cl_float) * count, NULL, NULL);
    if ((a_in == NULL) || (b_in == NULL) || (c_out== NULL))
        error_exit("Could not allocate device memory", rval);
    
    /* write a and b vectors into compute device memory */
    rval = clEnqueueWriteBuffer(commands, a_in, CL_TRUE, 0, 
            sizeof(cl_float) * count, a_data, 0, NULL, NULL);
    if (rval != CL_SUCCESS)
        error_exit("Could not copy a_data to device memory", rval);
    rval = clEnqueueWriteBuffer(commands, b_in, CL_TRUE, 0, 
            sizeof(cl_float) * count, b_data, 0, NULL, NULL);
    if (rval != CL_SUCCESS)
        error_exit("Could not copy b_data to device memory", rval);

    end_time = get_time();
    printf("Time to copy data %f seconds\n", end_time - start_time);
    
    /* set arguments to compute kernel */
    rval = 0;
    rval  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &a_in);
    rval |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_in);
    rval |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &c_out);
    rval |= clSetKernelArg(kernel, 3, sizeof(count), &count);
    if (rval != CL_SUCCESS)
        error_exit("Could not set kernel arguments", rval);

    /* get maximum work group size for executing kernel on device */
    rval = clGetKernelWorkGroupInfo(kernel, device_id, 
            CL_KERNEL_WORK_GROUP_SIZE, 
            sizeof(workgroup_size[0]), workgroup_size, NULL);
    if (rval != CL_SUCCESS)
        error_exit("Could not retrieve kernel work group info", rval);

    start_time = get_time();
    
    /* execute kernel over entire range of 1d input data set
       using maximum number of work group items for device */
    domain_size[0] = count;
    rval = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, 
            domain_size, workgroup_size, 0, NULL, NULL);
    if (rval != CL_SUCCESS)
        error_exit("Failed to execute kernel", rval); 

    /* wait for commands to complete before reading back results */
    clFinish(commands);

    end_time = get_time();
    printf("TIme for kernel computation %f seconds\n", end_time - start_time);

    /* read back results from compute device */
    rval = clEnqueueReadBuffer(commands, c_out, CL_TRUE, 0, 
            sizeof(cl_float) * count, c_res, 0, NULL, NULL);  
    if (rval != CL_SUCCESS)
        error_exit("Could not read output array", rval);
    
    /* check results */
    int correct = 0;
    cl_float diff;
    for(cl_uint i = 0; i < count; ++i)
    {
        diff = a_data[i] + b_data[i];
        diff -= c_res[i];
        if(diff*diff < TOL*TOL)
            ++correct;
        else {
            printf("difference %f in element %d:  a %f b %f c %f\n",
                    diff, i, a_data[i], b_data[i], c_res[i]);
        }
    }
    
    /* summarize results */
    printf("\nC = A+B:  %d out of %d results were correct\n", 
            correct, count);
    
    /* clean up */
    clReleaseMemObject(a_in);
    clReleaseMemObject(b_in);
    clReleaseMemObject(c_out);
    clReleaseProgram(program_obj);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    return EXIT_SUCCESS;
}
Esempio n. 6
0
int main(void)
{
    float *h_psum;              // vector to hold partial sum
    int in_nsteps = INSTEPS;    // default number of steps (updated later to device preferable)
    int niters = ITERS;         // number of iterations
    int nsteps;
    float step_size;
    size_t nwork_groups;
    size_t max_size, work_group_size = 8;
    float pi_res;

    cl_mem d_partial_sums;

    char *kernelsource = getKernelSource("../pi_ocl.cl");             // Kernel source

    cl_int err;
    cl_device_id     device_id;     // compute device id 
    cl_context       context;       // compute context
    cl_command_queue commands;      // compute command queue
    cl_program       program;       // compute program
    cl_kernel        kernel_pi;     // compute kernel

    // Set up OpenCL context. queue, kernel, etc.
    cl_uint numPlatforms;
    // Find number of platforms
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to find a platform!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }
    // Get all platforms
    cl_platform_id Platform[numPlatforms];
    err = clGetPlatformIDs(numPlatforms, Platform, NULL);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to get the platform!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }
    // Secure a device
    for (int i = 0; i < numPlatforms; i++)
    {
        err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL);
        if (err == CL_SUCCESS)
            break;
    }
    if (device_id == NULL)
    {
        printf("Error: Failed to create a device group!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }
    // Output information
    err = output_device_info(device_id);
    // Create a compute context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    // Create a command queue
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    // Build the program  
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }
    // Create the compute kernel from the program 
    kernel_pi = clCreateKernel(program, "pi", &err);
    if (!kernel_pi || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    // Find kernel work-group size
    err = clGetKernelWorkGroupInfo (kernel_pi, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &work_group_size, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to get kernel work-group info\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    // Now that we know the size of the work-groups, we can set the number of
    // work-groups, the actual number of steps, and the step size
    nwork_groups = in_nsteps/(work_group_size*niters);

    if (nwork_groups < 1)
    {
        err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &nwork_groups, NULL);
        work_group_size = in_nsteps / (nwork_groups * niters);
    }

    nsteps = work_group_size * niters * nwork_groups;
    step_size = 1.0f/(float)nsteps;
    h_psum = calloc(sizeof(float), nwork_groups);
    if (!h_psum)
    {
        printf("Error: could not allocate host memory for h_psum\n");
        return EXIT_FAILURE;
    }

    printf(" %ld work-groups of size %ld. %d Integration steps\n",
            nwork_groups,
            work_group_size,
            nsteps);

    d_partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups, NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create buffer\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    // Set kernel arguments
    err  = clSetKernelArg(kernel_pi, 0, sizeof(int), &niters);
    err |= clSetKernelArg(kernel_pi, 1, sizeof(float), &step_size);
    err |= clSetKernelArg(kernel_pi, 2, sizeof(float) * work_group_size, NULL);
    err |= clSetKernelArg(kernel_pi, 3, sizeof(cl_mem), &d_partial_sums);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments!\n");
        return EXIT_FAILURE;
    }

    // Execute the kernel over the entire range of our 1D input data set
    // using the maximum number of work items for this device
    size_t global = nwork_groups * work_group_size;
    size_t local = work_group_size;
    double rtime = wtime();
    err = clEnqueueNDRangeKernel(
        commands,
        kernel_pi,
        1, NULL,
        &global,
        &local,
        0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to execute kernel\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }


    err = clEnqueueReadBuffer(
        commands,
        d_partial_sums,
        CL_TRUE,
        0,
        sizeof(float) * nwork_groups,
        h_psum,
        0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read buffer\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    // complete the sum and compute the final integral value on the host
    pi_res = 0.0f;
    for (unsigned int i = 0; i < nwork_groups; i++)
    {
        pi_res += h_psum[i];
    }
    pi_res *= step_size;

    rtime = wtime() - rtime;

    printf("\nThe calculation ran in %lf seconds\n", rtime);
    printf(" pi = %f for %d steps\n", pi_res, nsteps);

    // clean up
    clReleaseMemObject(d_partial_sums);
    clReleaseProgram(program);
    clReleaseKernel(kernel_pi);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
    free(kernelsource);
    free(h_psum);
}
int main(int argc, char** argv)
{
    int          rank, size;         // MPI rank & size
    int          err;                // error code returned from OpenCL calls
    float        h_a[LENGTH];        // a vector
    float        h_b[LENGTH];        // b vector
    float        h_c[LENGTH];        // c vector (a+b) returned from the compute device (local per task)
    float        _h_c[LENGTH];       // c vector (a+b) returned from the compute device (global for master)
    unsigned int correct;            // number of correct results

    size_t global;                   // global domain size
    size_t local;                    // local  domain size

    cl_device_id     device_id;      // compute device id
    cl_context       context;        // compute context
    cl_command_queue commands;       // compute command queue
    cl_program       program;        // compute program
    cl_kernel        ko_vadd;        // compute kernel

    cl_mem d_a;                      // device memory used for the input  a vector
    cl_mem d_b;                      // device memory used for the input  b vector
    cl_mem d_c;                      // device memory used for the output c vector

    int mycount, i;

    err = MPI_Init (&argc, &argv);

    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Init failed!\n");
        exit (-1);
    }

    err = MPI_Comm_rank (MPI_COMM_WORLD, &rank);
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Comm_rank failed!\n");
        exit (-1);
    }

    err = MPI_Comm_size (MPI_COMM_WORLD, &size);
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Comm_size failed\n");
        exit (-1);
    }

    if (LENGTH % size != 0)
    {
        printf ("Number of MPI processes must divide LENGTH (%d)\n", LENGTH);
        exit (-1);
    }

    mycount = LENGTH / size;

    if (rank == 0)
    {
        for (i = 0; i < LENGTH; i++)
        {
            h_a[i] = rand() / (float)RAND_MAX;
            h_b[i] = rand() / (float)RAND_MAX;
            h_a[i] = i;
            h_b[i] = i*2;
        }
        err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed transferring h_a\n");
            exit (-1);
        }
        err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed transferring h_b\n");
            exit (-1);
        }
    }
    else
    {
        err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed receiving h_a\n");
            exit (-1);
        }
        err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD);
        if (err != MPI_SUCCESS)
        {
            printf ("MPI_Bcast failed receiving h_b\n");
            exit (-1);
        }
    }

    // Set up platform
    cl_uint numPlatforms;

    // Find number of platforms
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to find a platform!\n");
        return EXIT_FAILURE;
    }

    // Get all platforms
    cl_platform_id Platform[numPlatforms];
    err = clGetPlatformIDs(numPlatforms, Platform, NULL);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to get the platform!\n");
        return EXIT_FAILURE;
    }

    // Secure a GPU
    for (i = 0; i < numPlatforms; i++)
    {
        err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL);
        if (err == CL_SUCCESS)
            break;
    }

    if (device_id == NULL)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }
    else
    {
        if (output_device_info (rank, device_id) != CL_SUCCESS)
            return EXIT_FAILURE;
    }

    // Create a compute context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    // Create a command queue
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }

    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }

    // Build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }

    // Create the compute kernel from the program
    ko_vadd = clCreateKernel(program, "vadd", &err);
    if (!ko_vadd || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }

    // Create the input (a, b) and output (c) arrays in device memory
    d_a = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * mycount, NULL, NULL);
    d_b = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * mycount, NULL, NULL);
    d_c = clCreateBuffer(context,  CL_MEM_WRITE_ONLY, sizeof(float) * mycount, NULL, NULL);
    if (!d_a || !d_b || !d_c)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }

    // Write a and b vectors into compute device memory
    err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(float) * mycount, &h_a[rank*mycount], 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write h_a to source array!\n");
        exit(1);
    }

    err = clEnqueueWriteBuffer(commands, d_b, CL_TRUE, 0, sizeof(float) * mycount, &h_b[rank*mycount], 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write h_b to source array!\n");
        exit(1);
    }

    // Set the arguments to our compute kernel
    err  = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_a);
    err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_b);
    err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(ko_vadd, 3, sizeof(unsigned int), &mycount);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }

    // Get the maximum work group size for executing the kernel on the device
    err = clGetKernelWorkGroupInfo(ko_vadd, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }

    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    global = LENGTH;
    err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, &local, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        return EXIT_FAILURE;
    }

    // Wait for the commands to complete before reading back results
    clFinish(commands);

    // Read back the results from the compute device
    err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * mycount, &h_c, 0, NULL, NULL );
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }

    err = MPI_Gather (h_c, mycount, MPI_FLOAT, _h_c, mycount, MPI_FLOAT, 0, MPI_COMM_WORLD);
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Gather failed receiving h_c\n");
        exit (-1);
    }

    if (rank == 0)
    {
        // Test the results
        correct = 0;
        float tmp;

        for(i = 0; i < LENGTH; i++)
        {
            tmp = h_a[i] + h_b[i];     // assign element i of a+b to tmp
            tmp -= _h_c[i];             // compute deviation of expected and output result
            if(tmp*tmp < TOL*TOL)      // correct if square deviation is less than tolerance squared
                correct++;
            else
                printf(" tmp %f h_a %f h_b %f h_c %f \n",tmp, h_a[i], h_b[i], _h_c[i]);
        }

        // summarize results
        printf("C = A+B:  %d out of %d results were correct.\n", correct, LENGTH);
    }

    // cleanup then shutdown
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(ko_vadd);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    err = MPI_Finalize ();
    if (err != MPI_SUCCESS)
    {
        printf ("MPI_Finalize failed!\n");
        exit (-1);
    }

    return 0;
}
int main(void) {
	//###############################################
	//
	// Declare variables for OpenCL
	//
	//###############################################
	int err;               // error code returned from OpenCL calls

	size_t global;                  // global domain size

	cl_device_id device_id;     // compute device id
	cl_context context;       // compute context
	cl_command_queue commands;      // compute command queue
	cl_program program;       // compute program
	cl_kernel ko_calculate_imagerowdots_iterations;       // compute kernel
	cl_kernel ko_calculate_colorrow;       // compute kernel

	cl_mem d_a;                    // device memory used for the input  a vector
	cl_mem d_b;                    // device memory

	int i;

	//###############################################
	//
	// Set values for mandelbrot
	//
	//###############################################

	//plane section values
	float x_ebene_min = -1;
	float y_ebene_min = -1;
	float x_ebene_max = 2;
	float y_ebene_max = 1;

	//monitor resolution values
	const long x_mon = 640;
	const long y_mon = 480;

	//Iterations
	long itr = 100;

	//abort condition
	float abort_value = 2;

	//Number of images per second
	long fps = 24;

	//video duration in seconds
	long video_duration = 3;

	//zoom speed in percentage
	float reduction = 5;

	//zoom dot
	my_complex_t zoom_dot;

	//###############################################
	//
	// Set up platform and GPU device
	//
	//###############################################

	cl_uint numPlatforms;

	// Find number of platforms
	err = clGetPlatformIDs(0, NULL, &numPlatforms);
	checkError(err, "Finding platforms");
	if (numPlatforms == 0) {
		printf("Found 0 platforms!\n");
		return EXIT_FAILURE;
	}

	// Get all platforms
	cl_platform_id Platform[numPlatforms];
	err = clGetPlatformIDs(numPlatforms, Platform, NULL);
	checkError(err, "Getting platforms");

	// Secure a GPU
	for (i = 0; i < numPlatforms; i++) {
		err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL);
		if (err == CL_SUCCESS) {
			break;
		}
	}

	if (device_id == NULL)
		checkError(err, "Finding a device");

	err = output_device_info(device_id);
	checkError(err, "Printing device output");

	//###############################################
	//
	// Create context, command queue and kernel
	//
	//###############################################

	// Create a compute context
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	checkError(err, "Creating context");

	// Create a command queue
	commands = clCreateCommandQueue(context, device_id, 0, &err);
	checkError(err, "Creating command queue");

	//Read Kernel source
	FILE *fp;
	char *source_str;
	size_t source_size, program_size;

	fp = fopen("./kernel/calculate_iterations.cl", "r");
	if (!fp) {
		printf("Failed to load kernel\n");
		return 1;
	}

	fseek(fp, 0, SEEK_END);
	program_size = ftell(fp);
	rewind(fp);
	source_str = (char*) malloc(program_size + 1);
	source_str[program_size] = '\0';
	fread(source_str, sizeof(char), program_size, fp);
	fclose(fp);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **) &source_str,
	NULL, &err);

	checkError(err, "Creating program");

	// Build the program
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS) {
		size_t len;
		char buffer[2048];

		printf("Error: Failed to build program executable!\n%s\n",
				err_code(err));
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
				sizeof(buffer), buffer, &len);
		printf("%s\n", buffer);

		// Determine the size of the log
		size_t log_size;
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL,
				&log_size);

		// Allocate memory for the log
		char *log = (char *) malloc(log_size);

		// Get the log
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
				log_size, log, NULL);

		// Print the log
		printf("%s\n", log);

		return EXIT_FAILURE;
	}

	// Create the compute kernel from the program
	ko_calculate_imagerowdots_iterations = clCreateKernel(program,
			"calculate_imagerowdots_iterations", &err);
	checkError(err, "Creating kernel");

	// Create the compute kernel from the program
	ko_calculate_colorrow = clCreateKernel(program, "calculate_colorrow", &err);
	checkError(err, "Creating kernel");

	int number_images = 0;
	do {
		//Get memory for image
		long* h_image = (long*) calloc(x_mon * y_mon, sizeof(long));
		unsigned char* h_image_pixel = (unsigned char*) calloc(
				x_mon * y_mon * 3, sizeof(unsigned char));

		//###############################################
		//###############################################
		//
		// Loop to calculate image dot iterations
		//
		//###############################################
		//###############################################

		float y_value = y_ebene_max;
		float delta_y = delta(y_ebene_min, y_ebene_max, y_mon);

		for (int row = 0; row < y_mon; ++row) {
			//###############################################
			//
			// Create and write buffer
			//
			//###############################################

			//Get memory for row
			long* h_image_row = (long*) calloc(x_mon, sizeof(long)); // a vector

			d_a = clCreateBuffer(context, CL_MEM_READ_WRITE,
					sizeof(long) * x_mon,
					NULL, &err);
			checkError(err, "Creating buffer d_a");

			// Write a vector into compute device memory
			err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0,
					sizeof(long) * x_mon, h_image_row, 0, NULL, NULL);
			checkError(err, "Copying h_a to device at d_a");

			//###############################################
			//
			// Set the arguments to our compute kernel
			//
			//###############################################

			err = clSetKernelArg(ko_calculate_imagerowdots_iterations, 0,
					sizeof(float), &x_ebene_min);
			err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 1,
					sizeof(float), &x_ebene_max);
			err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 2,
					sizeof(float), &y_value);
			err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 3,
					sizeof(long), &x_mon);
			err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 4,
					sizeof(float), &abort_value);
			err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 5,
					sizeof(long), &itr);
			err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 6,
					sizeof(cl_mem), &d_a);
			checkError(err, "Setting kernel arguments");

			/*__kernel void calculate_imagerowdots_iterations(const float x_min, const float x_max,
			 const float y_value, const long x_mon, const float abort_value, const long itr,
			 __global long * imagerow)*/

			// Execute the kernel over the entire range of our 1d input data set
			// letting the OpenCL runtime choose the work-group size
			global = x_mon;
			err = clEnqueueNDRangeKernel(commands,
					ko_calculate_imagerowdots_iterations, 1, NULL, &global,
					NULL, 0,
					NULL, NULL);
			checkError(err, "Enqueueing kernel");

			// Wait for the commands to complete
			err = clFinish(commands);
			checkError(err, "Waiting for kernel to finish");

			// Read back the results from the compute device
			err = clEnqueueReadBuffer(commands, d_a, CL_TRUE, 0,
					sizeof(long) * x_mon, h_image_row, 0, NULL, NULL);
			if (err != CL_SUCCESS) {
				printf("Error: Failed to read output array!\n%s\n",
						err_code(err));
				exit(1);
			}

			//reduce y
			y_value -= delta_y;

			//cope row to image
			memcpy(h_image + row * x_mon, h_image_row, sizeof(long) * x_mon);

			free(h_image_row);
		}

//		for (i = 0; i < x_mon * y_mon; ++i) {
//			printf("%ld ", h_image[i]);
//		}
//		fflush(stdout);

		//###############################################
		//###############################################
		//
		// End of loop to calculate image dot iterations
		//
		//###############################################
		//###############################################

		//###############################################
		//###############################################
		//
		// Beginn color calculation
		//
		//###############################################
		//###############################################

		for (int row = 0; row < y_mon; ++row) {
			//Get memory for row
			long* h_image_row = (long*) calloc(x_mon, sizeof(long)); // a vector
			memcpy(h_image_row, h_image + row * x_mon, sizeof(long) * x_mon);

			d_a = clCreateBuffer(context, CL_MEM_READ_ONLY,
					sizeof(long) * x_mon,
					NULL, &err);
			checkError(err, "Creating buffer d_a");

			// Write a vector into compute device memory
			err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0,
					sizeof(long) * x_mon, h_image_row, 0, NULL, NULL);
			checkError(err, "Copying h_image_row to device at d_a");

			unsigned char* h_imagepixel_row = (unsigned char*) calloc(x_mon * 3,
					sizeof(unsigned char));     // a vector

			d_b = clCreateBuffer(context, CL_MEM_READ_WRITE,
					sizeof(unsigned char) * x_mon * 3,
					NULL, &err);
			checkError(err, "Creating buffer d_b");

			// Write a vector into compute device memory
			err = clEnqueueWriteBuffer(commands, d_b, CL_TRUE, 0,
					sizeof(unsigned char) * x_mon * 3, h_imagepixel_row, 0,
					NULL, NULL);
			checkError(err, "Copying h_imagepixel_row to device at d_b");

			//###############################################
			//
			// Set the arguments to our compute kernel
			//
			//###############################################

			err = clSetKernelArg(ko_calculate_colorrow, 0, sizeof(long),
					&x_mon);
			err |= clSetKernelArg(ko_calculate_colorrow, 1, sizeof(long), &itr);
			err |= clSetKernelArg(ko_calculate_colorrow, 2, sizeof(cl_mem),
					&d_a);
			err |= clSetKernelArg(ko_calculate_colorrow, 3, sizeof(cl_mem),
					&d_b);
			checkError(err, "Setting kernel arguments");

			/*__kernel void calculate_colorrow(const long width, long itr, long * imagerowvalues,
			 unsigned char * imagerow)*/

			// Execute the kernel over the entire range of our 1d input data set
			// letting the OpenCL runtime choose the work-group size
			global = x_mon;
			err = clEnqueueNDRangeKernel(commands, ko_calculate_colorrow, 1,
			NULL, &global, NULL, 0,
			NULL, NULL);
			checkError(err, "Enqueueing kernel");

			// Wait for the commands to complete
			err = clFinish(commands);
			checkError(err, "Waiting for kernel to finish");

			// Read back the results from the compute device
			err = clEnqueueReadBuffer(commands, d_b, CL_TRUE, 0,
					sizeof(unsigned char) * x_mon * 3, h_imagepixel_row, 0,
					NULL, NULL);
			if (err != CL_SUCCESS) {
				printf("Error: Failed to read output array!\n%s\n",
						err_code(err));
				exit(1);
			}

			memcpy(h_image_pixel + row * x_mon * 3, h_imagepixel_row,
					sizeof(unsigned char) * x_mon * 3);

			free(h_image_row);
			free(h_imagepixel_row);
		}

		if (number_images == 0) {
			zoom_dot = find_dot_to_zoom(x_ebene_min, x_ebene_max, y_ebene_min,
					y_ebene_max, h_image, y_mon, x_mon, itr);
		}

		reduce_plane_section_focus_dot(&x_ebene_min, &x_ebene_max, &y_ebene_min,
				&y_ebene_max, reduction, zoom_dot);


		// save the image
		char filename[50];
		sprintf(filename, "img-%d.bmp", number_images);

		safe_image_to_bmp(x_mon, y_mon, h_image_pixel, filename);

		free(h_image);
		free(h_image_pixel);

		number_images++;
		itr = (long) (itr + itr * reduction / 100);
		printf("%d\n", number_images);
		fflush(stdout);
	} while (number_images < (fps * video_duration));

	//###############################################
	//
	// cleanup then shutdown
	//
	//###############################################

	clReleaseMemObject(d_a);
	clReleaseMemObject(d_b);
	clReleaseProgram(program);
	clReleaseKernel(ko_calculate_imagerowdots_iterations);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);

	return 0;
}