Exemplo n.º 1
0
cl_kernel kernel_from_string(cl_context ctx,
    char const *knl, char const *knl_name, char const *options)
{
  // create an OpenCL program (may have multiple kernels)
  size_t sizes[] = { strlen(knl) };

  cl_int status;
  cl_program program = clCreateProgramWithSource(ctx, 1, &knl, sizes, &status);
  CHECK_CL_ERROR(status, "clCreateProgramWithSource");

  // build it
  status = clBuildProgram(program, 0, NULL, options, NULL, NULL);

  if (status != CL_SUCCESS)
  {
    // build failed, get build log and print it

    cl_device_id dev;
    CALL_CL_GUARDED(clGetProgramInfo, (program, CL_PROGRAM_DEVICES,
          sizeof(dev), &dev, NULL));

    size_t log_size;
    CALL_CL_GUARDED(clGetProgramBuildInfo, (program, dev, CL_PROGRAM_BUILD_LOG,
          0, NULL, &log_size));

    char *log = (char *) malloc(log_size);
    CHECK_SYS_ERROR(!log, "kernel_from_string: allocate log");

    char devname[MAX_NAME_LEN];
    CALL_CL_GUARDED(clGetDeviceInfo, (dev, CL_DEVICE_NAME,
          sizeof(devname), devname, NULL));

    CALL_CL_GUARDED(clGetProgramBuildInfo, (program, dev, CL_PROGRAM_BUILD_LOG,
          log_size, log, NULL));
    fprintf(stderr, "*** build of '%s' on '%s' failed:\n%s\n*** (end of error)\n",
        knl_name, devname, log);
    abort();
  }
  else
    CHECK_CL_ERROR(status, "clBuildProgram");

  // fish the kernel out of the program
  cl_kernel kernel = clCreateKernel(program, knl_name, &status);
  CHECK_CL_ERROR(status, "clCreateKernel");

  CALL_CL_GUARDED(clReleaseProgram, (program));

  return kernel;
}
Exemplo n.º 2
0
 std::string get_event_type(cl_event event) {
     cl_command_type type;
     CALL_CL_GUARDED(clGetEventInfo,
             (event, CL_EVENT_COMMAND_TYPE, sizeof(type), &type, NULL));
     std::string result;
     switch (type) {
         case CL_COMMAND_NDRANGE_KERNEL:
             result = "NDRANGE_KERNEL";
             break;
         case CL_COMMAND_TASK:
             result = "TASK";
             break;
         case CL_COMMAND_NATIVE_KERNEL:
             result = "NATIVE_KERNEL";
             break;
         case CL_COMMAND_READ_BUFFER:
             result = "READ_BUFFER";
             break;
         case CL_COMMAND_WRITE_BUFFER:
             result = "WRITE_BUFFER";
             break;
         case CL_COMMAND_COPY_BUFFER:
             result =  "COPY_BUFFER";
             break;
         case CL_COMMAND_READ_IMAGE:
             result =  "READ_IMAGE";
             break;
         case CL_COMMAND_WRITE_IMAGE:
             result =  "WRITE_IMAGE";
             break;
         case CL_COMMAND_COPY_IMAGE:
             result =  "COPY_IMAGE";
             break;
         case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
             result =  "COPY_BUFFER_TO_IMAGE";
             break;
         case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
             result =  "COPY_IMAGE_TO_BUFFER";
             break;
         case CL_COMMAND_MAP_BUFFER:
             result =  "MAP_BUFFER";
             break;
         case CL_COMMAND_MAP_IMAGE:
             result =  "MAP_IMAGE";
             break;
         case CL_COMMAND_UNMAP_MEM_OBJECT:
             result =  "UNMAP_MEM_OBJECT";
             break;
         case CL_COMMAND_MARKER:
             result =  "MARKER";
             break;
         case CL_COMMAND_ACQUIRE_GL_OBJECTS:
             result =  "ACQUIRE_GL_OBJECTS";
             break;
         case CL_COMMAND_RELEASE_GL_OBJECTS:
             result = "RELEASE_GL_OBJECTS";
             break;
     }
     return result;
 }
Exemplo n.º 3
0
void print_device_info_from_queue(cl_command_queue queue)
{
  cl_device_id dev;
  CALL_CL_GUARDED(clGetCommandQueueInfo,
      (queue, CL_QUEUE_DEVICE, sizeof dev, &dev, NULL));

  print_device_info(dev);
}
Exemplo n.º 4
0
//TODO remove this at some point (deprecated)
cl_kernel kernel_from_string(cl_context ctx, 
    char const *knl, char const *knl_name, char const *options)
{
  size_t sizes[] = { strlen(knl) };

  cl_int status;
  cl_program program = clCreateProgramWithSource(ctx, 1, &knl, sizes, &status);
  CHECK_CL_ERROR(status, "clCreateProgramWithSource");

  status = clBuildProgram(program, 0, NULL, options, NULL, NULL);

  if (status != CL_SUCCESS)
  {
    // build failed, get build log.

    cl_device_id dev;
    CALL_CL_GUARDED(clGetProgramInfo, (program, CL_PROGRAM_DEVICES,
          sizeof(dev), &dev, NULL));

    size_t log_size;
    CALL_CL_GUARDED(clGetProgramBuildInfo, (program, dev, CL_PROGRAM_BUILD_LOG,
          0, NULL, &log_size));

    char *log = malloc(log_size);
    CHECK_SYS_ERROR(!log, "kernel_from_string: allocate log");

    char devname[100];
    CALL_CL_GUARDED(clGetDeviceInfo, (dev, CL_DEVICE_NAME,
          sizeof(devname), devname, NULL));

    CALL_CL_GUARDED(clGetProgramBuildInfo, (program, dev, CL_PROGRAM_BUILD_LOG,
          log_size, log, NULL));
    THError("*** build of '%s' on '%s' failed:\n%s\n*** (end of error)\n",
            knl_name, devname, log);
  }
  else
    CHECK_CL_ERROR(status, "clBuildProgram");

  cl_kernel kernel = clCreateKernel(program, knl_name, &status);
  CHECK_CL_ERROR(status, "clCreateKernel");

  CALL_CL_GUARDED(clReleaseProgram, (program));

  return kernel;
}
Exemplo n.º 5
0
void get_device_name_from_queue(cl_command_queue queue, char * buf, int bufsize)
{
   cl_device_id dev;
   CALL_CL_GUARDED(clGetCommandQueueInfo,
                   (queue, CL_QUEUE_DEVICE, sizeof dev, &dev, NULL));
   
   clGetDeviceInfo(dev, CL_DEVICE_NAME, bufsize, buf, NULL);
   
}
void fft_1D(cl_mem a,cl_mem b,cl_mem c, int N, cl_kernel init, cl_kernel knl,cl_command_queue queue,int direction,int offset_line)
{
	//handle complex-to-complex fft, accutal size = 2 * N

	//size_t ldim[] = { 128 };
	//size_t gdim[] = { (N /ldim[0])/2};
	int Ns = 1;
	int y =0;
	SET_7_KERNEL_ARGS(init, a, b, N, Ns,direction,offset_line,y);


	size_t ldim[] = { 1 };
	size_t gdim[] = { N/4 };

	CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, init,
			 1, NULL, gdim, ldim,
			0, NULL, NULL));
	
	for(Ns=4; Ns<N; Ns<<=2)
	{



			SET_6_KERNEL_ARGS(knl, b, c, N, Ns,direction,offset_line);
			size_t ldim[] = { 1 };
			size_t gdim[] = { N/4 };
			
			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
					(queue, knl,
					 1, NULL, gdim, ldim,
					0, NULL, NULL));
			clEnqueueCopyBuffer(queue,c,b,
					offset_line*N*2*sizeof(float),
					offset_line*N*2*sizeof(float),
					sizeof(float)*N*2,0,NULL,NULL);
			//VecCopy(c,b,N,offset_line,vec_copy,queue);
			
			
			
		  
	}
	
}
Exemplo n.º 7
0
void update_walker_positions_device(sampler *samp){
    /*
     Update walker positions and corresponding PDF values on device.

     Input:
          sampler *samp                  Pointer to sampler structure which has been initialized.

     Output:
                                         Walker positions updated on device.
                                         Log PDF values updated on device.
     */



    // --------------------------------------------------------------------------
    // transfer to device
    // --------------------------------------------------------------------------

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->X_red_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->N * samp->K_over_two * sizeof(cl_float), samp->X_red_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->log_pdf_red_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->K_over_two * sizeof(cl_float), samp->log_pdf_red_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->X_black_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->N * samp->K_over_two * sizeof(cl_float), samp->X_black_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->log_pdf_black_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->K_over_two * sizeof(cl_float), samp->log_pdf_black_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clFinish, (samp->queue));

    if(OUTPUT_LEVEL > 0) printf("Walker update to device completed.\n");

}
Exemplo n.º 8
0
 void print_profile(cl_event event, cl_int status, void* data) {
     cl_long t_enqueue, t_submit, t_start, t_end;
     CALL_CL_GUARDED(clGetEventProfilingInfo,
             (event, CL_PROFILING_COMMAND_QUEUED,
             sizeof(t_enqueue), &t_enqueue, NULL));
     CALL_CL_GUARDED(clGetEventProfilingInfo,
             (event, CL_PROFILING_COMMAND_SUBMIT,
             sizeof(t_submit), &t_submit, NULL));
     CALL_CL_GUARDED(clGetEventProfilingInfo,
             (event, CL_PROFILING_COMMAND_START,
             sizeof(t_start), &t_start, NULL));
     CALL_CL_GUARDED(clGetEventProfilingInfo,
             (event, CL_PROFILING_COMMAND_END,
             sizeof(t_end), &t_end, NULL));
     std::cout << get_event_type(event) << "  status: " << status << std::endl;
     std::cout << "time on queue : "
         << (t_submit - t_enqueue) / 1.0e9 << "s" << std::endl;
     std::cout << "time submision: "
         << (t_start - t_submit) / 1.0e9 << "s" << std::endl;
     std::cout << "time execution: "
         << (t_end - t_start) / 1.0e9 << "s" << std::endl;
 }
void mat__trans(cl_mem a, cl_mem b, int N, cl_kernel mat_trans, cl_command_queue queue,int option, float epsilon,float k,float s)
{
	cl_long offset = 0;
	SET_8_KERNEL_ARGS(mat_trans, a, b, N, option,epsilon,k,s,offset);

	size_t ldim[] = { 16, 16 };
	size_t gdim[] = { N, N };
	CALL_CL_GUARDED(clEnqueueNDRangeKernel,
	(queue, mat_trans,
	/*dimensions*/ 2, NULL, gdim, ldim,
	0, NULL, NULL));


}
Exemplo n.º 10
0
void print_platforms_devices()
{
  // get number of platforms
  cl_uint plat_count;
  CALL_CL_GUARDED(clGetPlatformIDs, (0, NULL, &plat_count));

  // allocate memory, get list of platforms
  cl_platform_id *platforms =
    (cl_platform_id *) malloc(plat_count*sizeof(cl_platform_id));
  CHECK_SYS_ERROR(!platforms, "allocating platform array");

  CALL_CL_GUARDED(clGetPlatformIDs, (plat_count, platforms, NULL));

  // iterate over platforms
  for (cl_uint i = 0; i < plat_count; ++i)
  {
    // get platform vendor name
    char buf[MAX_NAME_LEN];
    CALL_CL_GUARDED(clGetPlatformInfo, (platforms[i], CL_PLATFORM_VENDOR,
          sizeof(buf), buf, NULL));
    printf("platform %d: vendor '%s'\n", i, buf);

    // get number of devices in platform
    cl_uint dev_count;
    CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL,
          0, NULL, &dev_count));

    cl_device_id *devices =
      (cl_device_id *) malloc(dev_count*sizeof(cl_device_id));
    CHECK_SYS_ERROR(!devices, "allocating device array");

    // get list of devices in platform
    CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL,
          dev_count, devices, NULL));

    // iterate over devices
    for (cl_uint j = 0; j < dev_count; ++j)
    {
      char buf[MAX_NAME_LEN];
      CALL_CL_GUARDED(clGetDeviceInfo, (devices[j], CL_DEVICE_NAME,
            sizeof(buf), buf, NULL));
      printf("  device %d: '%s'\n", j, buf);
    }

    free(devices);
  }

  free(platforms);
}
Exemplo n.º 11
0
//TODO move this to the routine opencl.printAllPlatform()
void print_platforms_devices()
{
  cl_uint plat_count;

  CALL_CL_GUARDED(clGetPlatformIDs, (0, NULL, &plat_count));

  cl_platform_id *platforms = 
    (cl_platform_id *) malloc(plat_count*sizeof(cl_platform_id));
  CHECK_SYS_ERROR(!platforms, "allocating platform array");

  CALL_CL_GUARDED(clGetPlatformIDs, (plat_count, platforms, NULL));
  cl_uint i;
  for (i = 0; i < plat_count; ++i)
  {
    char buf[100];
    CALL_CL_GUARDED(clGetPlatformInfo, (platforms[i], CL_PLATFORM_VENDOR, 
          sizeof(buf), buf, NULL));
    printf("plat %d: vendor '%s'\n", i, buf);

    cl_uint dev_count;
    CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL,
          0, NULL, &dev_count));

    cl_device_id *devices = 
      (cl_device_id *) malloc(dev_count*sizeof(cl_device_id));
    CHECK_SYS_ERROR(!devices, "allocating device array");

    CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL,
          dev_count, devices, NULL));

    cl_uint j;
    for (j = 0; j < dev_count; ++j)
    {
      char buf[100];
      CALL_CL_GUARDED(clGetDeviceInfo, (devices[j], CL_DEVICE_NAME,
            sizeof(buf), buf, NULL));
      printf("  dev %d '%s'\n", j, buf);
    }

    free(devices);
  }

  free(platforms);
}
Exemplo n.º 12
0
void run_simulated_annealing(sampler *samp, cl_float *cooling_schedule, cl_int annealing_loops, cl_int steps_per_loop){
    /*
     Run the simulated annealing to allow the walkers to explore the space
         and (hopefully) increase convergence speed.
     Discard all the samples generated by this routine.
     Reset all the counters for acceptance rates.

     Input:
          sampler *samp                  Pointer to sampler structure which has been initialized.
          cl_float *cooling_schedule     Values of beta for the simulated annealing
                                         Values should be increasing and the final value should be one
          cl_int annealing_loops         Number of loops
          cl_int steps_per_loop          Iterations per loop

     Output:
                                         Pre-allocated sampler arrays now have had simulated annealing performed.
     */

    for(int annealing_step=0; annealing_step<annealing_loops; annealing_step++){

        // set the beta value for this iteration
        (samp->data_st)->beta = cooling_schedule[annealing_step];
        (samp->data_st)->save = 0;

        // update the data structure accordingly
        CALL_CL_GUARDED(clEnqueueWriteBuffer, (
            samp->queue, samp->data_st_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
            sizeof(data_struct), samp->data_st,
            0, NULL, NULL));
        CALL_CL_GUARDED(clFinish, (samp->queue));

        for(int it=0; it<steps_per_loop; it++){
            SET_9_KERNEL_ARGS(samp->stretch_knl,
                  samp->X_red_device,
                  samp->log_pdf_red_device,
                  samp->X_black_device,
                  samp->ranluxcltab,
                  samp->accepted_device,
                  samp->data_device,
                  samp->data_st_device,
                  samp->indices_to_save_device,
                  samp->X_red_save);

            CALL_CL_GUARDED(clEnqueueNDRangeKernel,
                  (samp->queue, samp->stretch_knl,
                   1, NULL, samp->gdim, samp->ldim,
                   0, NULL, NULL));

            SET_9_KERNEL_ARGS(samp->stretch_knl,
                  samp->X_black_device,
                  samp->log_pdf_black_device,
                  samp->X_red_device,
                  samp->ranluxcltab,
                  samp->accepted_device,
                  samp->data_device,
                  samp->data_st_device,
                  samp->indices_to_save_device,
                  samp->X_black_save);

            CALL_CL_GUARDED(clEnqueueNDRangeKernel,
                  (samp->queue, samp->stretch_knl,
                   1, NULL, samp->gdim, samp->ldim,
                   0, NULL, NULL));

            CALL_CL_GUARDED(clFinish, (samp->queue));
        }

        if(OUTPUT_LEVEL > 0) printf("Annealing iteration %d\n", annealing_step * steps_per_loop);
    }

    // reset the acceptance counter after the annealing
    for(int i=0; i< (samp->K_over_two); i++) samp->accepted_host[i] = 0;
    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->accepted_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->K_over_two * sizeof(cl_ulong), samp->accepted_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clFinish, (samp->queue));

}
Exemplo n.º 13
0
void run_burn_in(sampler *samp, int burn_length){
    /*
     Run the sampler to burn in.
     Discard all the samples generated by this routine.
     Reset all the counters for acceptance rates.

     Input:
          sampler *samp        Pointer to sampler structure which has been initialized.
          int burn_length      Number of burn in steps to run.

     Output:
          Pre-allocated sampler arrays now have had burn-in performed.
     */

    // reset beta
    (samp->data_st)->beta = 1.0f;
    (samp->data_st)->save = 0;

    // update the data structure accordingly
    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->data_st_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        sizeof(data_struct), samp->data_st,
        0, NULL, NULL));
    CALL_CL_GUARDED(clFinish, (samp->queue));

    // do the burn in
    for(int it=0; it<burn_length; it++){

        SET_9_KERNEL_ARGS(samp->stretch_knl,
              samp->X_red_device,
              samp->log_pdf_red_device,
              samp->X_black_device,
              samp->ranluxcltab,
              samp->accepted_device,
              samp->data_device,
              samp->data_st_device,
              samp->indices_to_save_device,
              samp->X_red_save );

        CALL_CL_GUARDED(clEnqueueNDRangeKernel,
              (samp->queue, samp->stretch_knl,
               1, NULL, samp->gdim, samp->ldim,
               0, NULL, NULL));

        SET_9_KERNEL_ARGS(samp->stretch_knl,
              samp->X_black_device,
              samp->log_pdf_black_device,
              samp->X_red_device,
              samp->ranluxcltab,
              samp->accepted_device,
              samp->data_device,
              samp->data_st_device,
              samp->indices_to_save_device,
              samp->X_black_save);

        CALL_CL_GUARDED(clEnqueueNDRangeKernel,
              (samp->queue, samp->stretch_knl,
               1, NULL, samp->gdim, samp->ldim,
               0, NULL, NULL));

        CALL_CL_GUARDED(clFinish, (samp->queue));

        if( ((it % MAX((burn_length/10),1)) == 0) && (OUTPUT_LEVEL > 0))
                printf("Burn iteration %d\n", it);
    }


    // make sure everything is done with the burn in
    CALL_CL_GUARDED(clFinish, (samp->queue));

    // reset the acceptance counter after the burn in
    for(int i=0; i< (samp->K_over_two); i++)
        samp->accepted_host[i] = 0;

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->accepted_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->K_over_two * sizeof(cl_ulong), samp->accepted_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clFinish, (samp->queue));

    if(OUTPUT_LEVEL > 0) printf("Burn in complete.\n");

}
Exemplo n.º 14
0
sampler* initialize_sampler(cl_int chain_length, cl_int dimension,
                            cl_int walkers_per_group, size_t work_group_size,
                            double a, cl_int pdf_number,
                            cl_int data_length, cl_float *data,
                            cl_int num_to_save, cl_int *indices_to_save,
                            const char *plat_name, const char *dev_name){

    /*
     Initialize stretch move MCMC sampler struct.
     Arrange parameters into sampler struct pointer.
     Allocate arrays on host, initialize walkers and other values as appropriate.
     Start OpenCL context and queue.
     Allocate device memory and transfer from host.
     Compile and initialize random number generator.
     Compile stretch move OpenCL kernel.

     Input:
          cl_int chain_length                Allocate space for this many samples in the sampler struct.
                                                 Sampler fills this array when run_sampler is called.
          cl_int dimension                   Dimension of state vector of Markov chain.
          cl_int walkers_per_group           Number of walkers in each of two groups. Total walkers is twice this.
          size_t work_group_size             Work group size.
                                                 For CPU this must be set to one.
                                                 For GPU this should be set larger, powers of two are optimal, try 64, 128 or 256.
                                                 This number must divide walkers_per_group.
          double a                           Coefficient for range of 'z' random variable.
                                                 Must be greater than one.
                                                 Standard value is 2.
                                                 Decrease a to increase low acceptance rate, especially in high dimensions.
          cl_int pdf_number                  Which PDF to sample. Passed to pdf.h as a compile time definition.
          cl_int data_length                 Length of observation data. If no data set this to zero.
          cl_float *data                     Observation data.
          cl_int num_to_save                 Number of components to save in the chain
          cl_int *indices_to_save            Indices of components to save in the chain
          const char *plat_name              String for platform name. Set to CHOOSE_INTERACTIVELY (no quotes) to do so.
          const char *dev_name               String for device name. Set to CHOOSE_INTERACTIVELY (no quotes) to do so.

     Output:
          returned: sampler *samp            Pointer to sampler struct with parameters, arrays, context, queue, kernel initialized.
     */


    if(OUTPUT_LEVEL > 0) printf("Initializing Stretch Move sampler.\n");


    // --------------------------------------------------------------------------
    // Set parameters
    // --------------------------------------------------------------------------

    // This environment variable forces headers to be reloaded each time
    // If not set and pdf if changed, changes may not be updated
    setenv("CUDA_CACHE_DISABLE", "1", 1);

    // allocate the structure for all the sampler parameters and arrays
    sampler * samp = (sampler *) malloc(sizeof(sampler));
    if(!samp) { perror("Allocation failure sampler"); abort(); }

    // user set parameters
    samp->M = chain_length;                           // Number of steps to run
    samp->N = dimension;                              // Dimension of the problem and the walkers
    samp->K_over_two = walkers_per_group ;            // Number of walkers in each group

    // derived parameters
    samp->K = 2 * samp->K_over_two;                   // Total walkers
    samp->total_samples = samp->M * samp->K;          // Total samples produced

    // indices to save
    samp->num_to_save = num_to_save;
    samp->indices_to_save_host = indices_to_save;

    // Allocate the structure and set values
    samp->data_st = (data_struct *) malloc(sizeof(data_struct));
    if(!(samp->data_st)) { perror("Allocation failure data_struct"); abort(); }

    // default value one, unless performing simulated annealing
    (samp->data_st)->beta         = 1.0f;
    (samp->data_st)->save         = 1;
    (samp->data_st)->num_to_save  = num_to_save;

    // coefficient on Z random variable
    samp->a = a;
    double a_coeffs[3];
    a_coeffs[0] = 1.0 / a;
    a_coeffs[1] = 2.0 * (1.0 - 1.0/a);
    a_coeffs[2] = a - 2.0 + 1.0/a;


    // error check on dimensions
    if(samp->K <= samp->N){
        fprintf(stderr, "Error: Must have more walkers than the dimension.\nExiting\n");
        abort();
    }

    // error check on work sizes
    if( (samp->K_over_two % work_group_size) != 0){
        fprintf(stderr, "Error: Number of walkers in each group must be multiple of work group size.\nExiting\n");
        abort();
    }

    // error check on dimensions to save
    for(int i=0; i<num_to_save; i++){
        if(samp->indices_to_save_host[i] >= samp->N){
            fprintf(stderr, "Error: Cannot save an index larger than the dimension of the problem.\nExiting\n");
            abort();
        }
    }

    if(a <= 1.0){
        fprintf(stderr, "Error: Value of a must be greater than one.\nDefaulting to 2.\n");
        samp->a = 2.0;
    }


    // for later output
    samp->acor_times  = (double *) malloc(samp->num_to_save * sizeof(double));
    if(!samp->acor_times) { perror("Allocation failure"); abort(); }
    samp->acor_pass   = (char   *) malloc(samp->num_to_save * sizeof(char));
    if(!samp->acor_pass) { perror("Allocation failure"); abort(); }
    samp->sigma       = (double *) malloc(samp->num_to_save * sizeof(double));
    if(!samp->sigma)      { perror("Allocation failure"); abort(); }
    samp->means       = (double *) malloc(samp->num_to_save * sizeof(double));
    if(!samp->means)      { perror("Allocation failure"); abort(); }
    samp->err_bar     = (double *) malloc(samp->num_to_save * sizeof(double));
    if(!samp->err_bar)    { perror("Allocation failure"); abort(); }

    // write parameter file for plotting
    write_parameter_file_matlab(samp->M, samp->N, samp->K, "Stretch Move",
                            samp->indices_to_save_host, samp->num_to_save, pdf_number);

    // --------------------------------------------------------------------------
    // Set up OpenCL context and queues
    // --------------------------------------------------------------------------
    if(OUTPUT_LEVEL > 0) printf("Begin opencl contexts.\n");

    create_context_on(plat_name, dev_name, 0, &(samp->ctx), NULL, 0);

    {
      cl_int status;
      cl_device_id my_dev;

      CALL_CL_GUARDED(clGetContextInfo, (samp->ctx, CL_CONTEXT_DEVICES,
            sizeof(my_dev), &my_dev, NULL));

      samp->queue = clCreateCommandQueue(samp->ctx, my_dev, 0, &status);
      CHECK_CL_ERROR(status, "clCreateCommandQueue");
      samp->queue_mem = clCreateCommandQueue(samp->ctx, my_dev, 0, &status);
      CHECK_CL_ERROR(status, "clCreateCommandQueue");
    }

    // print information on selected device
    if(OUTPUT_LEVEL > 1)  print_device_info_from_queue(samp->queue);

    // set the work group sizes
    samp->ldim[0] = work_group_size;
    samp->gdim[0] = samp->K_over_two;

    if(OUTPUT_LEVEL > 0) printf("Context built.\n");


    // --------------------------------------------------------------------------
    // Start total timing
    // --------------------------------------------------------------------------
    if(OUTPUT_LEVEL > 0) printf("Begin total timing.\n");
    get_timestamp(&(samp->time1_total));


    // --------------------------------------------------------------------------
    // Allocate host memory
    // --------------------------------------------------------------------------

    // counter for number of samples accepted
    samp->accepted_host = (cl_ulong *) malloc(samp->K_over_two * sizeof(cl_ulong));
    if(!(samp->accepted_host)){ perror("Allocation failure accepted host"); abort(); }
    for(int i=0; i< (samp->K_over_two); i++) samp->accepted_host[i] = 0;

    // Adjacent memory on x_red moves with in the walker
    // To access the ith component of walker j, take x_red[i + j*N];

    // red walkers
    samp->X_red_host = (cl_float *) malloc(samp->N * samp->K_over_two * sizeof(cl_float));
    if(!(samp->X_red_host)){ perror("Allocation failure X_red_host"); abort(); }

    // log likelihood
    samp->log_pdf_red_host = (cl_float *) malloc(samp->K_over_two * sizeof(cl_float));
    if(!(samp->log_pdf_red_host)){ perror("Allocation failure X_red_host"); abort(); }
    for(int i=0; i<(samp->K_over_two); i++) samp->log_pdf_red_host[i] = (-1.0f) / 0.0f;

    // black walkers
    samp->X_black_host = (cl_float *) malloc(samp->N * samp->K_over_two * sizeof(cl_float));
    if(!(samp->X_black_host)){ perror("Allocation failure X_black_host"); abort(); }

    // log likelihood
    samp->log_pdf_black_host = (cl_float *) malloc(samp->K_over_two * sizeof(cl_float));
    if(!(samp->log_pdf_black_host)){ perror("Allocation failure X_red_host"); abort(); }
    for(int i=0; i< (samp->K_over_two); i++) samp->log_pdf_black_host[i] = (-1.0f) / 0.0f;

    // samples on host
    cl_int samples_length = samp->num_to_save * samp->M * samp->K;                // length of the samples array
    samp->samples_host = (cl_float *) malloc(samples_length * sizeof(cl_float));         // samples to return
    if(!(samp->samples_host)){ perror("Allocation failure samples_host"); abort(); }


    // intialize the walkers to random values
    // set the seed value
    srand48(0);

    // initialize the walkers to small random values
    for(int j=0; j < samp->N * samp->K_over_two; j++){
        if(NONNEGATIVE_BOX){
            samp->X_black_host[j] = (cl_float) drand48();
            samp->X_red_host[j]   = (cl_float) drand48();
        }
        else{
            samp->X_black_host[j] = (cl_float) (0.1 * (drand48()-0.5));
            samp->X_red_host[j]   = (cl_float) (0.1 * (drand48()-0.5));
        }

    }


    // set up observations
    samp->data_length = data_length;

    // there are lots of complications that appear if this is empty
    // make it length one instead
    if(samp->data_length == 0){
        samp->data_length = 1;
        samp->data_host = (cl_float *) malloc(samp->data_length * sizeof(cl_float)) ;
        if(!(samp->data_host)){ perror("Allocation failure data_host"); abort(); }
        samp->data_host[0] = 0.0f;
    }
    else{
        // standard case
        samp->data_host = data;
    }


    // --------------------------------------------------------------------------
    // load kernels
    // --------------------------------------------------------------------------

    // stretch move kernel
    char *knl_text = read_file("stretch_move.cl");
    char options[300];
    sprintf(options, "-D NN=%d -D K_OVER_TWO=%d -D WORK_GROUP_SIZE=%d -D DATA_LEN=%d -D PDF_NUMBER=%d -D A_COEFF_0=%.10ff -D A_COEFF_1=%.10ff -D A_COEFF_2=%.10ff  -I . ",
            samp->N, samp->K_over_two, (int) work_group_size, samp->data_length, pdf_number, a_coeffs[0], a_coeffs[1], a_coeffs[2]);

    if(OUTPUT_LEVEL > 0) printf("Options string for stretch move kernel:%s\n", options);

    samp->stretch_knl = kernel_from_string(samp->ctx, knl_text, "stretch_move", options);
    free(knl_text);

    if(OUTPUT_LEVEL > 0) printf("Stretch Move kernel compiled.\n");

    // random number generator initialization
    char * knl_text_rand = read_file("Kernel_Ranluxcl_Init.cl");
    char options_rand_lux[100];

    if(AMD)
        sprintf(options_rand_lux, "-DRANLUXCL_LUX=4 -I .");
    else
        sprintf(options_rand_lux, "-DRANLUXCL_LUX=4");

    samp->init_rand_lux_knl = kernel_from_string(samp->ctx, knl_text_rand, "Kernel_Ranluxcl_Init", options_rand_lux);
    free(knl_text_rand);

    if(OUTPUT_LEVEL > 0) printf("Ranluxcl init kernel compiled.\n");



    // --------------------------------------------------------------------------
    // allocate device memory
    // --------------------------------------------------------------------------
    cl_int status;

    samp->X_red_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
      sizeof(cl_float) * samp->N * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->log_pdf_red_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
      sizeof(cl_float) * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->X_red_save = clCreateBuffer(samp->ctx, CL_MEM_WRITE_ONLY,
      sizeof(cl_float) * samp->num_to_save * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->X_black_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
      sizeof(cl_float) * samp->N * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->log_pdf_black_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
      sizeof(cl_float) * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->X_black_save = clCreateBuffer(samp->ctx, CL_MEM_WRITE_ONLY,
      sizeof(cl_float) * samp->num_to_save * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->accepted_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
            samp->K_over_two * sizeof(cl_ulong), 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->indices_to_save_device = clCreateBuffer(samp->ctx, CL_MEM_READ_ONLY,
            samp->num_to_save * sizeof(cl_int), 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");


    // allocate for the observations
    samp->data_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
      sizeof(cl_float) * samp->data_length, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    // data struct on device
    samp->data_st_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
            sizeof(data_struct), 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");


    // allocate for the state array for randluxcl
    // use a 1d work group
    size_t rand_lux_state_buffer_size = samp->gdim[0] * 7 * sizeof(cl_float4);
    samp->ranluxcltab = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
        rand_lux_state_buffer_size, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");


    // --------------------------------------------------------------------------
    // transfer to device
    // --------------------------------------------------------------------------

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->X_red_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->N * samp->K_over_two * sizeof(cl_float), samp->X_red_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->log_pdf_red_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->K_over_two * sizeof(cl_float), samp->log_pdf_red_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->X_black_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->N * samp->K_over_two * sizeof(cl_float), samp->X_black_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->log_pdf_black_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->K_over_two * sizeof(cl_float), samp->log_pdf_black_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->data_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->data_length * sizeof(cl_float), samp->data_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->data_st_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        sizeof(data_struct), samp->data_st,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->indices_to_save_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->num_to_save * sizeof(cl_int), samp->indices_to_save_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clFinish, (samp->queue));


    // --------------------------------------------------------------------------
    // Initialize random number generator
    // --------------------------------------------------------------------------

    // int for state variable initialization
    cl_int ins = 1;
    SET_2_KERNEL_ARGS(samp->init_rand_lux_knl, ins, samp->ranluxcltab);

    CALL_CL_GUARDED(clEnqueueNDRangeKernel,
          (samp->queue, samp->init_rand_lux_knl,
           /*dimensions*/ 1, NULL, samp->gdim, samp->ldim,
           0, NULL, NULL));

    CALL_CL_GUARDED(clFinish, (samp->queue));

    // --------------------------------------------------------------------------
    // Sampler initialization is done
    // --------------------------------------------------------------------------
    if(OUTPUT_LEVEL > 0) printf("Sampler initialized.\n");
    return samp;
}
void fft2D_new(cl_mem a, cl_mem c, cl_mem b,cl_mem d, int N, cl_kernel init,cl_kernel interm,
		cl_kernel fft1D,cl_kernel mat_trans, cl_command_queue queue,int direction)
{
#if 0	
		int Ns = 1;
		int y =0;
		int x =N*N;
		SET_7_KERNEL_ARGS(init, a, b, N, Ns,direction,y,y);


		size_t ldim[] = { 1 };
		size_t gdim[] = { N*N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, init,
			 1, NULL, gdim, ldim,
			0, NULL, NULL));
#endif

#if 1
	int Ns = 1;
	int stride = 64;
	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;
		
		int y =0;
		SET_7_KERNEL_ARGS(init, a, b, N, Ns,direction,offset,y);


		size_t ldim[] = { 1 };
		size_t gdim[] = { N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, init,
			 1, NULL, gdim, ldim,
			0, NULL, NULL));
	}
#if 1
	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;
		if(N >= 4)
		{
		Ns = 4;

		SET_6_KERNEL_ARGS(interm, b, c, N, Ns,direction,offset);
		size_t ldim[] = { 16 };
		size_t gdim[] = { N/4 };
		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
		(queue, interm,
		 1, NULL, gdim, ldim,
		0, NULL, NULL));

		}
}
		
	
		clEnqueueCopyBuffer(queue,c,b,
			0,
			0,
			sizeof(float)*N*N*2,0,NULL,NULL);


	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;
		if(N>=16)
		{
		Ns = 16;

		SET_6_KERNEL_ARGS(interm, b, c, N, Ns,direction,offset);
		size_t ldim[] = { 16 };
		size_t gdim[] = { N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
		(queue, interm,
		1, NULL, gdim, ldim,
		0, NULL, NULL));

		}

}

		clEnqueueCopyBuffer(queue,c,b,
		0,
		0,
		sizeof(float)*N*N*2,0,NULL,NULL);

	if(N >=64) 

		
#endif
		for(Ns=64; Ns<N; Ns<<=2)
		{

	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;

		SET_6_KERNEL_ARGS(fft1D, b, c, N, Ns,direction,offset);
		size_t ldim[] = { 1 };
		size_t gdim[] = { N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, fft1D,
				 1, NULL, gdim, ldim,
				0, NULL, NULL));

		//VecCopy(c,b,N,offset_line,vec_copy,queue);




		}

		clEnqueueCopyBuffer(queue,c,b,
				0,
				0,
				sizeof(float)*N*N*2,0,NULL,NULL);
		}

#endif
	//CALL_CL_GUARDED(clFinish, (queue));
	//printf("1D fine \n");

	mat__trans(b,c,N,mat_trans,queue,0,1,1,1);

	#if 0
	float test;
	CALL_CL_GUARDED(clFinish, (queue));
	CALL_CL_GUARDED(clEnqueueReadBuffer, (
        	queue, c, /*blocking*/ CL_TRUE, /*offset*/ 2*sizeof(float)*N,
       		sizeof(float), &test,
        	0, NULL, NULL));
	

	printf("test = %f\n",test);
	#endif



	//CALL_CL_GUARDED(clFinish, (queue));

#if 0
	for(int j= 0;j<N;j++)
	{
		fft_1D_new(c,b,d,N,init,interm,fft1D,queue,direction,j);
	}
#endif
#if 1
	Ns = 1;	
	

	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;
		
		int y =0;
		SET_7_KERNEL_ARGS(init, c, b, N, Ns,direction,offset,y);


		size_t ldim[] = { 1 };
		size_t gdim[] = { N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, init,
			 1, NULL, gdim, ldim,
			0, NULL, NULL));
	}
#if 1
	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;
		if(N >= 4)
		{
		Ns = 4;

		SET_6_KERNEL_ARGS(interm, b, d, N, Ns,direction,offset);
		size_t ldim[] = { 16 };
		size_t gdim[] = { N/4 };
		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
		(queue, interm,
		 1, NULL, gdim, ldim,
		0, NULL, NULL));

		}
}
		
	
		clEnqueueCopyBuffer(queue,d,b,
			0,
			0,
			sizeof(float)*N*N*2,0,NULL,NULL);


	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;
		if(N>=16)
		{
		Ns = 16;

		SET_6_KERNEL_ARGS(interm, b, d, N, Ns,direction,offset);
		size_t ldim[] = { 16 };
		size_t gdim[] = { N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
		(queue, interm,
		1, NULL, gdim, ldim,
		0, NULL, NULL));

		}

}

		clEnqueueCopyBuffer(queue,d,b,
		0,
		0,
		sizeof(float)*N*N*2,0,NULL,NULL);

	if(N >=64) 

#endif		

		for(Ns=64; Ns<N; Ns<<=2)
		{

	for(int blk=0; blk<stride;blk++)
		for(int j= 0;j<N/stride;j++)
		{
			int offset = blk*N/stride +j;

		SET_6_KERNEL_ARGS(fft1D, b, d, N, Ns,direction,offset);
		size_t ldim[] = { 1 };
		size_t gdim[] = { N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, fft1D,
				 1, NULL, gdim, ldim,
				0, NULL, NULL));

		//VecCopy(c,b,N,offset_line,vec_copy,queue);




		}

		clEnqueueCopyBuffer(queue,d,b,
				0,
				0,
				sizeof(float)*N*N*2,0,NULL,NULL);
		}
#endif

	//CALL_CL_GUARDED(clFinish, (queue));
	if(direction == 1)
		mat__trans(b,c,N,mat_trans,queue,0,1,1,1);
	else 
		mat__trans(b,c,N,mat_trans,queue,-1,1,1,1);
	
}
Exemplo n.º 16
0
int
main (int argv, char **argc)
{
  /////////////////////////
  ////// SAME IN EVERY FILE
  /////////////////////////

  // create context and command queue
  cl_context       __sheets_context;
  cl_command_queue __sheets_queue;
  int              _i;
  cl_int           __cl_err;
  
  create_context_on(SHEETS_PLAT_NAME,
		    SHEETS_DEV_NAME,
		    0,		/* choose the first (only) available device */
		    &__sheets_context,
		    &__sheets_queue,
		    0);

  // compile kernels
  for (_i = 0; _i < NKERNELS; _i++) {
    compiled_kernels[_i] = kernel_from_string(__sheets_context,
					      kernel_strings[_i],
					      kernel_names[_i],
					      SHEETS_KERNEL_COMPILE_OPTS);
  }

  ////// [END]

  size_t __SIZE_wav = atoi(argc[1]);

  float wav[__SIZE_wav];
  const char *file_name = "mytune.wav";
  int in_thrsh_cnt = 0;

  timestamp_type st;
  timestamp_type end;

  get_timestamp(&st);		
  for (_i = 0; _i < __SIZE_wav; _i++) {
    wav[_i] = (float) rand() / RAND_MAX;
    if (in_thrsh(wav[_i], 0.1112, 0.7888))
      in_thrsh_cnt++;
  }
  get_timestamp(&end);

  printf("cpu execution took %f seconds\n", timestamp_diff_in_seconds(st, end));

  get_timestamp(&st);

  /////////////////
  ////// GFUNC CALL
  /////////////////

  /// create variables for function arguments given as literals
  float __PRIM_band_restrict_ARG2 = 0.1112f;
  float __PRIM_band_restrict_ARG3 = 0.7888f;

  /// return array (always arg0)
  cl_mem __CLMEM_band_restrict_ARG0 = clCreateBuffer(__sheets_context, 
						     CL_MEM_WRITE_ONLY, 
						     sizeof(float) * __SIZE_wav, 
						     NULL, 
						     &__cl_err);
  CHECK_CL_ERROR(__cl_err, "clCreateBuffer");
					      
  /// input arrays
  cl_mem __CLMEM_band_restrict_ARG1 = clCreateBuffer(__sheets_context, 
						     CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
						     sizeof(float) * __SIZE_wav, 
						     (void *) wav, 
						     &__cl_err);
  CHECK_CL_ERROR(__cl_err, "clCreateBuffer");

  /// write to device memory
  CALL_CL_GUARDED(clEnqueueWriteBuffer,
		 (__sheets_queue,
		  __CLMEM_band_restrict_ARG1,
		  CL_TRUE,	/* blocking write */
		  0, 		/* no offset */
		  sizeof(float) * __SIZE_wav,
		  wav,
		  0,		/* no wait list */
		  NULL,
		  NULL)
		  );
  
  /// set up kernel arguments
  SET_4_KERNEL_ARGS(compiled_kernels[0],
		    __CLMEM_band_restrict_ARG0,
		    __CLMEM_band_restrict_ARG1,
		    __PRIM_band_restrict_ARG2,
		    __PRIM_band_restrict_ARG3);

  /// enqueue kernel
  cl_event __CLEVENT_band_restrict_CALL;
  CALL_CL_GUARDED(clEnqueueNDRangeKernel,
		  (__sheets_queue,
		   compiled_kernels[0],
		   1,		/* 1 dimension */
		   0,		/* 0 offset */
		   &__SIZE_wav,
		   NULL,	/* let OpenCL break things up */
		   0,		/* no events in wait list */
		   NULL,	/* empty wait list */
		   &__CLEVENT_band_restrict_CALL)
		  );

  /// allocate space for cpu return array
  float out[__SIZE_wav];
  
  CALL_CL_GUARDED(clEnqueueReadBuffer,
		  (__sheets_queue,
		   __CLMEM_band_restrict_ARG0,
		   CL_TRUE,	 /* blocking read */
		   0,		 /* 0 offset */
		   sizeof(float) * __SIZE_wav, 	 /* read whole buffer */
		   (void *) out, /* host pointer */
		   1,		 /* wait for gfunc to finish */
		   &__CLEVENT_band_restrict_CALL, /* "" */
		   NULL)			  /* no need to wait for this call though */
		  );
  
  ////// [END] GFUNC CALL

  get_timestamp(&end);

  printf("gfunc call took %f seconds\n", timestamp_diff_in_seconds(st, end));

  ////// Validate call
  int c = 0;

  for (_i = 0; _i < __SIZE_wav; _i++) {
    if (in_thrsh(out[_i], 0.1112, 0.7888)) {
      c++;
    } else if(out[_i]) {
      exit(1);
    }
  }

  printf("\n");
	 
  assert(in_thrsh_cnt == c);

  //////////////
  ////// CLEANUP
  //////////////

  CALL_CL_GUARDED(clReleaseMemObject, (__CLMEM_band_restrict_ARG0));
  CALL_CL_GUARDED(clReleaseMemObject, (__CLMEM_band_restrict_ARG1));
  for (_i = 0; _i < NKERNELS; _i++) {
    CALL_CL_GUARDED(clReleaseKernel, (compiled_kernels[_i]));
  }
  CALL_CL_GUARDED(clReleaseCommandQueue, (__sheets_queue));
  CALL_CL_GUARDED(clReleaseContext, (__sheets_context));

  return 0;
}
Exemplo n.º 17
0
void create_context_on(const char *plat_name, const char*dev_name, cl_uint idx,
    cl_context *ctx, cl_command_queue *queue, int enable_profiling)
{
  cl_uint plat_count;

  CALL_CL_GUARDED(clGetPlatformIDs, (0, NULL, &plat_count));

  cl_platform_id *platforms = 
    (cl_platform_id *) malloc(plat_count*sizeof(cl_platform_id));
  CHECK_SYS_ERROR(!platforms, "allocating platform array");

  CALL_CL_GUARDED(clGetPlatformIDs, (plat_count, platforms, NULL));
  for (cl_uint i = 0; i < plat_count; ++i)
  {
    char buf[100];
    CALL_CL_GUARDED(clGetPlatformInfo, (platforms[i], CL_PLATFORM_VENDOR, 
          sizeof(buf), buf, NULL));

    if (!plat_name || strstr(buf, plat_name))
    {
      cl_uint dev_count;
      CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL,
            0, NULL, &dev_count));

      cl_device_id *devices = 
        (cl_device_id *) malloc(dev_count*sizeof(cl_device_id));
      CHECK_SYS_ERROR(!devices, "allocating device array");

      CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL,
            dev_count, devices, NULL));

      for (cl_uint j = 0; j < dev_count; ++j)
      {
        char buf[100];
        CALL_CL_GUARDED(clGetDeviceInfo, (devices[j], CL_DEVICE_NAME,
              sizeof(buf), buf, NULL));
        if (!dev_name || strstr(buf, dev_name))
        {
          if (idx == 0)
          {
            cl_platform_id plat = platforms[i];
            cl_device_id dev = devices[j];

            free(devices);
            free(platforms);

            cl_context_properties cps[3] = { 
              CL_CONTEXT_PLATFORM, (cl_context_properties) plat, 0 };

            cl_int status;
            *ctx = clCreateContext(
                cps, 1, &dev, NULL, NULL, &status);
            CHECK_CL_ERROR(status, "clCreateContext");


            cl_command_queue_properties qprops = 0;
            if (enable_profiling)
              qprops |= CL_QUEUE_PROFILING_ENABLE;

            *queue = clCreateCommandQueue(*ctx, dev, qprops, &status);
            CHECK_CL_ERROR(status, "clCreateCommandQueue");

            return;
          }
          else
            --idx;
        }
      }

      free(devices);
    }
  }

  free(platforms);

  fputs("create_context_on: specified device not found.\n", stderr);
  abort();
}
void fft_1D_big(cl_mem a,cl_mem b,cl_mem c, int N, cl_kernel init_big, cl_kernel clean,cl_kernel mat_trans,cl_command_queue queue,int direction,int offset_line)
{
	//handle complex-to-complex fft, accutal size = 2 * N

	//size_t ldim[] = { 128 };
	//size_t gdim[] = { (N /ldim[0])/2};
	int Ns = 1;
	int y =0;
	SET_7_KERNEL_ARGS(init_big, a, b, N, Ns,direction,offset_line,y);


	size_t ldim[] = { 16 };
	size_t gdim[] = { N/4 };

	CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, init_big,
			 1, NULL, gdim, ldim,
			0, NULL, NULL));
	if (N ==64 )
		return;
	else
	if( N == 256 || N == 1024)
	{
		cl_long offset = offset_line * N;
		SET_7_KERNEL_ARGS(clean, b, c, N, Ns,direction,offset_line,y);
		ldim[0] =4;

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, clean,
			 1, NULL, gdim, ldim,
			0, NULL, NULL));
		if(N == 1024)
		{
			int option =0;
			float k =0;
			int n = 16;			
			SET_8_KERNEL_ARGS(mat_trans, c, b, n, option,k,k,k,offset);

			size_t ldim[] = { 16, 16 };
			size_t gdim[] = { 16, 64 };
			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, mat_trans,
				2, NULL, gdim, ldim,
				0, NULL, NULL));

		}
		else if(N ==256)
		{
			int option =0;
			float k =0;
			int n = 4;			
			SET_8_KERNEL_ARGS(mat_trans, c, b, n, option,k,k,k,offset);

			size_t ldim[] = { 4, 4 };
			size_t gdim[] = { 4, 64 };
			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, mat_trans,
				2, NULL, gdim, ldim,
				0, NULL, NULL));

		}	
		
	}

	else
	{
		printf("FFT not implemented for this size!!!\n");

		return;
	}	
}
Exemplo n.º 19
0
void mgv(ftype f[], ftype u[], ftype dx, unsigned n1,unsigned n2,unsigned n3, size_t field_size, unsigned points, int use_alignment, unsigned dim_x, cl_context ctx, cl_command_queue queue, cl_kernel poisson_knl, int wg_dims , int wg_x, int wg_y, int wg_z, int z_div, int fetch_per_pt, int flops_per_pt){
  // mgv does one v-cycle for the Poisson problem on a grid with mesh size dx
  // Inputs: f is right hand side, u is current approx dx is mesh size, n1 number of sweeps
  // on downward branch, n2 number of sweeps on upwardbranch, n3 number of sweeps on
  // coarsest grid.
  // Output:  It just returns an updated version of u
  cl_ulong start_big;
  #ifdef DO_TIMING
  cl_event evt;
  cl_event *evt_ptr = &evt;
  #else
  cl_event *evt_ptr = NULL;
  #endif
  size_t i, isweep;
  item * ugrid, * head, * curr;
  int l = 0;
  ftype dxval[POINTS/2] = {0};  // this is huge and unnecessary.  Try to cut downif time!!  
  ftype h;
  unsigned nx[POINTS/2] = {0};
  // --- Allocate common gpu memory----
  cl_int status;
  cl_mem dev_buf_u = clCreateBuffer(ctx, CL_MEM_READ_WRITE, field_size * sizeof(ftype), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");
  cl_mem dev_buf_f = clCreateBuffer(ctx, CL_MEM_READ_ONLY, field_size * sizeof(ftype), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");
  cl_mem dev_buf_hist_u = clCreateBuffer(ctx, CL_MEM_READ_ONLY, field_size * sizeof(ftype), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");
  //cl_mem read_buf = clCreateBuffer(ctx, CL_MEM_READ_ONLY, field_size * sizeof(ftype), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");
  // -----------------------------------
  dxval[0] = dx;
  nx[0] = points;
  //const size_t max_size  = POINTS * POINTS * ((POINTS + 15)/16) * 16;
  // --------------- Allocatig the finest grid --------------------
  ugrid = (item *)malloc(sizeof(item));
  ugrid->uvec = malloc(field_size * sizeof(ftype));
  ugrid->fvec = malloc(field_size * sizeof(ftype));
  ugrid->rvec = malloc(field_size * sizeof(ftype));
  ugrid->dim_other = nx[0];
  ugrid->dim_x = dim_x;
  for(i = 0; i < field_size; i++){
     ugrid->uvec[i] = u[i];
     ugrid->fvec[i] = f[i];
     ugrid->rvec[i] = 0;
  }
  head = ugrid;  // head will always be the first one

  // ---------------- Set up the coarse grids ----------------------
  while((nx[l] - 1) % 2 == 0 && nx[l] > 3){
    l = l+1;
    nx[l] = (nx[l - 1] - 1) / 2 + 1;
    dxval[l] = 2 * dxval[l-1]; 
    curr = (item *)malloc(sizeof(item));
    curr->uvec = malloc(field_size * sizeof(ftype));
    curr->fvec = malloc(field_size * sizeof(ftype));
    curr->rvec = malloc(field_size * sizeof(ftype));

    curr->dim_other = nx[l];
    curr->field_start = 0;
    curr->dim_x = curr->dim_other;
    if(use_alignment)
    	curr->dim_x = ((nx[l] + 15)/16) * 16;

    // initialize vectors
    for(i = 0; i < field_size; i++){
 	curr->uvec[i] = 0;
	curr->fvec[i] = 0;
     	curr->rvec[i] = 0;
    }
    ugrid->next = curr; // curr gets attached to ugrid
    curr->prev = ugrid;
    ugrid = curr;
  }
  int nl = l; // this is the maximum number of grids that were created
  // --- at this point head contains the finest grid and ugrid contains the coarsest -----
  curr = head;
  head->prev = NULL;
  ugrid->next = NULL;

  // ---------------- Now relax each of the different grids descending--------
  for(l = 0; l < nl; l++){  // I stop right before nl (will be treated different)
     // ----------------------------------------------------------------------
     // -------------------- GPU DESCENDING V-CYCLE --------------------------
     // ----------------------------------------------------------------------
     {
     if(curr->dim_other < CUTOFF){
	for(isweep = 0; isweep < n1; isweep++){
	     gsrelax(curr, dxval[l]);
	}
     }

     else{
  	// ---GPU------GPU------GPU------GPU------GPU------GPU------GPU------GPU--- //
  	// fill in the buffers inside the GPU with the current data
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_f, CL_TRUE, 0, field_size * sizeof(ftype), curr->fvec, 0, NULL, NULL));
	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_hist_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
  	h = dxval[l] * dxval[l];
        size_t gdim[] = { curr->dim_x-16, curr->dim_x-16, curr->dim_x/z_div };
        size_t ldim[] = { wg_x, wg_y, wg_z };

  	for(i = 0; i < n1; i++){
     	   // ----------------------------------------------------------------------
     	   // invoke poisson kernel
     	   // ----------------------------------------------------------------------
	   //size_t u_size;
	   //CALL_CL_GUARDED(clGetMemObjectInfo, (dev_buf_u, CL_MEM_SIZE, sizeof(u_size), &u_size, 0));
	   //int u_size_i = u_size;
	   //printf("u_size=%d fstart=%d dim_x=%d dim_other=%d\n" , u_size_i, curr->field_start, curr->dim_x, curr->dim_other);
	   curr->field_start = 0;
     	   SET_7_KERNEL_ARGS(poisson_knl, dev_buf_u, dev_buf_f, dev_buf_hist_u, curr->field_start, curr->dim_x, curr->dim_other, h);
     	   // run the kernel
     	   CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, poisson_knl, /*dimensions*/ wg_dims, NULL, gdim, ldim, 0, NULL, evt_ptr));
     	   #ifdef DO_TIMING
      	   // If timing is enabled, this wait can mean a significant performance hit.
      	   CALL_CL_GUARDED(clWaitForEvents, (1, &evt));
 
      	   cl_ulong start, end;
      	   CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL));
      	   CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_END, sizeof(start), &end, NULL));
 
      	   gbytes_accessed += 1e-9*(sizeof(ftype) * field_size * fetch_per_pt);
	   start_big = start;
      	   seconds_taken += 1e-9*(end-start);
      	   mcells_updated += curr->dim_other*curr->dim_other*curr->dim_other/1e6;
      	   gflops_performed += 1e-9*curr->dim_x*curr->dim_x*curr->dim_x * flops_per_pt;

      	   CALL_CL_GUARDED(clReleaseEvent, (evt));
     	   #endif
      	   CALL_CL_GUARDED(clFinish, (queue)); //ira adentro??
	   cl_mem tmp = dev_buf_u;
	   dev_buf_u = dev_buf_hist_u;
	   dev_buf_hist_u = tmp;
     	}
        //when I'm done, read from buffer
        CALL_CL_GUARDED(clEnqueueReadBuffer, (queue, dev_buf_u, /*blocking*/ CL_TRUE, /*offset*/ 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
      }
     }
     resid2(curr, dxval[l]);
     injf2c(curr, curr->next); //this function updates f_{i+1}
     curr = curr->next;
  }
  // ----------------------------------------------------------------------
  // --------------- GPU ON THE COARSEST GRID -----------------------------
  // ----------------------------------------------------------------------
  {
    if(curr->dim_other < CUTOFF){
	for(i = 0; i < n3; i++){
	     gsrelax(curr, dxval[nl]);
	}
    }

    else{
  	// ---GPU------GPU------GPU------GPU------GPU------GPU------GPU------GPU--- //
  	// fill in the buffers inside the GPU with the current data
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_f, CL_TRUE, 0, field_size * sizeof(ftype), curr->fvec, 0, NULL, NULL));
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_hist_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
  	h = dxval[nl] * dxval[nl];
        size_t gdim[] = { curr->dim_x - 16, curr->dim_x - 16, curr->dim_x/z_div };
        size_t ldim[] = { wg_x, wg_y, wg_z };

  	for(i = 0; i < n3; i++){
     	   // ----------------------------------------------------------------------
     	   // invoke poisson kernel
     	   // ----------------------------------------------------------------------
	   curr->field_start = 0;
     	   SET_7_KERNEL_ARGS(poisson_knl, dev_buf_u, dev_buf_f, dev_buf_hist_u,curr->field_start, curr->dim_x, curr->dim_other, h);
     	   // run the kernel
	   curr->field_start = 0;
     	   CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, poisson_knl, /*dimensions*/ wg_dims, NULL, gdim, ldim, 0, NULL, evt_ptr));
     	   #ifdef DO_TIMING
      	   // If timing is enabled, this wait can mean a significant performance hit.
      	   CALL_CL_GUARDED(clWaitForEvents, (1, &evt));
 
      	   cl_ulong start, end;
      	   CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL));
      	   CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_END, sizeof(start), &end, NULL));
 
      	   gbytes_accessed += 1e-9*(sizeof(ftype) * field_size * fetch_per_pt);
      	   seconds_taken += 1e-9*(end-start);
      	   mcells_updated += curr->dim_other*curr->dim_other*curr->dim_other/1e6;
      	   gflops_performed += 1e-9*curr->dim_x*curr->dim_x*curr->dim_x * flops_per_pt;

      	   CALL_CL_GUARDED(clReleaseEvent, (evt));
     	   #endif
      	   CALL_CL_GUARDED(clFinish, (queue)); //ira adentro??
	   cl_mem tmp = dev_buf_u;
	   dev_buf_u = dev_buf_hist_u;
	   dev_buf_hist_u = tmp;
     	}
        //when I'm done, read from buffer
        CALL_CL_GUARDED(clEnqueueReadBuffer, (queue, dev_buf_u, /*blocking*/ CL_TRUE, /*offset*/ 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
     }
  }
  // ----------------------------------------------------------------------
  // -----------Upward branch of the V-cycle ------------------------------
  // ----------------------------------------------------------------------
  for(l = nl-1; l >= 0; --l){
     ctof(curr->prev, curr, field_size); //curr->prev is the finer of the two
     free(curr->uvec);  //curr won't be needed anymore
     free(curr->fvec);
     free(curr->rvec);
     curr = curr->prev;
     curr->next = NULL;
     for(isweep = 0; isweep < n2; isweep++){
	   gsrelax(curr, dxval[l]);
     }
     // Update the grids n1 times using the GPU when necessary
     {
     if(curr->dim_other < CUTOFF){
	for(isweep = 0; isweep < n2; isweep++){
	     gsrelax(curr, dxval[l]);
	}
     }

     else{
  	// ---GPU------GPU------GPU------GPU------GPU------GPU------GPU------GPU--- //
  	// fill in the buffers inside the GPU with the current data
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_f, CL_TRUE, 0, field_size * sizeof(ftype), curr->fvec, 0, NULL, NULL));
  	CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_hist_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
  	h = dxval[l] * dxval[l];
        size_t gdim[] = { curr->dim_x-16, curr->dim_x-16, curr->dim_x/z_div };
        size_t ldim[] = { wg_x, wg_y, wg_z };

  	for(i = 0; i < n1; i++){
     	   // ----------------------------------------------------------------------
     	   // invoke poisson kernel
     	   // ----------------------------------------------------------------------
	   curr->field_start = 0;
     	   SET_7_KERNEL_ARGS(poisson_knl, dev_buf_u, dev_buf_f, dev_buf_hist_u, curr->field_start, curr->dim_x, curr->dim_other, h);
     	   // run the kernel
     	   CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, poisson_knl, /*dimensions*/ wg_dims, NULL, gdim, ldim, 0, NULL, evt_ptr));
     	   #ifdef DO_TIMING
      	   // If timing is enabled, this wait can mean a significant performance hit.
      	   CALL_CL_GUARDED(clWaitForEvents, (1, &evt));
 
      	   cl_ulong start, end;
      	   CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL));
      	   CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_END, sizeof(start), &end, NULL));
 
      	   gbytes_accessed += 1e-9*(sizeof(ftype) * field_size * fetch_per_pt);
      	   seconds_taken += 1e-9*(end-start);
	   tot_secs += 1e-9*(end-start_big);
      	   mcells_updated += curr->dim_other*curr->dim_other*curr->dim_other/1e6;
      	   gflops_performed += 1e-9*curr->dim_x*curr->dim_x*curr->dim_x * flops_per_pt;

      	   CALL_CL_GUARDED(clReleaseEvent, (evt));
     	   #endif
      	   CALL_CL_GUARDED(clFinish, (queue)); //ira adentro??
	   cl_mem tmp = dev_buf_u;
	   dev_buf_u = dev_buf_hist_u;
	   dev_buf_hist_u = tmp;
     	}
        //when I'm done, read from buffer
        CALL_CL_GUARDED(clEnqueueReadBuffer, (queue, dev_buf_u, /*blocking*/ CL_TRUE, /*offset*/ 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL));
      }
     }
  }
  // ---------- and the solution is right there in the last curr curr->uvec
  for(i = 0; i < field_size; i++)
     u[i] = curr->uvec[i];
  free(curr->uvec);
  //free(curr->fvec);
  free(curr->rvec);
  //free(ugrid->uvec);
  //free(ugrid->fvec);
  //free(ugrid->rvec);
  free(curr);
  CALL_CL_GUARDED(clReleaseMemObject, (dev_buf_u));
  CALL_CL_GUARDED(clReleaseMemObject, (dev_buf_f));
  CALL_CL_GUARDED(clReleaseMemObject, (dev_buf_hist_u));
}
void fft2D_big_new(cl_mem a, cl_mem c, cl_mem b,cl_mem d, int N, cl_kernel init_big,
		cl_kernel clean,cl_kernel mat_trans, cl_kernel mat_trans_3D, cl_command_queue queue,int direction)
{
	


		int offset_line = 0;
			int Ns = 1;
			int y =0;
		SET_7_KERNEL_ARGS(init_big, a, b, N, Ns,direction,offset_line,y);


		size_t ldim[] = { 16 };
		size_t gdim[] = { N*N/4 };

		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, init_big,
				 1, NULL, gdim, ldim,
				0, NULL, NULL));
		
	if(N!=64)
		if(N == 1024)
		{
		
			int Ns =1;
			int y =0;			
			//cl_long offset = offset_line * N;
			SET_7_KERNEL_ARGS(clean, b, c, N, Ns,direction,offset_line,y);
			size_t ldim[]={ 4 };
			size_t gdim[] ={ N*N/4 };
			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, clean,
				 1, NULL, gdim, ldim,
				0, NULL, NULL));

		

			
			int option =0;
			float k =0;
			int n = 16;			
						
			SET_8_KERNEL_ARGS(mat_trans_3D, c, b, n, option,k,k,k,N);

				size_t ldim2[] = { 16, 16 ,1};
				size_t gdim2[] = { 16, 64 ,N};

			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, mat_trans_3D,
				3, NULL, gdim2, ldim2,
				0, NULL, NULL));

		}
		else if(N ==256)
		{

			int Ns =1;
			int y =0;			
			offset_line =0;
			SET_7_KERNEL_ARGS(clean, b, c, N, Ns,direction,offset_line,y);
			size_t ldim[] ={4};
			size_t gdim[] ={N*N/4};

			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, clean,
				 1, NULL, gdim, ldim,
				0, NULL, NULL));


							
			int option =0;
			float k =0;
			int n = 4;
						
			SET_8_KERNEL_ARGS(mat_trans_3D, c, b, n, option,k,k,k,N);

				size_t ldim2[] = { 4, 4 ,1};
				size_t gdim2[] = { 4, 64, N };

			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, mat_trans_3D,
				3, NULL, gdim2, ldim2,
				0, NULL, NULL));

			
		}
		
		else
		{
			printf("FFT not implemented for this size!!!\n");

			return;
		}	
	
	//CALL_CL_GUARDED(clFinish, (queue));
	//printf("1D fine \n");

	mat__trans(b,c,N,mat_trans,queue,0,1,1,1);

	//CALL_CL_GUARDED(clFinish, (queue));
/*	for(int j= 0;j<N;j++)
	{
		//fft_1D(c,b,d,N,fft_init,fft1D,queue,direction,j);
		fft_1D_big(c, b,d,N, init_big, clean,mat_trans,queue,direction,j);
	}
*/

		Ns =1;
		SET_7_KERNEL_ARGS(init_big, c, b, N, Ns,direction,offset_line,y);



		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, init_big,
				 1, NULL, gdim, ldim,
				0, NULL, NULL));

		
if (N !=64 )
		

		if( N == 256 || N == 1024)
		{
			int Ns =1;
			int y = 0;			
			int offset_line = 0;
			SET_7_KERNEL_ARGS(clean, b, d, N, Ns,direction,offset_line,y);
			size_t ldim[] = { 4 };
			size_t gdim[] = { N*N/4 };

			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, clean,
				 1, NULL, gdim, ldim,
				0, NULL, NULL));

			if(N == 1024)
			{
			
			int option =0;
			float k =0;
			int n = 16;			
						
			SET_8_KERNEL_ARGS(mat_trans_3D, d, b, n, option,k,k,k,N);

				size_t ldim2[] = { 16, 16 ,1};
				size_t gdim2[] = { 16, 64 ,N};

			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, mat_trans_3D,
				3, NULL, gdim2, ldim2,
				0, NULL, NULL));
			
			}
			else if(N ==256)
			{
		
			int option =0;
			float k =0;
			int n = 4;
						
			SET_8_KERNEL_ARGS(mat_trans_3D, d, b, n, option,k,k,k,N);

				size_t ldim2[] = { 4, 4 ,1};
				size_t gdim2[] = { 4, 64, N };

			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, mat_trans_3D,
				3, NULL, gdim2, ldim2,
				0, NULL, NULL));

			
			}	
		
		}

		else
		{
			printf("FFT not implemented for this size!!!\n");

			return;
		}	
	

	//CALL_CL_GUARDED(clFinish, (queue));
	if(direction == 1)
		mat__trans(b,c,N,mat_trans,queue,0,1,1,1);
	else 
		mat__trans(b,c,N,mat_trans,queue,-1,1,1,1);
	
}
Exemplo n.º 21
0
void run_sampler(sampler *samp){
    /*
     Run the sampler and save output.
     Overlap sampling and communication with the device using two queues.
     While red walkers are being sampled, black walkers are being sent.
     This means that the first sampling iteration reads black walkers from the burn in,
     and the final iteration is thrown out.

     Runs in the following order.
         - Sample X_red non-blocking
         - Copy X_black to host non-blocking
         - Check both are finished
         - Sample X_black non-blocking
         - Copy X_red to host non-blocking
         - Check both are finished

     Input:
          sampler *samp        Pointer to sampler structure which has been initialized.
                                    Burn-in should also be performed before running this routine.
                                    Run for samp->M total times.

     Output:
                                    Array samp->samples_host is filled with new samples.
     */


    // start the kernel timer
    get_timestamp(& (samp->time1));

    // reset beta and set to save
    (samp->data_st)->beta = 1.0f;
    (samp->data_st)->save = 1;

    // update the data structure accordingly
    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->data_st_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        sizeof(data_struct), samp->data_st,
        0, NULL, NULL));
    CALL_CL_GUARDED(clFinish, (samp->queue));


    // run the sampler
    unsigned int buffer_position = 0;

    // since samples are read while update takes place, do not read the first set of samples
    char read_samples = 0;

    // main sampling loop
    for(int it=0; it < samp->M + 1; it++){

        // update X_red
        SET_9_KERNEL_ARGS(samp->stretch_knl,
              samp->X_red_device,
              samp->log_pdf_red_device,
              samp->X_black_device,
              samp->ranluxcltab,
              samp->accepted_device,
              samp->data_device,
              samp->data_st_device,
              samp->indices_to_save_device,
              samp->X_red_save);

        CALL_CL_GUARDED(clEnqueueNDRangeKernel,
              (samp->queue, samp->stretch_knl,
               /*dimensions*/ 1, NULL, samp->gdim, samp->ldim,
               0, NULL, NULL));

        if(read_samples){
            // read the constant samples while others are updating
            CALL_CL_GUARDED(clEnqueueReadBuffer, (
                samp->queue_mem, samp->X_black_save, CL_FALSE, 0,
                samp->num_to_save * samp->K_over_two * sizeof(cl_float), samp->samples_host + buffer_position,
                0, NULL, NULL));

            buffer_position += samp->num_to_save * samp->K_over_two;
        }

        // both must finish before next iteration
        CALL_CL_GUARDED(clFinish, (samp->queue_mem));
        CALL_CL_GUARDED(clFinish, (samp->queue));


        // update X_black
        SET_9_KERNEL_ARGS(samp->stretch_knl,
              samp->X_black_device,
              samp->log_pdf_black_device,
              samp->X_red_device,
              samp->ranluxcltab,
              samp->accepted_device,
              samp->data_device,
              samp->data_st_device,
              samp->indices_to_save_device,
              samp->X_black_save);

        CALL_CL_GUARDED(clEnqueueNDRangeKernel,
              (samp->queue, samp->stretch_knl,
               /*dimensions*/ 1, NULL, samp->gdim, samp->ldim,
               0, NULL, NULL));

        if(read_samples){
            // read the constant samples while others are updating
            CALL_CL_GUARDED(clEnqueueReadBuffer, (
                samp->queue_mem, samp->X_red_save, CL_FALSE, 0,
                samp->num_to_save * samp->K_over_two * sizeof(cl_float), samp->samples_host + buffer_position,
                0, NULL, NULL));

            buffer_position += samp->num_to_save * samp->K_over_two;
        }

        // both must finish before next iteration
        CALL_CL_GUARDED(clFinish, (samp->queue_mem));
        CALL_CL_GUARDED(clFinish, (samp->queue));

        if( ((it % (MAX(samp->M/10,1))) == 0) && (OUTPUT_LEVEL > 0) )
            printf("Sample iteration %d\n", it);

        read_samples = 1;
    }

    // make sure everything is back in place
    CALL_CL_GUARDED(clFinish, (samp->queue));

    // take the end of the kernel timing
    get_timestamp(&(samp->time2));

    // save the acceptance probability
    CALL_CL_GUARDED(clEnqueueReadBuffer, (
        samp->queue, samp->accepted_device, CL_TRUE, 0,
        samp->K_over_two * sizeof(cl_ulong), samp->accepted_host,
        0, NULL, NULL));

    // ensure that all reads are finished
    CALL_CL_GUARDED(clFinish, (samp->queue));

    samp->accepted_total = 0;
    for(int i=0; i<samp->K_over_two; i++)
        samp->accepted_total += samp->accepted_host[i];

    // end total timing
    get_timestamp(&(samp->time2_total));

    if(OUTPUT_LEVEL > 0) printf("Sampler kernel ran and completed.\n\n");

}
Exemplo n.º 22
0
void flush_cl_queue()
{
   CALL_CL_GUARDED(clFinish, (clData.queue));
}
Exemplo n.º 23
0
void print_device_info(cl_device_id device)
{
  // adapted from http://graphics.stanford.edu/~yoel/notes/clInfo.c

#define LONG_PROPS \
  defn(VENDOR_ID), \
  defn(MAX_COMPUTE_UNITS), \
  defn(MAX_WORK_ITEM_DIMENSIONS), \
  defn(MAX_WORK_GROUP_SIZE), \
  defn(PREFERRED_VECTOR_WIDTH_CHAR), \
  defn(PREFERRED_VECTOR_WIDTH_SHORT), \
  defn(PREFERRED_VECTOR_WIDTH_INT), \
  defn(PREFERRED_VECTOR_WIDTH_LONG), \
  defn(PREFERRED_VECTOR_WIDTH_FLOAT), \
  defn(PREFERRED_VECTOR_WIDTH_DOUBLE), \
  defn(MAX_CLOCK_FREQUENCY), \
  defn(ADDRESS_BITS), \
  defn(MAX_MEM_ALLOC_SIZE), \
  defn(IMAGE_SUPPORT), \
  defn(MAX_READ_IMAGE_ARGS), \
  defn(MAX_WRITE_IMAGE_ARGS), \
  defn(IMAGE2D_MAX_WIDTH), \
  defn(IMAGE2D_MAX_HEIGHT), \
  defn(IMAGE3D_MAX_WIDTH), \
  defn(IMAGE3D_MAX_HEIGHT), \
  defn(IMAGE3D_MAX_DEPTH), \
  defn(MAX_SAMPLERS), \
  defn(MAX_PARAMETER_SIZE), \
  defn(MEM_BASE_ADDR_ALIGN), \
  defn(MIN_DATA_TYPE_ALIGN_SIZE), \
  defn(GLOBAL_MEM_CACHELINE_SIZE), \
  defn(GLOBAL_MEM_CACHE_SIZE), \
  defn(GLOBAL_MEM_SIZE), \
  defn(MAX_CONSTANT_BUFFER_SIZE), \
  defn(MAX_CONSTANT_ARGS), \
  defn(LOCAL_MEM_SIZE), \
  defn(ERROR_CORRECTION_SUPPORT), \
  defn(PROFILING_TIMER_RESOLUTION), \
  defn(ENDIAN_LITTLE), \
  defn(AVAILABLE), \
  defn(COMPILER_AVAILABLE),

#define STR_PROPS \
  defn(NAME), \
  defn(VENDOR), \
  defn(PROFILE), \
  defn(VERSION), \
  defn(EXTENSIONS),

#define HEX_PROPS \
  defn(SINGLE_FP_CONFIG), \
  defn(QUEUE_PROPERTIES),


  printf("---------------------------------------------------------------------\n");
  

  static struct { cl_device_info param; const char *name; } longProps[] = {
#define defn(X) { CL_DEVICE_##X, #X }
    LONG_PROPS
#undef defn
    { 0, NULL },
  };
  static struct { cl_device_info param; const char *name; } hexProps[] = {
#define defn(X) { CL_DEVICE_##X, #X }
    HEX_PROPS
#undef defn
    { 0, NULL },
  };
  static struct { cl_device_info param; const char *name; } strProps[] = {
#define defn(X) { CL_DEVICE_##X, #X }
    STR_PROPS
#undef defn
    { CL_DRIVER_VERSION, "DRIVER_VERSION" },
    { 0, NULL },
  };
  cl_int status;
  size_t size;
  char buf[65536];
  long long val; /* Avoids unpleasant surprises for some params */
  int ii;

  for (ii = 0; strProps[ii].name != NULL; ii++)
  {
    status = clGetDeviceInfo(device, strProps[ii].param, sizeof buf, buf, &size);
    if (status != CL_SUCCESS)
    {
      printf("Unable to get %s: %s!\n",
          strProps[ii].name, cl_error_to_str(status));
      continue;
    }
    if (size > sizeof buf)
    {
      printf("Large %s (%zd bytes)!  Truncating to %ld!\n",
          strProps[ii].name, size, sizeof buf);
    }
    printf("%s: %s\n",
        strProps[ii].name, buf);
  }
  printf("\n");

  status = clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof val, &val, NULL);
  if (status == CL_SUCCESS)
  {
    printf("Type: ");
    if (val & CL_DEVICE_TYPE_DEFAULT)
    {
      val &= ~CL_DEVICE_TYPE_DEFAULT;
      printf("Default ");
    }
    if (val & CL_DEVICE_TYPE_CPU)
    {
      val &= ~CL_DEVICE_TYPE_CPU;
      printf("CPU ");
    }
    if (val & CL_DEVICE_TYPE_GPU)
    {
      val &= ~CL_DEVICE_TYPE_GPU;
      printf("GPU ");
    }
    if (val & CL_DEVICE_TYPE_ACCELERATOR)
    {
      val &= ~CL_DEVICE_TYPE_ACCELERATOR;
      printf("Accelerator ");
    }
    if (val != 0) {
      printf("Unknown (0x%llx) ", val);
    }
    printf("\n");
  }
  else
  {
    printf("Unable to get TYPE: %s!\n",
        cl_error_to_str(status));
  }

  status = clGetDeviceInfo(device, CL_DEVICE_EXECUTION_CAPABILITIES,
      sizeof val, &val, NULL);
  if (status == CL_SUCCESS)
  {
    printf("EXECUTION_CAPABILITIES: ");
    if (val & CL_EXEC_KERNEL)
    {
      val &= ~CL_EXEC_KERNEL;
      printf("Kernel ");
    }
    if (val & CL_EXEC_NATIVE_KERNEL)
    {
      val &= ~CL_EXEC_NATIVE_KERNEL;
      printf("Native ");
    }
    if (val)
      printf("Unknown (0x%llx) ", val);

    printf("\n");
  }
  else
  {
    printf("Unable to get EXECUTION_CAPABILITIES: %s!\n",
        cl_error_to_str(status));
  }

  status = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE,
      sizeof val, &val, NULL);
  if (status == CL_SUCCESS)
  {
    static const char *cacheTypes[] = { "None", "Read-Only", "Read-Write" };
    static int numTypes = sizeof cacheTypes / sizeof cacheTypes[0];

    printf("GLOBAL_MEM_CACHE_TYPE: %s (%lld)\n",
        val < numTypes ? cacheTypes[val] : "???", val);
  }
  else
  {
    printf("Unable to get GLOBAL_MEM_CACHE_TYPE: %s!\n",
        cl_error_to_str(status));
  }

  status = clGetDeviceInfo(device,
      CL_DEVICE_LOCAL_MEM_TYPE, sizeof val, &val, NULL);

  if (status == CL_SUCCESS)
  {
    static const char *lmemTypes[] = { "???", "Local", "Global" };
    static int numTypes = sizeof lmemTypes / sizeof lmemTypes[0];

    printf("CL_DEVICE_LOCAL_MEM_TYPE: %s (%lld)\n",
        val < numTypes ? lmemTypes[val] : "???", val);
  }
  else
  {
    printf("Unable to get CL_DEVICE_LOCAL_MEM_TYPE: %s!\n",
        cl_error_to_str(status));
  }

  for (ii = 0; hexProps[ii].name != NULL; ii++)
  {
    status = clGetDeviceInfo(device, hexProps[ii].param, sizeof val, &val, &size);
    if (status != CL_SUCCESS)
    {
      printf("Unable to get %s: %s!\n",
          hexProps[ii].name, cl_error_to_str(status));
      continue;
    }
    if (size > sizeof val)
    {
      printf("Large %s (%zd bytes)!  Truncating to %ld!\n",
          hexProps[ii].name, size, sizeof val);
    }
    printf("%s: 0x%llx\n", hexProps[ii].name, val);
  }
  printf("\n");

  for (ii = 0; longProps[ii].name != NULL; ii++)
  {
    status = clGetDeviceInfo(device, longProps[ii].param, sizeof val, &val, &size);
    if (status != CL_SUCCESS)
    {
      printf("Unable to get %s: %s!\n",
          longProps[ii].name, cl_error_to_str(status));
      continue;
    }
    if (size > sizeof val)
    {
      printf("Large %s (%zd bytes)!  Truncating to %ld!\n",
          longProps[ii].name, size, sizeof val);
    }
    printf("%s: %lld\n", longProps[ii].name, val);
  }

  {
    size_t size = sizeof(size_t);
    CALL_CL_GUARDED(clGetDeviceInfo,
        (device, CL_DEVICE_MAX_WORK_ITEM_SIZES, 0, 0, &size));

    size_t res_vec[size/sizeof(size_t)]; // C99 VLA yay!

    CALL_CL_GUARDED(clGetDeviceInfo,
        (device, CL_DEVICE_MAX_WORK_ITEM_SIZES, size, res_vec, &size));

    printf("MAX_WORK_GROUP_SIZES: "); // a tiny lie
    for (size_t i = 0; i < size/sizeof(size_t); ++i)
      printf("%zd ", res_vec[i]);
    printf("\n");
  }
  printf("---------------------------------------------------------------------\n");
}
void main(int argc, char** argv)
{
	//int k = atoi(argv[1]);	
	//int  N  = pow(2,k);
	int	N=1024;
	int	k=10;
	
	float * a = (float *) malloc(sizeof(float)*N* N * 2);
	float * b = (float *) malloc(sizeof(float) *N*N * 2);
	float * c = (float *) malloc(sizeof(float) * N*N* 2);
	float p = 2*M_PI ;	
	for (int i =0; i< N*N; i++)
	{
		a[2*i] = 1;
		a[2*i+1] = 0;
		b[2*i] = 1;
		b[2*i+1] = 0;
	}
#if 0 
	srand(1);
	for(int i =0;i<N*N;i++)
	{	
		a[2*i]=sin(i%N *2 *M_PI);
		//printf("%f\n",uu[2*i]);
		a[2*i+1] =0 ;
	}
#endif
	print_platforms_devices();

	cl_context ctx;
	cl_command_queue queue;
	create_context_on("NVIDIA","GeForce GTX 590",0,&ctx,&queue,0);

	cl_int status;
	cl_mem buf_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float) *N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");

	cl_mem buf_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float)  * N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");
	
	cl_mem buf_c = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float) * N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");

	cl_mem buf_d = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float)*N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");
	cl_mem buf_e = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float) *N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");

	cl_mem buf_f = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float) *N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");

	cl_mem buf_g = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float) *N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");

	CALL_CL_GUARDED(clEnqueueWriteBuffer, (
	queue, buf_a, /*blocking*/ CL_TRUE, /*offset*/ 0,
	sizeof(float) *N*N*2, a,
	0, NULL, NULL));

	CALL_CL_GUARDED(clEnqueueWriteBuffer, (
	queue, buf_b, /*blocking*/ CL_TRUE, /*offset*/ 0,
	sizeof(float) *N *N* 2, b,
	0, NULL, NULL));

	CALL_CL_GUARDED(clEnqueueWriteBuffer, (
	queue, buf_c, /*blocking*/ CL_TRUE, /*offset*/ 0,
	sizeof(float)  *N* N*2, c,
	0, NULL, NULL));

	char *knl_text = read_file("vec_add.cl");
	cl_kernel vec_add = kernel_from_string(ctx, knl_text, "sum", NULL);
	free(knl_text);

	knl_text = read_file("mat_etr_mul.cl");
	cl_kernel mat_etr_mul = kernel_from_string(ctx, knl_text, "mult", NULL);
	free(knl_text);


	knl_text = read_file("radix-4-float.cl");
	cl_kernel fft1D = kernel_from_string(ctx, knl_text, "fft1D", NULL);
	free(knl_text);

	knl_text = read_file("radix-4-init.cl");
	cl_kernel fft_init = kernel_from_string(ctx, knl_text, "fft1D_init", NULL);
	free(knl_text);

	knl_text = read_file("radix-4-interm.cl");
	cl_kernel fft_interm = kernel_from_string(ctx, knl_text, "fft1D", NULL);
	free(knl_text);

	knl_text = read_file("transpose-soln-gpu.cl");
	cl_kernel mat_trans = kernel_from_string(ctx, knl_text, "transpose", NULL);
	free(knl_text);

	knl_text = read_file("radix-4-modi.cl");
	cl_kernel fft_init_w = kernel_from_string(ctx, knl_text, "fft1D_init", NULL);
	free(knl_text);

	knl_text = read_file("vec_zero.cl");
	cl_kernel vec_zero = kernel_from_string(ctx, knl_text, "zero", NULL);
	free(knl_text);

	knl_text = read_file("reduction.cl");
	cl_kernel reduct_mul = kernel_from_string(ctx, knl_text, "reduction_mult", NULL);
	free(knl_text);

	knl_text = read_file("reduction1D.cl");
	cl_kernel reduct = kernel_from_string(ctx, knl_text, "reduction", NULL);
	free(knl_text);

	knl_text = read_file("reduction-init.cl");
	cl_kernel reduct_init = kernel_from_string(ctx, knl_text, "reduction_init", NULL);
	free(knl_text);


	knl_text = read_file("reduct-energy.cl");
	cl_kernel reduct_eng = kernel_from_string(ctx, knl_text, "reduction_eng", NULL);
	free(knl_text);

	knl_text = read_file("resid.cl");
	cl_kernel resid = kernel_from_string(ctx, knl_text, "resid", NULL);
	free(knl_text);

	knl_text = read_file("resid-init.cl");
	cl_kernel resid_init = kernel_from_string(ctx, knl_text, "resid_init", NULL);
	free(knl_text);


	knl_text = read_file("radix-4-big.cl");
	cl_kernel fft_big = kernel_from_string(ctx, knl_text, "fft1D_big", NULL);
	free(knl_text);
	knl_text = read_file("radix-4-big-clean.cl");
	cl_kernel fft_clean = kernel_from_string(ctx, knl_text, "fft1D_clean", NULL);
	free(knl_text);

	knl_text = read_file("radix-4-2D.cl");
	cl_kernel fft_2D = kernel_from_string(ctx, knl_text, "fft2D_big", NULL);
	free(knl_text);

	knl_text = read_file("radix-4-2D-clean.cl");
	cl_kernel fft_2D_clean = kernel_from_string(ctx, knl_text, "fft2D_clean", NULL);
	free(knl_text);


	knl_text = read_file("mat-trans-3D.cl");
	cl_kernel mat_trans_3D = kernel_from_string(ctx, knl_text, "transpose_3D", NULL);
	free(knl_text);
	int Ns =1 ;
	int direction = 1;
	timestamp_type time1, time2;
	
	struct parameter param;

	param.N = N;
	param.epsilon = 0.1;
	param.s =1;
	
	float kk =1e-4;



	param.h = 2*PI/N;
	param.N = N;
	
 	param.maxCG = 1000;
	param.maxN = 5;
	
	//Minimum and starting time step
	float mink = 1e-7;
	float startk = 1e-4;

	// Tolerances
	param.Ntol = 1e-4;
	param.cgtol = 1e-7;
	float ksafety = 0.8;
	float kfact = 1.3;
	float kfact2 = 1/1.3;
	float Nfact = 0.7;
	float CGfact = 0.7;
	double elapsed ;

	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time1);

//for(int s=0;s<100;s++)
	//fft_1D_big(buf_a,buf_b,buf_c,N,fft_big,fft_clean,mat_trans,queue,direction,0);
	//fft_1D_new(buf_a,buf_b,buf_c,N,fft_init,fft_interm, fft1D,queue,direction,0);
	//fft_1D(buf_a,buf_b,buf_c,N,fft_init, fft1D,queue,direction,0);
	//fft2D(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft1D,mat_trans,queue, 1);
	//fft2D_new(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft_interm,fft1D,mat_trans,queue, 1);
	//fft2D_big(buf_a,buf_b,buf_c,buf_d,N,fft_big,fft_clean,mat_trans,queue,direction);
	//fft2D_big_new(buf_a,buf_b,buf_c,buf_d,N,fft_2D,fft_2D_clean,
			//mat_trans,mat_trans_3D,queue,direction);
	//fft_w(buf_a,buf_b,buf_c,buf_d,buf_e,N,0.1,0,1,fft_init_w,fft_init,fft1D,mat_trans,queue);
#if 0
	frhs(buf_a,buf_b,buf_c,buf_d,buf_e,&param,fft1D_init,fft1D,mat_trans,
		 vec_add, queue);
#endif
#if 0	
	float E1 = energy(buf_a, buf_b, buf_c,buf_d, buf_e,buf_f,1e-4, 
				&param, fft_init,fft1D,mat_trans,reduct_eng,
				reduct,queue);
#endif

	
	//float reside = residual(buf_a,buf_b,resid,resid_init,queue,N*N);
	/*fft_d_q(buf_a,buf_b,buf_c,buf_d, N,0.1,k ,1, 
		 fft1D_init,
		fft1D,mat_trans,queue);*/
	//for(int j= 0;j<N;j++)
	//{
		//fft_1D_w_orig(buf_a,buf_b,buf_c,N,fft1D_init,fft1D,queue,1,j);
	//}
	//fft_shar(buf_a,buf_b,buf_c,buf_d,N,0.1,0,1,fft1D_init,fft1D,mat_trans,queue);
	//mat__trans(buf_a,buf_b,N,mat_trans,queue,4,0.1,0,1);
	//double elapsed = reduction_mult(buf_a, buf_b,buf_c,N*N,reduct_mul,reduct,queue);

	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time1);
	fft_1D_big(buf_a,buf_b,buf_c,N*N,fft_big,fft_clean,mat_trans,queue,direction,0);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	printf("Hierarchy 1D FFT of size %d  array  on gpu takes %f s\n", N*N,elapsed);
	printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9);
	printf("---------------------------------------------\n");

	


	CALL_CL_GUARDED(clFinish, (queue));

	
	get_timestamp(&time1);
	fft2D(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft1D,mat_trans,queue, 1);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	printf("Navie 2D FFT of size %d * %d matrix  on gpu takes %f s\n", N,N,elapsed);
	printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9);
	printf("---------------------------------------------\n");
	//printf("data access from global achieve %f GB/s\n",sizeof(float)*2*16*N*N/elapsed*1e-9);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time1);
	fft2D_new(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft_interm,fft1D,mat_trans,queue, 1);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	printf("local data exchange 2D FFT of size %d * %d matrix  on gpu takes %f s\n", N,N,elapsed);
	printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9);
	printf("---------------------------------------------\n");


	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time1);
	fft2D_big(buf_a,buf_b,buf_c,buf_d,N,fft_big,fft_clean,mat_trans,queue,direction);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	printf("Hierarchy 2D FFT of size %d * %d matrix  on gpu takes %f s\n", N,N,elapsed);
	printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9);
	printf("---------------------------------------------\n");


	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time1);
	fft2D_big_new(buf_a,buf_b,buf_c,buf_d,N,fft_2D,fft_2D_clean,
			mat_trans,mat_trans_3D,queue,direction);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	printf("Using 2D kernel 2D FFT of size %d * %d matrix  on gpu takes %f s\n", N,N,elapsed);
	printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9);
	printf("---------------------------------------------\n");



	get_timestamp(&time1);






	direction = -1;
	//fft_1D(buf_b,buf_c,buf_d,N,fft_init, fft1D,queue,direction,0);
	fft2D(buf_b,buf_c,buf_d,buf_e,N,fft_init,fft1D,mat_trans,queue, direction);
	//fft2D_new(buf_b,buf_c,buf_e,buf_d,N,fft_init,fft_interm,fft1D,mat_trans,queue, -1);
	//fft2D_big(buf_b,buf_c,buf_d,buf_e,N,fft_big,fft_clean,mat_trans,queue,direction);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	//printf("1D inverse %f s\n", elapsed);
	#if 0
	float test;
	CALL_CL_GUARDED(clFinish, (queue));
	CALL_CL_GUARDED(clEnqueueReadBuffer, (
        	queue, buf_b, /*blocking*/ CL_TRUE, /*offset*/ 0,
       		sizeof(float), &test,
        	0, NULL, NULL));
	

		printf("test success and %f \n",test);		
	

	#endif
	#if 0
	CALL_CL_GUARDED(clFinish, (queue));
	CALL_CL_GUARDED(clEnqueueReadBuffer, (
        	queue, buf_c, /*blocking*/ CL_TRUE, /*offset*/ 0,
       		2*N*N* sizeof(float), c,
        	0, NULL, NULL));
	

	/*for(int i =0; i<  N; i++)
	{
		printf("a%f+ i*",a[2*i]);		
		printf("%f\n",a[2*i+1]);
	}*/
	int T = 10<N? 10:N ;
	for(int i =0; i<  T; i++)
	{
		printf("%f + i*",a[2*i]);		
		printf("%f\t",a[2*i+1]);
		printf("%f + i*",c[2*i]);		
		printf("%f\n",c[2*i+1]);
	}

	#endif 
/*	for( Ns = 1;Ns < N; Ns *= 2 )
	{
		for (int j = 0; j<N/2; j++)
		{
			fftiteration(j,N,Ns,a,b);
		}
		float * d;
		d = a ;
		a = b;
		b = d;
		//printf("ok\n");

	}

*/


	
	CALL_CL_GUARDED(clReleaseMemObject, (buf_a));
	CALL_CL_GUARDED(clReleaseMemObject, (buf_b));
	CALL_CL_GUARDED(clReleaseMemObject, (buf_c));
	CALL_CL_GUARDED(clReleaseMemObject, (buf_d));
	CALL_CL_GUARDED(clReleaseMemObject, (buf_e));
	CALL_CL_GUARDED(clReleaseKernel, (fft1D));
	CALL_CL_GUARDED(clReleaseKernel, (fft_init));
	CALL_CL_GUARDED(clReleaseKernel, (vec_add));
	CALL_CL_GUARDED(clReleaseKernel, (reduct_mul));
	CALL_CL_GUARDED(clReleaseKernel, (reduct));
	CALL_CL_GUARDED(clReleaseKernel, (mat_trans));
	CALL_CL_GUARDED(clReleaseCommandQueue, (queue));
	CALL_CL_GUARDED(clReleaseContext, (ctx));

}
Exemplo n.º 25
0
void create_context_on(const char *plat_name, const char*dev_name, cl_uint idx,
    cl_context *ctx, cl_command_queue *queue, int enable_profiling)
{
  char dev_sel_buf[MAX_NAME_LEN];
  char platform_sel_buf[MAX_NAME_LEN];

  // get number of platforms
  cl_uint plat_count;
  CALL_CL_GUARDED(clGetPlatformIDs, (0, NULL, &plat_count));

  // allocate memory, get list of platform handles
  cl_platform_id *platforms =
    (cl_platform_id *) malloc(plat_count*sizeof(cl_platform_id));
  CHECK_SYS_ERROR(!platforms, "allocating platform array");
  CALL_CL_GUARDED(clGetPlatformIDs, (plat_count, platforms, NULL));

  // print menu, if requested
#ifndef CL_HELPER_FORCE_INTERACTIVE
  if (plat_name == CHOOSE_INTERACTIVELY) // yes, we want exactly that pointer
#endif
  {
    puts("Choose platform:");
    for (cl_uint i = 0; i < plat_count; ++i)
    {
      char buf[MAX_NAME_LEN];
      CALL_CL_GUARDED(clGetPlatformInfo, (platforms[i], CL_PLATFORM_VENDOR,
            sizeof(buf), buf, NULL));
      printf("[%d] %s\n", i, buf);
    }

    printf("Enter choice: ");
    fflush(stdout);

    char *sel = read_a_line();
    if (!sel)
    {
      fprintf(stderr, "error reading line from stdin");
      abort();
    }

    int sel_int = MIN(MAX(0, atoi(sel)), (int) plat_count-1);
    free(sel);

    CALL_CL_GUARDED(clGetPlatformInfo, (platforms[sel_int], CL_PLATFORM_VENDOR,
          sizeof(platform_sel_buf), platform_sel_buf, NULL));
    plat_name = platform_sel_buf;
  }

  // iterate over platforms
  for (cl_uint i = 0; i < plat_count; ++i)
  {
    // get platform name
    char buf[MAX_NAME_LEN];
    CALL_CL_GUARDED(clGetPlatformInfo, (platforms[i], CL_PLATFORM_VENDOR,
          sizeof(buf), buf, NULL));

    // does it match?
    if (!plat_name || strstr(buf, plat_name))
    {
      // get number of devices in platform
      cl_uint dev_count;
      CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL,
            0, NULL, &dev_count));

      // allocate memory, get list of device handles in platform
      cl_device_id *devices =
        (cl_device_id *) malloc(dev_count*sizeof(cl_device_id));
      CHECK_SYS_ERROR(!devices, "allocating device array");

      CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL,
            dev_count, devices, NULL));

      // {{{ print device menu, if requested
#ifndef CL_HELPER_FORCE_INTERACTIVE
      if (dev_name == CHOOSE_INTERACTIVELY) // yes, we want exactly that pointer
#endif
      {
        puts("Choose device:");
        for (cl_uint j = 0; j < dev_count; ++j)
        {
          char buf[MAX_NAME_LEN];
          CALL_CL_GUARDED(clGetDeviceInfo, (devices[j], CL_DEVICE_NAME,
                sizeof(buf), buf, NULL));
          printf("[%d] %s\n", j, buf);
        }

        printf("Enter choice: ");
        fflush(stdout);

        char *sel = read_a_line();
        if (!sel)
        {
          fprintf(stderr, "error reading line from stdin");
          abort();
        }

        int int_sel = MIN(MAX(0, atoi(sel)), (int) dev_count-1);
        free(sel);

        CALL_CL_GUARDED(clGetDeviceInfo, (devices[int_sel], CL_DEVICE_NAME,
              sizeof(dev_sel_buf), dev_sel_buf, NULL));
        dev_name = dev_sel_buf;
      }

      // }}}

      // iterate over devices
      for (cl_uint j = 0; j < dev_count; ++j)
      {
        // get device name
        char buf[MAX_NAME_LEN];
        CALL_CL_GUARDED(clGetDeviceInfo, (devices[j], CL_DEVICE_NAME,
              sizeof(buf), buf, NULL));

        // does it match?
        if (!dev_name || strstr(buf, dev_name))
        {
          if (idx == 0)
          {
            cl_platform_id plat = platforms[i];
            cl_device_id dev = devices[j];

            free(devices);
            free(platforms);

            cl_int status;
            
            // create a context
#if OPENCL_SHARE_WITH_OPENGL
  #if __APPLE__
//              CGLContextObj kCGLContext = CGLGetCurrentContext();
//              CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext);
//              cl_context_properties cps[] = {
//                CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup,
//                CL_CONTEXT_PLATFORM, (cl_context_properties) plat, 0 };
//            
            
            CGLContextObj gl_context = CGLGetCurrentContext();
            CGLShareGroupObj share_group = CGLGetShareGroup(gl_context);
            
            cl_context_properties properties[] = {
              CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE,
              (cl_context_properties)share_group, 0
            };
            *ctx = clCreateContext(properties, 0, 0, 0, 0, 0);
            clGetGLContextInfoAPPLE(*ctx, gl_context,
                                    CL_CGL_DEVICE_FOR_CURRENT_VIRTUAL_SCREEN_APPLE, sizeof(dev),
                                    &dev, NULL);
            
            
            
  #elif WIN32
              cl_context_properties cps[] = {
                CL_GL_CONTEXT_KHR, (cl_context_properties) wglGetCurrentContext(), CL_WGL_HDC_KHR, (cl_context_properties) wglGetCurrentDC(), CL_CONTEXT_PLATFORM, (cl_context_properties) plat, 0};
            
            //Probably won't work because &dev should correspond to glContext
            *ctx = clCreateContext(cps, 1, &dev, NULL, NULL, &status);
            CHECK_CL_ERROR(status, "clCreateContext");
  #else
              // Linux
              cl_context_properties cps[] = {
                CL_GL_CONTEXT_KHR, ( cl_context_properties) glXGetCurrentContext(), CL_GLX_DISPLAY_KHR, (cl_context_properties) glXGetCurrentDisplay(), CL_CONTEXT_PLATFORM, (cl_context_properties) plat, 0 };
            //Probably won't work because &dev should correspond to glContext
            *ctx = clCreateContext(cps, 1, &dev, NULL, NULL, &status);
            CHECK_CL_ERROR(status, "clCreateContext");
#endif

#else
            // create a context
            cl_context_properties cps[3] = {
              CL_CONTEXT_PLATFORM, (cl_context_properties) plat, 0 };
            // create a command queue
            cl_command_queue_properties qprops = 0;
            if (enable_profiling)
              qprops |= CL_QUEUE_PROFILING_ENABLE;
            
            *queue = clCreateCommandQueue(*ctx, dev, qprops, &status);
            CHECK_CL_ERROR(status, "clCreateCommandQueue");
#endif
//            *ctx = clCreateContext(
//                                   cps, 1, &dev, NULL, NULL, &status);
//            CHECK_CL_ERROR(status, "clCreateContext");

//            // create a command queue
            cl_command_queue_properties qprops = 0;
            if (enable_profiling)
              qprops |= CL_QUEUE_PROFILING_ENABLE;

            *queue = clCreateCommandQueue(*ctx, dev, qprops, &status);
            CHECK_CL_ERROR(status, "clCreateCommandQueue");

            return;
          }
          else
            --idx;
        }
      }

      free(devices);
    }
  }

  free(platforms);

  fputs("create_context_on: specified device not found.\n", stderr);
  abort();
}
Exemplo n.º 26
0
int main()
{
  int enable_profiling = 0;
  #ifdef DO_TIMING
      enable_profiling = 1;
  #endif

  //print_platforms_devices();
  cl_context ctx;
  cl_command_queue queue;
  create_context_on("NVIDIA", NULL, 0, &ctx, &queue, enable_profiling);

  // --------------------------------------------------------------------------
  // load kernels
  // --------------------------------------------------------------------------
  // read the cl file

  char buf[100];
  sprintf(buf, "mg-kernel-ver%d.cl", VERSION);
  char *knl_text = read_file(buf);
  //get work group dimensions and gflop info.
  int wg_dims , wg_x, wg_y, wg_z, z_div, fetch_per_pt, flops_per_pt;
  if (sscanf(knl_text, "// workgroup: (%d,%d,%d) z_div:%d fetch_per_pt:%d flops_per_pt:%d", 
        &wg_x, &wg_y, &wg_z, &z_div, &fetch_per_pt, &flops_per_pt) == 6)
  {
    wg_dims = 3;
  }
  else if (sscanf(knl_text, "// workgroup: (%d,%d) fetch_per_pt:%d flops_per_pt:%d",
        &wg_x, &wg_y, &fetch_per_pt, &flops_per_pt) == 4)
  {
    wg_dims = 2;
    wg_z = -1;
    z_div = -1;
  }
  else
  {
    perror("reading workgroup spec");
    abort();
  }

  #ifdef USE_DOUBLE
  char *compile_opt = "-DFTYPE=double";
  #else
  char *compile_opt = "-DFTYPE=float";
  #endif

  // creation of the kernel
  cl_kernel poisson_knl = kernel_from_string(ctx, knl_text, "fd_update", compile_opt);
  free(knl_text); // my compiler complains about this one.  OJO!!
  // --------------------------------------------------------------------------
  // set up grid
  // --------------------------------------------------------------------------
  const unsigned points = POINTS;
  const ftype minus_bdry = -1, plus_bdry = 1;

  // We're dividing into (points-1) intervals.
  ftype dx = (plus_bdry-minus_bdry)/(points-1);

  // --------------------------------------------------------------------------
  // allocate and initialize CPU memory
  // --------------------------------------------------------------------------
  int use_alignment;
  unsigned dim_other = points; //if order 2 then 1 point extra on each side
  #ifdef USE_ALIGNMENT
  // adjusts dimension so that the next row starts in a number divisible by 16
  unsigned dim_x = ((dim_other + 15) / 16) * 16; 
  unsigned field_start = 0;
  use_alignment = 1; 
  #else
  unsigned dim_x = dim_other;
  unsigned field_start = 0;// this one puts me right at the beginning
  use_alignment = 0;
  #endif
  // --------Allocate forcing uexact, r and u vectors -------------------------
  const size_t field_size = 0+dim_x*dim_x*dim_x;  // extra large to fit the 2^n constrain in GPU
  ftype *f = malloc(field_size*sizeof(ftype));
  CHECK_SYS_ERROR(!f, "allocating f");
  ftype *u = malloc (field_size*sizeof(ftype));
  CHECK_SYS_ERROR(!u, "allocating u");  
  ftype *uexact = malloc (field_size*sizeof(ftype));
  CHECK_SYS_ERROR(!uexact, "allocating uexact");
  ftype *r = malloc(field_size * sizeof(ftype));
  CHECK_SYS_ERROR(!r, "allocating residual r");

  // --------------------------------------------------------------------------
  // initialize
  // --------------------------------------------------------------------------
    // zero out (necessary to initialize everything bec. I measure norms)
    for (size_t i = 0; i < field_size; ++i){
      f[i] = 0;
      u[i] = 0;
      uexact[i] = 0;
      r[i] = 0;
    }
    // set up the forcing field
    init_f (points, f, dx, field_start, dim_x, dim_other, minus_bdry);
    // Initialize u with initial boundary conditions
    init_u ( points, u , minus_bdry, plus_bdry, dx, field_start, dim_x, dim_other);
    // Initialize the exact solution
    init_uexact(points, u, uexact, dx, field_size, field_start, dim_x, dim_other);

    // --------------------------------------------------------------------------
    // Setup the v-cycles
    // --------------------------------------------------------------------------
  
    unsigned n1, n2, n3, ncycles;
    n1 = 50;
    n2 = 60;
    n3 = 1;
    ncycles = 2;
    ftype *sweeps = malloc (ncycles*sizeof(ftype));
    ftype *rnorm = malloc (ncycles*sizeof(ftype));
    ftype *enorm = malloc (ncycles*sizeof(ftype));
    ftype rtol = 1.0e-05;

    // Find the norm of the residual (choose your method)
    sweeps[0] =0;
    resid (r, f, u, dx, field_size, field_start, dim_x, dim_other);
    rnorm[0] = norm( r , field_size) * dx;
    U_error(u, uexact, r, field_size);
    enorm[0] = norm( r, field_size ) * dx;

    for(unsigned icycle = 1; icycle <= ncycles; icycle++){
       mgv(f, u, dx, n1, n2, n3, field_size, points, use_alignment, dim_x, ctx, queue, poisson_knl, wg_dims , wg_x, wg_y, wg_z, z_div, fetch_per_pt, flops_per_pt);  //update u through a v-cycle 
       sweeps[icycle] = sweeps[icycle -1] + (4 * (n1 + n2)/3);
       resid (r, f, u, dx, field_size, field_start, dim_x, dim_other);
       rnorm[icycle] = norm( r, field_size ) * dx;
       U_error(u, uexact, r, field_size);
       enorm[icycle] = norm( r, field_size ) * dx;
       //cfacts = (rnorm(icycle)/rnorm(icycle - 1))^(1 / (n1 + n2)) not necessary
       //disp something here if I want to.
       //printf("norm of the cycle %f", enorm[icycle]);
       if(rnorm[icycle] <= rtol * rnorm[0])
	  break;
    }
    #ifdef DO_TIMING
  printf(" ftype:%d ver:%d align:%d pts:%d\tgflops:%.1f\tmcells:%.1f\tgbytes:%.1f [/sec]\tout_gflops:%.6f\n", (int) sizeof(ftype), VERSION, use_alignment, points, gflops_performed/seconds_taken, mcells_updated/seconds_taken, gbytes_accessed/seconds_taken, gflops_performed/tot_secs);
#endif
  // --------------------------------------------------------------------------
  // clean up
  // --------------------------------------------------------------------------
  CALL_CL_GUARDED(clReleaseKernel, (poisson_knl));
  CALL_CL_GUARDED(clReleaseCommandQueue, (queue));
  CALL_CL_GUARDED(clReleaseContext, (ctx));
}
Exemplo n.º 27
0
void free_sampler(sampler* samp){
    /* Free all resources allocated by the sampler and the sampler itself. */

    // free up OpenCL memory
    CALL_CL_GUARDED(clReleaseMemObject, (samp->X_red_device));
    CALL_CL_GUARDED(clReleaseMemObject, (samp->log_pdf_red_device));
    CALL_CL_GUARDED(clReleaseMemObject, (samp->X_red_save));
    CALL_CL_GUARDED(clReleaseMemObject, (samp->X_black_device));
    CALL_CL_GUARDED(clReleaseMemObject, (samp->log_pdf_black_device));
    CALL_CL_GUARDED(clReleaseMemObject, (samp->X_black_save));
    CALL_CL_GUARDED(clReleaseMemObject, (samp->accepted_device));
    CALL_CL_GUARDED(clReleaseMemObject, (samp->data_device));
    CALL_CL_GUARDED(clReleaseMemObject, (samp->ranluxcltab));
    CALL_CL_GUARDED(clReleaseMemObject, (samp->data_st_device));
    CALL_CL_GUARDED(clReleaseMemObject, (samp->indices_to_save_device));

    // kernels, context and queues
    CALL_CL_GUARDED(clReleaseKernel,       (samp->stretch_knl));
    CALL_CL_GUARDED(clReleaseKernel,       (samp->init_rand_lux_knl));
    CALL_CL_GUARDED(clReleaseCommandQueue, (samp->queue));
    CALL_CL_GUARDED(clReleaseCommandQueue, (samp->queue_mem));
    CALL_CL_GUARDED(clReleaseContext,      (samp->ctx));

    // free host resources
    free(samp->X_red_host);
    free(samp->log_pdf_red_host);
    free(samp->X_black_host);
    free(samp->log_pdf_black_host);
    free(samp->samples_host);
    free(samp->accepted_host);
    free(samp->data_host);
    free(samp->data_st);
    free(samp->indices_to_save_host);

    // data resources
    free(samp->acor_times);
    free(samp->acor_pass);
    free(samp->means);
    free(samp->sigma);
    free(samp->err_bar);

    free(samp);
}
Exemplo n.º 28
0
int main(int argc, char **argv)
{
  if (argc != 3)
  {
    fprintf(stderr, "need two arguments!\n");
    abort();
  }

  const long n = atol(argv[1]);
  const long size = n*n;
  const int ntrips = atoi(argv[2]);

  cl_context ctx;
  cl_command_queue queue;
  create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0);

  cl_int status;

  // --------------------------------------------------------------------------
  // load kernels 
  // --------------------------------------------------------------------------
  char *knl_text = read_file("transpose-soln.cl");
  cl_kernel knl = kernel_from_string(ctx, knl_text, "transpose", NULL);
  free(knl_text);

  // --------------------------------------------------------------------------
  // allocate and initialize CPU memory
  // --------------------------------------------------------------------------
#ifdef USE_PINNED
  cl_mem buf_a_host = clCreateBuffer(ctx,
      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
      sizeof(value_type) * size, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");
  cl_mem buf_b_host = clCreateBuffer(ctx,
      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
      sizeof(value_type) * size, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  value_type *a = (value_type *) clEnqueueMapBuffer(queue, buf_a_host,
      /*blocking*/ CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 
      /*offs*/ 0, sizeof(value_type)*size, 0, NULL, NULL, &status);
  CHECK_CL_ERROR(status, "clEnqueueMapBuffer");
  value_type *b = (value_type *) clEnqueueMapBuffer(queue, buf_b_host,
      /*blocking*/ CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 
      /*offs*/ 0, sizeof(value_type)*size, 0, NULL, NULL, &status);
  CHECK_CL_ERROR(status, "clEnqueueMapBuffer");

#else
  value_type *a = (value_type *) malloc(sizeof(value_type) * size);
  if (!a) { perror("alloc x"); abort(); }
  value_type *b = (value_type *) malloc(sizeof(value_type) * size);
  if (!b) { perror("alloc y"); abort(); }
#endif

  for (size_t j = 0; j < n; ++j)
    for (size_t i = 0; i < n; ++i)
      a[i + j*n] = i + j*n;

  // --------------------------------------------------------------------------
  // allocate device memory
  // --------------------------------------------------------------------------
  cl_mem buf_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
      sizeof(value_type) * size, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
      sizeof(value_type) * size, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  // --------------------------------------------------------------------------
  // transfer to device
  // --------------------------------------------------------------------------
  CALL_CL_GUARDED(clFinish, (queue));

  timestamp_type time1, time2;
  get_timestamp(&time1);

  CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        queue, buf_a, /*blocking*/ CL_FALSE, /*offset*/ 0,
        size * sizeof(value_type), a,
        0, NULL, NULL));

  CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        queue, buf_b, /*blocking*/ CL_FALSE, /*offset*/ 0,
        size * sizeof(value_type), b,
        0, NULL, NULL));

  get_timestamp(&time2);
  double elapsed = timestamp_diff_in_seconds(time1,time2);
  printf("transfer: %f s\n", elapsed);
  printf("transfer: %f GB/s\n",
      2*size*sizeof(value_type)/1e9/elapsed);


  // --------------------------------------------------------------------------
  // run code on device
  // --------------------------------------------------------------------------

  CALL_CL_GUARDED(clFinish, (queue));

  get_timestamp(&time1);

  for (int trip = 0; trip < ntrips; ++trip)
  {
    SET_3_KERNEL_ARGS(knl, buf_a, buf_b, n);
    size_t ldim[] = { 16, 16 };
    size_t gdim[] = { n, n };
    CALL_CL_GUARDED(clEnqueueNDRangeKernel,
        (queue, knl,
         /*dimensions*/ 2, NULL, gdim, ldim,
         0, NULL, NULL));
  }

  CALL_CL_GUARDED(clFinish, (queue));

  get_timestamp(&time2);
  elapsed = timestamp_diff_in_seconds(time1,time2)/ntrips;
  printf("%f s\n", elapsed);
  printf("%f GB/s\n",
      2*size*sizeof(value_type)/1e9/elapsed);

  CALL_CL_GUARDED(clEnqueueReadBuffer, (
        queue, buf_b, /*blocking*/ CL_FALSE, /*offset*/ 0,
        size * sizeof(value_type), b,
        0, NULL, NULL));

  CALL_CL_GUARDED(clFinish, (queue));

  for (size_t i = 0; i < n; ++i)
    for (size_t j = 0; j < n; ++j)
      if (a[i + j*n] != b[j + i*n])
      {
        printf("bad %d %d\n", i, j);
        abort();
      }

  // --------------------------------------------------------------------------
  // clean up
  // --------------------------------------------------------------------------
  CALL_CL_GUARDED(clFinish, (queue));
  CALL_CL_GUARDED(clReleaseMemObject, (buf_a));
  CALL_CL_GUARDED(clReleaseMemObject, (buf_b));
  CALL_CL_GUARDED(clReleaseKernel, (knl));
  CALL_CL_GUARDED(clReleaseCommandQueue, (queue));
  CALL_CL_GUARDED(clReleaseContext, (ctx));

#ifdef USE_PINNED
  CALL_CL_GUARDED(clReleaseMemObject, (buf_a_host));
  CALL_CL_GUARDED(clReleaseMemObject, (buf_b_host));
#else
  free(a);
  free(b);
#endif
  return 0;
}
Exemplo n.º 29
0
void chcg(float k,struct parameter * p_param,  cl_mem temp,cl_mem rhs, cl_mem temp2, bool *fail,
		cl_mem temp3,cl_mem temp4,cl_mem temp5,cl_mem temp6,cl_mem temp7,cl_mem temp8,cl_mem temp9,cl_kernel fft_2D,
		cl_kernel fft_2D_clean, cl_kernel fft_init_w,cl_kernel vec_add, cl_kernel vec_zero,
		cl_kernel mat_trans,cl_kernel mat_trans_3D,cl_kernel reduct,cl_kernel reduct_init,
		cl_kernel reduct_mul,cl_kernel resid, cl_kernel resid_init,cl_command_queue queue)
{
	* fail = false;
	int N = p_param->N;
	// fft2(rk)./q & rk =rhs 
	fft_d_q(rhs,temp2,temp9,temp3,N,p_param->epsilon,k,
		p_param->s,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue);
	//CALL_CL_GUARDED(clFinish, (queue));
	//printf("I am here!\n");
	#if 0
	float test;
	CALL_CL_GUARDED(clFinish, (queue));
	CALL_CL_GUARDED(clEnqueueReadBuffer, (
        	queue, rhs, /*blocking*/ CL_TRUE, /*offset*/ 0,
       		sizeof(float), &test,
        	0, NULL, NULL));
	

		printf("test success and %f \n",test);		
	

	#endif
	fft2D(temp2,temp3,temp9,temp4,N,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D, queue,-1);
//------------------------	
//temp3 = zk temp =unew 
//------------------------

	// linvzk = real(ifft2(fft2(zk)./nlap_s2));  real part;
	fft_d_nlaps2(temp3,temp2,temp9,temp4,N,p_param->epsilon,k,p_param->s,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue);
	#if 0
	float test;
	CALL_CL_GUARDED(clFinish, (queue));
	CALL_CL_GUARDED(clEnqueueReadBuffer, (
        	queue, temp2, /*blocking*/ CL_TRUE, /*offset*/ 0,
       		sizeof(float), &test,
        	0, NULL, NULL));
	

		printf("test success and temp2 %f \n",test);		
	

	#endif
		
	fft2D(temp2,temp4,temp9,temp5,N,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue,-1);
	
//-----------------------------
//linvzk = temp4
//-----------------------------


//-----------------------------------	
//pk = temp5, temp8 = rk
//-------------------------------------




	clEnqueueCopyBuffer(queue,temp3,temp5,
		0,0,
		sizeof(float)*N*N*2,0,NULL,NULL);
	//vec_copy(rhs,temp8,2*N*N,p_knl,queue);
	clEnqueueCopyBuffer(queue,rhs,temp8,
		0,0,
		sizeof(float)*N*N*2,0,NULL,NULL);

//------------------
//temp2 = xk
//--------------------
	vec__zero(temp2,(2*N*N),vec_zero,queue);

	CALL_CL_GUARDED(clFinish, (queue));
	//printf("I am here!\n");
	#if 0
	float test;
	CALL_CL_GUARDED(clFinish, (queue));
	CALL_CL_GUARDED(clEnqueueReadBuffer, (
        	queue, temp4, /*blocking*/ CL_TRUE, /*offset*/ 0,
       		sizeof(float), &test,
        	0, NULL, NULL));
	

		printf("test success and %f \n",test);		
	

	#endif
	float ipnew = reduction_mult(temp4, rhs,temp9,(N*N),reduct_mul,reduct,queue);
	
	float ipold;

	
	float reside = 1;
	int iter = 0;
	float cgalpha;
	float beta;

	
		
	
	while(reside > p_param->cgtol && iter < p_param->maxCG)
	{
		ipold = ipnew;
		//printf("ipold = %f\n",ipnew);
		//fft((3*u.^2 -1) .* pk).*nlap_s

		fft_w(temp, temp5, temp7,temp4,N,p_param->epsilon,k,p_param->s,fft_2D,fft_init_w,fft_2D_clean,mat_trans,mat_trans_3D,queue);





	
		fft2D(temp4, temp6,temp9,temp3,N,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue,-1);

	#if 0
		float test;
		CALL_CL_GUARDED(clFinish, (queue));
		CALL_CL_GUARDED(clEnqueueReadBuffer, (
		queue, temp6, /*blocking*/ CL_TRUE, /*offset*/ 0,
		sizeof(float), &test,
		0, NULL, NULL));


		printf("test success and Apk =  %f \n",test*k/p_param->epsilon);		


		#endif	
		//---------------------------
		// temp4 = pk + Apk1
		//---------------------------
		vec__add(temp5, temp6, temp4,1,k/p_param->epsilon,2*N*N,vec_add, queue);

    		//Apk2 = -alpha*k*epsilon*real(ifft2(fft2(pk).*sharmonic));
		fft_shar(temp5, temp6,temp9,temp7,N, p_param->epsilon,k,p_param->s,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue);
		fft2D(temp6, temp7,temp9,temp3,N,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue,-1);
		//temp4 = Apk 
		vec__add(temp4, temp7, temp4, 1, -k*p_param->epsilon, 2*N*N,vec_add, queue);

		//linvpk = real(ifft2(fft2(pk)./nlap_s2)); temp6 = linvpk
		fft_d_nlaps2(temp5,temp7,temp9,temp3,N,p_param->epsilon,k,p_param->s,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue);
		fft2D(temp7,temp6,temp9,temp3,N,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue,-1);

		if( ipold > 1e-9) 
			cgalpha = ipold / reduction_mult(temp6, temp4,temp9,N*N,reduct_mul,reduct,queue);
		else cgalpha =0;

		vec__add(temp2, temp5,temp2,1, cgalpha,2*N*N,vec_add,queue);

//---------------------
// update temp8 =rk
//---------------------

		vec__add(temp8 , temp4, temp8, 1, -cgalpha,2*N*N,vec_add,queue);

		//temp3 = zk
		fft_d_q(temp8,temp6,temp9,temp3,N,p_param->epsilon,k,p_param->s,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue);
		fft2D(temp6,temp3,temp9,temp4,N,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue,-1);

		//linvzk = real(ifft2(fft2(zk)./nlap_s2)); temp6 = linvzk
		fft_d_nlaps2(temp3,temp7,temp9,temp4,N,p_param->epsilon,k,p_param->s,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue);
		fft2D(temp7,temp6,temp9,temp4,N,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue,-1);
		
		ipnew = reduction_mult(temp8, temp6,temp9,N*N,reduct_mul,reduct,queue);
		CALL_CL_GUARDED(clFinish, (queue));
		//printf("ipnew = %f\n",ipnew);
	#if 0
		float test;
		CALL_CL_GUARDED(clFinish, (queue));
		CALL_CL_GUARDED(clEnqueueReadBuffer, (
		queue, temp8, /*blocking*/ CL_TRUE, /*offset*/ 0,
		sizeof(float), &test,
		0, NULL, NULL));


		printf("test success and %f \n",test);		


		#endif	

		if (ipold >1e-9)
		beta = ipnew /ipold;
		else beta =0;
		vec__add(temp3,temp5,temp5,1,beta,2*N*N,vec_add,queue);
				
//printf("In cg step here!\n");
		reside = residual(temp8,temp9,resid,resid_init,queue,N*N);
		CALL_CL_GUARDED(clFinish, (queue));
		//printf("cg residual is %f\n",reside);
		iter ++;
  		p_param->cgloc ++;
	}

	if( reside > p_param->cgtol)
	{
		printf("too many CG steps\n");
		*fail = true;
	}



}
Exemplo n.º 30
0
float reduct_energy(cl_mem a,cl_mem b, cl_mem c,cl_mem d, int N, float epsilon, 
			cl_kernel reduct_eng,cl_kernel reduct, cl_command_queue queue)
{
	int n = N ;
	float output;
//CALL_CL_GUARDED(clFinish, (queue));
		//printf("aha, n = %d\n",n);
	
	if(n > 128)
	{	
		SET_6_KERNEL_ARGS(reduct_eng, a, b, c, d,n,epsilon);
		size_t ldim[] = { 128 };
		size_t gdim[] = { n };
		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, reduct_eng,
			/*dimensions*/ 1, NULL, gdim, ldim,
			0, NULL, NULL));
		n /= 128;
			
		CALL_CL_GUARDED(clFinish, (queue));
		//printf("aha, n = %d\n",n);
		while(n>=128)
		{
			SET_2_KERNEL_ARGS(reduct, d, n);
			size_t ldim[] = { 128 };
			size_t gdim[] = { n };
			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, reduct,
				/*dimensions*/ 1, NULL, gdim, ldim,
				0, NULL, NULL));
			n /= 128;
		CALL_CL_GUARDED(clFinish, (queue));
		//printf("aha, n = %d\n",n);

		}

		if(n != 1)
		{
			SET_2_KERNEL_ARGS(reduct, d, n);
			size_t ldim[] = { n };
			size_t gdim[] = { n };
			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, reduct,
				/*dimensions*/ 1, NULL, gdim, ldim,
				0, NULL, NULL));
		}
					
		CALL_CL_GUARDED(clEnqueueReadBuffer, (
        		queue, d, /*blocking*/ CL_TRUE, /*offset*/ 0,
       			sizeof(float), &output,
        		0, NULL, NULL));

	}
	else 
	{
		SET_6_KERNEL_ARGS(reduct_eng, a, b, c, d,n,epsilon);
		size_t ldim[] = { n };
		size_t gdim[] = { n };
		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, reduct_eng,
			/*dimensions*/ 1, NULL, gdim, ldim,
			0, NULL, NULL));
		CALL_CL_GUARDED(clEnqueueReadBuffer, (
        		queue, d, /*blocking*/ CL_TRUE, /*offset*/ 0,
       			sizeof(float), &output,
        		0, NULL, NULL));
	}
	CALL_CL_GUARDED(clFinish, (queue));
	//printf("reduct energy = %f\n",output);
	return output;
}