cl_kernel kernel_from_string(cl_context ctx, char const *knl, char const *knl_name, char const *options) { // create an OpenCL program (may have multiple kernels) size_t sizes[] = { strlen(knl) }; cl_int status; cl_program program = clCreateProgramWithSource(ctx, 1, &knl, sizes, &status); CHECK_CL_ERROR(status, "clCreateProgramWithSource"); // build it status = clBuildProgram(program, 0, NULL, options, NULL, NULL); if (status != CL_SUCCESS) { // build failed, get build log and print it cl_device_id dev; CALL_CL_GUARDED(clGetProgramInfo, (program, CL_PROGRAM_DEVICES, sizeof(dev), &dev, NULL)); size_t log_size; CALL_CL_GUARDED(clGetProgramBuildInfo, (program, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); char *log = (char *) malloc(log_size); CHECK_SYS_ERROR(!log, "kernel_from_string: allocate log"); char devname[MAX_NAME_LEN]; CALL_CL_GUARDED(clGetDeviceInfo, (dev, CL_DEVICE_NAME, sizeof(devname), devname, NULL)); CALL_CL_GUARDED(clGetProgramBuildInfo, (program, dev, CL_PROGRAM_BUILD_LOG, log_size, log, NULL)); fprintf(stderr, "*** build of '%s' on '%s' failed:\n%s\n*** (end of error)\n", knl_name, devname, log); abort(); } else CHECK_CL_ERROR(status, "clBuildProgram"); // fish the kernel out of the program cl_kernel kernel = clCreateKernel(program, knl_name, &status); CHECK_CL_ERROR(status, "clCreateKernel"); CALL_CL_GUARDED(clReleaseProgram, (program)); return kernel; }
std::string get_event_type(cl_event event) { cl_command_type type; CALL_CL_GUARDED(clGetEventInfo, (event, CL_EVENT_COMMAND_TYPE, sizeof(type), &type, NULL)); std::string result; switch (type) { case CL_COMMAND_NDRANGE_KERNEL: result = "NDRANGE_KERNEL"; break; case CL_COMMAND_TASK: result = "TASK"; break; case CL_COMMAND_NATIVE_KERNEL: result = "NATIVE_KERNEL"; break; case CL_COMMAND_READ_BUFFER: result = "READ_BUFFER"; break; case CL_COMMAND_WRITE_BUFFER: result = "WRITE_BUFFER"; break; case CL_COMMAND_COPY_BUFFER: result = "COPY_BUFFER"; break; case CL_COMMAND_READ_IMAGE: result = "READ_IMAGE"; break; case CL_COMMAND_WRITE_IMAGE: result = "WRITE_IMAGE"; break; case CL_COMMAND_COPY_IMAGE: result = "COPY_IMAGE"; break; case CL_COMMAND_COPY_BUFFER_TO_IMAGE: result = "COPY_BUFFER_TO_IMAGE"; break; case CL_COMMAND_COPY_IMAGE_TO_BUFFER: result = "COPY_IMAGE_TO_BUFFER"; break; case CL_COMMAND_MAP_BUFFER: result = "MAP_BUFFER"; break; case CL_COMMAND_MAP_IMAGE: result = "MAP_IMAGE"; break; case CL_COMMAND_UNMAP_MEM_OBJECT: result = "UNMAP_MEM_OBJECT"; break; case CL_COMMAND_MARKER: result = "MARKER"; break; case CL_COMMAND_ACQUIRE_GL_OBJECTS: result = "ACQUIRE_GL_OBJECTS"; break; case CL_COMMAND_RELEASE_GL_OBJECTS: result = "RELEASE_GL_OBJECTS"; break; } return result; }
void print_device_info_from_queue(cl_command_queue queue) { cl_device_id dev; CALL_CL_GUARDED(clGetCommandQueueInfo, (queue, CL_QUEUE_DEVICE, sizeof dev, &dev, NULL)); print_device_info(dev); }
//TODO remove this at some point (deprecated) cl_kernel kernel_from_string(cl_context ctx, char const *knl, char const *knl_name, char const *options) { size_t sizes[] = { strlen(knl) }; cl_int status; cl_program program = clCreateProgramWithSource(ctx, 1, &knl, sizes, &status); CHECK_CL_ERROR(status, "clCreateProgramWithSource"); status = clBuildProgram(program, 0, NULL, options, NULL, NULL); if (status != CL_SUCCESS) { // build failed, get build log. cl_device_id dev; CALL_CL_GUARDED(clGetProgramInfo, (program, CL_PROGRAM_DEVICES, sizeof(dev), &dev, NULL)); size_t log_size; CALL_CL_GUARDED(clGetProgramBuildInfo, (program, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); char *log = malloc(log_size); CHECK_SYS_ERROR(!log, "kernel_from_string: allocate log"); char devname[100]; CALL_CL_GUARDED(clGetDeviceInfo, (dev, CL_DEVICE_NAME, sizeof(devname), devname, NULL)); CALL_CL_GUARDED(clGetProgramBuildInfo, (program, dev, CL_PROGRAM_BUILD_LOG, log_size, log, NULL)); THError("*** build of '%s' on '%s' failed:\n%s\n*** (end of error)\n", knl_name, devname, log); } else CHECK_CL_ERROR(status, "clBuildProgram"); cl_kernel kernel = clCreateKernel(program, knl_name, &status); CHECK_CL_ERROR(status, "clCreateKernel"); CALL_CL_GUARDED(clReleaseProgram, (program)); return kernel; }
void get_device_name_from_queue(cl_command_queue queue, char * buf, int bufsize) { cl_device_id dev; CALL_CL_GUARDED(clGetCommandQueueInfo, (queue, CL_QUEUE_DEVICE, sizeof dev, &dev, NULL)); clGetDeviceInfo(dev, CL_DEVICE_NAME, bufsize, buf, NULL); }
void fft_1D(cl_mem a,cl_mem b,cl_mem c, int N, cl_kernel init, cl_kernel knl,cl_command_queue queue,int direction,int offset_line) { //handle complex-to-complex fft, accutal size = 2 * N //size_t ldim[] = { 128 }; //size_t gdim[] = { (N /ldim[0])/2}; int Ns = 1; int y =0; SET_7_KERNEL_ARGS(init, a, b, N, Ns,direction,offset_line,y); size_t ldim[] = { 1 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, init, 1, NULL, gdim, ldim, 0, NULL, NULL)); for(Ns=4; Ns<N; Ns<<=2) { SET_6_KERNEL_ARGS(knl, b, c, N, Ns,direction,offset_line); size_t ldim[] = { 1 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, knl, 1, NULL, gdim, ldim, 0, NULL, NULL)); clEnqueueCopyBuffer(queue,c,b, offset_line*N*2*sizeof(float), offset_line*N*2*sizeof(float), sizeof(float)*N*2,0,NULL,NULL); //VecCopy(c,b,N,offset_line,vec_copy,queue); } }
void update_walker_positions_device(sampler *samp){ /* Update walker positions and corresponding PDF values on device. Input: sampler *samp Pointer to sampler structure which has been initialized. Output: Walker positions updated on device. Log PDF values updated on device. */ // -------------------------------------------------------------------------- // transfer to device // -------------------------------------------------------------------------- CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->X_red_device, /*blocking*/ CL_TRUE, /*offset*/ 0, samp->N * samp->K_over_two * sizeof(cl_float), samp->X_red_host, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->log_pdf_red_device, /*blocking*/ CL_TRUE, /*offset*/ 0, samp->K_over_two * sizeof(cl_float), samp->log_pdf_red_host, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->X_black_device, /*blocking*/ CL_TRUE, /*offset*/ 0, samp->N * samp->K_over_two * sizeof(cl_float), samp->X_black_host, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->log_pdf_black_device, /*blocking*/ CL_TRUE, /*offset*/ 0, samp->K_over_two * sizeof(cl_float), samp->log_pdf_black_host, 0, NULL, NULL)); CALL_CL_GUARDED(clFinish, (samp->queue)); if(OUTPUT_LEVEL > 0) printf("Walker update to device completed.\n"); }
void print_profile(cl_event event, cl_int status, void* data) { cl_long t_enqueue, t_submit, t_start, t_end; CALL_CL_GUARDED(clGetEventProfilingInfo, (event, CL_PROFILING_COMMAND_QUEUED, sizeof(t_enqueue), &t_enqueue, NULL)); CALL_CL_GUARDED(clGetEventProfilingInfo, (event, CL_PROFILING_COMMAND_SUBMIT, sizeof(t_submit), &t_submit, NULL)); CALL_CL_GUARDED(clGetEventProfilingInfo, (event, CL_PROFILING_COMMAND_START, sizeof(t_start), &t_start, NULL)); CALL_CL_GUARDED(clGetEventProfilingInfo, (event, CL_PROFILING_COMMAND_END, sizeof(t_end), &t_end, NULL)); std::cout << get_event_type(event) << " status: " << status << std::endl; std::cout << "time on queue : " << (t_submit - t_enqueue) / 1.0e9 << "s" << std::endl; std::cout << "time submision: " << (t_start - t_submit) / 1.0e9 << "s" << std::endl; std::cout << "time execution: " << (t_end - t_start) / 1.0e9 << "s" << std::endl; }
void mat__trans(cl_mem a, cl_mem b, int N, cl_kernel mat_trans, cl_command_queue queue,int option, float epsilon,float k,float s) { cl_long offset = 0; SET_8_KERNEL_ARGS(mat_trans, a, b, N, option,epsilon,k,s,offset); size_t ldim[] = { 16, 16 }; size_t gdim[] = { N, N }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, mat_trans, /*dimensions*/ 2, NULL, gdim, ldim, 0, NULL, NULL)); }
void print_platforms_devices() { // get number of platforms cl_uint plat_count; CALL_CL_GUARDED(clGetPlatformIDs, (0, NULL, &plat_count)); // allocate memory, get list of platforms cl_platform_id *platforms = (cl_platform_id *) malloc(plat_count*sizeof(cl_platform_id)); CHECK_SYS_ERROR(!platforms, "allocating platform array"); CALL_CL_GUARDED(clGetPlatformIDs, (plat_count, platforms, NULL)); // iterate over platforms for (cl_uint i = 0; i < plat_count; ++i) { // get platform vendor name char buf[MAX_NAME_LEN]; CALL_CL_GUARDED(clGetPlatformInfo, (platforms[i], CL_PLATFORM_VENDOR, sizeof(buf), buf, NULL)); printf("platform %d: vendor '%s'\n", i, buf); // get number of devices in platform cl_uint dev_count; CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &dev_count)); cl_device_id *devices = (cl_device_id *) malloc(dev_count*sizeof(cl_device_id)); CHECK_SYS_ERROR(!devices, "allocating device array"); // get list of devices in platform CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL, dev_count, devices, NULL)); // iterate over devices for (cl_uint j = 0; j < dev_count; ++j) { char buf[MAX_NAME_LEN]; CALL_CL_GUARDED(clGetDeviceInfo, (devices[j], CL_DEVICE_NAME, sizeof(buf), buf, NULL)); printf(" device %d: '%s'\n", j, buf); } free(devices); } free(platforms); }
//TODO move this to the routine opencl.printAllPlatform() void print_platforms_devices() { cl_uint plat_count; CALL_CL_GUARDED(clGetPlatformIDs, (0, NULL, &plat_count)); cl_platform_id *platforms = (cl_platform_id *) malloc(plat_count*sizeof(cl_platform_id)); CHECK_SYS_ERROR(!platforms, "allocating platform array"); CALL_CL_GUARDED(clGetPlatformIDs, (plat_count, platforms, NULL)); cl_uint i; for (i = 0; i < plat_count; ++i) { char buf[100]; CALL_CL_GUARDED(clGetPlatformInfo, (platforms[i], CL_PLATFORM_VENDOR, sizeof(buf), buf, NULL)); printf("plat %d: vendor '%s'\n", i, buf); cl_uint dev_count; CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &dev_count)); cl_device_id *devices = (cl_device_id *) malloc(dev_count*sizeof(cl_device_id)); CHECK_SYS_ERROR(!devices, "allocating device array"); CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL, dev_count, devices, NULL)); cl_uint j; for (j = 0; j < dev_count; ++j) { char buf[100]; CALL_CL_GUARDED(clGetDeviceInfo, (devices[j], CL_DEVICE_NAME, sizeof(buf), buf, NULL)); printf(" dev %d '%s'\n", j, buf); } free(devices); } free(platforms); }
void run_simulated_annealing(sampler *samp, cl_float *cooling_schedule, cl_int annealing_loops, cl_int steps_per_loop){ /* Run the simulated annealing to allow the walkers to explore the space and (hopefully) increase convergence speed. Discard all the samples generated by this routine. Reset all the counters for acceptance rates. Input: sampler *samp Pointer to sampler structure which has been initialized. cl_float *cooling_schedule Values of beta for the simulated annealing Values should be increasing and the final value should be one cl_int annealing_loops Number of loops cl_int steps_per_loop Iterations per loop Output: Pre-allocated sampler arrays now have had simulated annealing performed. */ for(int annealing_step=0; annealing_step<annealing_loops; annealing_step++){ // set the beta value for this iteration (samp->data_st)->beta = cooling_schedule[annealing_step]; (samp->data_st)->save = 0; // update the data structure accordingly CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->data_st_device, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(data_struct), samp->data_st, 0, NULL, NULL)); CALL_CL_GUARDED(clFinish, (samp->queue)); for(int it=0; it<steps_per_loop; it++){ SET_9_KERNEL_ARGS(samp->stretch_knl, samp->X_red_device, samp->log_pdf_red_device, samp->X_black_device, samp->ranluxcltab, samp->accepted_device, samp->data_device, samp->data_st_device, samp->indices_to_save_device, samp->X_red_save); CALL_CL_GUARDED(clEnqueueNDRangeKernel, (samp->queue, samp->stretch_knl, 1, NULL, samp->gdim, samp->ldim, 0, NULL, NULL)); SET_9_KERNEL_ARGS(samp->stretch_knl, samp->X_black_device, samp->log_pdf_black_device, samp->X_red_device, samp->ranluxcltab, samp->accepted_device, samp->data_device, samp->data_st_device, samp->indices_to_save_device, samp->X_black_save); CALL_CL_GUARDED(clEnqueueNDRangeKernel, (samp->queue, samp->stretch_knl, 1, NULL, samp->gdim, samp->ldim, 0, NULL, NULL)); CALL_CL_GUARDED(clFinish, (samp->queue)); } if(OUTPUT_LEVEL > 0) printf("Annealing iteration %d\n", annealing_step * steps_per_loop); } // reset the acceptance counter after the annealing for(int i=0; i< (samp->K_over_two); i++) samp->accepted_host[i] = 0; CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->accepted_device, /*blocking*/ CL_TRUE, /*offset*/ 0, samp->K_over_two * sizeof(cl_ulong), samp->accepted_host, 0, NULL, NULL)); CALL_CL_GUARDED(clFinish, (samp->queue)); }
void run_burn_in(sampler *samp, int burn_length){ /* Run the sampler to burn in. Discard all the samples generated by this routine. Reset all the counters for acceptance rates. Input: sampler *samp Pointer to sampler structure which has been initialized. int burn_length Number of burn in steps to run. Output: Pre-allocated sampler arrays now have had burn-in performed. */ // reset beta (samp->data_st)->beta = 1.0f; (samp->data_st)->save = 0; // update the data structure accordingly CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->data_st_device, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(data_struct), samp->data_st, 0, NULL, NULL)); CALL_CL_GUARDED(clFinish, (samp->queue)); // do the burn in for(int it=0; it<burn_length; it++){ SET_9_KERNEL_ARGS(samp->stretch_knl, samp->X_red_device, samp->log_pdf_red_device, samp->X_black_device, samp->ranluxcltab, samp->accepted_device, samp->data_device, samp->data_st_device, samp->indices_to_save_device, samp->X_red_save ); CALL_CL_GUARDED(clEnqueueNDRangeKernel, (samp->queue, samp->stretch_knl, 1, NULL, samp->gdim, samp->ldim, 0, NULL, NULL)); SET_9_KERNEL_ARGS(samp->stretch_knl, samp->X_black_device, samp->log_pdf_black_device, samp->X_red_device, samp->ranluxcltab, samp->accepted_device, samp->data_device, samp->data_st_device, samp->indices_to_save_device, samp->X_black_save); CALL_CL_GUARDED(clEnqueueNDRangeKernel, (samp->queue, samp->stretch_knl, 1, NULL, samp->gdim, samp->ldim, 0, NULL, NULL)); CALL_CL_GUARDED(clFinish, (samp->queue)); if( ((it % MAX((burn_length/10),1)) == 0) && (OUTPUT_LEVEL > 0)) printf("Burn iteration %d\n", it); } // make sure everything is done with the burn in CALL_CL_GUARDED(clFinish, (samp->queue)); // reset the acceptance counter after the burn in for(int i=0; i< (samp->K_over_two); i++) samp->accepted_host[i] = 0; CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->accepted_device, /*blocking*/ CL_TRUE, /*offset*/ 0, samp->K_over_two * sizeof(cl_ulong), samp->accepted_host, 0, NULL, NULL)); CALL_CL_GUARDED(clFinish, (samp->queue)); if(OUTPUT_LEVEL > 0) printf("Burn in complete.\n"); }
sampler* initialize_sampler(cl_int chain_length, cl_int dimension, cl_int walkers_per_group, size_t work_group_size, double a, cl_int pdf_number, cl_int data_length, cl_float *data, cl_int num_to_save, cl_int *indices_to_save, const char *plat_name, const char *dev_name){ /* Initialize stretch move MCMC sampler struct. Arrange parameters into sampler struct pointer. Allocate arrays on host, initialize walkers and other values as appropriate. Start OpenCL context and queue. Allocate device memory and transfer from host. Compile and initialize random number generator. Compile stretch move OpenCL kernel. Input: cl_int chain_length Allocate space for this many samples in the sampler struct. Sampler fills this array when run_sampler is called. cl_int dimension Dimension of state vector of Markov chain. cl_int walkers_per_group Number of walkers in each of two groups. Total walkers is twice this. size_t work_group_size Work group size. For CPU this must be set to one. For GPU this should be set larger, powers of two are optimal, try 64, 128 or 256. This number must divide walkers_per_group. double a Coefficient for range of 'z' random variable. Must be greater than one. Standard value is 2. Decrease a to increase low acceptance rate, especially in high dimensions. cl_int pdf_number Which PDF to sample. Passed to pdf.h as a compile time definition. cl_int data_length Length of observation data. If no data set this to zero. cl_float *data Observation data. cl_int num_to_save Number of components to save in the chain cl_int *indices_to_save Indices of components to save in the chain const char *plat_name String for platform name. Set to CHOOSE_INTERACTIVELY (no quotes) to do so. const char *dev_name String for device name. Set to CHOOSE_INTERACTIVELY (no quotes) to do so. Output: returned: sampler *samp Pointer to sampler struct with parameters, arrays, context, queue, kernel initialized. */ if(OUTPUT_LEVEL > 0) printf("Initializing Stretch Move sampler.\n"); // -------------------------------------------------------------------------- // Set parameters // -------------------------------------------------------------------------- // This environment variable forces headers to be reloaded each time // If not set and pdf if changed, changes may not be updated setenv("CUDA_CACHE_DISABLE", "1", 1); // allocate the structure for all the sampler parameters and arrays sampler * samp = (sampler *) malloc(sizeof(sampler)); if(!samp) { perror("Allocation failure sampler"); abort(); } // user set parameters samp->M = chain_length; // Number of steps to run samp->N = dimension; // Dimension of the problem and the walkers samp->K_over_two = walkers_per_group ; // Number of walkers in each group // derived parameters samp->K = 2 * samp->K_over_two; // Total walkers samp->total_samples = samp->M * samp->K; // Total samples produced // indices to save samp->num_to_save = num_to_save; samp->indices_to_save_host = indices_to_save; // Allocate the structure and set values samp->data_st = (data_struct *) malloc(sizeof(data_struct)); if(!(samp->data_st)) { perror("Allocation failure data_struct"); abort(); } // default value one, unless performing simulated annealing (samp->data_st)->beta = 1.0f; (samp->data_st)->save = 1; (samp->data_st)->num_to_save = num_to_save; // coefficient on Z random variable samp->a = a; double a_coeffs[3]; a_coeffs[0] = 1.0 / a; a_coeffs[1] = 2.0 * (1.0 - 1.0/a); a_coeffs[2] = a - 2.0 + 1.0/a; // error check on dimensions if(samp->K <= samp->N){ fprintf(stderr, "Error: Must have more walkers than the dimension.\nExiting\n"); abort(); } // error check on work sizes if( (samp->K_over_two % work_group_size) != 0){ fprintf(stderr, "Error: Number of walkers in each group must be multiple of work group size.\nExiting\n"); abort(); } // error check on dimensions to save for(int i=0; i<num_to_save; i++){ if(samp->indices_to_save_host[i] >= samp->N){ fprintf(stderr, "Error: Cannot save an index larger than the dimension of the problem.\nExiting\n"); abort(); } } if(a <= 1.0){ fprintf(stderr, "Error: Value of a must be greater than one.\nDefaulting to 2.\n"); samp->a = 2.0; } // for later output samp->acor_times = (double *) malloc(samp->num_to_save * sizeof(double)); if(!samp->acor_times) { perror("Allocation failure"); abort(); } samp->acor_pass = (char *) malloc(samp->num_to_save * sizeof(char)); if(!samp->acor_pass) { perror("Allocation failure"); abort(); } samp->sigma = (double *) malloc(samp->num_to_save * sizeof(double)); if(!samp->sigma) { perror("Allocation failure"); abort(); } samp->means = (double *) malloc(samp->num_to_save * sizeof(double)); if(!samp->means) { perror("Allocation failure"); abort(); } samp->err_bar = (double *) malloc(samp->num_to_save * sizeof(double)); if(!samp->err_bar) { perror("Allocation failure"); abort(); } // write parameter file for plotting write_parameter_file_matlab(samp->M, samp->N, samp->K, "Stretch Move", samp->indices_to_save_host, samp->num_to_save, pdf_number); // -------------------------------------------------------------------------- // Set up OpenCL context and queues // -------------------------------------------------------------------------- if(OUTPUT_LEVEL > 0) printf("Begin opencl contexts.\n"); create_context_on(plat_name, dev_name, 0, &(samp->ctx), NULL, 0); { cl_int status; cl_device_id my_dev; CALL_CL_GUARDED(clGetContextInfo, (samp->ctx, CL_CONTEXT_DEVICES, sizeof(my_dev), &my_dev, NULL)); samp->queue = clCreateCommandQueue(samp->ctx, my_dev, 0, &status); CHECK_CL_ERROR(status, "clCreateCommandQueue"); samp->queue_mem = clCreateCommandQueue(samp->ctx, my_dev, 0, &status); CHECK_CL_ERROR(status, "clCreateCommandQueue"); } // print information on selected device if(OUTPUT_LEVEL > 1) print_device_info_from_queue(samp->queue); // set the work group sizes samp->ldim[0] = work_group_size; samp->gdim[0] = samp->K_over_two; if(OUTPUT_LEVEL > 0) printf("Context built.\n"); // -------------------------------------------------------------------------- // Start total timing // -------------------------------------------------------------------------- if(OUTPUT_LEVEL > 0) printf("Begin total timing.\n"); get_timestamp(&(samp->time1_total)); // -------------------------------------------------------------------------- // Allocate host memory // -------------------------------------------------------------------------- // counter for number of samples accepted samp->accepted_host = (cl_ulong *) malloc(samp->K_over_two * sizeof(cl_ulong)); if(!(samp->accepted_host)){ perror("Allocation failure accepted host"); abort(); } for(int i=0; i< (samp->K_over_two); i++) samp->accepted_host[i] = 0; // Adjacent memory on x_red moves with in the walker // To access the ith component of walker j, take x_red[i + j*N]; // red walkers samp->X_red_host = (cl_float *) malloc(samp->N * samp->K_over_two * sizeof(cl_float)); if(!(samp->X_red_host)){ perror("Allocation failure X_red_host"); abort(); } // log likelihood samp->log_pdf_red_host = (cl_float *) malloc(samp->K_over_two * sizeof(cl_float)); if(!(samp->log_pdf_red_host)){ perror("Allocation failure X_red_host"); abort(); } for(int i=0; i<(samp->K_over_two); i++) samp->log_pdf_red_host[i] = (-1.0f) / 0.0f; // black walkers samp->X_black_host = (cl_float *) malloc(samp->N * samp->K_over_two * sizeof(cl_float)); if(!(samp->X_black_host)){ perror("Allocation failure X_black_host"); abort(); } // log likelihood samp->log_pdf_black_host = (cl_float *) malloc(samp->K_over_two * sizeof(cl_float)); if(!(samp->log_pdf_black_host)){ perror("Allocation failure X_red_host"); abort(); } for(int i=0; i< (samp->K_over_two); i++) samp->log_pdf_black_host[i] = (-1.0f) / 0.0f; // samples on host cl_int samples_length = samp->num_to_save * samp->M * samp->K; // length of the samples array samp->samples_host = (cl_float *) malloc(samples_length * sizeof(cl_float)); // samples to return if(!(samp->samples_host)){ perror("Allocation failure samples_host"); abort(); } // intialize the walkers to random values // set the seed value srand48(0); // initialize the walkers to small random values for(int j=0; j < samp->N * samp->K_over_two; j++){ if(NONNEGATIVE_BOX){ samp->X_black_host[j] = (cl_float) drand48(); samp->X_red_host[j] = (cl_float) drand48(); } else{ samp->X_black_host[j] = (cl_float) (0.1 * (drand48()-0.5)); samp->X_red_host[j] = (cl_float) (0.1 * (drand48()-0.5)); } } // set up observations samp->data_length = data_length; // there are lots of complications that appear if this is empty // make it length one instead if(samp->data_length == 0){ samp->data_length = 1; samp->data_host = (cl_float *) malloc(samp->data_length * sizeof(cl_float)) ; if(!(samp->data_host)){ perror("Allocation failure data_host"); abort(); } samp->data_host[0] = 0.0f; } else{ // standard case samp->data_host = data; } // -------------------------------------------------------------------------- // load kernels // -------------------------------------------------------------------------- // stretch move kernel char *knl_text = read_file("stretch_move.cl"); char options[300]; sprintf(options, "-D NN=%d -D K_OVER_TWO=%d -D WORK_GROUP_SIZE=%d -D DATA_LEN=%d -D PDF_NUMBER=%d -D A_COEFF_0=%.10ff -D A_COEFF_1=%.10ff -D A_COEFF_2=%.10ff -I . ", samp->N, samp->K_over_two, (int) work_group_size, samp->data_length, pdf_number, a_coeffs[0], a_coeffs[1], a_coeffs[2]); if(OUTPUT_LEVEL > 0) printf("Options string for stretch move kernel:%s\n", options); samp->stretch_knl = kernel_from_string(samp->ctx, knl_text, "stretch_move", options); free(knl_text); if(OUTPUT_LEVEL > 0) printf("Stretch Move kernel compiled.\n"); // random number generator initialization char * knl_text_rand = read_file("Kernel_Ranluxcl_Init.cl"); char options_rand_lux[100]; if(AMD) sprintf(options_rand_lux, "-DRANLUXCL_LUX=4 -I ."); else sprintf(options_rand_lux, "-DRANLUXCL_LUX=4"); samp->init_rand_lux_knl = kernel_from_string(samp->ctx, knl_text_rand, "Kernel_Ranluxcl_Init", options_rand_lux); free(knl_text_rand); if(OUTPUT_LEVEL > 0) printf("Ranluxcl init kernel compiled.\n"); // -------------------------------------------------------------------------- // allocate device memory // -------------------------------------------------------------------------- cl_int status; samp->X_red_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE, sizeof(cl_float) * samp->N * samp->K_over_two, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); samp->log_pdf_red_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE, sizeof(cl_float) * samp->K_over_two, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); samp->X_red_save = clCreateBuffer(samp->ctx, CL_MEM_WRITE_ONLY, sizeof(cl_float) * samp->num_to_save * samp->K_over_two, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); samp->X_black_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE, sizeof(cl_float) * samp->N * samp->K_over_two, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); samp->log_pdf_black_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE, sizeof(cl_float) * samp->K_over_two, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); samp->X_black_save = clCreateBuffer(samp->ctx, CL_MEM_WRITE_ONLY, sizeof(cl_float) * samp->num_to_save * samp->K_over_two, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); samp->accepted_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE, samp->K_over_two * sizeof(cl_ulong), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); samp->indices_to_save_device = clCreateBuffer(samp->ctx, CL_MEM_READ_ONLY, samp->num_to_save * sizeof(cl_int), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); // allocate for the observations samp->data_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE, sizeof(cl_float) * samp->data_length, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); // data struct on device samp->data_st_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE, sizeof(data_struct), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); // allocate for the state array for randluxcl // use a 1d work group size_t rand_lux_state_buffer_size = samp->gdim[0] * 7 * sizeof(cl_float4); samp->ranluxcltab = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE, rand_lux_state_buffer_size, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); // -------------------------------------------------------------------------- // transfer to device // -------------------------------------------------------------------------- CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->X_red_device, /*blocking*/ CL_TRUE, /*offset*/ 0, samp->N * samp->K_over_two * sizeof(cl_float), samp->X_red_host, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->log_pdf_red_device, /*blocking*/ CL_TRUE, /*offset*/ 0, samp->K_over_two * sizeof(cl_float), samp->log_pdf_red_host, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->X_black_device, /*blocking*/ CL_TRUE, /*offset*/ 0, samp->N * samp->K_over_two * sizeof(cl_float), samp->X_black_host, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->log_pdf_black_device, /*blocking*/ CL_TRUE, /*offset*/ 0, samp->K_over_two * sizeof(cl_float), samp->log_pdf_black_host, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->data_device, /*blocking*/ CL_TRUE, /*offset*/ 0, samp->data_length * sizeof(cl_float), samp->data_host, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->data_st_device, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(data_struct), samp->data_st, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->indices_to_save_device, /*blocking*/ CL_TRUE, /*offset*/ 0, samp->num_to_save * sizeof(cl_int), samp->indices_to_save_host, 0, NULL, NULL)); CALL_CL_GUARDED(clFinish, (samp->queue)); // -------------------------------------------------------------------------- // Initialize random number generator // -------------------------------------------------------------------------- // int for state variable initialization cl_int ins = 1; SET_2_KERNEL_ARGS(samp->init_rand_lux_knl, ins, samp->ranluxcltab); CALL_CL_GUARDED(clEnqueueNDRangeKernel, (samp->queue, samp->init_rand_lux_knl, /*dimensions*/ 1, NULL, samp->gdim, samp->ldim, 0, NULL, NULL)); CALL_CL_GUARDED(clFinish, (samp->queue)); // -------------------------------------------------------------------------- // Sampler initialization is done // -------------------------------------------------------------------------- if(OUTPUT_LEVEL > 0) printf("Sampler initialized.\n"); return samp; }
void fft2D_new(cl_mem a, cl_mem c, cl_mem b,cl_mem d, int N, cl_kernel init,cl_kernel interm, cl_kernel fft1D,cl_kernel mat_trans, cl_command_queue queue,int direction) { #if 0 int Ns = 1; int y =0; int x =N*N; SET_7_KERNEL_ARGS(init, a, b, N, Ns,direction,y,y); size_t ldim[] = { 1 }; size_t gdim[] = { N*N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, init, 1, NULL, gdim, ldim, 0, NULL, NULL)); #endif #if 1 int Ns = 1; int stride = 64; for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; int y =0; SET_7_KERNEL_ARGS(init, a, b, N, Ns,direction,offset,y); size_t ldim[] = { 1 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, init, 1, NULL, gdim, ldim, 0, NULL, NULL)); } #if 1 for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; if(N >= 4) { Ns = 4; SET_6_KERNEL_ARGS(interm, b, c, N, Ns,direction,offset); size_t ldim[] = { 16 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, interm, 1, NULL, gdim, ldim, 0, NULL, NULL)); } } clEnqueueCopyBuffer(queue,c,b, 0, 0, sizeof(float)*N*N*2,0,NULL,NULL); for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; if(N>=16) { Ns = 16; SET_6_KERNEL_ARGS(interm, b, c, N, Ns,direction,offset); size_t ldim[] = { 16 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, interm, 1, NULL, gdim, ldim, 0, NULL, NULL)); } } clEnqueueCopyBuffer(queue,c,b, 0, 0, sizeof(float)*N*N*2,0,NULL,NULL); if(N >=64) #endif for(Ns=64; Ns<N; Ns<<=2) { for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; SET_6_KERNEL_ARGS(fft1D, b, c, N, Ns,direction,offset); size_t ldim[] = { 1 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, fft1D, 1, NULL, gdim, ldim, 0, NULL, NULL)); //VecCopy(c,b,N,offset_line,vec_copy,queue); } clEnqueueCopyBuffer(queue,c,b, 0, 0, sizeof(float)*N*N*2,0,NULL,NULL); } #endif //CALL_CL_GUARDED(clFinish, (queue)); //printf("1D fine \n"); mat__trans(b,c,N,mat_trans,queue,0,1,1,1); #if 0 float test; CALL_CL_GUARDED(clFinish, (queue)); CALL_CL_GUARDED(clEnqueueReadBuffer, ( queue, c, /*blocking*/ CL_TRUE, /*offset*/ 2*sizeof(float)*N, sizeof(float), &test, 0, NULL, NULL)); printf("test = %f\n",test); #endif //CALL_CL_GUARDED(clFinish, (queue)); #if 0 for(int j= 0;j<N;j++) { fft_1D_new(c,b,d,N,init,interm,fft1D,queue,direction,j); } #endif #if 1 Ns = 1; for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; int y =0; SET_7_KERNEL_ARGS(init, c, b, N, Ns,direction,offset,y); size_t ldim[] = { 1 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, init, 1, NULL, gdim, ldim, 0, NULL, NULL)); } #if 1 for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; if(N >= 4) { Ns = 4; SET_6_KERNEL_ARGS(interm, b, d, N, Ns,direction,offset); size_t ldim[] = { 16 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, interm, 1, NULL, gdim, ldim, 0, NULL, NULL)); } } clEnqueueCopyBuffer(queue,d,b, 0, 0, sizeof(float)*N*N*2,0,NULL,NULL); for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; if(N>=16) { Ns = 16; SET_6_KERNEL_ARGS(interm, b, d, N, Ns,direction,offset); size_t ldim[] = { 16 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, interm, 1, NULL, gdim, ldim, 0, NULL, NULL)); } } clEnqueueCopyBuffer(queue,d,b, 0, 0, sizeof(float)*N*N*2,0,NULL,NULL); if(N >=64) #endif for(Ns=64; Ns<N; Ns<<=2) { for(int blk=0; blk<stride;blk++) for(int j= 0;j<N/stride;j++) { int offset = blk*N/stride +j; SET_6_KERNEL_ARGS(fft1D, b, d, N, Ns,direction,offset); size_t ldim[] = { 1 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, fft1D, 1, NULL, gdim, ldim, 0, NULL, NULL)); //VecCopy(c,b,N,offset_line,vec_copy,queue); } clEnqueueCopyBuffer(queue,d,b, 0, 0, sizeof(float)*N*N*2,0,NULL,NULL); } #endif //CALL_CL_GUARDED(clFinish, (queue)); if(direction == 1) mat__trans(b,c,N,mat_trans,queue,0,1,1,1); else mat__trans(b,c,N,mat_trans,queue,-1,1,1,1); }
int main (int argv, char **argc) { ///////////////////////// ////// SAME IN EVERY FILE ///////////////////////// // create context and command queue cl_context __sheets_context; cl_command_queue __sheets_queue; int _i; cl_int __cl_err; create_context_on(SHEETS_PLAT_NAME, SHEETS_DEV_NAME, 0, /* choose the first (only) available device */ &__sheets_context, &__sheets_queue, 0); // compile kernels for (_i = 0; _i < NKERNELS; _i++) { compiled_kernels[_i] = kernel_from_string(__sheets_context, kernel_strings[_i], kernel_names[_i], SHEETS_KERNEL_COMPILE_OPTS); } ////// [END] size_t __SIZE_wav = atoi(argc[1]); float wav[__SIZE_wav]; const char *file_name = "mytune.wav"; int in_thrsh_cnt = 0; timestamp_type st; timestamp_type end; get_timestamp(&st); for (_i = 0; _i < __SIZE_wav; _i++) { wav[_i] = (float) rand() / RAND_MAX; if (in_thrsh(wav[_i], 0.1112, 0.7888)) in_thrsh_cnt++; } get_timestamp(&end); printf("cpu execution took %f seconds\n", timestamp_diff_in_seconds(st, end)); get_timestamp(&st); ///////////////// ////// GFUNC CALL ///////////////// /// create variables for function arguments given as literals float __PRIM_band_restrict_ARG2 = 0.1112f; float __PRIM_band_restrict_ARG3 = 0.7888f; /// return array (always arg0) cl_mem __CLMEM_band_restrict_ARG0 = clCreateBuffer(__sheets_context, CL_MEM_WRITE_ONLY, sizeof(float) * __SIZE_wav, NULL, &__cl_err); CHECK_CL_ERROR(__cl_err, "clCreateBuffer"); /// input arrays cl_mem __CLMEM_band_restrict_ARG1 = clCreateBuffer(__sheets_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * __SIZE_wav, (void *) wav, &__cl_err); CHECK_CL_ERROR(__cl_err, "clCreateBuffer"); /// write to device memory CALL_CL_GUARDED(clEnqueueWriteBuffer, (__sheets_queue, __CLMEM_band_restrict_ARG1, CL_TRUE, /* blocking write */ 0, /* no offset */ sizeof(float) * __SIZE_wav, wav, 0, /* no wait list */ NULL, NULL) ); /// set up kernel arguments SET_4_KERNEL_ARGS(compiled_kernels[0], __CLMEM_band_restrict_ARG0, __CLMEM_band_restrict_ARG1, __PRIM_band_restrict_ARG2, __PRIM_band_restrict_ARG3); /// enqueue kernel cl_event __CLEVENT_band_restrict_CALL; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (__sheets_queue, compiled_kernels[0], 1, /* 1 dimension */ 0, /* 0 offset */ &__SIZE_wav, NULL, /* let OpenCL break things up */ 0, /* no events in wait list */ NULL, /* empty wait list */ &__CLEVENT_band_restrict_CALL) ); /// allocate space for cpu return array float out[__SIZE_wav]; CALL_CL_GUARDED(clEnqueueReadBuffer, (__sheets_queue, __CLMEM_band_restrict_ARG0, CL_TRUE, /* blocking read */ 0, /* 0 offset */ sizeof(float) * __SIZE_wav, /* read whole buffer */ (void *) out, /* host pointer */ 1, /* wait for gfunc to finish */ &__CLEVENT_band_restrict_CALL, /* "" */ NULL) /* no need to wait for this call though */ ); ////// [END] GFUNC CALL get_timestamp(&end); printf("gfunc call took %f seconds\n", timestamp_diff_in_seconds(st, end)); ////// Validate call int c = 0; for (_i = 0; _i < __SIZE_wav; _i++) { if (in_thrsh(out[_i], 0.1112, 0.7888)) { c++; } else if(out[_i]) { exit(1); } } printf("\n"); assert(in_thrsh_cnt == c); ////////////// ////// CLEANUP ////////////// CALL_CL_GUARDED(clReleaseMemObject, (__CLMEM_band_restrict_ARG0)); CALL_CL_GUARDED(clReleaseMemObject, (__CLMEM_band_restrict_ARG1)); for (_i = 0; _i < NKERNELS; _i++) { CALL_CL_GUARDED(clReleaseKernel, (compiled_kernels[_i])); } CALL_CL_GUARDED(clReleaseCommandQueue, (__sheets_queue)); CALL_CL_GUARDED(clReleaseContext, (__sheets_context)); return 0; }
void create_context_on(const char *plat_name, const char*dev_name, cl_uint idx, cl_context *ctx, cl_command_queue *queue, int enable_profiling) { cl_uint plat_count; CALL_CL_GUARDED(clGetPlatformIDs, (0, NULL, &plat_count)); cl_platform_id *platforms = (cl_platform_id *) malloc(plat_count*sizeof(cl_platform_id)); CHECK_SYS_ERROR(!platforms, "allocating platform array"); CALL_CL_GUARDED(clGetPlatformIDs, (plat_count, platforms, NULL)); for (cl_uint i = 0; i < plat_count; ++i) { char buf[100]; CALL_CL_GUARDED(clGetPlatformInfo, (platforms[i], CL_PLATFORM_VENDOR, sizeof(buf), buf, NULL)); if (!plat_name || strstr(buf, plat_name)) { cl_uint dev_count; CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &dev_count)); cl_device_id *devices = (cl_device_id *) malloc(dev_count*sizeof(cl_device_id)); CHECK_SYS_ERROR(!devices, "allocating device array"); CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL, dev_count, devices, NULL)); for (cl_uint j = 0; j < dev_count; ++j) { char buf[100]; CALL_CL_GUARDED(clGetDeviceInfo, (devices[j], CL_DEVICE_NAME, sizeof(buf), buf, NULL)); if (!dev_name || strstr(buf, dev_name)) { if (idx == 0) { cl_platform_id plat = platforms[i]; cl_device_id dev = devices[j]; free(devices); free(platforms); cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) plat, 0 }; cl_int status; *ctx = clCreateContext( cps, 1, &dev, NULL, NULL, &status); CHECK_CL_ERROR(status, "clCreateContext"); cl_command_queue_properties qprops = 0; if (enable_profiling) qprops |= CL_QUEUE_PROFILING_ENABLE; *queue = clCreateCommandQueue(*ctx, dev, qprops, &status); CHECK_CL_ERROR(status, "clCreateCommandQueue"); return; } else --idx; } } free(devices); } } free(platforms); fputs("create_context_on: specified device not found.\n", stderr); abort(); }
void fft_1D_big(cl_mem a,cl_mem b,cl_mem c, int N, cl_kernel init_big, cl_kernel clean,cl_kernel mat_trans,cl_command_queue queue,int direction,int offset_line) { //handle complex-to-complex fft, accutal size = 2 * N //size_t ldim[] = { 128 }; //size_t gdim[] = { (N /ldim[0])/2}; int Ns = 1; int y =0; SET_7_KERNEL_ARGS(init_big, a, b, N, Ns,direction,offset_line,y); size_t ldim[] = { 16 }; size_t gdim[] = { N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, init_big, 1, NULL, gdim, ldim, 0, NULL, NULL)); if (N ==64 ) return; else if( N == 256 || N == 1024) { cl_long offset = offset_line * N; SET_7_KERNEL_ARGS(clean, b, c, N, Ns,direction,offset_line,y); ldim[0] =4; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, clean, 1, NULL, gdim, ldim, 0, NULL, NULL)); if(N == 1024) { int option =0; float k =0; int n = 16; SET_8_KERNEL_ARGS(mat_trans, c, b, n, option,k,k,k,offset); size_t ldim[] = { 16, 16 }; size_t gdim[] = { 16, 64 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, mat_trans, 2, NULL, gdim, ldim, 0, NULL, NULL)); } else if(N ==256) { int option =0; float k =0; int n = 4; SET_8_KERNEL_ARGS(mat_trans, c, b, n, option,k,k,k,offset); size_t ldim[] = { 4, 4 }; size_t gdim[] = { 4, 64 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, mat_trans, 2, NULL, gdim, ldim, 0, NULL, NULL)); } } else { printf("FFT not implemented for this size!!!\n"); return; } }
void mgv(ftype f[], ftype u[], ftype dx, unsigned n1,unsigned n2,unsigned n3, size_t field_size, unsigned points, int use_alignment, unsigned dim_x, cl_context ctx, cl_command_queue queue, cl_kernel poisson_knl, int wg_dims , int wg_x, int wg_y, int wg_z, int z_div, int fetch_per_pt, int flops_per_pt){ // mgv does one v-cycle for the Poisson problem on a grid with mesh size dx // Inputs: f is right hand side, u is current approx dx is mesh size, n1 number of sweeps // on downward branch, n2 number of sweeps on upwardbranch, n3 number of sweeps on // coarsest grid. // Output: It just returns an updated version of u cl_ulong start_big; #ifdef DO_TIMING cl_event evt; cl_event *evt_ptr = &evt; #else cl_event *evt_ptr = NULL; #endif size_t i, isweep; item * ugrid, * head, * curr; int l = 0; ftype dxval[POINTS/2] = {0}; // this is huge and unnecessary. Try to cut downif time!! ftype h; unsigned nx[POINTS/2] = {0}; // --- Allocate common gpu memory---- cl_int status; cl_mem dev_buf_u = clCreateBuffer(ctx, CL_MEM_READ_WRITE, field_size * sizeof(ftype), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem dev_buf_f = clCreateBuffer(ctx, CL_MEM_READ_ONLY, field_size * sizeof(ftype), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem dev_buf_hist_u = clCreateBuffer(ctx, CL_MEM_READ_ONLY, field_size * sizeof(ftype), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); //cl_mem read_buf = clCreateBuffer(ctx, CL_MEM_READ_ONLY, field_size * sizeof(ftype), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); // ----------------------------------- dxval[0] = dx; nx[0] = points; //const size_t max_size = POINTS * POINTS * ((POINTS + 15)/16) * 16; // --------------- Allocatig the finest grid -------------------- ugrid = (item *)malloc(sizeof(item)); ugrid->uvec = malloc(field_size * sizeof(ftype)); ugrid->fvec = malloc(field_size * sizeof(ftype)); ugrid->rvec = malloc(field_size * sizeof(ftype)); ugrid->dim_other = nx[0]; ugrid->dim_x = dim_x; for(i = 0; i < field_size; i++){ ugrid->uvec[i] = u[i]; ugrid->fvec[i] = f[i]; ugrid->rvec[i] = 0; } head = ugrid; // head will always be the first one // ---------------- Set up the coarse grids ---------------------- while((nx[l] - 1) % 2 == 0 && nx[l] > 3){ l = l+1; nx[l] = (nx[l - 1] - 1) / 2 + 1; dxval[l] = 2 * dxval[l-1]; curr = (item *)malloc(sizeof(item)); curr->uvec = malloc(field_size * sizeof(ftype)); curr->fvec = malloc(field_size * sizeof(ftype)); curr->rvec = malloc(field_size * sizeof(ftype)); curr->dim_other = nx[l]; curr->field_start = 0; curr->dim_x = curr->dim_other; if(use_alignment) curr->dim_x = ((nx[l] + 15)/16) * 16; // initialize vectors for(i = 0; i < field_size; i++){ curr->uvec[i] = 0; curr->fvec[i] = 0; curr->rvec[i] = 0; } ugrid->next = curr; // curr gets attached to ugrid curr->prev = ugrid; ugrid = curr; } int nl = l; // this is the maximum number of grids that were created // --- at this point head contains the finest grid and ugrid contains the coarsest ----- curr = head; head->prev = NULL; ugrid->next = NULL; // ---------------- Now relax each of the different grids descending-------- for(l = 0; l < nl; l++){ // I stop right before nl (will be treated different) // ---------------------------------------------------------------------- // -------------------- GPU DESCENDING V-CYCLE -------------------------- // ---------------------------------------------------------------------- { if(curr->dim_other < CUTOFF){ for(isweep = 0; isweep < n1; isweep++){ gsrelax(curr, dxval[l]); } } else{ // ---GPU------GPU------GPU------GPU------GPU------GPU------GPU------GPU--- // // fill in the buffers inside the GPU with the current data CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_f, CL_TRUE, 0, field_size * sizeof(ftype), curr->fvec, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_hist_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); h = dxval[l] * dxval[l]; size_t gdim[] = { curr->dim_x-16, curr->dim_x-16, curr->dim_x/z_div }; size_t ldim[] = { wg_x, wg_y, wg_z }; for(i = 0; i < n1; i++){ // ---------------------------------------------------------------------- // invoke poisson kernel // ---------------------------------------------------------------------- //size_t u_size; //CALL_CL_GUARDED(clGetMemObjectInfo, (dev_buf_u, CL_MEM_SIZE, sizeof(u_size), &u_size, 0)); //int u_size_i = u_size; //printf("u_size=%d fstart=%d dim_x=%d dim_other=%d\n" , u_size_i, curr->field_start, curr->dim_x, curr->dim_other); curr->field_start = 0; SET_7_KERNEL_ARGS(poisson_knl, dev_buf_u, dev_buf_f, dev_buf_hist_u, curr->field_start, curr->dim_x, curr->dim_other, h); // run the kernel CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, poisson_knl, /*dimensions*/ wg_dims, NULL, gdim, ldim, 0, NULL, evt_ptr)); #ifdef DO_TIMING // If timing is enabled, this wait can mean a significant performance hit. CALL_CL_GUARDED(clWaitForEvents, (1, &evt)); cl_ulong start, end; CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL)); CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_END, sizeof(start), &end, NULL)); gbytes_accessed += 1e-9*(sizeof(ftype) * field_size * fetch_per_pt); start_big = start; seconds_taken += 1e-9*(end-start); mcells_updated += curr->dim_other*curr->dim_other*curr->dim_other/1e6; gflops_performed += 1e-9*curr->dim_x*curr->dim_x*curr->dim_x * flops_per_pt; CALL_CL_GUARDED(clReleaseEvent, (evt)); #endif CALL_CL_GUARDED(clFinish, (queue)); //ira adentro?? cl_mem tmp = dev_buf_u; dev_buf_u = dev_buf_hist_u; dev_buf_hist_u = tmp; } //when I'm done, read from buffer CALL_CL_GUARDED(clEnqueueReadBuffer, (queue, dev_buf_u, /*blocking*/ CL_TRUE, /*offset*/ 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); } } resid2(curr, dxval[l]); injf2c(curr, curr->next); //this function updates f_{i+1} curr = curr->next; } // ---------------------------------------------------------------------- // --------------- GPU ON THE COARSEST GRID ----------------------------- // ---------------------------------------------------------------------- { if(curr->dim_other < CUTOFF){ for(i = 0; i < n3; i++){ gsrelax(curr, dxval[nl]); } } else{ // ---GPU------GPU------GPU------GPU------GPU------GPU------GPU------GPU--- // // fill in the buffers inside the GPU with the current data CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_f, CL_TRUE, 0, field_size * sizeof(ftype), curr->fvec, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_hist_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); h = dxval[nl] * dxval[nl]; size_t gdim[] = { curr->dim_x - 16, curr->dim_x - 16, curr->dim_x/z_div }; size_t ldim[] = { wg_x, wg_y, wg_z }; for(i = 0; i < n3; i++){ // ---------------------------------------------------------------------- // invoke poisson kernel // ---------------------------------------------------------------------- curr->field_start = 0; SET_7_KERNEL_ARGS(poisson_knl, dev_buf_u, dev_buf_f, dev_buf_hist_u,curr->field_start, curr->dim_x, curr->dim_other, h); // run the kernel curr->field_start = 0; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, poisson_knl, /*dimensions*/ wg_dims, NULL, gdim, ldim, 0, NULL, evt_ptr)); #ifdef DO_TIMING // If timing is enabled, this wait can mean a significant performance hit. CALL_CL_GUARDED(clWaitForEvents, (1, &evt)); cl_ulong start, end; CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL)); CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_END, sizeof(start), &end, NULL)); gbytes_accessed += 1e-9*(sizeof(ftype) * field_size * fetch_per_pt); seconds_taken += 1e-9*(end-start); mcells_updated += curr->dim_other*curr->dim_other*curr->dim_other/1e6; gflops_performed += 1e-9*curr->dim_x*curr->dim_x*curr->dim_x * flops_per_pt; CALL_CL_GUARDED(clReleaseEvent, (evt)); #endif CALL_CL_GUARDED(clFinish, (queue)); //ira adentro?? cl_mem tmp = dev_buf_u; dev_buf_u = dev_buf_hist_u; dev_buf_hist_u = tmp; } //when I'm done, read from buffer CALL_CL_GUARDED(clEnqueueReadBuffer, (queue, dev_buf_u, /*blocking*/ CL_TRUE, /*offset*/ 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); } } // ---------------------------------------------------------------------- // -----------Upward branch of the V-cycle ------------------------------ // ---------------------------------------------------------------------- for(l = nl-1; l >= 0; --l){ ctof(curr->prev, curr, field_size); //curr->prev is the finer of the two free(curr->uvec); //curr won't be needed anymore free(curr->fvec); free(curr->rvec); curr = curr->prev; curr->next = NULL; for(isweep = 0; isweep < n2; isweep++){ gsrelax(curr, dxval[l]); } // Update the grids n1 times using the GPU when necessary { if(curr->dim_other < CUTOFF){ for(isweep = 0; isweep < n2; isweep++){ gsrelax(curr, dxval[l]); } } else{ // ---GPU------GPU------GPU------GPU------GPU------GPU------GPU------GPU--- // // fill in the buffers inside the GPU with the current data CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_f, CL_TRUE, 0, field_size * sizeof(ftype), curr->fvec, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, (queue, dev_buf_hist_u, CL_TRUE, 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); h = dxval[l] * dxval[l]; size_t gdim[] = { curr->dim_x-16, curr->dim_x-16, curr->dim_x/z_div }; size_t ldim[] = { wg_x, wg_y, wg_z }; for(i = 0; i < n1; i++){ // ---------------------------------------------------------------------- // invoke poisson kernel // ---------------------------------------------------------------------- curr->field_start = 0; SET_7_KERNEL_ARGS(poisson_knl, dev_buf_u, dev_buf_f, dev_buf_hist_u, curr->field_start, curr->dim_x, curr->dim_other, h); // run the kernel CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, poisson_knl, /*dimensions*/ wg_dims, NULL, gdim, ldim, 0, NULL, evt_ptr)); #ifdef DO_TIMING // If timing is enabled, this wait can mean a significant performance hit. CALL_CL_GUARDED(clWaitForEvents, (1, &evt)); cl_ulong start, end; CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL)); CALL_CL_GUARDED(clGetEventProfilingInfo, (evt, CL_PROFILING_COMMAND_END, sizeof(start), &end, NULL)); gbytes_accessed += 1e-9*(sizeof(ftype) * field_size * fetch_per_pt); seconds_taken += 1e-9*(end-start); tot_secs += 1e-9*(end-start_big); mcells_updated += curr->dim_other*curr->dim_other*curr->dim_other/1e6; gflops_performed += 1e-9*curr->dim_x*curr->dim_x*curr->dim_x * flops_per_pt; CALL_CL_GUARDED(clReleaseEvent, (evt)); #endif CALL_CL_GUARDED(clFinish, (queue)); //ira adentro?? cl_mem tmp = dev_buf_u; dev_buf_u = dev_buf_hist_u; dev_buf_hist_u = tmp; } //when I'm done, read from buffer CALL_CL_GUARDED(clEnqueueReadBuffer, (queue, dev_buf_u, /*blocking*/ CL_TRUE, /*offset*/ 0, field_size * sizeof(ftype), curr->uvec, 0, NULL, NULL)); } } } // ---------- and the solution is right there in the last curr curr->uvec for(i = 0; i < field_size; i++) u[i] = curr->uvec[i]; free(curr->uvec); //free(curr->fvec); free(curr->rvec); //free(ugrid->uvec); //free(ugrid->fvec); //free(ugrid->rvec); free(curr); CALL_CL_GUARDED(clReleaseMemObject, (dev_buf_u)); CALL_CL_GUARDED(clReleaseMemObject, (dev_buf_f)); CALL_CL_GUARDED(clReleaseMemObject, (dev_buf_hist_u)); }
void fft2D_big_new(cl_mem a, cl_mem c, cl_mem b,cl_mem d, int N, cl_kernel init_big, cl_kernel clean,cl_kernel mat_trans, cl_kernel mat_trans_3D, cl_command_queue queue,int direction) { int offset_line = 0; int Ns = 1; int y =0; SET_7_KERNEL_ARGS(init_big, a, b, N, Ns,direction,offset_line,y); size_t ldim[] = { 16 }; size_t gdim[] = { N*N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, init_big, 1, NULL, gdim, ldim, 0, NULL, NULL)); if(N!=64) if(N == 1024) { int Ns =1; int y =0; //cl_long offset = offset_line * N; SET_7_KERNEL_ARGS(clean, b, c, N, Ns,direction,offset_line,y); size_t ldim[]={ 4 }; size_t gdim[] ={ N*N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, clean, 1, NULL, gdim, ldim, 0, NULL, NULL)); int option =0; float k =0; int n = 16; SET_8_KERNEL_ARGS(mat_trans_3D, c, b, n, option,k,k,k,N); size_t ldim2[] = { 16, 16 ,1}; size_t gdim2[] = { 16, 64 ,N}; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, mat_trans_3D, 3, NULL, gdim2, ldim2, 0, NULL, NULL)); } else if(N ==256) { int Ns =1; int y =0; offset_line =0; SET_7_KERNEL_ARGS(clean, b, c, N, Ns,direction,offset_line,y); size_t ldim[] ={4}; size_t gdim[] ={N*N/4}; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, clean, 1, NULL, gdim, ldim, 0, NULL, NULL)); int option =0; float k =0; int n = 4; SET_8_KERNEL_ARGS(mat_trans_3D, c, b, n, option,k,k,k,N); size_t ldim2[] = { 4, 4 ,1}; size_t gdim2[] = { 4, 64, N }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, mat_trans_3D, 3, NULL, gdim2, ldim2, 0, NULL, NULL)); } else { printf("FFT not implemented for this size!!!\n"); return; } //CALL_CL_GUARDED(clFinish, (queue)); //printf("1D fine \n"); mat__trans(b,c,N,mat_trans,queue,0,1,1,1); //CALL_CL_GUARDED(clFinish, (queue)); /* for(int j= 0;j<N;j++) { //fft_1D(c,b,d,N,fft_init,fft1D,queue,direction,j); fft_1D_big(c, b,d,N, init_big, clean,mat_trans,queue,direction,j); } */ Ns =1; SET_7_KERNEL_ARGS(init_big, c, b, N, Ns,direction,offset_line,y); CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, init_big, 1, NULL, gdim, ldim, 0, NULL, NULL)); if (N !=64 ) if( N == 256 || N == 1024) { int Ns =1; int y = 0; int offset_line = 0; SET_7_KERNEL_ARGS(clean, b, d, N, Ns,direction,offset_line,y); size_t ldim[] = { 4 }; size_t gdim[] = { N*N/4 }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, clean, 1, NULL, gdim, ldim, 0, NULL, NULL)); if(N == 1024) { int option =0; float k =0; int n = 16; SET_8_KERNEL_ARGS(mat_trans_3D, d, b, n, option,k,k,k,N); size_t ldim2[] = { 16, 16 ,1}; size_t gdim2[] = { 16, 64 ,N}; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, mat_trans_3D, 3, NULL, gdim2, ldim2, 0, NULL, NULL)); } else if(N ==256) { int option =0; float k =0; int n = 4; SET_8_KERNEL_ARGS(mat_trans_3D, d, b, n, option,k,k,k,N); size_t ldim2[] = { 4, 4 ,1}; size_t gdim2[] = { 4, 64, N }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, mat_trans_3D, 3, NULL, gdim2, ldim2, 0, NULL, NULL)); } } else { printf("FFT not implemented for this size!!!\n"); return; } //CALL_CL_GUARDED(clFinish, (queue)); if(direction == 1) mat__trans(b,c,N,mat_trans,queue,0,1,1,1); else mat__trans(b,c,N,mat_trans,queue,-1,1,1,1); }
void run_sampler(sampler *samp){ /* Run the sampler and save output. Overlap sampling and communication with the device using two queues. While red walkers are being sampled, black walkers are being sent. This means that the first sampling iteration reads black walkers from the burn in, and the final iteration is thrown out. Runs in the following order. - Sample X_red non-blocking - Copy X_black to host non-blocking - Check both are finished - Sample X_black non-blocking - Copy X_red to host non-blocking - Check both are finished Input: sampler *samp Pointer to sampler structure which has been initialized. Burn-in should also be performed before running this routine. Run for samp->M total times. Output: Array samp->samples_host is filled with new samples. */ // start the kernel timer get_timestamp(& (samp->time1)); // reset beta and set to save (samp->data_st)->beta = 1.0f; (samp->data_st)->save = 1; // update the data structure accordingly CALL_CL_GUARDED(clEnqueueWriteBuffer, ( samp->queue, samp->data_st_device, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(data_struct), samp->data_st, 0, NULL, NULL)); CALL_CL_GUARDED(clFinish, (samp->queue)); // run the sampler unsigned int buffer_position = 0; // since samples are read while update takes place, do not read the first set of samples char read_samples = 0; // main sampling loop for(int it=0; it < samp->M + 1; it++){ // update X_red SET_9_KERNEL_ARGS(samp->stretch_knl, samp->X_red_device, samp->log_pdf_red_device, samp->X_black_device, samp->ranluxcltab, samp->accepted_device, samp->data_device, samp->data_st_device, samp->indices_to_save_device, samp->X_red_save); CALL_CL_GUARDED(clEnqueueNDRangeKernel, (samp->queue, samp->stretch_knl, /*dimensions*/ 1, NULL, samp->gdim, samp->ldim, 0, NULL, NULL)); if(read_samples){ // read the constant samples while others are updating CALL_CL_GUARDED(clEnqueueReadBuffer, ( samp->queue_mem, samp->X_black_save, CL_FALSE, 0, samp->num_to_save * samp->K_over_two * sizeof(cl_float), samp->samples_host + buffer_position, 0, NULL, NULL)); buffer_position += samp->num_to_save * samp->K_over_two; } // both must finish before next iteration CALL_CL_GUARDED(clFinish, (samp->queue_mem)); CALL_CL_GUARDED(clFinish, (samp->queue)); // update X_black SET_9_KERNEL_ARGS(samp->stretch_knl, samp->X_black_device, samp->log_pdf_black_device, samp->X_red_device, samp->ranluxcltab, samp->accepted_device, samp->data_device, samp->data_st_device, samp->indices_to_save_device, samp->X_black_save); CALL_CL_GUARDED(clEnqueueNDRangeKernel, (samp->queue, samp->stretch_knl, /*dimensions*/ 1, NULL, samp->gdim, samp->ldim, 0, NULL, NULL)); if(read_samples){ // read the constant samples while others are updating CALL_CL_GUARDED(clEnqueueReadBuffer, ( samp->queue_mem, samp->X_red_save, CL_FALSE, 0, samp->num_to_save * samp->K_over_two * sizeof(cl_float), samp->samples_host + buffer_position, 0, NULL, NULL)); buffer_position += samp->num_to_save * samp->K_over_two; } // both must finish before next iteration CALL_CL_GUARDED(clFinish, (samp->queue_mem)); CALL_CL_GUARDED(clFinish, (samp->queue)); if( ((it % (MAX(samp->M/10,1))) == 0) && (OUTPUT_LEVEL > 0) ) printf("Sample iteration %d\n", it); read_samples = 1; } // make sure everything is back in place CALL_CL_GUARDED(clFinish, (samp->queue)); // take the end of the kernel timing get_timestamp(&(samp->time2)); // save the acceptance probability CALL_CL_GUARDED(clEnqueueReadBuffer, ( samp->queue, samp->accepted_device, CL_TRUE, 0, samp->K_over_two * sizeof(cl_ulong), samp->accepted_host, 0, NULL, NULL)); // ensure that all reads are finished CALL_CL_GUARDED(clFinish, (samp->queue)); samp->accepted_total = 0; for(int i=0; i<samp->K_over_two; i++) samp->accepted_total += samp->accepted_host[i]; // end total timing get_timestamp(&(samp->time2_total)); if(OUTPUT_LEVEL > 0) printf("Sampler kernel ran and completed.\n\n"); }
void flush_cl_queue() { CALL_CL_GUARDED(clFinish, (clData.queue)); }
void print_device_info(cl_device_id device) { // adapted from http://graphics.stanford.edu/~yoel/notes/clInfo.c #define LONG_PROPS \ defn(VENDOR_ID), \ defn(MAX_COMPUTE_UNITS), \ defn(MAX_WORK_ITEM_DIMENSIONS), \ defn(MAX_WORK_GROUP_SIZE), \ defn(PREFERRED_VECTOR_WIDTH_CHAR), \ defn(PREFERRED_VECTOR_WIDTH_SHORT), \ defn(PREFERRED_VECTOR_WIDTH_INT), \ defn(PREFERRED_VECTOR_WIDTH_LONG), \ defn(PREFERRED_VECTOR_WIDTH_FLOAT), \ defn(PREFERRED_VECTOR_WIDTH_DOUBLE), \ defn(MAX_CLOCK_FREQUENCY), \ defn(ADDRESS_BITS), \ defn(MAX_MEM_ALLOC_SIZE), \ defn(IMAGE_SUPPORT), \ defn(MAX_READ_IMAGE_ARGS), \ defn(MAX_WRITE_IMAGE_ARGS), \ defn(IMAGE2D_MAX_WIDTH), \ defn(IMAGE2D_MAX_HEIGHT), \ defn(IMAGE3D_MAX_WIDTH), \ defn(IMAGE3D_MAX_HEIGHT), \ defn(IMAGE3D_MAX_DEPTH), \ defn(MAX_SAMPLERS), \ defn(MAX_PARAMETER_SIZE), \ defn(MEM_BASE_ADDR_ALIGN), \ defn(MIN_DATA_TYPE_ALIGN_SIZE), \ defn(GLOBAL_MEM_CACHELINE_SIZE), \ defn(GLOBAL_MEM_CACHE_SIZE), \ defn(GLOBAL_MEM_SIZE), \ defn(MAX_CONSTANT_BUFFER_SIZE), \ defn(MAX_CONSTANT_ARGS), \ defn(LOCAL_MEM_SIZE), \ defn(ERROR_CORRECTION_SUPPORT), \ defn(PROFILING_TIMER_RESOLUTION), \ defn(ENDIAN_LITTLE), \ defn(AVAILABLE), \ defn(COMPILER_AVAILABLE), #define STR_PROPS \ defn(NAME), \ defn(VENDOR), \ defn(PROFILE), \ defn(VERSION), \ defn(EXTENSIONS), #define HEX_PROPS \ defn(SINGLE_FP_CONFIG), \ defn(QUEUE_PROPERTIES), printf("---------------------------------------------------------------------\n"); static struct { cl_device_info param; const char *name; } longProps[] = { #define defn(X) { CL_DEVICE_##X, #X } LONG_PROPS #undef defn { 0, NULL }, }; static struct { cl_device_info param; const char *name; } hexProps[] = { #define defn(X) { CL_DEVICE_##X, #X } HEX_PROPS #undef defn { 0, NULL }, }; static struct { cl_device_info param; const char *name; } strProps[] = { #define defn(X) { CL_DEVICE_##X, #X } STR_PROPS #undef defn { CL_DRIVER_VERSION, "DRIVER_VERSION" }, { 0, NULL }, }; cl_int status; size_t size; char buf[65536]; long long val; /* Avoids unpleasant surprises for some params */ int ii; for (ii = 0; strProps[ii].name != NULL; ii++) { status = clGetDeviceInfo(device, strProps[ii].param, sizeof buf, buf, &size); if (status != CL_SUCCESS) { printf("Unable to get %s: %s!\n", strProps[ii].name, cl_error_to_str(status)); continue; } if (size > sizeof buf) { printf("Large %s (%zd bytes)! Truncating to %ld!\n", strProps[ii].name, size, sizeof buf); } printf("%s: %s\n", strProps[ii].name, buf); } printf("\n"); status = clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof val, &val, NULL); if (status == CL_SUCCESS) { printf("Type: "); if (val & CL_DEVICE_TYPE_DEFAULT) { val &= ~CL_DEVICE_TYPE_DEFAULT; printf("Default "); } if (val & CL_DEVICE_TYPE_CPU) { val &= ~CL_DEVICE_TYPE_CPU; printf("CPU "); } if (val & CL_DEVICE_TYPE_GPU) { val &= ~CL_DEVICE_TYPE_GPU; printf("GPU "); } if (val & CL_DEVICE_TYPE_ACCELERATOR) { val &= ~CL_DEVICE_TYPE_ACCELERATOR; printf("Accelerator "); } if (val != 0) { printf("Unknown (0x%llx) ", val); } printf("\n"); } else { printf("Unable to get TYPE: %s!\n", cl_error_to_str(status)); } status = clGetDeviceInfo(device, CL_DEVICE_EXECUTION_CAPABILITIES, sizeof val, &val, NULL); if (status == CL_SUCCESS) { printf("EXECUTION_CAPABILITIES: "); if (val & CL_EXEC_KERNEL) { val &= ~CL_EXEC_KERNEL; printf("Kernel "); } if (val & CL_EXEC_NATIVE_KERNEL) { val &= ~CL_EXEC_NATIVE_KERNEL; printf("Native "); } if (val) printf("Unknown (0x%llx) ", val); printf("\n"); } else { printf("Unable to get EXECUTION_CAPABILITIES: %s!\n", cl_error_to_str(status)); } status = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof val, &val, NULL); if (status == CL_SUCCESS) { static const char *cacheTypes[] = { "None", "Read-Only", "Read-Write" }; static int numTypes = sizeof cacheTypes / sizeof cacheTypes[0]; printf("GLOBAL_MEM_CACHE_TYPE: %s (%lld)\n", val < numTypes ? cacheTypes[val] : "???", val); } else { printf("Unable to get GLOBAL_MEM_CACHE_TYPE: %s!\n", cl_error_to_str(status)); } status = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof val, &val, NULL); if (status == CL_SUCCESS) { static const char *lmemTypes[] = { "???", "Local", "Global" }; static int numTypes = sizeof lmemTypes / sizeof lmemTypes[0]; printf("CL_DEVICE_LOCAL_MEM_TYPE: %s (%lld)\n", val < numTypes ? lmemTypes[val] : "???", val); } else { printf("Unable to get CL_DEVICE_LOCAL_MEM_TYPE: %s!\n", cl_error_to_str(status)); } for (ii = 0; hexProps[ii].name != NULL; ii++) { status = clGetDeviceInfo(device, hexProps[ii].param, sizeof val, &val, &size); if (status != CL_SUCCESS) { printf("Unable to get %s: %s!\n", hexProps[ii].name, cl_error_to_str(status)); continue; } if (size > sizeof val) { printf("Large %s (%zd bytes)! Truncating to %ld!\n", hexProps[ii].name, size, sizeof val); } printf("%s: 0x%llx\n", hexProps[ii].name, val); } printf("\n"); for (ii = 0; longProps[ii].name != NULL; ii++) { status = clGetDeviceInfo(device, longProps[ii].param, sizeof val, &val, &size); if (status != CL_SUCCESS) { printf("Unable to get %s: %s!\n", longProps[ii].name, cl_error_to_str(status)); continue; } if (size > sizeof val) { printf("Large %s (%zd bytes)! Truncating to %ld!\n", longProps[ii].name, size, sizeof val); } printf("%s: %lld\n", longProps[ii].name, val); } { size_t size = sizeof(size_t); CALL_CL_GUARDED(clGetDeviceInfo, (device, CL_DEVICE_MAX_WORK_ITEM_SIZES, 0, 0, &size)); size_t res_vec[size/sizeof(size_t)]; // C99 VLA yay! CALL_CL_GUARDED(clGetDeviceInfo, (device, CL_DEVICE_MAX_WORK_ITEM_SIZES, size, res_vec, &size)); printf("MAX_WORK_GROUP_SIZES: "); // a tiny lie for (size_t i = 0; i < size/sizeof(size_t); ++i) printf("%zd ", res_vec[i]); printf("\n"); } printf("---------------------------------------------------------------------\n"); }
void main(int argc, char** argv) { //int k = atoi(argv[1]); //int N = pow(2,k); int N=1024; int k=10; float * a = (float *) malloc(sizeof(float)*N* N * 2); float * b = (float *) malloc(sizeof(float) *N*N * 2); float * c = (float *) malloc(sizeof(float) * N*N* 2); float p = 2*M_PI ; for (int i =0; i< N*N; i++) { a[2*i] = 1; a[2*i+1] = 0; b[2*i] = 1; b[2*i+1] = 0; } #if 0 srand(1); for(int i =0;i<N*N;i++) { a[2*i]=sin(i%N *2 *M_PI); //printf("%f\n",uu[2*i]); a[2*i+1] =0 ; } #endif print_platforms_devices(); cl_context ctx; cl_command_queue queue; create_context_on("NVIDIA","GeForce GTX 590",0,&ctx,&queue,0); cl_int status; cl_mem buf_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float) *N *N* 2 , 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float) * N *N* 2 , 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_c = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float) * N *N* 2 , 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_d = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float)*N *N* 2 , 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_e = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float) *N *N* 2 , 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_f = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float) *N *N* 2 , 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_g = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float) *N *N* 2 , 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( queue, buf_a, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(float) *N*N*2, a, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( queue, buf_b, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(float) *N *N* 2, b, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( queue, buf_c, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(float) *N* N*2, c, 0, NULL, NULL)); char *knl_text = read_file("vec_add.cl"); cl_kernel vec_add = kernel_from_string(ctx, knl_text, "sum", NULL); free(knl_text); knl_text = read_file("mat_etr_mul.cl"); cl_kernel mat_etr_mul = kernel_from_string(ctx, knl_text, "mult", NULL); free(knl_text); knl_text = read_file("radix-4-float.cl"); cl_kernel fft1D = kernel_from_string(ctx, knl_text, "fft1D", NULL); free(knl_text); knl_text = read_file("radix-4-init.cl"); cl_kernel fft_init = kernel_from_string(ctx, knl_text, "fft1D_init", NULL); free(knl_text); knl_text = read_file("radix-4-interm.cl"); cl_kernel fft_interm = kernel_from_string(ctx, knl_text, "fft1D", NULL); free(knl_text); knl_text = read_file("transpose-soln-gpu.cl"); cl_kernel mat_trans = kernel_from_string(ctx, knl_text, "transpose", NULL); free(knl_text); knl_text = read_file("radix-4-modi.cl"); cl_kernel fft_init_w = kernel_from_string(ctx, knl_text, "fft1D_init", NULL); free(knl_text); knl_text = read_file("vec_zero.cl"); cl_kernel vec_zero = kernel_from_string(ctx, knl_text, "zero", NULL); free(knl_text); knl_text = read_file("reduction.cl"); cl_kernel reduct_mul = kernel_from_string(ctx, knl_text, "reduction_mult", NULL); free(knl_text); knl_text = read_file("reduction1D.cl"); cl_kernel reduct = kernel_from_string(ctx, knl_text, "reduction", NULL); free(knl_text); knl_text = read_file("reduction-init.cl"); cl_kernel reduct_init = kernel_from_string(ctx, knl_text, "reduction_init", NULL); free(knl_text); knl_text = read_file("reduct-energy.cl"); cl_kernel reduct_eng = kernel_from_string(ctx, knl_text, "reduction_eng", NULL); free(knl_text); knl_text = read_file("resid.cl"); cl_kernel resid = kernel_from_string(ctx, knl_text, "resid", NULL); free(knl_text); knl_text = read_file("resid-init.cl"); cl_kernel resid_init = kernel_from_string(ctx, knl_text, "resid_init", NULL); free(knl_text); knl_text = read_file("radix-4-big.cl"); cl_kernel fft_big = kernel_from_string(ctx, knl_text, "fft1D_big", NULL); free(knl_text); knl_text = read_file("radix-4-big-clean.cl"); cl_kernel fft_clean = kernel_from_string(ctx, knl_text, "fft1D_clean", NULL); free(knl_text); knl_text = read_file("radix-4-2D.cl"); cl_kernel fft_2D = kernel_from_string(ctx, knl_text, "fft2D_big", NULL); free(knl_text); knl_text = read_file("radix-4-2D-clean.cl"); cl_kernel fft_2D_clean = kernel_from_string(ctx, knl_text, "fft2D_clean", NULL); free(knl_text); knl_text = read_file("mat-trans-3D.cl"); cl_kernel mat_trans_3D = kernel_from_string(ctx, knl_text, "transpose_3D", NULL); free(knl_text); int Ns =1 ; int direction = 1; timestamp_type time1, time2; struct parameter param; param.N = N; param.epsilon = 0.1; param.s =1; float kk =1e-4; param.h = 2*PI/N; param.N = N; param.maxCG = 1000; param.maxN = 5; //Minimum and starting time step float mink = 1e-7; float startk = 1e-4; // Tolerances param.Ntol = 1e-4; param.cgtol = 1e-7; float ksafety = 0.8; float kfact = 1.3; float kfact2 = 1/1.3; float Nfact = 0.7; float CGfact = 0.7; double elapsed ; CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time1); //for(int s=0;s<100;s++) //fft_1D_big(buf_a,buf_b,buf_c,N,fft_big,fft_clean,mat_trans,queue,direction,0); //fft_1D_new(buf_a,buf_b,buf_c,N,fft_init,fft_interm, fft1D,queue,direction,0); //fft_1D(buf_a,buf_b,buf_c,N,fft_init, fft1D,queue,direction,0); //fft2D(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft1D,mat_trans,queue, 1); //fft2D_new(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft_interm,fft1D,mat_trans,queue, 1); //fft2D_big(buf_a,buf_b,buf_c,buf_d,N,fft_big,fft_clean,mat_trans,queue,direction); //fft2D_big_new(buf_a,buf_b,buf_c,buf_d,N,fft_2D,fft_2D_clean, //mat_trans,mat_trans_3D,queue,direction); //fft_w(buf_a,buf_b,buf_c,buf_d,buf_e,N,0.1,0,1,fft_init_w,fft_init,fft1D,mat_trans,queue); #if 0 frhs(buf_a,buf_b,buf_c,buf_d,buf_e,¶m,fft1D_init,fft1D,mat_trans, vec_add, queue); #endif #if 0 float E1 = energy(buf_a, buf_b, buf_c,buf_d, buf_e,buf_f,1e-4, ¶m, fft_init,fft1D,mat_trans,reduct_eng, reduct,queue); #endif //float reside = residual(buf_a,buf_b,resid,resid_init,queue,N*N); /*fft_d_q(buf_a,buf_b,buf_c,buf_d, N,0.1,k ,1, fft1D_init, fft1D,mat_trans,queue);*/ //for(int j= 0;j<N;j++) //{ //fft_1D_w_orig(buf_a,buf_b,buf_c,N,fft1D_init,fft1D,queue,1,j); //} //fft_shar(buf_a,buf_b,buf_c,buf_d,N,0.1,0,1,fft1D_init,fft1D,mat_trans,queue); //mat__trans(buf_a,buf_b,N,mat_trans,queue,4,0.1,0,1); //double elapsed = reduction_mult(buf_a, buf_b,buf_c,N*N,reduct_mul,reduct,queue); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time1); fft_1D_big(buf_a,buf_b,buf_c,N*N,fft_big,fft_clean,mat_trans,queue,direction,0); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time2); elapsed = timestamp_diff_in_seconds(time1,time2); printf("Hierarchy 1D FFT of size %d array on gpu takes %f s\n", N*N,elapsed); printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9); printf("---------------------------------------------\n"); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time1); fft2D(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft1D,mat_trans,queue, 1); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time2); elapsed = timestamp_diff_in_seconds(time1,time2); printf("Navie 2D FFT of size %d * %d matrix on gpu takes %f s\n", N,N,elapsed); printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9); printf("---------------------------------------------\n"); //printf("data access from global achieve %f GB/s\n",sizeof(float)*2*16*N*N/elapsed*1e-9); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time1); fft2D_new(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft_interm,fft1D,mat_trans,queue, 1); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time2); elapsed = timestamp_diff_in_seconds(time1,time2); printf("local data exchange 2D FFT of size %d * %d matrix on gpu takes %f s\n", N,N,elapsed); printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9); printf("---------------------------------------------\n"); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time1); fft2D_big(buf_a,buf_b,buf_c,buf_d,N,fft_big,fft_clean,mat_trans,queue,direction); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time2); elapsed = timestamp_diff_in_seconds(time1,time2); printf("Hierarchy 2D FFT of size %d * %d matrix on gpu takes %f s\n", N,N,elapsed); printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9); printf("---------------------------------------------\n"); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time1); fft2D_big_new(buf_a,buf_b,buf_c,buf_d,N,fft_2D,fft_2D_clean, mat_trans,mat_trans_3D,queue,direction); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time2); elapsed = timestamp_diff_in_seconds(time1,time2); printf("Using 2D kernel 2D FFT of size %d * %d matrix on gpu takes %f s\n", N,N,elapsed); printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9); printf("---------------------------------------------\n"); get_timestamp(&time1); direction = -1; //fft_1D(buf_b,buf_c,buf_d,N,fft_init, fft1D,queue,direction,0); fft2D(buf_b,buf_c,buf_d,buf_e,N,fft_init,fft1D,mat_trans,queue, direction); //fft2D_new(buf_b,buf_c,buf_e,buf_d,N,fft_init,fft_interm,fft1D,mat_trans,queue, -1); //fft2D_big(buf_b,buf_c,buf_d,buf_e,N,fft_big,fft_clean,mat_trans,queue,direction); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time2); elapsed = timestamp_diff_in_seconds(time1,time2); //printf("1D inverse %f s\n", elapsed); #if 0 float test; CALL_CL_GUARDED(clFinish, (queue)); CALL_CL_GUARDED(clEnqueueReadBuffer, ( queue, buf_b, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(float), &test, 0, NULL, NULL)); printf("test success and %f \n",test); #endif #if 0 CALL_CL_GUARDED(clFinish, (queue)); CALL_CL_GUARDED(clEnqueueReadBuffer, ( queue, buf_c, /*blocking*/ CL_TRUE, /*offset*/ 0, 2*N*N* sizeof(float), c, 0, NULL, NULL)); /*for(int i =0; i< N; i++) { printf("a%f+ i*",a[2*i]); printf("%f\n",a[2*i+1]); }*/ int T = 10<N? 10:N ; for(int i =0; i< T; i++) { printf("%f + i*",a[2*i]); printf("%f\t",a[2*i+1]); printf("%f + i*",c[2*i]); printf("%f\n",c[2*i+1]); } #endif /* for( Ns = 1;Ns < N; Ns *= 2 ) { for (int j = 0; j<N/2; j++) { fftiteration(j,N,Ns,a,b); } float * d; d = a ; a = b; b = d; //printf("ok\n"); } */ CALL_CL_GUARDED(clReleaseMemObject, (buf_a)); CALL_CL_GUARDED(clReleaseMemObject, (buf_b)); CALL_CL_GUARDED(clReleaseMemObject, (buf_c)); CALL_CL_GUARDED(clReleaseMemObject, (buf_d)); CALL_CL_GUARDED(clReleaseMemObject, (buf_e)); CALL_CL_GUARDED(clReleaseKernel, (fft1D)); CALL_CL_GUARDED(clReleaseKernel, (fft_init)); CALL_CL_GUARDED(clReleaseKernel, (vec_add)); CALL_CL_GUARDED(clReleaseKernel, (reduct_mul)); CALL_CL_GUARDED(clReleaseKernel, (reduct)); CALL_CL_GUARDED(clReleaseKernel, (mat_trans)); CALL_CL_GUARDED(clReleaseCommandQueue, (queue)); CALL_CL_GUARDED(clReleaseContext, (ctx)); }
void create_context_on(const char *plat_name, const char*dev_name, cl_uint idx, cl_context *ctx, cl_command_queue *queue, int enable_profiling) { char dev_sel_buf[MAX_NAME_LEN]; char platform_sel_buf[MAX_NAME_LEN]; // get number of platforms cl_uint plat_count; CALL_CL_GUARDED(clGetPlatformIDs, (0, NULL, &plat_count)); // allocate memory, get list of platform handles cl_platform_id *platforms = (cl_platform_id *) malloc(plat_count*sizeof(cl_platform_id)); CHECK_SYS_ERROR(!platforms, "allocating platform array"); CALL_CL_GUARDED(clGetPlatformIDs, (plat_count, platforms, NULL)); // print menu, if requested #ifndef CL_HELPER_FORCE_INTERACTIVE if (plat_name == CHOOSE_INTERACTIVELY) // yes, we want exactly that pointer #endif { puts("Choose platform:"); for (cl_uint i = 0; i < plat_count; ++i) { char buf[MAX_NAME_LEN]; CALL_CL_GUARDED(clGetPlatformInfo, (platforms[i], CL_PLATFORM_VENDOR, sizeof(buf), buf, NULL)); printf("[%d] %s\n", i, buf); } printf("Enter choice: "); fflush(stdout); char *sel = read_a_line(); if (!sel) { fprintf(stderr, "error reading line from stdin"); abort(); } int sel_int = MIN(MAX(0, atoi(sel)), (int) plat_count-1); free(sel); CALL_CL_GUARDED(clGetPlatformInfo, (platforms[sel_int], CL_PLATFORM_VENDOR, sizeof(platform_sel_buf), platform_sel_buf, NULL)); plat_name = platform_sel_buf; } // iterate over platforms for (cl_uint i = 0; i < plat_count; ++i) { // get platform name char buf[MAX_NAME_LEN]; CALL_CL_GUARDED(clGetPlatformInfo, (platforms[i], CL_PLATFORM_VENDOR, sizeof(buf), buf, NULL)); // does it match? if (!plat_name || strstr(buf, plat_name)) { // get number of devices in platform cl_uint dev_count; CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &dev_count)); // allocate memory, get list of device handles in platform cl_device_id *devices = (cl_device_id *) malloc(dev_count*sizeof(cl_device_id)); CHECK_SYS_ERROR(!devices, "allocating device array"); CALL_CL_GUARDED(clGetDeviceIDs, (platforms[i], CL_DEVICE_TYPE_ALL, dev_count, devices, NULL)); // {{{ print device menu, if requested #ifndef CL_HELPER_FORCE_INTERACTIVE if (dev_name == CHOOSE_INTERACTIVELY) // yes, we want exactly that pointer #endif { puts("Choose device:"); for (cl_uint j = 0; j < dev_count; ++j) { char buf[MAX_NAME_LEN]; CALL_CL_GUARDED(clGetDeviceInfo, (devices[j], CL_DEVICE_NAME, sizeof(buf), buf, NULL)); printf("[%d] %s\n", j, buf); } printf("Enter choice: "); fflush(stdout); char *sel = read_a_line(); if (!sel) { fprintf(stderr, "error reading line from stdin"); abort(); } int int_sel = MIN(MAX(0, atoi(sel)), (int) dev_count-1); free(sel); CALL_CL_GUARDED(clGetDeviceInfo, (devices[int_sel], CL_DEVICE_NAME, sizeof(dev_sel_buf), dev_sel_buf, NULL)); dev_name = dev_sel_buf; } // }}} // iterate over devices for (cl_uint j = 0; j < dev_count; ++j) { // get device name char buf[MAX_NAME_LEN]; CALL_CL_GUARDED(clGetDeviceInfo, (devices[j], CL_DEVICE_NAME, sizeof(buf), buf, NULL)); // does it match? if (!dev_name || strstr(buf, dev_name)) { if (idx == 0) { cl_platform_id plat = platforms[i]; cl_device_id dev = devices[j]; free(devices); free(platforms); cl_int status; // create a context #if OPENCL_SHARE_WITH_OPENGL #if __APPLE__ // CGLContextObj kCGLContext = CGLGetCurrentContext(); // CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext); // cl_context_properties cps[] = { // CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup, // CL_CONTEXT_PLATFORM, (cl_context_properties) plat, 0 }; // CGLContextObj gl_context = CGLGetCurrentContext(); CGLShareGroupObj share_group = CGLGetShareGroup(gl_context); cl_context_properties properties[] = { CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)share_group, 0 }; *ctx = clCreateContext(properties, 0, 0, 0, 0, 0); clGetGLContextInfoAPPLE(*ctx, gl_context, CL_CGL_DEVICE_FOR_CURRENT_VIRTUAL_SCREEN_APPLE, sizeof(dev), &dev, NULL); #elif WIN32 cl_context_properties cps[] = { CL_GL_CONTEXT_KHR, (cl_context_properties) wglGetCurrentContext(), CL_WGL_HDC_KHR, (cl_context_properties) wglGetCurrentDC(), CL_CONTEXT_PLATFORM, (cl_context_properties) plat, 0}; //Probably won't work because &dev should correspond to glContext *ctx = clCreateContext(cps, 1, &dev, NULL, NULL, &status); CHECK_CL_ERROR(status, "clCreateContext"); #else // Linux cl_context_properties cps[] = { CL_GL_CONTEXT_KHR, ( cl_context_properties) glXGetCurrentContext(), CL_GLX_DISPLAY_KHR, (cl_context_properties) glXGetCurrentDisplay(), CL_CONTEXT_PLATFORM, (cl_context_properties) plat, 0 }; //Probably won't work because &dev should correspond to glContext *ctx = clCreateContext(cps, 1, &dev, NULL, NULL, &status); CHECK_CL_ERROR(status, "clCreateContext"); #endif #else // create a context cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) plat, 0 }; // create a command queue cl_command_queue_properties qprops = 0; if (enable_profiling) qprops |= CL_QUEUE_PROFILING_ENABLE; *queue = clCreateCommandQueue(*ctx, dev, qprops, &status); CHECK_CL_ERROR(status, "clCreateCommandQueue"); #endif // *ctx = clCreateContext( // cps, 1, &dev, NULL, NULL, &status); // CHECK_CL_ERROR(status, "clCreateContext"); // // create a command queue cl_command_queue_properties qprops = 0; if (enable_profiling) qprops |= CL_QUEUE_PROFILING_ENABLE; *queue = clCreateCommandQueue(*ctx, dev, qprops, &status); CHECK_CL_ERROR(status, "clCreateCommandQueue"); return; } else --idx; } } free(devices); } } free(platforms); fputs("create_context_on: specified device not found.\n", stderr); abort(); }
int main() { int enable_profiling = 0; #ifdef DO_TIMING enable_profiling = 1; #endif //print_platforms_devices(); cl_context ctx; cl_command_queue queue; create_context_on("NVIDIA", NULL, 0, &ctx, &queue, enable_profiling); // -------------------------------------------------------------------------- // load kernels // -------------------------------------------------------------------------- // read the cl file char buf[100]; sprintf(buf, "mg-kernel-ver%d.cl", VERSION); char *knl_text = read_file(buf); //get work group dimensions and gflop info. int wg_dims , wg_x, wg_y, wg_z, z_div, fetch_per_pt, flops_per_pt; if (sscanf(knl_text, "// workgroup: (%d,%d,%d) z_div:%d fetch_per_pt:%d flops_per_pt:%d", &wg_x, &wg_y, &wg_z, &z_div, &fetch_per_pt, &flops_per_pt) == 6) { wg_dims = 3; } else if (sscanf(knl_text, "// workgroup: (%d,%d) fetch_per_pt:%d flops_per_pt:%d", &wg_x, &wg_y, &fetch_per_pt, &flops_per_pt) == 4) { wg_dims = 2; wg_z = -1; z_div = -1; } else { perror("reading workgroup spec"); abort(); } #ifdef USE_DOUBLE char *compile_opt = "-DFTYPE=double"; #else char *compile_opt = "-DFTYPE=float"; #endif // creation of the kernel cl_kernel poisson_knl = kernel_from_string(ctx, knl_text, "fd_update", compile_opt); free(knl_text); // my compiler complains about this one. OJO!! // -------------------------------------------------------------------------- // set up grid // -------------------------------------------------------------------------- const unsigned points = POINTS; const ftype minus_bdry = -1, plus_bdry = 1; // We're dividing into (points-1) intervals. ftype dx = (plus_bdry-minus_bdry)/(points-1); // -------------------------------------------------------------------------- // allocate and initialize CPU memory // -------------------------------------------------------------------------- int use_alignment; unsigned dim_other = points; //if order 2 then 1 point extra on each side #ifdef USE_ALIGNMENT // adjusts dimension so that the next row starts in a number divisible by 16 unsigned dim_x = ((dim_other + 15) / 16) * 16; unsigned field_start = 0; use_alignment = 1; #else unsigned dim_x = dim_other; unsigned field_start = 0;// this one puts me right at the beginning use_alignment = 0; #endif // --------Allocate forcing uexact, r and u vectors ------------------------- const size_t field_size = 0+dim_x*dim_x*dim_x; // extra large to fit the 2^n constrain in GPU ftype *f = malloc(field_size*sizeof(ftype)); CHECK_SYS_ERROR(!f, "allocating f"); ftype *u = malloc (field_size*sizeof(ftype)); CHECK_SYS_ERROR(!u, "allocating u"); ftype *uexact = malloc (field_size*sizeof(ftype)); CHECK_SYS_ERROR(!uexact, "allocating uexact"); ftype *r = malloc(field_size * sizeof(ftype)); CHECK_SYS_ERROR(!r, "allocating residual r"); // -------------------------------------------------------------------------- // initialize // -------------------------------------------------------------------------- // zero out (necessary to initialize everything bec. I measure norms) for (size_t i = 0; i < field_size; ++i){ f[i] = 0; u[i] = 0; uexact[i] = 0; r[i] = 0; } // set up the forcing field init_f (points, f, dx, field_start, dim_x, dim_other, minus_bdry); // Initialize u with initial boundary conditions init_u ( points, u , minus_bdry, plus_bdry, dx, field_start, dim_x, dim_other); // Initialize the exact solution init_uexact(points, u, uexact, dx, field_size, field_start, dim_x, dim_other); // -------------------------------------------------------------------------- // Setup the v-cycles // -------------------------------------------------------------------------- unsigned n1, n2, n3, ncycles; n1 = 50; n2 = 60; n3 = 1; ncycles = 2; ftype *sweeps = malloc (ncycles*sizeof(ftype)); ftype *rnorm = malloc (ncycles*sizeof(ftype)); ftype *enorm = malloc (ncycles*sizeof(ftype)); ftype rtol = 1.0e-05; // Find the norm of the residual (choose your method) sweeps[0] =0; resid (r, f, u, dx, field_size, field_start, dim_x, dim_other); rnorm[0] = norm( r , field_size) * dx; U_error(u, uexact, r, field_size); enorm[0] = norm( r, field_size ) * dx; for(unsigned icycle = 1; icycle <= ncycles; icycle++){ mgv(f, u, dx, n1, n2, n3, field_size, points, use_alignment, dim_x, ctx, queue, poisson_knl, wg_dims , wg_x, wg_y, wg_z, z_div, fetch_per_pt, flops_per_pt); //update u through a v-cycle sweeps[icycle] = sweeps[icycle -1] + (4 * (n1 + n2)/3); resid (r, f, u, dx, field_size, field_start, dim_x, dim_other); rnorm[icycle] = norm( r, field_size ) * dx; U_error(u, uexact, r, field_size); enorm[icycle] = norm( r, field_size ) * dx; //cfacts = (rnorm(icycle)/rnorm(icycle - 1))^(1 / (n1 + n2)) not necessary //disp something here if I want to. //printf("norm of the cycle %f", enorm[icycle]); if(rnorm[icycle] <= rtol * rnorm[0]) break; } #ifdef DO_TIMING printf(" ftype:%d ver:%d align:%d pts:%d\tgflops:%.1f\tmcells:%.1f\tgbytes:%.1f [/sec]\tout_gflops:%.6f\n", (int) sizeof(ftype), VERSION, use_alignment, points, gflops_performed/seconds_taken, mcells_updated/seconds_taken, gbytes_accessed/seconds_taken, gflops_performed/tot_secs); #endif // -------------------------------------------------------------------------- // clean up // -------------------------------------------------------------------------- CALL_CL_GUARDED(clReleaseKernel, (poisson_knl)); CALL_CL_GUARDED(clReleaseCommandQueue, (queue)); CALL_CL_GUARDED(clReleaseContext, (ctx)); }
void free_sampler(sampler* samp){ /* Free all resources allocated by the sampler and the sampler itself. */ // free up OpenCL memory CALL_CL_GUARDED(clReleaseMemObject, (samp->X_red_device)); CALL_CL_GUARDED(clReleaseMemObject, (samp->log_pdf_red_device)); CALL_CL_GUARDED(clReleaseMemObject, (samp->X_red_save)); CALL_CL_GUARDED(clReleaseMemObject, (samp->X_black_device)); CALL_CL_GUARDED(clReleaseMemObject, (samp->log_pdf_black_device)); CALL_CL_GUARDED(clReleaseMemObject, (samp->X_black_save)); CALL_CL_GUARDED(clReleaseMemObject, (samp->accepted_device)); CALL_CL_GUARDED(clReleaseMemObject, (samp->data_device)); CALL_CL_GUARDED(clReleaseMemObject, (samp->ranluxcltab)); CALL_CL_GUARDED(clReleaseMemObject, (samp->data_st_device)); CALL_CL_GUARDED(clReleaseMemObject, (samp->indices_to_save_device)); // kernels, context and queues CALL_CL_GUARDED(clReleaseKernel, (samp->stretch_knl)); CALL_CL_GUARDED(clReleaseKernel, (samp->init_rand_lux_knl)); CALL_CL_GUARDED(clReleaseCommandQueue, (samp->queue)); CALL_CL_GUARDED(clReleaseCommandQueue, (samp->queue_mem)); CALL_CL_GUARDED(clReleaseContext, (samp->ctx)); // free host resources free(samp->X_red_host); free(samp->log_pdf_red_host); free(samp->X_black_host); free(samp->log_pdf_black_host); free(samp->samples_host); free(samp->accepted_host); free(samp->data_host); free(samp->data_st); free(samp->indices_to_save_host); // data resources free(samp->acor_times); free(samp->acor_pass); free(samp->means); free(samp->sigma); free(samp->err_bar); free(samp); }
int main(int argc, char **argv) { if (argc != 3) { fprintf(stderr, "need two arguments!\n"); abort(); } const long n = atol(argv[1]); const long size = n*n; const int ntrips = atoi(argv[2]); cl_context ctx; cl_command_queue queue; create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0); cl_int status; // -------------------------------------------------------------------------- // load kernels // -------------------------------------------------------------------------- char *knl_text = read_file("transpose-soln.cl"); cl_kernel knl = kernel_from_string(ctx, knl_text, "transpose", NULL); free(knl_text); // -------------------------------------------------------------------------- // allocate and initialize CPU memory // -------------------------------------------------------------------------- #ifdef USE_PINNED cl_mem buf_a_host = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * size, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_b_host = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * size, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); value_type *a = (value_type *) clEnqueueMapBuffer(queue, buf_a_host, /*blocking*/ CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, /*offs*/ 0, sizeof(value_type)*size, 0, NULL, NULL, &status); CHECK_CL_ERROR(status, "clEnqueueMapBuffer"); value_type *b = (value_type *) clEnqueueMapBuffer(queue, buf_b_host, /*blocking*/ CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, /*offs*/ 0, sizeof(value_type)*size, 0, NULL, NULL, &status); CHECK_CL_ERROR(status, "clEnqueueMapBuffer"); #else value_type *a = (value_type *) malloc(sizeof(value_type) * size); if (!a) { perror("alloc x"); abort(); } value_type *b = (value_type *) malloc(sizeof(value_type) * size); if (!b) { perror("alloc y"); abort(); } #endif for (size_t j = 0; j < n; ++j) for (size_t i = 0; i < n; ++i) a[i + j*n] = i + j*n; // -------------------------------------------------------------------------- // allocate device memory // -------------------------------------------------------------------------- cl_mem buf_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(value_type) * size, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(value_type) * size, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); // -------------------------------------------------------------------------- // transfer to device // -------------------------------------------------------------------------- CALL_CL_GUARDED(clFinish, (queue)); timestamp_type time1, time2; get_timestamp(&time1); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( queue, buf_a, /*blocking*/ CL_FALSE, /*offset*/ 0, size * sizeof(value_type), a, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( queue, buf_b, /*blocking*/ CL_FALSE, /*offset*/ 0, size * sizeof(value_type), b, 0, NULL, NULL)); get_timestamp(&time2); double elapsed = timestamp_diff_in_seconds(time1,time2); printf("transfer: %f s\n", elapsed); printf("transfer: %f GB/s\n", 2*size*sizeof(value_type)/1e9/elapsed); // -------------------------------------------------------------------------- // run code on device // -------------------------------------------------------------------------- CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time1); for (int trip = 0; trip < ntrips; ++trip) { SET_3_KERNEL_ARGS(knl, buf_a, buf_b, n); size_t ldim[] = { 16, 16 }; size_t gdim[] = { n, n }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, knl, /*dimensions*/ 2, NULL, gdim, ldim, 0, NULL, NULL)); } CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time2); elapsed = timestamp_diff_in_seconds(time1,time2)/ntrips; printf("%f s\n", elapsed); printf("%f GB/s\n", 2*size*sizeof(value_type)/1e9/elapsed); CALL_CL_GUARDED(clEnqueueReadBuffer, ( queue, buf_b, /*blocking*/ CL_FALSE, /*offset*/ 0, size * sizeof(value_type), b, 0, NULL, NULL)); CALL_CL_GUARDED(clFinish, (queue)); for (size_t i = 0; i < n; ++i) for (size_t j = 0; j < n; ++j) if (a[i + j*n] != b[j + i*n]) { printf("bad %d %d\n", i, j); abort(); } // -------------------------------------------------------------------------- // clean up // -------------------------------------------------------------------------- CALL_CL_GUARDED(clFinish, (queue)); CALL_CL_GUARDED(clReleaseMemObject, (buf_a)); CALL_CL_GUARDED(clReleaseMemObject, (buf_b)); CALL_CL_GUARDED(clReleaseKernel, (knl)); CALL_CL_GUARDED(clReleaseCommandQueue, (queue)); CALL_CL_GUARDED(clReleaseContext, (ctx)); #ifdef USE_PINNED CALL_CL_GUARDED(clReleaseMemObject, (buf_a_host)); CALL_CL_GUARDED(clReleaseMemObject, (buf_b_host)); #else free(a); free(b); #endif return 0; }
void chcg(float k,struct parameter * p_param, cl_mem temp,cl_mem rhs, cl_mem temp2, bool *fail, cl_mem temp3,cl_mem temp4,cl_mem temp5,cl_mem temp6,cl_mem temp7,cl_mem temp8,cl_mem temp9,cl_kernel fft_2D, cl_kernel fft_2D_clean, cl_kernel fft_init_w,cl_kernel vec_add, cl_kernel vec_zero, cl_kernel mat_trans,cl_kernel mat_trans_3D,cl_kernel reduct,cl_kernel reduct_init, cl_kernel reduct_mul,cl_kernel resid, cl_kernel resid_init,cl_command_queue queue) { * fail = false; int N = p_param->N; // fft2(rk)./q & rk =rhs fft_d_q(rhs,temp2,temp9,temp3,N,p_param->epsilon,k, p_param->s,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue); //CALL_CL_GUARDED(clFinish, (queue)); //printf("I am here!\n"); #if 0 float test; CALL_CL_GUARDED(clFinish, (queue)); CALL_CL_GUARDED(clEnqueueReadBuffer, ( queue, rhs, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(float), &test, 0, NULL, NULL)); printf("test success and %f \n",test); #endif fft2D(temp2,temp3,temp9,temp4,N,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D, queue,-1); //------------------------ //temp3 = zk temp =unew //------------------------ // linvzk = real(ifft2(fft2(zk)./nlap_s2)); real part; fft_d_nlaps2(temp3,temp2,temp9,temp4,N,p_param->epsilon,k,p_param->s,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue); #if 0 float test; CALL_CL_GUARDED(clFinish, (queue)); CALL_CL_GUARDED(clEnqueueReadBuffer, ( queue, temp2, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(float), &test, 0, NULL, NULL)); printf("test success and temp2 %f \n",test); #endif fft2D(temp2,temp4,temp9,temp5,N,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue,-1); //----------------------------- //linvzk = temp4 //----------------------------- //----------------------------------- //pk = temp5, temp8 = rk //------------------------------------- clEnqueueCopyBuffer(queue,temp3,temp5, 0,0, sizeof(float)*N*N*2,0,NULL,NULL); //vec_copy(rhs,temp8,2*N*N,p_knl,queue); clEnqueueCopyBuffer(queue,rhs,temp8, 0,0, sizeof(float)*N*N*2,0,NULL,NULL); //------------------ //temp2 = xk //-------------------- vec__zero(temp2,(2*N*N),vec_zero,queue); CALL_CL_GUARDED(clFinish, (queue)); //printf("I am here!\n"); #if 0 float test; CALL_CL_GUARDED(clFinish, (queue)); CALL_CL_GUARDED(clEnqueueReadBuffer, ( queue, temp4, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(float), &test, 0, NULL, NULL)); printf("test success and %f \n",test); #endif float ipnew = reduction_mult(temp4, rhs,temp9,(N*N),reduct_mul,reduct,queue); float ipold; float reside = 1; int iter = 0; float cgalpha; float beta; while(reside > p_param->cgtol && iter < p_param->maxCG) { ipold = ipnew; //printf("ipold = %f\n",ipnew); //fft((3*u.^2 -1) .* pk).*nlap_s fft_w(temp, temp5, temp7,temp4,N,p_param->epsilon,k,p_param->s,fft_2D,fft_init_w,fft_2D_clean,mat_trans,mat_trans_3D,queue); fft2D(temp4, temp6,temp9,temp3,N,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue,-1); #if 0 float test; CALL_CL_GUARDED(clFinish, (queue)); CALL_CL_GUARDED(clEnqueueReadBuffer, ( queue, temp6, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(float), &test, 0, NULL, NULL)); printf("test success and Apk = %f \n",test*k/p_param->epsilon); #endif //--------------------------- // temp4 = pk + Apk1 //--------------------------- vec__add(temp5, temp6, temp4,1,k/p_param->epsilon,2*N*N,vec_add, queue); //Apk2 = -alpha*k*epsilon*real(ifft2(fft2(pk).*sharmonic)); fft_shar(temp5, temp6,temp9,temp7,N, p_param->epsilon,k,p_param->s,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue); fft2D(temp6, temp7,temp9,temp3,N,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue,-1); //temp4 = Apk vec__add(temp4, temp7, temp4, 1, -k*p_param->epsilon, 2*N*N,vec_add, queue); //linvpk = real(ifft2(fft2(pk)./nlap_s2)); temp6 = linvpk fft_d_nlaps2(temp5,temp7,temp9,temp3,N,p_param->epsilon,k,p_param->s,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue); fft2D(temp7,temp6,temp9,temp3,N,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue,-1); if( ipold > 1e-9) cgalpha = ipold / reduction_mult(temp6, temp4,temp9,N*N,reduct_mul,reduct,queue); else cgalpha =0; vec__add(temp2, temp5,temp2,1, cgalpha,2*N*N,vec_add,queue); //--------------------- // update temp8 =rk //--------------------- vec__add(temp8 , temp4, temp8, 1, -cgalpha,2*N*N,vec_add,queue); //temp3 = zk fft_d_q(temp8,temp6,temp9,temp3,N,p_param->epsilon,k,p_param->s,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue); fft2D(temp6,temp3,temp9,temp4,N,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue,-1); //linvzk = real(ifft2(fft2(zk)./nlap_s2)); temp6 = linvzk fft_d_nlaps2(temp3,temp7,temp9,temp4,N,p_param->epsilon,k,p_param->s,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue); fft2D(temp7,temp6,temp9,temp4,N,fft_2D,fft_2D_clean,mat_trans,mat_trans_3D,queue,-1); ipnew = reduction_mult(temp8, temp6,temp9,N*N,reduct_mul,reduct,queue); CALL_CL_GUARDED(clFinish, (queue)); //printf("ipnew = %f\n",ipnew); #if 0 float test; CALL_CL_GUARDED(clFinish, (queue)); CALL_CL_GUARDED(clEnqueueReadBuffer, ( queue, temp8, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(float), &test, 0, NULL, NULL)); printf("test success and %f \n",test); #endif if (ipold >1e-9) beta = ipnew /ipold; else beta =0; vec__add(temp3,temp5,temp5,1,beta,2*N*N,vec_add,queue); //printf("In cg step here!\n"); reside = residual(temp8,temp9,resid,resid_init,queue,N*N); CALL_CL_GUARDED(clFinish, (queue)); //printf("cg residual is %f\n",reside); iter ++; p_param->cgloc ++; } if( reside > p_param->cgtol) { printf("too many CG steps\n"); *fail = true; } }
float reduct_energy(cl_mem a,cl_mem b, cl_mem c,cl_mem d, int N, float epsilon, cl_kernel reduct_eng,cl_kernel reduct, cl_command_queue queue) { int n = N ; float output; //CALL_CL_GUARDED(clFinish, (queue)); //printf("aha, n = %d\n",n); if(n > 128) { SET_6_KERNEL_ARGS(reduct_eng, a, b, c, d,n,epsilon); size_t ldim[] = { 128 }; size_t gdim[] = { n }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, reduct_eng, /*dimensions*/ 1, NULL, gdim, ldim, 0, NULL, NULL)); n /= 128; CALL_CL_GUARDED(clFinish, (queue)); //printf("aha, n = %d\n",n); while(n>=128) { SET_2_KERNEL_ARGS(reduct, d, n); size_t ldim[] = { 128 }; size_t gdim[] = { n }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, reduct, /*dimensions*/ 1, NULL, gdim, ldim, 0, NULL, NULL)); n /= 128; CALL_CL_GUARDED(clFinish, (queue)); //printf("aha, n = %d\n",n); } if(n != 1) { SET_2_KERNEL_ARGS(reduct, d, n); size_t ldim[] = { n }; size_t gdim[] = { n }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, reduct, /*dimensions*/ 1, NULL, gdim, ldim, 0, NULL, NULL)); } CALL_CL_GUARDED(clEnqueueReadBuffer, ( queue, d, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(float), &output, 0, NULL, NULL)); } else { SET_6_KERNEL_ARGS(reduct_eng, a, b, c, d,n,epsilon); size_t ldim[] = { n }; size_t gdim[] = { n }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, reduct_eng, /*dimensions*/ 1, NULL, gdim, ldim, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueReadBuffer, ( queue, d, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(float), &output, 0, NULL, NULL)); } CALL_CL_GUARDED(clFinish, (queue)); //printf("reduct energy = %f\n",output); return output; }