static void release_opencl() { DTIMER_START(T_RELEASE); free(g_chk); clReleaseMemObject(m_u); clReleaseMemObject(m_u0); clReleaseMemObject(m_u1); clReleaseMemObject(m_twiddle); if (device_type == CL_DEVICE_TYPE_CPU) { clReleaseMemObject(m_ty1); clReleaseMemObject(m_ty2); } clReleaseMemObject(m_chk); clReleaseKernel(k_compute_indexmap); clReleaseKernel(k_compute_ics); clReleaseKernel(k_cffts1); clReleaseKernel(k_cffts2); clReleaseKernel(k_cffts3); clReleaseKernel(k_evolve); clReleaseKernel(k_checksum); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); DTIMER_STOP(T_RELEASE); #ifdef TIMER_DETAIL print_opencl_timers(); #endif }
//--------------------------------------------------------------------- // set the boundary values of dependent variables //--------------------------------------------------------------------- void setbv() { DTIMER_START(t_setbv); cl_int ecode; ecode = clEnqueueNDRangeKernel(cmd_queue, k_setbv1, SETBV1_DIM, NULL, setbv1_gws, setbv1_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); ecode = clEnqueueNDRangeKernel(cmd_queue, k_setbv2, SETBV2_DIM, NULL, setbv2_gws, setbv2_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); ecode = clEnqueueNDRangeKernel(cmd_queue, k_setbv3, SETBV3_DIM, NULL, setbv3_gws, setbv3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); DTIMER_STOP(t_setbv); }
void release_opencl() { DTIMER_START(T_RELEASE); clReleaseMemObject(pgq); clReleaseMemObject(pgsx); clReleaseMemObject(pgsy); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); DTIMER_STOP(T_RELEASE); #ifdef TIMER_DETAIL if (timers_enabled) { int i; double tt; double t_opencl = 0.0, t_buffer = 0.0, t_kernel = 0.0; unsigned cnt; for (i = T_OPENCL_API; i < T_END; i++) t_opencl += timer_read(i); for (i = T_BUFFER_CREATE; i <= T_BUFFER_WRITE; i++) t_buffer += timer_read(i); for (i = T_KERNEL_EMBAR; i <= T_KERNEL_EMBAR; i++) t_kernel += timer_read(i); printf("\nOpenCL timers -\n"); printf("Kernel : %9.3f (%.2f%%)\n", t_kernel, t_kernel/t_opencl * 100.0); cnt = timer_count(T_KERNEL_EMBAR); tt = timer_read(T_KERNEL_EMBAR); printf("- embar : %9.3lf (%u, %.3f, %.2f%%)\n", tt, cnt, tt/cnt, tt/t_kernel * 100.0); printf("Buffer : %9.3lf (%.2f%%)\n", t_buffer, t_buffer/t_opencl * 100.0); printf("- creation: %9.3lf\n", timer_read(T_BUFFER_CREATE)); printf("- read : %9.3lf\n", timer_read(T_BUFFER_READ)); printf("- write : %9.3lf\n", timer_read(T_BUFFER_WRITE)); tt = timer_read(T_OPENCL_API); printf("API : %9.3lf (%.2f%%)\n", tt, tt/t_opencl * 100.0); tt = timer_read(T_BUILD); printf("BUILD : %9.3lf (%.2f%%)\n", tt, tt/t_opencl * 100.0); tt = timer_read(T_RELEASE); printf("RELEASE : %9.3lf (%.2f%%)\n", tt, tt/t_opencl * 100.0); printf("Total : %9.3lf\n", t_opencl); } #endif }
//--------------------------------------------------------------------- // touch all the big data //--------------------------------------------------------------------- static void init_ui(cl_mem *u0, cl_mem *u1, cl_mem *twiddle, int d1, int d2, int d3) { cl_kernel k_init_ui; cl_int ecode; DTIMER_START(T_OPENCL_API); // Create a kernel k_init_ui = clCreateKernel(program, "init_ui", &ecode); clu_CheckError(ecode, "clCreateKernel() for init_ui"); DTIMER_STOP(T_OPENCL_API); int n = d3 * d2 * (d1+1); ecode = clSetKernelArg(k_init_ui, 0, sizeof(cl_mem), (void*)u0); ecode |= clSetKernelArg(k_init_ui, 1, sizeof(cl_mem), (void*)u1); ecode |= clSetKernelArg(k_init_ui, 2, sizeof(cl_mem), (void*)twiddle); ecode |= clSetKernelArg(k_init_ui, 3, sizeof(int), (void*)&n); clu_CheckError(ecode, "clSetKernelArg() for init_ui"); size_t local_ws = work_item_sizes[0]; size_t global_ws = clu_RoundWorkSize((size_t)n, local_ws); ecode = clEnqueueNDRangeKernel(cmd_queue, k_init_ui, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for init_ui"); ecode = clFinish(cmd_queue); clu_CheckError(ecode, "clFinish()"); DTIMER_START(T_RELEASE); clReleaseKernel(k_init_ui); DTIMER_STOP(T_RELEASE); }
void create_seq( double seed, double a ) { cl_kernel k_cs; cl_int ecode; size_t cs_lws[1], cs_gws[1]; DTIMER_START(T_OPENCL_API); // Create a kernel k_cs = clCreateKernel(program, "create_seq", &ecode); clu_CheckError(ecode, "clCreateKernel() for create_seq"); DTIMER_STOP(T_OPENCL_API); DTIMER_START(T_KERNEL_CREATE_SEQ); // Set kernel arguments ecode = clSetKernelArg(k_cs, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_cs, 1, sizeof(cl_double), (void*)&seed); ecode |= clSetKernelArg(k_cs, 2, sizeof(cl_double), (void*)&a); clu_CheckError(ecode, "clSetKernelArg() for create_seq"); // Launch the kernel cs_lws[0] = CREATE_SEQ_GROUP_SIZE; cs_gws[0] = CREATE_SEQ_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_cs, 1, NULL, cs_gws, cs_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for create_seq"); ecode = clFinish(cmd_queue); clu_CheckError(ecode, "clFinish"); DTIMER_STOP(T_KERNEL_CREATE_SEQ); DTIMER_START(T_RELEASE); clReleaseKernel(k_cs); DTIMER_STOP(T_RELEASE); }
//--------------------------------------------------------------------- // Fill in array u0 with initial conditions from // random number generator //--------------------------------------------------------------------- static void compute_initial_conditions(cl_mem *u0, int d1, int d2, int d3) { int k; double start, an, dummy, starts[NZ]; size_t local_ws, global_ws, temp; cl_mem m_starts; cl_int ecode; start = SEED; //--------------------------------------------------------------------- // Jump to the starting element for our first plane. //--------------------------------------------------------------------- an = ipow46(A, 0); dummy = randlc(&start, an); an = ipow46(A, 2*NX*NY); starts[0] = start; for (k = 1; k < dims[2]; k++) { dummy = randlc(&start, an); starts[k] = start; } if (device_type == CL_DEVICE_TYPE_CPU) { m_starts = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double) * NZ, starts, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_starts"); local_ws = 1; global_ws = clu_RoundWorkSize((size_t)d2, local_ws); } else { //GPU m_starts = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * NZ, starts, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_starts"); temp = d2 / max_compute_units; local_ws = temp == 0 ? 1 : ((temp > work_item_sizes[0]) ? work_item_sizes[0] : temp); global_ws = clu_RoundWorkSize((size_t)d2, local_ws); } ecode = clSetKernelArg(k_compute_ics, 0, sizeof(cl_mem), u0); ecode |= clSetKernelArg(k_compute_ics, 1, sizeof(cl_mem), &m_starts); clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions"); ecode = clEnqueueNDRangeKernel(cmd_queue, k_compute_ics, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); ecode = clFinish(cmd_queue); clu_CheckError(ecode, "clFinish()"); DTIMER_START(T_RELEASE); clReleaseMemObject(m_starts); DTIMER_STOP(T_RELEASE); }
int main(int argc, char *argv[]) { int i; int iter; double total_time, mflops; logical verified; char Class; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } //--------------------------------------------------------------------- // Run the entire problem once to make sure all data is touched. // This reduces variable startup costs, which is important for such a // short benchmark. The other NPB 2 implementations are similar. //--------------------------------------------------------------------- for (i = 1; i <= T_max; i++) { timer_clear(i); } setup(); setup_opencl(argc, argv); init_ui(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]); compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]); compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]); fft_init(dims[0]); fft(1, &m_u1, &m_u0); //--------------------------------------------------------------------- // Start over from the beginning. Note that all operations must // be timed, in contrast to other benchmarks. //--------------------------------------------------------------------- for (i = 1; i <= T_max; i++) { timer_clear(i); } timer_start(T_total); if (timers_enabled) timer_start(T_setup); DTIMER_START(T_compute_im); compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]); DTIMER_STOP(T_compute_im); DTIMER_START(T_compute_ics); compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]); DTIMER_STOP(T_compute_ics); DTIMER_START(T_fft_init); fft_init(dims[0]); DTIMER_STOP(T_fft_init); if (timers_enabled) timer_stop(T_setup); if (timers_enabled) timer_start(T_fft); fft(1, &m_u1, &m_u0); if (timers_enabled) timer_stop(T_fft); for (iter = 1; iter <= niter; iter++) { if (timers_enabled) timer_start(T_evolve); evolve(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]); if (timers_enabled) timer_stop(T_evolve); if (timers_enabled) timer_start(T_fft); fft(-1, &m_u1, &m_u1); if (timers_enabled) timer_stop(T_fft); if (timers_enabled) timer_start(T_checksum); checksum(iter, &m_u1, dims[0], dims[1], dims[2]); if (timers_enabled) timer_stop(T_checksum); } verify(NX, NY, NZ, niter, &verified, &Class); timer_stop(T_total); total_time = timer_read(T_total); if (total_time != 0.0) { mflops = 1.0e-6 * (double)NTOTAL * (14.8157 + 7.19641 * log((double)NTOTAL) + (5.23518 + 7.21113 * log((double)NTOTAL)) * niter) / total_time; } else { mflops = 0.0; } c_print_results("FT", Class, NX, NY, NZ, niter, total_time, mflops, " floating point", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name); if (timers_enabled) print_timers(); release_opencl(); fflush(stdout); return 0; }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char *argv[]) { size_t temp; cl_int ecode; char *source_dir = "FT"; if (argc > 1) source_dir = argv[1]; #ifdef TIMER_DETAIL if (timers_enabled) { int i; for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i); } #endif DTIMER_START(T_OPENCL_API); // 1. Find the default device type and get a device for the device type device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // Device information ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_sizes), &work_item_sizes, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); // FIXME: The below values are experimental. if (max_work_group_size > 64) { max_work_group_size = 64; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > 64) { work_item_sizes[i] = 64; } } } ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); // 2. Create a context for the specified device context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); // 3. Create a command queue cmd_queue = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); DTIMER_STOP(T_OPENCL_API); // 4. Build the program DTIMER_START(T_BUILD); char *source_file; char build_option[50]; if (device_type == CL_DEVICE_TYPE_CPU) { source_file = "ft_cpu.cl"; sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS); COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_CPU; EVOLVE_DIM = EVOLVE_DIM_CPU; CFFTS_DIM = CFFTS_DIM_CPU; } else if (device_type == CL_DEVICE_TYPE_GPU) { char vendor[50]; ecode = clGetDeviceInfo(device, CL_DEVICE_VENDOR, 50, vendor, NULL); clu_CheckError(ecode, "clGetDeviceInfo()"); if (strncmp(vendor, DEV_VENDOR_NVIDIA, strlen(DEV_VENDOR_NVIDIA)) == 0) { source_file = "ft_gpu_nvidia.cl"; CFFTS_LSIZE = 32; } else { source_file = "ft_gpu.cl"; CFFTS_LSIZE = 64; } sprintf(build_option, "-I. -DCLASS=\'%c\' -DLSIZE=%lu", CLASS, CFFTS_LSIZE); COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_GPU; EVOLVE_DIM = EVOLVE_DIM_GPU; CFFTS_DIM = CFFTS_DIM_GPU; } else { fprintf(stderr, "Set the environment variable OPENCL_DEVICE_TYPE!\n"); exit(EXIT_FAILURE); } program = clu_MakeProgram(context, device, source_dir, source_file, build_option); DTIMER_STOP(T_BUILD); // 5. Create buffers DTIMER_START(T_BUFFER_CREATE); m_u = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(dcomplex) * NXP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u"); m_u0 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(dcomplex) * NTOTALP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u0"); m_u1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(dcomplex) * NTOTALP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u1"); m_twiddle = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double) * NTOTALP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_twiddle"); if (device_type == CL_DEVICE_TYPE_CPU) { size_t ty1_size, ty2_size; if (CFFTS_DIM == 2) { ty1_size = sizeof(dcomplex) * NX * NY * NZ; ty2_size = sizeof(dcomplex) * NX * NY * NZ; } else { fprintf(stderr, "Wrong CFFTS_DIM: %u\n", CFFTS_DIM); exit(EXIT_FAILURE); } m_ty1 = clCreateBuffer(context, CL_MEM_READ_WRITE, ty1_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ty1"); m_ty2 = clCreateBuffer(context, CL_MEM_READ_WRITE, ty2_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ty2"); } if (device_type == CL_DEVICE_TYPE_CPU) { temp = 1024 / max_compute_units; checksum_local_ws = temp == 0 ? 1 : temp; checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws); } else if (device_type == CL_DEVICE_TYPE_GPU) { checksum_local_ws = 32; checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws); } checksum_wg_num = checksum_global_ws / checksum_local_ws; m_chk = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(dcomplex) * checksum_wg_num, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_chk"); g_chk = (dcomplex *)malloc(sizeof(dcomplex) * checksum_wg_num); DTIMER_STOP(T_BUFFER_CREATE); // 6. Create kernels DTIMER_START(T_OPENCL_API); double ap = -4.0 * ALPHA * PI * PI; int d1 = dims[0]; int d2 = dims[1]; int d3 = dims[2]; k_compute_indexmap = clCreateKernel(program, "compute_indexmap", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_indexmap"); ecode = clSetKernelArg(k_compute_indexmap, 0, sizeof(cl_mem), &m_twiddle); ecode |= clSetKernelArg(k_compute_indexmap, 1, sizeof(int), &d1); ecode |= clSetKernelArg(k_compute_indexmap, 2, sizeof(int), &d2); ecode |= clSetKernelArg(k_compute_indexmap, 3, sizeof(int), &d3); ecode |= clSetKernelArg(k_compute_indexmap, 4, sizeof(double), &ap); clu_CheckError(ecode, "clSetKernelArg() for compute_indexmap"); if (COMPUTE_IMAP_DIM == 3) { cimap_lws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0]; temp = max_work_group_size / cimap_lws[0]; cimap_lws[1] = d2 < temp ? d2 : temp; temp = temp / cimap_lws[1]; cimap_lws[2] = d3 < temp ? d3 : temp; cimap_gws[0] = clu_RoundWorkSize((size_t)d1, cimap_lws[0]); cimap_gws[1] = clu_RoundWorkSize((size_t)d2, cimap_lws[1]); cimap_gws[2] = clu_RoundWorkSize((size_t)d3, cimap_lws[2]); } else if (COMPUTE_IMAP_DIM == 2) { cimap_lws[0] = d2 < work_item_sizes[0] ? d2 : work_item_sizes[0]; temp = max_work_group_size / cimap_lws[0]; cimap_lws[1] = d3 < temp ? d3 : temp; cimap_gws[0] = clu_RoundWorkSize((size_t)d2, cimap_lws[0]); cimap_gws[1] = clu_RoundWorkSize((size_t)d3, cimap_lws[1]); } else { //temp = d3 / max_compute_units; temp = 1; cimap_lws[0] = temp == 0 ? 1 : temp; cimap_gws[0] = clu_RoundWorkSize((size_t)d3, cimap_lws[0]); } k_compute_ics = clCreateKernel(program, "compute_initial_conditions", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_initial_conditions"); ecode = clSetKernelArg(k_compute_ics, 2, sizeof(int), &d1); ecode |= clSetKernelArg(k_compute_ics, 3, sizeof(int), &d2); ecode |= clSetKernelArg(k_compute_ics, 4, sizeof(int), &d3); clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions"); k_cffts1 = clCreateKernel(program, "cffts1", &ecode); clu_CheckError(ecode, "clCreateKernel() for cffts1"); ecode = clSetKernelArg(k_cffts1, 2, sizeof(cl_mem), &m_u); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_cffts1, 8, sizeof(cl_mem), &m_ty1); ecode |= clSetKernelArg(k_cffts1, 9, sizeof(cl_mem), &m_ty2); } clu_CheckError(ecode, "clSetKernelArg() for k_cffts1"); k_cffts2 = clCreateKernel(program, "cffts2", &ecode); clu_CheckError(ecode, "clCreateKernel() for cffts2"); ecode = clSetKernelArg(k_cffts2, 2, sizeof(cl_mem), &m_u); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_cffts2, 8, sizeof(cl_mem), &m_ty1); ecode |= clSetKernelArg(k_cffts2, 9, sizeof(cl_mem), &m_ty2); } clu_CheckError(ecode, "clSetKernelArg() for k_cffts2"); k_cffts3 = clCreateKernel(program, "cffts3", &ecode); clu_CheckError(ecode, "clCreateKernel() for cffts3"); ecode = clSetKernelArg(k_cffts3, 2, sizeof(cl_mem), &m_u); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_cffts3, 8, sizeof(cl_mem), &m_ty1); ecode |= clSetKernelArg(k_cffts3, 9, sizeof(cl_mem), &m_ty2); } clu_CheckError(ecode, "clSetKernelArg() for k_cffts3"); k_evolve = clCreateKernel(program, "evolve", &ecode); clu_CheckError(ecode, "clCreateKernel() for evolve"); k_checksum = clCreateKernel(program, "checksum", &ecode); clu_CheckError(ecode, "clCreateKernel() for checksum"); ecode = clSetKernelArg(k_checksum, 1, sizeof(cl_mem), &m_chk); ecode |= clSetKernelArg(k_checksum, 2, sizeof(dcomplex)*checksum_local_ws, NULL); ecode |= clSetKernelArg(k_checksum, 3, sizeof(int), &dims[0]); ecode |= clSetKernelArg(k_checksum, 4, sizeof(int), &dims[1]); clu_CheckError(ecode, "clSetKernelArg() for checksum"); DTIMER_STOP(T_OPENCL_API); }
static void release_opencl() { DTIMER_START(T_RELEASE); clReleaseMemObject(m_key_array); clReleaseMemObject(m_key_buff1); clReleaseMemObject(m_key_buff2); clReleaseMemObject(m_index_array); clReleaseMemObject(m_rank_array); clReleaseMemObject(m_partial_vals); clReleaseMemObject(m_passed_verification); if (device_type == CL_DEVICE_TYPE_GPU) { clReleaseMemObject(m_key_scan); clReleaseMemObject(m_sum); } else { clReleaseMemObject(m_bucket_ptrs); clReleaseMemObject(m_bucket_size); } clReleaseKernel(k_rank0); clReleaseKernel(k_rank1); clReleaseKernel(k_rank2); if (device_type == CL_DEVICE_TYPE_GPU) { clReleaseKernel(k_rank3_0); clReleaseKernel(k_rank3_1); clReleaseKernel(k_rank3_2); } else { clReleaseKernel(k_rank3); } clReleaseKernel(k_rank4); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); DTIMER_STOP(T_RELEASE); #ifdef TIMER_DETAIL if (timer_on) { int i; double tt; double t_opencl = 0.0, t_buffer = 0.0, t_kernel = 0.0; unsigned cnt; for (i = T_OPENCL_API; i < T_END; i++) t_opencl += timer_read(i); for (i = T_BUFFER_CREATE; i <= T_BUFFER_WRITE; i++) t_buffer += timer_read(i); for (i = T_KERNEL_CREATE_SEQ; i <= T_KERNEL_FV2; i++) t_kernel += timer_read(i); printf("\nOpenCL timers -\n"); printf("Kernel : %9.3f (%.2f%%)\n", t_kernel, t_kernel/t_opencl * 100.0); cnt = timer_count(T_KERNEL_CREATE_SEQ); tt = timer_read(T_KERNEL_CREATE_SEQ); printf("- create_seq: %9.3lf (%u, %.3f, %.2f%%)\n", tt, cnt, tt/cnt, tt/t_kernel * 100.0); cnt = timer_count(T_KERNEL_RANK0); tt = timer_read(T_KERNEL_RANK0); printf("- rank0 : %9.3lf (%u, %.3f, %.2f%%)\n", tt, cnt, tt/cnt, tt/t_kernel * 100.0); cnt = timer_count(T_KERNEL_RANK1); tt = timer_read(T_KERNEL_RANK1); printf("- rank1 : %9.3lf (%u, %.3f, %.2f%%)\n", tt, cnt, tt/cnt, tt/t_kernel * 100.0); cnt = timer_count(T_KERNEL_RANK2); tt = timer_read(T_KERNEL_RANK2); printf("- rank2 : %9.3lf (%u, %.3f, %.2f%%)\n", tt, cnt, tt/cnt, tt/t_kernel * 100.0); cnt = timer_count(T_KERNEL_RANK3); tt = timer_read(T_KERNEL_RANK3); printf("- rank3 : %9.3lf (%u, %.3f, %.2f%%)\n", tt, cnt, tt/cnt, tt/t_kernel * 100.0); cnt = timer_count(T_KERNEL_RANK4); tt = timer_read(T_KERNEL_RANK4); printf("- rank4 : %9.3lf (%u, %.3f, %.2f%%)\n", tt, cnt, tt/cnt, tt/t_kernel * 100.0); cnt = timer_count(T_KERNEL_FV0); tt = timer_read(T_KERNEL_FV0); printf("- fv0 : %9.3lf (%u, %.3f, %.2f%%)\n", tt, cnt, tt/cnt, tt/t_kernel * 100.0); cnt = timer_count(T_KERNEL_FV1); tt = timer_read(T_KERNEL_FV1); printf("- fv1 : %9.3lf (%u, %.3f, %.2f%%)\n", tt, cnt, tt/cnt, tt/t_kernel * 100.0); cnt = timer_count(T_KERNEL_FV2); tt = timer_read(T_KERNEL_FV2); printf("- fv2 : %9.3lf (%u, %.3f, %.2f%%)\n", tt, cnt, tt/cnt, tt/t_kernel * 100.0); printf("Buffer : %9.3lf (%.2f%%)\n", t_buffer, t_buffer/t_opencl * 100.0); printf("- creation : %9.3lf\n", timer_read(T_BUFFER_CREATE)); printf("- read : %9.3lf\n", timer_read(T_BUFFER_READ)); printf("- write : %9.3lf\n", timer_read(T_BUFFER_WRITE)); tt = timer_read(T_OPENCL_API); printf("API : %9.3lf (%.2f%%)\n", tt, tt/t_opencl * 100.0); tt = timer_read(T_BUILD); printf("BUILD : %9.3lf (%.2f%%)\n", tt, tt/t_opencl * 100.0); tt = timer_read(T_RELEASE); printf("RELEASE : %9.3lf (%.2f%%)\n", tt, tt/t_opencl * 100.0); printf("Total : %9.3lf\n", t_opencl); } #endif }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char *argv[]) { cl_int ecode; char *source_dir = "IS"; if (argc > 1) source_dir = argv[1]; #ifdef TIMER_DETAIL if (timer_on) { int i; for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i); } #endif DTIMER_START(T_OPENCL_API); // 1. Find the default device type and get a device for the device type device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // Device information ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_sizes), &work_item_sizes, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); // FIXME: The below values are experimental. if (max_work_group_size > 256) { max_work_group_size = 256; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > 256) { work_item_sizes[i] = 256; } } } // 2. Create a context for the specified device context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); // 3. Create a command queue cmd_queue = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); DTIMER_STOP(T_OPENCL_API); // 4. Build the program DTIMER_START(T_BUILD); char *source_file; char build_option[30]; if (device_type == CL_DEVICE_TYPE_CPU) { source_file = "is_cpu.cl"; sprintf(build_option, "-DCLASS=%d -I.", CLASS); CREATE_SEQ_GROUP_SIZE = 64; CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256; RANK_GROUP_SIZE = 1; RANK_GLOBAL_SIZE = RANK_GROUP_SIZE * 128; RANK1_GROUP_SIZE = 1; RANK1_GLOBAL_SIZE = RANK1_GROUP_SIZE * RANK_GLOBAL_SIZE;; RANK2_GROUP_SIZE = RANK_GROUP_SIZE; RANK2_GLOBAL_SIZE = RANK_GLOBAL_SIZE;; FV2_GROUP_SIZE = 64; FV2_GLOBAL_SIZE = FV2_GROUP_SIZE * 256; } else if (device_type == CL_DEVICE_TYPE_GPU) { source_file = "is_gpu.cl"; sprintf(build_option, "-DCLASS=\'%c\' -I.", CLASS); CREATE_SEQ_GROUP_SIZE = 64; CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256; RANK1_GROUP_SIZE = work_item_sizes[0]; RANK1_GLOBAL_SIZE = MAX_KEY; RANK2_GROUP_SIZE = work_item_sizes[0]; RANK2_GLOBAL_SIZE = NUM_KEYS; FV2_GROUP_SIZE = work_item_sizes[0]; FV2_GLOBAL_SIZE = NUM_KEYS; } else { fprintf(stderr, "%s: not supported.", clu_GetDeviceTypeName(device_type)); exit(EXIT_FAILURE); } program = clu_MakeProgram(context, device, source_dir, source_file, build_option); DTIMER_STOP(T_BUILD); // 5. Create buffers DTIMER_START(T_BUFFER_CREATE); m_key_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * SIZE_OF_BUFFERS, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_array"); m_key_buff1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * MAX_KEY, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1"); m_key_buff2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * SIZE_OF_BUFFERS, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_buff2"); size_t test_array_size = sizeof(INT_TYPE) * TEST_ARRAY_SIZE; m_index_array = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, test_array_size, test_index_array, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_index_array"); m_rank_array = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, test_array_size, test_rank_array, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rank_array"); m_partial_vals = clCreateBuffer(context, CL_MEM_WRITE_ONLY, test_array_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_partial_vals"); m_passed_verification = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_passed_verification"); if (device_type == CL_DEVICE_TYPE_GPU) { m_key_scan = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * MAX_KEY, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1_scan"); m_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * work_item_sizes[0], NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_sum"); } else { size_t bs_size = RANK_GLOBAL_SIZE * sizeof(INT_TYPE) * NUM_BUCKETS; m_bucket_size = clCreateBuffer(context, CL_MEM_READ_WRITE, bs_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_bucket_size"); m_bucket_ptrs = clCreateBuffer(context, CL_MEM_READ_WRITE, bs_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_bucket_ptrs"); } DTIMER_STOP(T_BUFFER_CREATE); // 6. Create kernels DTIMER_START(T_OPENCL_API); k_rank0 = clCreateKernel(program, "rank0", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank0"); ecode = clSetKernelArg(k_rank0, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_rank0, 1, sizeof(cl_mem), (void*)&m_partial_vals); ecode |= clSetKernelArg(k_rank0, 2, sizeof(cl_mem), (void*)&m_index_array); clu_CheckError(ecode, "clSetKernelArg() for rank0"); if (device_type == CL_DEVICE_TYPE_GPU) { k_rank1 = clCreateKernel(program, "rank1", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank1"); ecode = clSetKernelArg(k_rank1, 0, sizeof(cl_mem), (void*)&m_key_buff1); clu_CheckError(ecode, "clSetKernelArg() for rank1"); k_rank2 = clCreateKernel(program, "rank2", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank2"); ecode = clSetKernelArg(k_rank2, 0, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem), (void*)&m_key_array); clu_CheckError(ecode, "clSetKernelArg() for rank2"); k_rank3_0 = clCreateKernel(program, "rank3_0", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3_0"); ecode = clSetKernelArg(k_rank3_0, 0, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3_0, 1, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3_0, 2, sizeof(cl_mem),(void*)&m_sum); ecode |= clSetKernelArg(k_rank3_0, 3, sizeof(INT_TYPE) * work_item_sizes[0] * 2, NULL); clu_CheckError(ecode, "clSetKernelArg() for rank3_0"); k_rank3_1 = clCreateKernel(program, "rank3_1", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3_1"); ecode = clSetKernelArg(k_rank3_1, 0, sizeof(cl_mem), (void*)&m_sum); ecode = clSetKernelArg(k_rank3_1, 1, sizeof(cl_mem), (void*)&m_sum); ecode |= clSetKernelArg(k_rank3_1, 2, sizeof(INT_TYPE) * work_item_sizes[0] * 2, NULL); clu_CheckError(ecode, "clSetKernelArg() for rank3_1"); k_rank3_2 = clCreateKernel(program, "rank3_2", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3_2"); ecode = clSetKernelArg(k_rank3_2, 0, sizeof(cl_mem),(void*)&m_key_buff1); ecode = clSetKernelArg(k_rank3_2, 1, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3_2, 2, sizeof(cl_mem),(void*)&m_sum); clu_CheckError(ecode, "clSetKernelArg() for rank3_2"); } else { k_rank1 = clCreateKernel(program, "rank1", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank1"); ecode = clSetKernelArg(k_rank1, 0, sizeof(cl_mem),(void*)&m_key_array); ecode |= clSetKernelArg(k_rank1, 1, sizeof(cl_mem),(void*)&m_bucket_size); clu_CheckError(ecode, "clSetKernelArg() for rank1"); k_rank2 = clCreateKernel(program, "rank2", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank2"); ecode = clSetKernelArg(k_rank2, 0, sizeof(cl_mem),(void*)&m_key_array); ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem),(void*)&m_bucket_size); ecode |= clSetKernelArg(k_rank2, 2, sizeof(cl_mem),(void*)&m_bucket_ptrs); ecode |= clSetKernelArg(k_rank2, 3, sizeof(cl_mem),(void*)&m_key_buff2); clu_CheckError(ecode, "clSetKernelArg() for rank2"); k_rank3 = clCreateKernel(program, "rank3", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3"); ecode = clSetKernelArg(k_rank3, 0, sizeof(cl_mem),(void*)&m_bucket_size); ecode |= clSetKernelArg(k_rank3, 1, sizeof(cl_mem),(void*)&m_bucket_ptrs); ecode |= clSetKernelArg(k_rank3, 2, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3, 3, sizeof(cl_mem),(void*)&m_key_buff2); clu_CheckError(ecode, "clSetKernelArg() for rank3"); } k_rank4 = clCreateKernel(program, "rank4", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank4"); ecode = clSetKernelArg(k_rank4, 0, sizeof(cl_mem), (void*)&m_partial_vals); ecode |= clSetKernelArg(k_rank4, 1, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank4, 2, sizeof(cl_mem), (void*)&m_rank_array); ecode |= clSetKernelArg(k_rank4, 3, sizeof(cl_mem), (void*)&m_passed_verification); clu_CheckError(ecode, "clSetKernelArg() for rank4"); DTIMER_STOP(T_OPENCL_API); }
int main( int argc, char **argv ) { int i, iteration; double timecounter; FILE *fp; cl_int ecode; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } /* Initialize timers */ timer_on = 0; if ((fp = fopen("timer.flag", "r")) != NULL) { fclose(fp); timer_on = 1; } timer_clear( 0 ); if (timer_on) { timer_clear( 1 ); timer_clear( 2 ); timer_clear( 3 ); } if (timer_on) timer_start( 3 ); /* Initialize the verification arrays if a valid class */ for( i=0; i<TEST_ARRAY_SIZE; i++ ) switch( CLASS ) { case 'S': test_index_array[i] = S_test_index_array[i]; test_rank_array[i] = S_test_rank_array[i]; break; case 'A': test_index_array[i] = A_test_index_array[i]; test_rank_array[i] = A_test_rank_array[i]; break; case 'W': test_index_array[i] = W_test_index_array[i]; test_rank_array[i] = W_test_rank_array[i]; break; case 'B': test_index_array[i] = B_test_index_array[i]; test_rank_array[i] = B_test_rank_array[i]; break; case 'C': test_index_array[i] = C_test_index_array[i]; test_rank_array[i] = C_test_rank_array[i]; break; case 'D': test_index_array[i] = D_test_index_array[i]; test_rank_array[i] = D_test_rank_array[i]; break; }; /* set up the OpenCL environment. */ setup_opencl(argc, argv); /* Printout initial NPB info */ printf( "\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - IS Benchmark\n\n" ); printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS, CLASS ); printf( " Iterations: %d\n", MAX_ITERATIONS ); if (timer_on) timer_start( 1 ); /* Generate random number sequence and subsequent keys on all procs */ create_seq( 314159265.00, /* Random number gen seed */ 1220703125.00 ); /* Random number gen mult */ if (timer_on) timer_stop( 1 ); /* Do one interation for free (i.e., untimed) to guarantee initialization of all data and code pages and respective tables */ rank( 1 ); /* Start verification counter */ passed_verification = 0; DTIMER_START(T_BUFFER_WRITE); ecode = clEnqueueWriteBuffer(cmd_queue, m_passed_verification, CL_TRUE, 0, sizeof(cl_int), &passed_verification, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_passed_verification"); DTIMER_STOP(T_BUFFER_WRITE); if( CLASS != 'S' ) printf( "\n iteration\n" ); /* Start timer */ timer_start( 0 ); /* This is the main iteration */ for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) { if( CLASS != 'S' ) printf( " %d\n", iteration ); rank( iteration ); } DTIMER_START(T_BUFFER_READ); ecode = clEnqueueReadBuffer(cmd_queue, m_passed_verification, CL_TRUE, 0, sizeof(cl_int), &passed_verification, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer() for m_passed_verification"); DTIMER_STOP(T_BUFFER_READ); /* End of timing, obtain maximum time of all processors */ timer_stop( 0 ); timecounter = timer_read( 0 ); /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */ if (timer_on) timer_start( 2 ); full_verify(); if (timer_on) timer_stop( 2 ); if (timer_on) timer_stop( 3 ); /* The final printout */ if( passed_verification != 5*MAX_ITERATIONS + 1 ) passed_verification = 0; c_print_results( "IS", CLASS, (int)(TOTAL_KEYS/64), 64, 0, MAX_ITERATIONS, timecounter, ((double) (MAX_ITERATIONS*TOTAL_KEYS)) /timecounter/1000000., "keys ranked", passed_verification, NPBVERSION, COMPILETIME, CC, CLINK, C_LIB, C_INC, CFLAGS, CLINKFLAGS, "", clu_GetDeviceTypeName(device_type), device_name); /* Print additional timers */ if (timer_on) { double t_total, t_percent; t_total = timer_read( 3 ); printf("\nAdditional timers -\n"); printf(" Total execution: %8.3f\n", t_total); if (t_total == 0.0) t_total = 1.0; timecounter = timer_read(1); t_percent = timecounter/t_total * 100.; printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(0); t_percent = timecounter/t_total * 100.; printf(" Benchmarking : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(2); t_percent = timecounter/t_total * 100.; printf(" Sorting : %8.3f (%5.2f%%)\n", timecounter, t_percent); } release_opencl(); fflush(stdout); return 0; /**************************/ } /* E N D P R O G R A M */
void rank( int iteration ) { size_t r1_lws[1], r1_gws[1]; size_t r2_lws[1], r2_gws[1]; size_t r3_lws[1], r3_gws[1]; cl_int ecode; DTIMER_START(T_KERNEL_RANK0); // rank0 ecode = clSetKernelArg(k_rank0, 3, sizeof(cl_int), (void*)&iteration); clu_CheckError(ecode, "clSetKernelArg() for rank0: iteration"); ecode = clEnqueueTask(cmd_queue, k_rank0, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueTask() for rank0"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_RANK0); DTIMER_START(T_KERNEL_RANK1); // rank1 r1_lws[0] = RANK1_GROUP_SIZE; r1_gws[0] = RANK1_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank1, 1, NULL, r1_gws, r1_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank1"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_RANK1); DTIMER_START(T_KERNEL_RANK2); // rank2 r2_lws[0] = RANK2_GROUP_SIZE; r2_gws[0] = RANK2_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank2, 1, NULL, r2_gws, r2_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank2"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_RANK2); DTIMER_START(T_KERNEL_RANK3); // rank3 if (device_type == CL_DEVICE_TYPE_GPU) { r3_lws[0] = work_item_sizes[0]; r3_gws[0] = work_item_sizes[0] * work_item_sizes[0]; if (r3_gws[0] > MAX_KEY) r3_gws[0] = MAX_KEY; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank3_0, 1, NULL, r3_gws, r3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_0"); r3_lws[0] = work_item_sizes[0]; r3_gws[0] = work_item_sizes[0]; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank3_1, 1, NULL, r3_gws, r3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_1"); r3_lws[0] = work_item_sizes[0]; r3_gws[0] = work_item_sizes[0] * work_item_sizes[0]; if (r3_gws[0] > MAX_KEY) r3_gws[0] = MAX_KEY; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank3_2, 1, NULL, r3_gws, r3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3_2"); } else { r3_lws[0] = RANK_GROUP_SIZE; r3_gws[0] = RANK_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_rank3, 1, NULL, r3_gws, r3_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for rank3"); } CHECK_FINISH(); DTIMER_STOP(T_KERNEL_RANK3); // rank4 - partial verification DTIMER_START(T_KERNEL_RANK4); ecode = clSetKernelArg(k_rank4, 4, sizeof(cl_int), (void*)&iteration); clu_CheckError(ecode, "clSetKernelArg() for rank4"); ecode = clEnqueueTask(cmd_queue, k_rank4, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueTask() for rank4"); ecode = clFinish(cmd_queue); clu_CheckError(ecode, "clFinish"); DTIMER_STOP(T_KERNEL_RANK4); }
void full_verify( void ) { cl_kernel k_fv1, k_fv2; cl_mem m_j; INT_TYPE *g_j; INT_TYPE j = 0, i; size_t j_size; size_t fv1_lws[1], fv1_gws[1]; size_t fv2_lws[1], fv2_gws[1]; cl_int ecode; DTIMER_START(T_BUFFER_CREATE); // Create buffers j_size = sizeof(INT_TYPE) * (FV2_GLOBAL_SIZE / FV2_GROUP_SIZE); m_j = clCreateBuffer(context, CL_MEM_READ_WRITE, j_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer for m_j"); DTIMER_STOP(T_BUFFER_CREATE); DTIMER_START(T_OPENCL_API); // Create kernels k_fv1 = clCreateKernel(program, "full_verify1", &ecode); clu_CheckError(ecode, "clCreateKernel() for full_verify1"); k_fv2 = clCreateKernel(program, "full_verify2", &ecode); clu_CheckError(ecode, "clCreateKernel() for full_verify2"); DTIMER_STOP(T_OPENCL_API); if (device_type == CL_DEVICE_TYPE_GPU) { cl_kernel k_fv0; size_t fv0_lws[1], fv0_gws[1]; DTIMER_START(T_OPENCL_API); // Create kernels k_fv0 = clCreateKernel(program, "full_verify0", &ecode); clu_CheckError(ecode, "clCreateKernel() for full_verify0"); DTIMER_STOP(T_OPENCL_API); // Kernel execution DTIMER_START(T_KERNEL_FV0); ecode = clSetKernelArg(k_fv0, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_fv0, 1, sizeof(cl_mem), (void*)&m_key_buff2); clu_CheckError(ecode, "clSetKernelArg() for full_verify0"); fv0_lws[0] = work_item_sizes[0]; fv0_gws[0] = NUM_KEYS; ecode = clEnqueueNDRangeKernel(cmd_queue, k_fv0, 1, NULL, fv0_gws, fv0_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify0"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_FV0); DTIMER_START(T_KERNEL_FV1); ecode = clSetKernelArg(k_fv1, 0, sizeof(cl_mem), (void*)&m_key_buff2); ecode |= clSetKernelArg(k_fv1, 1, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_fv1, 2, sizeof(cl_mem), (void*)&m_key_array); clu_CheckError(ecode, "clSetKernelArg() for full_verify1"); fv1_lws[0] = work_item_sizes[0]; fv1_gws[0] = NUM_KEYS; ecode = clEnqueueNDRangeKernel(cmd_queue, k_fv1, 1, NULL, fv1_gws, fv1_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify1"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_FV1); DTIMER_START(T_KERNEL_FV2); ecode = clSetKernelArg(k_fv2, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_fv2, 1, sizeof(cl_mem), (void*)&m_j); ecode |= clSetKernelArg(k_fv2, 2, sizeof(INT_TYPE)*FV2_GROUP_SIZE, NULL); clu_CheckError(ecode, "clSetKernelArg() for full_verify2"); fv2_lws[0] = FV2_GROUP_SIZE; fv2_gws[0] = FV2_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_fv2, 1, NULL, fv2_gws, fv2_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify2"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_FV2); g_j = (INT_TYPE *)malloc(j_size); DTIMER_START(T_BUFFER_READ); ecode = clEnqueueReadBuffer(cmd_queue, m_j, CL_TRUE, 0, j_size, g_j, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer() for m_j"); DTIMER_STOP(T_BUFFER_READ); // reduction for (i = 0; i < j_size/sizeof(INT_TYPE); i++) { j += g_j[i]; } DTIMER_START(T_RELEASE); clReleaseKernel(k_fv0); DTIMER_STOP(T_RELEASE); } else { // Kernel execution DTIMER_START(T_KERNEL_FV1); ecode = clSetKernelArg(k_fv1, 0, sizeof(cl_mem), (void*)&m_bucket_ptrs); ecode |= clSetKernelArg(k_fv1, 1, sizeof(cl_mem), (void*)&m_key_buff2); ecode |= clSetKernelArg(k_fv1, 2, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_fv1, 3, sizeof(cl_mem), (void*)&m_key_array); clu_CheckError(ecode, "clSetKernelArg() for full_verify1"); fv1_lws[0] = RANK_GROUP_SIZE; fv1_gws[0] = RANK_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_fv1, 1, NULL, fv1_gws, fv1_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify1"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_FV1); DTIMER_START(T_KERNEL_FV2); ecode = clSetKernelArg(k_fv2, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_fv2, 1, sizeof(cl_mem), (void*)&m_j); ecode |= clSetKernelArg(k_fv2, 2, sizeof(INT_TYPE)*FV2_GROUP_SIZE, NULL); clu_CheckError(ecode, "clSetKernelArg() for full_verify2"); fv2_lws[0] = FV2_GROUP_SIZE; fv2_gws[0] = FV2_GLOBAL_SIZE; ecode = clEnqueueNDRangeKernel(cmd_queue, k_fv2, 1, NULL, fv2_gws, fv2_lws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel() for full_verify2"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_FV2); g_j = (INT_TYPE *)malloc(j_size); DTIMER_START(T_BUFFER_READ); ecode = clEnqueueReadBuffer(cmd_queue, m_j, CL_TRUE, 0, j_size, g_j, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer() for m_j"); DTIMER_STOP(T_BUFFER_READ); // reduction for (i = 0; i < j_size/sizeof(INT_TYPE); i++) { j += g_j[i]; } } DTIMER_START(T_RELEASE); free(g_j); clReleaseMemObject(m_j); clReleaseKernel(k_fv1); clReleaseKernel(k_fv2); DTIMER_STOP(T_RELEASE); if (j != 0) printf( "Full_verify: number of keys out of sort: %ld\n", (long)j ); else passed_verification++; }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- void setup_opencl(int argc, char *argv[]) { cl_int err_code; char *source_dir = "EP"; if (argc > 1) source_dir = argv[1]; #ifdef TIMER_DETAIL if (timers_enabled) { int i; for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i); } #endif DTIMER_START(T_OPENCL_API); // 1. Find the default device type and get a device for the device type device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // 2. Create a context for the specified device context = clCreateContext(NULL, 1, &device, NULL, NULL, &err_code); clu_CheckError(err_code, "clCreateContext()"); // 3. Create a command queue cmd_queue = clCreateCommandQueue(context, device, 0, &err_code); clu_CheckError(err_code, "clCreateCommandQueue()"); DTIMER_STOP(T_OPENCL_API); // 4. Build the program DTIMER_START(T_BUILD); char *source_file; char build_option[30]; sprintf(build_option, "-DM=%d -I.", M); if (device_type == CL_DEVICE_TYPE_CPU) { source_file = "ep_cpu.cl"; GROUP_SIZE = 16; } else { source_file = "ep_gpu.cl"; GROUP_SIZE = 64; } program = clu_MakeProgram(context, device, source_dir, source_file, build_option); DTIMER_STOP(T_BUILD); // 5. Create buffers DTIMER_START(T_BUFFER_CREATE); gq_size = np / GROUP_SIZE * NQ * sizeof(double); gsx_size = np / GROUP_SIZE * sizeof(double); gsy_size = np / GROUP_SIZE * sizeof(double); pgq = clCreateBuffer(context, CL_MEM_READ_WRITE, gq_size, NULL, &err_code); clu_CheckError(err_code, "clCreateBuffer() for pgq"); pgsx = clCreateBuffer(context, CL_MEM_READ_WRITE, gsx_size,NULL, &err_code); clu_CheckError(err_code, "clCreateBuffer() for pgsx"); pgsy = clCreateBuffer(context, CL_MEM_READ_WRITE, gsy_size,NULL, &err_code); clu_CheckError(err_code, "clCreateBuffer() for pgsy"); DTIMER_STOP(T_BUFFER_CREATE); // 6. Create a kernel DTIMER_START(T_OPENCL_API); kernel = clCreateKernel(program, "embar", &err_code); clu_CheckError(err_code, "clCreateKernel()"); DTIMER_STOP(T_OPENCL_API); }
int main(int argc, char *argv[]) { double Mops, t1, t2; double tsx, tsy, tm, an, tt, gc; double sx_verify_value, sy_verify_value, sx_err, sy_err; int i, nit; int k_offset, j; logical verified; char size[16]; FILE *fp; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } if ((fp = fopen("timer.flag", "r")) == NULL) { timers_enabled = false; } else { timers_enabled = true; fclose(fp); } //-------------------------------------------------------------------- // Because the size of the problem is too large to store in a 32-bit // integer for some classes, we put it into a string (for printing). // Have to strip off the decimal point put in there by the floating // point print statement (internal file) //-------------------------------------------------------------------- sprintf(size, "%15.0lf", pow(2.0, M+1)); j = 14; if (size[j] == '.') j--; size[j+1] = '\0'; printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n"); printf("\n Number of random numbers generated: %15s\n", size); verified = false; //-------------------------------------------------------------------- // Compute the number of "batches" of random number pairs generated // per processor. Adjust if the number of processors does not evenly // divide the total number //-------------------------------------------------------------------- np = NN; setup_opencl(argc, argv); timer_clear(0); timer_start(0); //-------------------------------------------------------------------- // Compute AN = A ^ (2 * NK) (mod 2^46). //-------------------------------------------------------------------- t1 = A; for (i = 0; i < MK + 1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; //-------------------------------------------------------------------- // Each instance of this loop may be performed independently. We compute // the k offsets separately to take into account the fact that some nodes // have more numbers to generate than others //-------------------------------------------------------------------- k_offset = -1; DTIMER_START(T_KERNEL_EMBAR); // Launch the kernel int q_size = GROUP_SIZE * NQ * sizeof(cl_double); int sx_size = GROUP_SIZE * sizeof(cl_double); int sy_size = GROUP_SIZE * sizeof(cl_double); err_code = clSetKernelArg(kernel, 0, q_size, NULL); err_code |= clSetKernelArg(kernel, 1, sx_size, NULL); err_code |= clSetKernelArg(kernel, 2, sy_size, NULL); err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq); err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx); err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy); err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset); err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an); clu_CheckError(err_code, "clSetKernelArg()"); size_t localWorkSize[] = { GROUP_SIZE }; size_t globalWorkSize[] = { np }; err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_EMBAR); double (*gq)[NQ] = (double (*)[NQ])malloc(gq_size); double *gsx = (double*)malloc(gsx_size); double *gsy = (double*)malloc(gsy_size); gc = 0.0; tsx = 0.0; tsy = 0.0; for (i = 0; i < NQ; i++) { q[i] = 0.0; } // 9. Get the result DTIMER_START(T_BUFFER_READ); err_code = clEnqueueReadBuffer(cmd_queue, pgq, CL_FALSE, 0, gq_size, gq, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); err_code = clEnqueueReadBuffer(cmd_queue, pgsx, CL_FALSE, 0, gsx_size, gsx, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); err_code = clEnqueueReadBuffer(cmd_queue, pgsy, CL_TRUE, 0, gsy_size, gsy, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); DTIMER_STOP(T_BUFFER_READ); for (i = 0; i < np/localWorkSize[0]; i++) { for (j = 0; j < NQ; j++ ){ q[j] = q[j] + gq[i][j]; } tsx = tsx + gsx[i]; tsy = tsy + gsy[i]; } for (i = 0; i < NQ; i++) { gc = gc + q[i]; } timer_stop(0); tm = timer_read(0); nit = 0; verified = true; if (M == 24) { sx_verify_value = -3.247834652034740e+3; sy_verify_value = -6.958407078382297e+3; } else if (M == 25) { sx_verify_value = -2.863319731645753e+3; sy_verify_value = -6.320053679109499e+3; } else if (M == 28) { sx_verify_value = -4.295875165629892e+3; sy_verify_value = -1.580732573678431e+4; } else if (M == 30) { sx_verify_value = 4.033815542441498e+4; sy_verify_value = -2.660669192809235e+4; } else if (M == 32) { sx_verify_value = 4.764367927995374e+4; sy_verify_value = -8.084072988043731e+4; } else if (M == 36) { sx_verify_value = 1.982481200946593e+5; sy_verify_value = -1.020596636361769e+5; } else if (M == 40) { sx_verify_value = -5.319717441530e+05; sy_verify_value = -3.688834557731e+05; } else { verified = false; } if (verified) { sx_err = fabs((tsx - sx_verify_value) / sx_verify_value); sy_err = fabs((tsy - sy_verify_value) / sy_verify_value); verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON)); } Mops = pow(2.0, M+1) / tm / 1000000.0; printf("\nEP Benchmark Results:\n\n"); printf("CPU Time =%10.4lf\n", tm); printf("N = 2^%5d\n", M); printf("No. Gaussian Pairs = %15.0lf\n", gc); printf("Sums = %25.15lE %25.15lE\n", tsx, tsy); printf("Counts: \n"); for (i = 0; i < NQ; i++) { printf("%3d%15.0lf\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name); if (timers_enabled) { if (tm <= 0.0) tm = 1.0; tt = timer_read(0); printf("\nTotal time: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); } free(gq); free(gsx); free(gsy); release_opencl(); fflush(stdout); return 0; }