int main(int argc, char *argv[]) { int width, height; width = height = 0; /* read image file */ if (argc >= 2) decode_init(argv[1], &width, &height); /* openGL */ init_gl(argc, argv); createGLBufTex(width, height, GL_BUFFER_SORC); createGLBufTex(width/2, height, GL_BUFFER_DEST); /* openCL */ init_cl(); clloadProgram("./algorithm.cl"); createCLBufferFromGL(); setImageWidth(width); setKernelRange(width/2, height); transferParam(); glutMainLoop(); exit: decode_close(); exit_cl(); exit_gl(); err: return 0; }
int main (int argc, char* argv[]) { /* Start GL processing */ init_gl(argc, argv); /* Initialize CL data structures */ init_cl(); /* Create CL and GL data objects */ configure_shared_data(); /* Execute kernel */ execute_kernel(); /* Set callback functions */ glutDisplayFunc(display); glutReshapeFunc(reshape); /* Start processing loop */ glutMainLoop(); /* Deallocate OpenCL resources */ clReleaseMemObject(in_texture); clReleaseMemObject(out_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); /* Deallocate OpenGL resources */ glDeleteBuffers(2, vbo); return 0; }
void ConcurrentMarkThread::run() { initialize_in_thread(); _vtime_start = os::elapsedVTime(); wait_for_universe_init(); G1CollectedHeap* g1h = G1CollectedHeap::heap(); G1CollectorPolicy* g1_policy = g1h->g1_policy(); G1MMUTracker *mmu_tracker = g1_policy->mmu_tracker(); Thread *current_thread = Thread::current(); while (!_should_terminate) { // wait until started is set. sleepBeforeNextCycle(); { ResourceMark rm; HandleMark hm; double cycle_start = os::elapsedVTime(); double mark_start_sec = os::elapsedTime(); char verbose_str[128]; if (PrintGC) { gclog_or_tty->date_stamp(PrintGCDateStamps); gclog_or_tty->stamp(PrintGCTimeStamps); gclog_or_tty->print_cr("[GC concurrent-mark-start]"); } if (!g1_policy->in_young_gc_mode()) { // this ensures the flag is not set if we bail out of the marking // cycle; normally the flag is cleared immediately after cleanup g1h->set_marking_complete(); if (g1_policy->adaptive_young_list_length()) { double now = os::elapsedTime(); double init_prediction_ms = g1_policy->predict_init_time_ms(); jlong sleep_time_ms = mmu_tracker->when_ms(now, init_prediction_ms); os::sleep(current_thread, sleep_time_ms, false); } // We don't have to skip here if we've been asked to restart, because // in the worst case we just enqueue a new VM operation to start a // marking. Note that the init operation resets has_aborted() CMCheckpointRootsInitialClosure init_cl(_cm); strcpy(verbose_str, "GC initial-mark"); VM_CGC_Operation op(&init_cl, verbose_str); VMThread::execute(&op); } int iter = 0; do { iter++; if (!cm()->has_aborted()) { _cm->markFromRoots(); } double mark_end_time = os::elapsedVTime(); double mark_end_sec = os::elapsedTime(); _vtime_mark_accum += (mark_end_time - cycle_start); if (!cm()->has_aborted()) { if (g1_policy->adaptive_young_list_length()) { double now = os::elapsedTime(); double remark_prediction_ms = g1_policy->predict_remark_time_ms(); jlong sleep_time_ms = mmu_tracker->when_ms(now, remark_prediction_ms); os::sleep(current_thread, sleep_time_ms, false); } if (PrintGC) { gclog_or_tty->date_stamp(PrintGCDateStamps); gclog_or_tty->stamp(PrintGCTimeStamps); gclog_or_tty->print_cr("[GC concurrent-mark-end, %1.7lf sec]", mark_end_sec - mark_start_sec); } CMCheckpointRootsFinalClosure final_cl(_cm); sprintf(verbose_str, "GC remark"); VM_CGC_Operation op(&final_cl, verbose_str); VMThread::execute(&op); } if (cm()->restart_for_overflow() && G1TraceMarkStackOverflow) { gclog_or_tty->print_cr("Restarting conc marking because of MS overflow " "in remark (restart #%d).", iter); } if (cm()->restart_for_overflow()) { if (PrintGC) { gclog_or_tty->date_stamp(PrintGCDateStamps); gclog_or_tty->stamp(PrintGCTimeStamps); gclog_or_tty->print_cr("[GC concurrent-mark-restart-for-overflow]"); } } } while (cm()->restart_for_overflow()); double counting_start_time = os::elapsedVTime(); // YSR: These look dubious (i.e. redundant) !!! FIX ME slt()->manipulatePLL(SurrogateLockerThread::acquirePLL); slt()->manipulatePLL(SurrogateLockerThread::releaseAndNotifyPLL); if (!cm()->has_aborted()) { double count_start_sec = os::elapsedTime(); if (PrintGC) { gclog_or_tty->date_stamp(PrintGCDateStamps); gclog_or_tty->stamp(PrintGCTimeStamps); gclog_or_tty->print_cr("[GC concurrent-count-start]"); } _sts.join(); _cm->calcDesiredRegions(); _sts.leave(); if (!cm()->has_aborted()) { double count_end_sec = os::elapsedTime(); if (PrintGC) { gclog_or_tty->date_stamp(PrintGCDateStamps); gclog_or_tty->stamp(PrintGCTimeStamps); gclog_or_tty->print_cr("[GC concurrent-count-end, %1.7lf]", count_end_sec - count_start_sec); } } } double end_time = os::elapsedVTime(); _vtime_count_accum += (end_time - counting_start_time); // Update the total virtual time before doing this, since it will try // to measure it to get the vtime for this marking. We purposely // neglect the presumably-short "completeCleanup" phase here. _vtime_accum = (end_time - _vtime_start); if (!cm()->has_aborted()) { if (g1_policy->adaptive_young_list_length()) { double now = os::elapsedTime(); double cleanup_prediction_ms = g1_policy->predict_cleanup_time_ms(); jlong sleep_time_ms = mmu_tracker->when_ms(now, cleanup_prediction_ms); os::sleep(current_thread, sleep_time_ms, false); } CMCleanUp cl_cl(_cm); sprintf(verbose_str, "GC cleanup"); VM_CGC_Operation op(&cl_cl, verbose_str); VMThread::execute(&op); } else { g1h->set_marking_complete(); } // Check if cleanup set the free_regions_coming flag. If it // hasn't, we can just skip the next step. if (g1h->free_regions_coming()) { // The following will finish freeing up any regions that we // found to be empty during cleanup. We'll do this part // without joining the suspendible set. If an evacuation pause // takes places, then we would carry on freeing regions in // case they are needed by the pause. If a Full GC takes // places, it would wait for us to process the regions // reclaimed by cleanup. double cleanup_start_sec = os::elapsedTime(); if (PrintGC) { gclog_or_tty->date_stamp(PrintGCDateStamps); gclog_or_tty->stamp(PrintGCTimeStamps); gclog_or_tty->print_cr("[GC concurrent-cleanup-start]"); } // Now do the remainder of the cleanup operation. _cm->completeCleanup(); // Notify anyone who's waiting that there are no more free // regions coming. We have to do this before we join the STS, // otherwise we might deadlock: a GC worker could be blocked // waiting for the notification whereas this thread will be // blocked for the pause to finish while it's trying to join // the STS, which is conditional on the GC workers finishing. g1h->reset_free_regions_coming(); _sts.join(); g1_policy->record_concurrent_mark_cleanup_completed(); _sts.leave(); double cleanup_end_sec = os::elapsedTime(); if (PrintGC) { gclog_or_tty->date_stamp(PrintGCDateStamps); gclog_or_tty->stamp(PrintGCTimeStamps); gclog_or_tty->print_cr("[GC concurrent-cleanup-end, %1.7lf]", cleanup_end_sec - cleanup_start_sec); } } guarantee(cm()->cleanup_list_is_empty(), "at this point there should be no regions on the cleanup list"); if (cm()->has_aborted()) { if (PrintGC) { gclog_or_tty->date_stamp(PrintGCDateStamps); gclog_or_tty->stamp(PrintGCTimeStamps); gclog_or_tty->print_cr("[GC concurrent-mark-abort]"); } } // we now want to allow clearing of the marking bitmap to be // suspended by a collection pause. _sts.join(); _cm->clearNextBitmap(); _sts.leave(); } // Update the number of full collections that have been // completed. This will also notify the FullGCCount_lock in case a // Java thread is waiting for a full GC to happen (e.g., it // called System.gc() with +ExplicitGCInvokesConcurrent). _sts.join(); g1h->increment_full_collections_completed(true /* concurrent */); _sts.leave(); } assert(_should_terminate, "just checking"); terminate(); }
void go_cl(config c) { cl_int error; cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel; cl_event ev; cl_mem mem; float *result = NULL; char buf[256]; int work_size[2] = { 128, 128 }; int offset[2]; int x, y; cl_float4 camera_pos = { c.camera_pos.x, c.camera_pos.y, c.camera_pos.z, 0 }; cl_float4 camera_dir = { c.camera_target.x - c.camera_pos.x, c.camera_target.y - c.camera_pos.y, c.camera_target.z - c.camera_pos.z, 0 }; cl_float4 light_pos = { c.light_pos.x, c.light_pos.y, c.light_pos.z, 0 }; cl_int2 image_size = { c.width, c.height }; sprintf(buf, "-DBAILOUT=%d -DSCALE=%f -DFOV=%f", c.bailout, c.scale, c.fov); printf("Starting\n"); init_cl(&platform, &device, &context, &commands); dump_info(platform, device); printf("Creating kernel\n"); program = load_program_from_file("kernel.cl", context, device, buf); kernel = create_kernel(program, "test"); printf("Setting memory\n"); mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * c.width * c.height, NULL, &error); check_error(error, "Could not allocate buffer"); error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem); error = clSetKernelArg(kernel, 1, sizeof(cl_float4), &camera_pos); error = clSetKernelArg(kernel, 2, sizeof(cl_float4), &camera_dir); error = clSetKernelArg(kernel, 3, sizeof(cl_float4), &light_pos); error = clSetKernelArg(kernel, 4, sizeof(cl_int2), &image_size); clFinish(commands); printf("Running\n"); for (x = 0; x < c.width; x += work_size[0]) { for (y = 0; y < c.height; y += work_size[1]) { offset[0] = x; offset[1] = y; error = clEnqueueNDRangeKernel(commands, kernel, 2, offset, work_size, NULL, 0, NULL, &ev); printf("."); } } clFinish(commands); printf("\nWriting image\n"); result = malloc(sizeof(float) * c.width * c.height); error = clEnqueueReadBuffer(commands, mem, CL_TRUE, 0, sizeof(float) * c.width * c.height, result, 0, NULL, &ev); clFinish(commands); save(result, c, 0, 0); free(result); clReleaseMemObject(mem); release_cl(context, program, commands, kernel); }
int main (int argc, char** argv) { /*---------------------- Initialize OpenGL and OpenCL ----------------------*/ if (init_gl(argc,argv,&glinfo, window_size, "RT") != 0){ std::cerr << "Failed to initialize GL" << std::endl; exit(1); } else { std::cout << "Initialized GL succesfully" << std::endl; } if (init_cl(glinfo,&clinfo) != CL_SUCCESS){ std::cerr << "Failed to initialize CL" << std::endl; exit(1); } else { std::cout << "Initialized CL succesfully" << std::endl; } print_cl_info(clinfo); /* Initialize device interface */ if (device.initialize(clinfo)) { std::cerr << "Failed to initialize device interface" << std::endl; exit(1); } /* Initialize generic gpu library */ DeviceFunctionLibrary::initialize(clinfo); /*---------------------- Create shared GL-CL texture ----------------------*/ gl_tex = create_tex_gl(window_size[0],window_size[1]); tex_id = device.new_memory(); DeviceMemory& tex_mem = device.memory(tex_id); if (tex_mem.initialize_from_gl_texture(gl_tex)) { std::cerr << "Failed to create memory object from gl texture" << std::endl; exit(1); } /*---------------------- Set up scene ---------------------------*/ frames = 15; const std::string pack = "pack1OBJ"; // const std::string pack = "pack2OBJ"; double wave_attenuation = 1.f; scenes.resize(frames); for (uint32_t i = 0; i < frames ; ++i) { scenes[i].initialize(); std::stringstream grid_path,visual_path; grid_path << "models/obj/" << pack << "/gridFluid" << i+1 << ".obj"; mesh_id grid_mid = scenes[i].load_obj_file_as_aggregate(grid_path.str()); object_id grid_oid = scenes[i].add_object(grid_mid); Object& grid = scenes[i].object(grid_oid); grid.mat.diffuse = White; grid.mat.reflectiveness = 0.95f; grid.mat.refractive_index = 1.5f; grid.geom.setScale(makeVector(1.f,wave_attenuation,1.f)); /* ---- Solids ------ */ // visual_path << "models/obj/" << pack << "/visual" << visual_frame << ".obj"; // mesh_id visual_mid = scenes[i].load_obj_file_as_aggregate(visual_path.str()); // object_id visual_oid = scenes[i].geometry.add_object(visual_mid); // Object& visual = scenes[i].geometry.object(visual_oid); // visual.mat.diffuse = Red; // for (uint32_t j = 0; j < bridge_parts ; ++j) { // std::stringstream bridge_path; // bridge_path << "models/obj/" << pack << "/bridge" << j << i+1 << ".obj"; // mesh_id bridge_mid = scenes[i].load_obj_file_as_aggregate(bridge_path.str()); // object_id bridge_oid = scenes[i].geometry.add_object(bridge_mid); // Object& bridge = scenes[i].geometry.object(bridge_oid); // bridge.mat.diffuse = Green; // } // mesh_id teapot_mesh_id = scenes[i].load_obj_file_as_aggregate("models/obj/teapot2.obj"); // mesh_id teapot_mesh_id = scenes[i].load_obj_file_as_aggregate("models/obj/teapot-low_res.obj"); // object_id teapot_obj_id = scenes[i].geometry.add_object(teapot_mesh_id); // Object& teapot_obj = scenes[i].geometry.object(teapot_obj_id); // teapot_obj.geom.setPos(makeVector(-1.f,0.f,0.f)); // teapot_obj.geom.setScale(makeVector(3.f,3.f,3.f)); // teapot_obj.mat.diffuse = Green; // teapot_obj.mat.shininess = 1.f; // teapot_obj.mat.reflectiveness = 0.3f; /* ------------------*/ // scenes[i].set_accelerator_type(KDTREE_ACCELERATOR); scenes[i].set_accelerator_type(BVH_ACCELERATOR); scenes[i].create_aggregate_mesh(); scenes[i].create_aggregate_accelerator(); Mesh& scene_mesh = scenes[i].get_aggregate_mesh(); if (scenes[i].transfer_aggregate_mesh_to_device() || scenes[i].transfer_aggregate_accelerator_to_device()) std::cerr << "Failed to transfer scene info to device"<< std::endl; /*---------------------- Print scene data ----------------------*/ std::cerr << "Scene " << i << " stats: " << std::endl; std::cerr << "\tTriangle count: " << scene_mesh.triangleCount() << std::endl; std::cerr << "\tVertex count: " << scene_mesh.vertexCount() << std::endl; std::cerr << std::endl; } /*---------------------- Set initial Camera paramaters -----------------------*/ camera.set(makeVector(0,3,-30), makeVector(0,0,1), makeVector(0,1,0), M_PI/4., window_size[0] / (float)window_size[1]); /*---------------------------- Set tile size ------------------------------*/ best_tile_size = clinfo.max_compute_units * clinfo.max_work_item_sizes[0]; best_tile_size *= 64; best_tile_size = std::min(pixel_count, best_tile_size); /*---------------------- Initialize ray bundles -----------------------------*/ int32_t ray_bundle_size = best_tile_size * 4; if (ray_bundle_1.initialize(ray_bundle_size)) { std::cerr << "Error initializing ray bundle 1" << std::endl; std::cerr.flush(); exit(1); } if (ray_bundle_2.initialize(ray_bundle_size)) { std::cerr << "Error initializing ray bundle 2" << std::endl; std::cerr.flush(); exit(1); } std::cout << "Initialized ray bundles succesfully" << std::endl; /*---------------------- Initialize hit bundle -----------------------------*/ int32_t hit_bundle_size = ray_bundle_size; if (hit_bundle.initialize(hit_bundle_size)) { std::cerr << "Error initializing hit bundle" << std::endl; std::cerr.flush(); exit(1); } std::cout << "Initialized hit bundle succesfully" << std::endl; /*----------------------- Initialize cubemap ---------------------------*/ if (cubemap.initialize("textures/cubemap/Path/posx.jpg", "textures/cubemap/Path/negx.jpg", "textures/cubemap/Path/posy.jpg", "textures/cubemap/Path/negy.jpg", "textures/cubemap/Path/posz.jpg", "textures/cubemap/Path/negz.jpg")) { std::cerr << "Failed to initialize cubemap." << std::endl; exit(1); } std::cerr << "Initialized cubemap succesfully." << std::endl; /*------------------------ Initialize FrameBuffer ---------------------------*/ if (framebuffer.initialize(window_size)) { std::cerr << "Error initializing framebuffer." << std::endl; exit(1); } std::cout << "Initialized framebuffer succesfully." << std::endl; /* ------------------ Initialize ray tracer kernel ----------------------*/ if (tracer.initialize()){ std::cerr << "Failed to initialize tracer." << std::endl; return 0; } std::cerr << "Initialized tracer succesfully." << std::endl; /* ------------------ Initialize Primary Ray Generator ----------------------*/ if (prim_ray_gen.initialize()) { std::cerr << "Error initializing primary ray generator." << std::endl; exit(1); } std::cout << "Initialized primary ray generator succesfully." << std::endl; /* ------------------ Initialize Secondary Ray Generator ----------------------*/ if (sec_ray_gen.initialize()) { std::cerr << "Error initializing secondary ray generator." << std::endl; exit(1); } sec_ray_gen.set_max_rays(ray_bundle_1.count()); std::cout << "Initialized secondary ray generator succesfully." << std::endl; /*------------------------ Initialize RayShader ---------------------------*/ if (ray_shader.initialize()) { std::cerr << "Error initializing ray shader." << std::endl; exit(1); } std::cout << "Initialized ray shader succesfully." << std::endl; /*----------------------- Enable timing in all clases -------------------*/ framebuffer.timing(true); prim_ray_gen.timing(true); sec_ray_gen.timing(true); tracer.timing(true); ray_shader.timing(true); /*------------------------- Count mem usage -----------------------------------*/ int32_t total_cl_mem = 0; total_cl_mem += pixel_count * 4; /* 4bpp texture */ // for (uint32_t i = 0; i < frames; ++i) // total_cl_mem += scene_info[i].size(); total_cl_mem += ray_bundle_1.mem().size() + ray_bundle_2.mem().size(); total_cl_mem += hit_bundle.mem().size(); total_cl_mem += cubemap.positive_x_mem().size() * 6; total_cl_mem += framebuffer.image_mem().size(); std::cout << "\nMemory stats: " << std::endl; std::cout << "\tTotal opencl mem usage: " << total_cl_mem/1e6 << " MB." << std::endl; // for (uint32_t i = 0; i < frames; ++i) // std::cout << "\tScene " << i << " mem usage: " << scene_info[i].size()/1e6 << " MB." << std::endl; std::cout << "\tFramebuffer+Tex mem usage: " << (framebuffer.image_mem().size() + pixel_count * 4)/1e6 << " MB."<< std::endl; std::cout << "\tCubemap mem usage: " << (cubemap.positive_x_mem().size()*6)/1e6 << " MB."<< std::endl; std::cout << "\tRay mem usage: " << (ray_bundle_1.mem().size()*2)/1e6 << " MB."<< std::endl; std::cout << "\tRay hit info mem usage: " << hit_bundle.mem().size()/1e6 << " MB."<< std::endl; /* !! ---------------------- Test area ---------------- */ std::cerr << std::endl; std::cerr << "Misc info: " << std::endl; std::cerr << "Tile size: " << best_tile_size << std::endl; std::cerr << "Tiles: " << pixel_count / (float)best_tile_size << std::endl; std::cerr << "color_cl size: " << sizeof(color_cl) << std::endl; std::cerr << "directional_light_cl size: " << sizeof(directional_light_cl) << std::endl; std::cerr << "ray_cl size: " << sizeof(ray_cl) << std::endl; std::cerr << "sample_cl size: " << sizeof(sample_cl) << std::endl; std::cerr << "sample_trace_info_cl size: " << sizeof(sample_trace_info_cl) << std::endl; /*------------------------ Set GLUT and misc functions -----------------------*/ rt_time.snap_time(); seq_time.snap_time(); glutKeyboardFunc(gl_key); glutMotionFunc(gl_mouse); glutDisplayFunc(gl_loop); glutIdleFunc(gl_loop); glutMainLoop(); clinfo.release_resources(); return 0; }
int main(int argc, char **argv) { glfwSetErrorCallback(error_callback); /* Initialize the library */ if (!glfwInit()){ fprintf(stderr, "Initialization failed.\n"); return 1; } /* Create a windowed mode window and its OpenGL context */ window = glfwCreateWindow(width, height, "Hello World", NULL, NULL); if (!window) { glfwTerminate(); fprintf(stderr, "Error creating window.\n"); return 1; } /* Make the window's context current */ glfwMakeContextCurrent(window); glfwSetInputMode(window, GLFW_STICKY_MOUSE_BUTTONS, 1); glfwSetKeyCallback(window, key_callback); glfwSetMouseButtonCallback(window, mouse_button_callback); glfwSetCursorPosCallback(window, cursor_pos_callback); //**************************** generowanie przykładowych piksli hs_init(&argc, &argv); initOctTree(); hs_exit(); float *piksele = malloc(height*width*3*sizeof(*piksele)); printf("sizeof(OctTreeNode)=%d\n", (int)sizeof(OctTreeNode)); //**************************** init_cl(); turnCamera(0.f,0.f,0.f); // Calculates initial camera direction fflush(stderr); /* Loop until the user closes the window */ while (!glfwWindowShouldClose(window)) { /* Render here */ for (int i = 0; i < height * width * 3; i++) piksele[i] = 0.0; clock_t start = clock(); captureOctTree(camera_pos, camera_target, up, width, height, piksele); clock_t end = clock(); // show render time in window title char title[16]; snprintf(title, 16, "%d ms", (int)((end - start) / (CLOCKS_PER_SEC / 1000))); glfwSetWindowTitle(window, title); /* Swap front and back buffers */ glfwSwapBuffers(window); /* Poll for and process events */ glfwPollEvents(); } if (num_platforms > 0) { clReleaseMemObject(mainOctCL); clReleaseMemObject(image); clReleaseKernel(kernel); clReleaseCommandQueue(queue); } glfwDestroyWindow(window); glfwTerminate(); return 0; }
cl_int cl_runner::execute(const char *filename) { if (filename == NULL || m_clContext == NULL) return -1; if (!m_bInitCL) init_cl(); // Error code cl_int err_num = CL_SUCCESS; char *source_content = NULL; size_t src_length = 0; bool source_need_free = false; #if 1 std::ifstream ifs; source_need_free = true; err_num = clLoadProgramSource(filename, (const char **)&source_content, (size_t *)&src_length); if (err_num != CL_SUCCESS || src_length == 0 || source_content == NULL) { DOL_TRACE1("cl_runner: Error load program source. ErrCode = %d \n", err_num); return err_num; } #else std::ifstream ifs(filename, std::ios_base::binary); if (!ifs.good()) { DOL_TRACE("cl_runner: Error load program source, open source file failed.\n"); return -1; } // get file length ifs.seekg(0, std::ios_base::end); size_t length = ifs.tellg(); ifs.seekg(0, std::ios_base::beg); // read program source std::vector<char> data(length + 1); ifs.read(&data[0], length); data[length] = 0; // create and build program source_content = &data[0]; src_length = length; #endif // Create the program err_num = CL_SUCCESS; m_clProgram = clCreateProgramWithSource(m_clContext, 1, (const char **)&source_content, &src_length, &err_num); // 加载文件内容 if (source_need_free) { if (source_content != NULL) { free(source_content); source_content = NULL; } } ifs.close(); DOL_ASSERT(err_num == CL_SUCCESS); if (err_num != CL_SUCCESS || m_clProgram == NULL) { DOL_TRACE1("cl_runner: Error create program with source. ErrCode = %d \n", err_num); return err_num; } // Build the program err_num = clBuildProgram(m_clProgram, 1, &m_clDeviceId, NULL, NULL, NULL); // 编译cl程序 DOL_ASSERT(err_num == CL_SUCCESS); if (err_num != CL_SUCCESS) { DOL_TRACE1("cl_runner: Error build program. ErrCode = %d \n", err_num); return err_num; } // Show the log char *build_log; size_t log_size; // First call to know the proper size err_num = clGetProgramBuildInfo(m_clProgram, m_clDeviceId, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); if (err_num != CL_SUCCESS) { DOL_TRACE1("cl_runner: Error get program build info step 1. ErrCode = %d \n", err_num); return err_num; } build_log = new char[log_size + 1]; // 编译CL的出错记录 if (build_log == NULL) return (cl_int)-1; // Second call to get the log err_num = clGetProgramBuildInfo(m_clProgram, m_clDeviceId, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); if (err_num != CL_SUCCESS) { if (build_log) { delete build_log; build_log = NULL; } DOL_TRACE1("cl_runner: Error get program build info step 2. ErrCode = %d \n", err_num); return err_num; } build_log[log_size] = '\0'; std::string strLog(build_log); strLog += "\n"; DOL_TRACE(strLog.c_str()); // 因为cl程序是在运行时编译的,在运行过程中如果出错,显示编译CL文件的错误,以便查找问题 if (build_log) { delete build_log; build_log = NULL; } // set seed for rand() ::srand(FIXED_SRAND_SEED); const unsigned int DATA_SIZE = 1048576; std::vector<CL_FLOAT_T> a(DATA_SIZE), b(DATA_SIZE), ret(DATA_SIZE); for (unsigned int i = 0; i < DATA_SIZE; ++i) { a[i] = std::rand() / (CL_FLOAT_T)RAND_MAX; b[i] = std::rand() / (CL_FLOAT_T)RAND_MAX; ret[i] = 0.0; } // Allocate the buffer memory objects cl_mem cl_a = clCreateBuffer(m_clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(CL_FLOAT_T) * DATA_SIZE, (void *)&a[0], NULL); cl_mem cl_b = clCreateBuffer(m_clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(CL_FLOAT_T) * DATA_SIZE, (void *)&b[0], NULL); cl_mem cl_ret = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, sizeof(CL_FLOAT_T) * DATA_SIZE, (void *)NULL, NULL); cl_mem cl_num = clCreateBuffer(m_clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint), (void *)&DATA_SIZE, NULL); // 创建Kernel对应的函数 // Extracting the kernel #if defined(USE_CL_DOUBLE) && (USE_CL_DOUBLE != 0) m_clKernel = clCreateKernel(m_clProgram, "vector_add_double", &err_num); // 这个引号中的字符串要对应cl文件中的kernel函数 #else m_clKernel = clCreateKernel(m_clProgram, "vector_add_float", &err_num); // 这个引号中的字符串要对应cl文件中的kernel函数 #endif DOL_ASSERT(err_num == CL_SUCCESS); if (err_num != CL_SUCCESS) { DOL_TRACE1("cl_runner: Error create kernel. ErrCode = %d \n", err_num); return err_num; } if (m_clKernel != NULL) { // Set the args values err_num = clSetKernelArg(m_clKernel, 0, sizeof(cl_mem), &cl_a); err_num |= clSetKernelArg(m_clKernel, 1, sizeof(cl_mem), &cl_b); err_num |= clSetKernelArg(m_clKernel, 2, sizeof(cl_mem), &cl_ret); //err_num |= clSetKernelArg(m_clKernel, 3, sizeof(cl_mem), &cl_num); err_num |= clSetKernelArg(m_clKernel, 3, sizeof(cl_uint), &DATA_SIZE); if (err_num != CL_SUCCESS) return err_num; // Set work-item dimensions size_t globalWorkSize[2]; size_t localWorkSize[2] = { 0 }; globalWorkSize[0] = DATA_SIZE; globalWorkSize[1] = DATA_SIZE; sw_kernel.start(); // Execute kernel err_num = clEnqueueNDRangeKernel(m_clCmdQueue, m_clKernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); sw_kernel.stop(); if (err_num != CL_SUCCESS) { if (err_num == CL_INVALID_KERNEL_ARGS) DOL_TRACE("Invalid kernel args \n"); else DOL_TRACE1("cl_runner: Error enqueue NDRange kernel. ErrCode = %d \n", err_num); //return err_num; } // Read output array if (err_num == CL_SUCCESS) { sw_kernel_readBuffer.start(); err_num = clEnqueueReadBuffer(m_clCmdQueue, cl_ret, CL_TRUE, 0, sizeof(CL_FLOAT_T) * DATA_SIZE, &ret[0], 0, NULL, NULL); sw_kernel_readBuffer.stop(); if (err_num != CL_SUCCESS) { DOL_TRACE1("cl_runner: Error enqueue read buffer. ErrCode = %d \n", err_num); //return err_num; } if (err_num == CL_SUCCESS) { bool correct = true; CL_FLOAT_T diff; for (unsigned int i = 0; i < DATA_SIZE; ++i) { diff = a[i] + b[i] - ret[i]; if (fabs(diff) > 0.0001) { correct = false; break; } } if (correct) std::cout << "cl_runner: Data is correct" << endl; else std::cout << "cl_runner: Data is incorrect" << endl; } else { std::cerr << "cl_runner: Can't run kernel or read back data" << endl; } } } if (cl_a) clReleaseMemObject(cl_a); if (cl_b) clReleaseMemObject(cl_b); if (cl_ret) clReleaseMemObject(cl_ret); if (cl_num) clReleaseMemObject(cl_num); if (m_clKernel) { clReleaseKernel(m_clKernel); m_clKernel = NULL; } if (m_clProgram) { clReleaseProgram(m_clProgram); m_clProgram = NULL; } return err_num; }
void transform(int deviceNumber, int levels, unsigned w, unsigned h, unsigned bits) { uint64_t cbinput=uint64_t(w)*bits/8; std::vector<uint64_t> input(2*(cbinput/8)); std::vector<uint64_t> output(2*(cbinput/8)); std::vector<uint32_t> unpackedInput(2*(cbinput/4)); std::vector<uint32_t> unpackedOutput(2*(cbinput/4)); uint64_t* inputWriteptr = &input[0]; uint64_t* inputReadptr = &input[cbinput/8]; uint32_t* unpackedInputReadptr = &unpackedInput[cbinput/4]; uint32_t* unpackedInputWriteptr = &unpackedInput[0]; uint32_t* unpackedOutputReadptr = &unpackedOutput[cbinput/4]; uint32_t* unpackedOutputWriteptr = &unpackedOutput[0]; uint64_t* outputWriteptr = &output[0]; uint64_t* outputReadptr = &output[cbinput/8]; std::vector<uint32_t> gpuReadOffsets(2*levels+1,0); std::vector<uint32_t> gpuWriteOffsets(2*levels+1,0); std::vector<uint32_t> aboveOverrides(2*levels,0); std::vector<uint32_t> belowOverrides(2*levels,0); int j=0; for (int i=0; i<2*levels+1; i++) { gpuWriteOffsets[i] = j; j+= 2; j = j - (4 & -(j >= 4)); j = j + (4 & -(j < 0)); gpuReadOffsets[i] = j; } auto cl_instance = init_cl(levels,w,h,bits,"pipeline_kernels.cl",deviceNumber); tbb::task_group group; bool finished = false; int fullness = 0; bool full = false; int tailEnd = 4*levels+5; int image_line = 0; while(1){ for (int i=0; i<levels; i++) { if (image_line == 4 + 2*i) aboveOverrides[i] = 0x0 ^ -(levels < 0); else aboveOverrides[i] = 0xFFFFFFFF ^ -(levels < 0); if (image_line == 3 + 2*i) belowOverrides[i] = 0x0 ^ -(levels < 0); else belowOverrides[i] = 0xFFFFFFFF ^ -(levels < 0); } for (int i=levels; i<2*levels; i++) { if (image_line == 4 + 2*i) aboveOverrides[i] = 0xFFFFFFFF ^ -(levels < 0); else aboveOverrides[i] = 0x0 ^ -(levels < 0); if (image_line == 3 + 2*i) belowOverrides[i] = 0xFFFFFFFF ^ -(levels < 0); else belowOverrides[i] = 0x0 ^ -(levels < 0); } group.run([&](){ if(!finished && !read_blob(STDIN_FILENO, cbinput, inputWriteptr)) finished = true; unpack_blob_32(cbinput, inputReadptr, unpackedInputWriteptr); pack_blob_32(cbinput, unpackedOutputReadptr, outputWriteptr); if (fullness >= 4*levels+6 || full) { full = true; write_blob(STDOUT_FILENO, cbinput, outputReadptr); } else { fullness++; } }); group.run([&](){ process_opencl_packed_line(levels, w, bits, gpuReadOffsets, gpuWriteOffsets, unpackedInputReadptr, unpackedOutputWriteptr, aboveOverrides, belowOverrides, cl_instance); for (int i=0; i<2*levels+1; i++) { int j = gpuWriteOffsets[i]; j++; j = j - (4 & -(j >= 4)); gpuWriteOffsets[i] = j; j = gpuReadOffsets[i]; j++; j = j - (4 & -(j >= 4)); gpuReadOffsets[i] = j; } }); group.wait(); if (tailEnd == 0) { break; } if (finished) tailEnd--; std::swap(inputReadptr, inputWriteptr); std::swap(unpackedInputReadptr, unpackedInputWriteptr); std::swap(unpackedOutputReadptr, unpackedOutputWriteptr); std::swap(outputReadptr, outputWriteptr); image_line++; if (image_line == h) image_line = 0; } }
int test_cl_devices(int levels, unsigned w, unsigned h, unsigned bits, std::string source) { std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); if(platforms.size()==0) throw std::runtime_error("No OpenCL platforms found."); int selectedPlatform=0; if(getenv("HPCE_SELECT_PLATFORM")){ selectedPlatform=atoi(getenv("HPCE_SELECT_PLATFORM")); } cl::Platform platform=platforms.at(selectedPlatform); std::vector<cl::Device> devices; platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); if(devices.size()==0){ throw std::runtime_error("No opencl devices found.\n"); } uint64_t best = 0xFFFFFFFFFFFFFFFFul; int best_device = -1; for(unsigned i=0;i<devices.size();i++){ uint64_t microseconds = 0xFFFFFFFFFFFFFFFFul; bool failedAttempt = false; try { uint64_t cbinput=uint64_t(w)*bits/8; std::vector<uint32_t> gpuReadOffsets(2*levels+1,0); std::vector<uint32_t> gpuWriteOffsets(2*levels+1,0); std::vector<uint32_t> aboveOverrides(2*levels,0); std::vector<uint32_t> belowOverrides(2*levels,0); std::vector<uint32_t> unpackedInput(2*(cbinput/4)); std::vector<uint32_t> unpackedOutput(2*(cbinput/4)); uint32_t* unpackedInputReadptr = &unpackedInput[cbinput/4]; uint32_t* unpackedOutputWriteptr = &unpackedOutput[0]; auto cl_instance = init_cl(levels,w,h,bits,"pipeline_kernels.cl",i); timeval before, after; gettimeofday(&before, NULL); for(int i=0; i<100; i++) { process_opencl_packed_line(levels, w, bits, gpuReadOffsets, gpuWriteOffsets, unpackedInputReadptr, unpackedOutputWriteptr, aboveOverrides, belowOverrides, cl_instance); } gettimeofday(&after, NULL); microseconds =(after.tv_sec - before.tv_sec)*1000000L + after.tv_usec - before.tv_usec; } catch (std::exception) { failedAttempt = true; } if (failedAttempt) continue; if (microseconds < best) { best_device = i; best = microseconds; } } return best_device; }