void Init_barracuda() { id_times = rb_intern("times"); id_new = rb_intern("new"); id_to_s = rb_intern("to_s"); id_data_type = rb_intern("data_type"); id_buffer_data = rb_intern("buffer_data"); rb_hTypes = rb_hash_new(); rb_define_method(rb_mKernel, "Type", type_new, 1); types_hash_init(); rb_mBarracuda = rb_define_module("Barracuda"); rb_define_const(rb_mBarracuda, "VERSION", rb_str_new2(VERSION_STRING)); rb_define_const(rb_mBarracuda, "TYPES", rb_hTypes); rb_eProgramSyntaxError = rb_define_class_under(rb_mBarracuda, "SyntaxError", rb_eSyntaxError); rb_eOpenCLError = rb_define_class_under(rb_mBarracuda, "OpenCLError", rb_eStandardError); rb_cProgram = rb_define_class_under(rb_mBarracuda, "Program", rb_cObject); rb_define_alloc_func(rb_cProgram, program_s_allocate); rb_define_method(rb_cProgram, "initialize", program_initialize, -1); rb_define_method(rb_cProgram, "compile", program_compile, 1); rb_define_method(rb_cProgram, "method_missing", program_method_missing, -1); rb_cBuffer = rb_define_class_under(rb_mBarracuda, "Buffer", rb_cArray); rb_define_method(rb_cBuffer, "initialize", buffer_initialize, -1); rb_define_method(rb_cBuffer, "outvar", buffer_outvar, 0); rb_define_method(rb_cBuffer, "outvar?", buffer_is_outvar, 0); rb_define_method(rb_cBuffer, "mark_dirty", buffer_mark_dirty, 0); rb_define_method(rb_cBuffer, "dirty?", buffer_dirty, 0); rb_cType = rb_define_class_under(rb_mBarracuda, "Type", rb_cObject); rb_define_method(rb_cType, "initialize", type_initialize, 1); rb_define_method(rb_cType, "method_missing", type_method_missing, 1); rb_define_method(rb_cType, "object", type_object, 0); rb_define_method(rb_cArray, "outvar", array_to_outvar, 0); rb_define_method(rb_cObject, "to_type", object_to_type, 1); rb_define_method(rb_cFixnum, "to_type", fixnum_to_type, 1); rb_define_method(rb_cObject, "data_type", object_data_type_get, 0); rb_define_method(rb_cArray, "data_type", array_data_type_get, 0); rb_define_method(rb_cFixnum, "data_type", fixnum_data_type_get, 0); rb_define_method(rb_cBignum, "data_type", bignum_data_type_get, 0); rb_define_method(rb_cFloat, "data_type", float_data_type_get, 0); init_opencl(); }
int main (int argc, char* const *argv) { char filepath[MAXPATHLEN]; filepath[0] = '\0'; process_arguments(argc, argv, filepath); // Perform typical OpenCL setup in order to obtain a context and command // queue. init_opencl(); // Check if the current architecture is compatible with the specified test options if (device_type == CL_DEVICE_TYPE_CPU) { #if __LP64__ if (is32bit) fprintf(stderr, "Warning: user specified the 'cpu32' option on the 64bit architecture.\n"); #else if (!is32bit) fprintf(stderr, "Warning: user specified the 'cpu64' option on the 32bit architecture.\n"); #endif } else if (device_type == CL_DEVICE_TYPE_GPU) { cl_int err; cl_uint address_bits = 0; err = clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(address_bits), &address_bits, NULL); if (!is32bit && (address_bits == 32)) fprintf(stderr, "Warning: user specified the 'gpu64' option on the 32bit architecture.\n"); else if (is32bit && (address_bits == 64)) fprintf(stderr, "Warning: user specified the 'gpu32' option on the 64bit architecture.\n"); } // Obtain a CL program and kernel from our pre-compiled bitcode file and // test it by running the kernel on some test data. create_program_from_bitcode(filepath); // Close everything down. shutdown_opencl(); return 0; }
static PyObject * pl_allocate_memory(PyObject *self, PyObject *args) { PyObject *result = NULL; char format[4]; format[0] = IndexFormatUnit; format[1] = IndexFormatUnit; format[2] = IndexFormatUnit; format[3] = '\0'; if (PyArg_ParseTuple(args, format, &natoms, &nres, &nchains)) { create_atoms_array(&atoms, natoms); create_coords_array(&coords, natoms); create_distances_array(&distances, natoms); create_distances_array(&invdistances, natoms); create_bool_array(&atommask, natoms); create_distmask_array(&distmask, natoms); set_atom_coords(atoms, coords, natoms); create_residues_array(&residues, nres); create_chains_array(&chains, nchains); send_system_to_modules(atoms, coords, distances, invdistances, atommask, distmask, natoms, residues, nres, chains, nchains); #ifdef __USE_OPENCL_CODEPATH__ init_opencl(&clcontext, &cldevices, &clqueue); set_opencl_memory(&clcoords, &cldistmask, &cldistances, clcontext, coords, distmask, distances, natoms); compile_opencl_kernels(&clprogram, &cldistkernel, &cldistsqkernel, &clinvdistkernel, clcontext); send_opencl_to_modules(&clprogram, &cldistkernel, &cldistsqkernel, &clinvdistkernel, &clcontext, &clqueue, &clcoords, &cldistmask, &cldistances); #endif result = Py_BuildValue("i", 0); } return result; }
void run_opencl_test(use_gpu){ init_opencl(use_gpu); load_cl_kernels(&clData); allocate_cl_buffers(&clData); transfer_buffers_to_gpu(); flush_cl_queue(); run_cl_advect_density(&clData, dt); flush_cl_queue(); transfer_buffers_to_cpu(); flush_cl_queue(); // printf("dens[%d] = %3.2f\n",IX(16,3,0),g_dens[IX(16,3,0)]); // // if(g_dens[IX(16,3,0)] > 0.0f) // { // printf("Success!!\n"); // } // // for (int i = 0; i < clData.n; ++i) // { // if(i == 112) { // int j = i*clData.dn; // printf("debug_data1[%d] = %3.2f, %3.2f, %3.2f, %3.2f\n",i,clData.debug_data1[j], clData.debug_data1[j+1], clData.debug_data1[j+2], clData.debug_data1[j+3]); // } // // } cleanup_cl(&clData); }
int main(int argc, char **argv) { size_t g_work_size, l_work_size; cl_int error; if (argc != 2) { fprintf(stderr, "Usage: worms matchlist\n"); exit(EXIT_FAILURE); } init_opencl(); load_round_configs(argv[1]); prepare_job_scenario(); /* Start this processing scenario. */ g_work_size = NUM_STREAM_PROCS; l_work_size = NUM_STREAM_PROCS; error = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, &g_work_size, &l_work_size, 0, NULL, &kernel_completion); check_error("enquing kernel", error); unsigned int output_data[256]; error = clEnqueueReadBuffer(cmd_queue, output_buf, true, 0, 1024, output_data, 1, &kernel_completion, NULL); unsigned int i; for (i = 0; i < 256; i++) { printf("%d,", output_data[i]); if ((i % 8) == 7) printf("\n"); } exit(EXIT_SUCCESS); }
ortho() : pump_factor_(4) { // glVertexPointer(4, GL_FLOAT, 0, ptr); //glEnableClientState( GL_VERTEX_ARRAY ); // glColorPointer( //std::ifstream is( "cryistal-castle-hidden-ramp.txt" ); // std::ifstream is( "house1.txt" ); //std::ifstream is( "cryistal-castle-tree-wave.txt" ); // assert( is.good() ); // height_fields_ = crystal_bits::load_crystal(is, pump_factor_); // std::cout << "hf: " << height_fields_.size() << "\n"; // // // // scene_static_.init_solid(height_fields_); // // scene_static_.init_solid_from_crystal(is, pump_factor_); // scene_static_.init_planes(); // uint64_t scene_hash = scene_static_.hash(); // // try { // std::ifstream is( "ff.bin" ); // // // light_static_ = light_static( is, scene_hash ); // } catch( std::runtime_error x ) { // // std::cerr << "load failed. recreating. error:\n" << x.what() << std::endl; // // light_static_ = setup_formfactors(scene_static_.planes(), scene_static_.solid()); // } // // if( !false ) { // std::ofstream os( "ff.bin" ); // light_static_.write(os, scene_hash); // } // // // light_dynamic_ = light_dynamic(scene_static_.planes().size() ); CL_OpenGLWindowDescription desc; desc.set_size( CL_Size( 1024, 768 ), true ); desc.set_depth_size(16); //std::cout << "depth: " << desc.get_depth_size(); wnd_ = CL_DisplayWindow(desc); CL_GraphicContext_GL gc = wnd_.get_gc(); // //CL_Mat4f proj = CL_Mat4f::ortho( 0, 1024, 0, 768, 100, -100 ); gc.set_active(); #ifdef TEST_OPENCL try { init_opencl(); } catch( cl::Error x ) { // std::array<void*, 256> bt; // //void *bt[256]; // // size_t size = backtrace( bt.data(), bt.size() ); // char **strings = backtrace_symbols( bt.data(), size ); // std::cout << "backtrace: " << size << "\n"; // for( size_t i = 0; i < size; ++i ) { // std::cout << i << " " << strings[i] << "\n"; // } // free( strings ); std::cerr << "opencl initialization failed\ncall: " << x.what() << "\nerror code: " << cl_str_error( x.err() ) << "\n"; throw; } #endif // throw 0; glMatrixMode(GL_PROJECTION); //hello CL_Mat4f proj = CL_Mat4f::perspective( 60, 1.5, 2, 200 ); // CL_Mat4f proj = CL_Mat4f::ortho( -20.0 * pump_factor_, 20.0 * pump_factor_, -15.0 * pump_factor_, 15.0 * pump_factor_, 0, 200 ); //CL_Mat4f proj = CL_Mat4f::ortho( -40, 40, -30, 30, 0, 200 ); glLoadMatrixf( proj.matrix ); //CL_Texture tex(gc, 64, 64 ); struct texel { GLubyte col[3]; GLubyte alpha; texel() { col[0] = 128; col[1] = 128; col[2] = 128; alpha = 255; } }; std::array<texel,64 * 64> tex_data; glLightModelf(GL_LIGHT_MODEL_LOCAL_VIEWER, 0.0); // glGenTextures(1, &texName); // glBindTexture(GL_TEXTURE_2D, texName); // glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); // glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); // glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); // glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); // glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 64, 64, 0, GL_RGBA, GL_UNSIGNED_BYTE, tex_data.data()); // gc.set_map_mode(cl_user_projection); // gc.set_projection(proj); // // gc.flush_batcher(); // glMatrixMode(GL_PROJECTION); glMatrixMode(GL_MODELVIEW); glEnable(GL_DEPTH_TEST); glDepthMask(GL_TRUE); glDepthFunc(GL_LESS); glEnable(GL_TEXTURE_2D); glShadeModel(GL_FLAT); }
void runTimings(int use_gpu){ int ntrips = 10; char device_name[256]; timestamp_type time1, time2; //////////////////////////////////////////////////// ///GPU TIMINGS //////////////////////////////////////////////////// init_opencl(use_gpu); load_cl_kernels(&clData); allocate_cl_buffers(&clData); print_device_info_from_queue(clData.queue); get_device_name_from_queue(clData.queue, device_name, 256); transfer_buffers_to_gpu(); double advectionVelocityTimeGPU, advectionDensityTimeGPU, divergenceTimeGPU, projectJacobiTimeGPU, projectCGTimeGPU, pressureApplyTimeGPU; transfer_buffers_to_gpu(); get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { run_cl_advect_velocity(&clData, dt); } flush_cl_queue(); get_timestamp(&time2); advectionVelocityTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { run_cl_calculate_divergence(&clData, dt); } flush_cl_queue(); get_timestamp(&time2); divergenceTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; transfer_buffers_to_cpu(); flush_cl_queue(); //This needs ntrips different divergence matrices to get accurate timings. //This is because by the time the second time it is called it will detect //the system is solved and exit after one matrix get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { transfer_cl_float_buffer_from_device(&clData,clData.buf_pressure,g_pressure,clData.n,true); transfer_cl_float_buffer_from_device(&clData,clData.buf_divergence,g_divergence,clData.n,true); run_cl_cg_no_mtx(&clData,g_pressure, g_divergence, g_cg_r, g_cg_d, g_cg_q, clData.n, 10, 0.0001f); flush_cl_queue(); transfer_cl_float_buffer_to_device(&clData,clData.buf_pressure,g_pressure,clData.n,true); } flush_cl_queue(); get_timestamp(&time2); projectCGTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { for(int i = 0; i < 20; ++i) { run_cl_pressure_solve(&clData, dt); } } flush_cl_queue(); get_timestamp(&time2); projectJacobiTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { run_cl_pressure_apply(&clData, dt); } flush_cl_queue(); get_timestamp(&time2); pressureApplyTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { run_cl_advect_density(&clData, dt); } flush_cl_queue(); get_timestamp(&time2); advectionDensityTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU","Advection Velocity",advectionVelocityTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionVelocityTimeGPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU","Advection Density",advectionDensityTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionDensityTimeGPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU", "Divergence",divergenceTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/divergenceTimeGPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU", "Projection Jacobi",projectJacobiTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectJacobiTimeGPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t",device_name,NX,NY,NZ,"GPU", "Projection Conjugate Gradient",projectCGTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectCGTimeGPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU","Pressure Apply",pressureApplyTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/pressureApplyTimeGPU); cleanup_cl(&clData); //////////////////////////////////////////////////// ///CPU TIMINGS //////////////////////////////////////////////////// double advectionVelocityTimeCPU, advectionDensityTimeCPU, divergenceTimeCPU, projectJacobiTimeCPU, projectCGTimeCPU, pressureApplyTimeCPU; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { advect_velocity_RK2(dt, g_u, g_v, g_w, g_u_prev, g_v_prev, g_w_prev); } get_timestamp(&time2); advectionVelocityTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; //project(dt,g_u,g_v, g_w, g_divergence, g_pressure, g_pressure_prev, g_laplacian_matrix,useCG); get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { calculate_divergence(g_divergence, g_u, g_v, g_w, dt); } get_timestamp(&time2); divergenceTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; //This needs ntrips different divergence matrices to get accurate timings. //This is because by the time the second time it is called it will detect //the system is solved and exit after one matrix get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { pressure_solve_cg_no_matrix(g_pressure, g_divergence, g_cg_r, g_cg_d, g_cg_q); } get_timestamp(&time2); projectCGTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { pressure_solve(g_pressure,g_pressure_prev, g_divergence, dt); } get_timestamp(&time2); projectJacobiTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { pressure_apply(g_u, g_v, g_w, g_pressure, dt); } get_timestamp(&time2); pressureApplyTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { advectRK2(dt,g_dens,g_dens_prev, g_u, g_v, g_w); } get_timestamp(&time2); advectionDensityTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Advection Velocity",advectionVelocityTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionVelocityTimeCPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Advection Density",advectionDensityTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionDensityTimeCPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Divergence",divergenceTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/divergenceTimeCPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Projection Jacobi",projectJacobiTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectJacobiTimeCPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Projection Conjugate Gradient",projectCGTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectCGTimeCPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Pressure Apply",pressureApplyTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/pressureApplyTimeCPU); }
int main ( int argc, char ** argv ) { // Parse command line options // int use_gpu = 1; int use_interop = 0; for(int i = 0; i < argc && argv; i++) { if(!argv[i]) continue; if(strstr(argv[i], "cpu")) use_gpu = 0; else if(strstr(argv[i], "gpu")) use_gpu = 1; else if(strstr(argv[i], "interop")) use_interop = 1; } printf("Parameter detect %s device (%s)\n",use_gpu==1?"GPU":"CPU",use_interop==1?"Share OpenGL":"Not Sharing OpenGL"); OPENCL_SHARE_WITH_OPENGL = use_interop; //testCG(); win_x = 512; win_y = 512; glutInit ( &argc, argv ); open_glut_window (); //test_opencl_opengl_interop(); dt = 0.1f; force = 10.0f; source = 10.0f; printf ( "\n\nHow to use this demo:\n\n" ); printf ( "\t Add densities with the left mouse button\n" ); printf ( "\t Add velocities with the left mouse button and dragging the mouse\n" ); printf ( "\t Toggle density/velocity display with the 'v' key\n" ); printf ( "\t Clear the simulation by pressing the 'x' key\n" ); printf ( "\t switch poisson solvers from jacobi to conjugate gradient by pressing the 'c' key\n" ); printf ( "\t switch advection scheme from RK2 to MacCormack by pressing the 'm' key\n" ); printf ( "\t toggle vorticity confinement by pressing the 'o' key\n" ); printf ( "\t Quit by pressing the 'q' key\n" ); dvel = 0; step = 0; maccormack = 0; vorticity = 0; useCG = 0; if ( !allocate_data () ) exit ( 1 ); clear_data (); //setupMatrix(g_laplacian_matrix); // FOR_EACH_FACE // { // //if(i < NX - NX*0.4 && i > NX*0.4 // // && // // j < NY - NY*0.4 && j > NY*0.4 ) // { // g_u_prev[IX(i,j,0)] = -0.01 * cosf(3.14159 * 2.0 * i/NX); // g_v_prev[IX(i,j,0)] = 0.01 * sinf(3.14159 * 2.0 * j/NY); // } // } #if RUN_TIMINGS runTimings(use_gpu); exit(0); #endif copy_grid(g_u_prev, g_u); copy_grid(g_v_prev, g_v); g_dens_prev[IX(16,3,0)] = 10.0f; //g_u_prev[IX(16,3,0)] = 10.0f; /* calculate_divergence(g_divergence, g_u_prev, g_v_prev, g_w_prev, dt); pressure_solve(g_pressure,g_pressure_prev, g_divergence, dt); pressure_apply(g_u_prev, g_v_prev, g_w_prev, g_pressure, dt); //project(dt,g_u_prev,g_v_prev, g_w_prev, g_divergence, g_pressure, g_pressure_prev); SWAP(g_u_prev,g_u); SWAP(g_v_prev,g_v); SWAP(g_w_prev,g_w); if(!check_divergence(g_u_prev, g_v_prev, g_w_prev)) { printf("Initial field wasn't divergence free!\n"); } */ //print_platforms_devices(); // run_opencl_test(use_gpu); // run_tests(); #if USE_OPENCL init_opencl(use_gpu); load_cl_kernels(&clData); allocate_cl_buffers(&clData); transfer_buffers_to_gpu(); flush_cl_queue(); #endif glutMainLoop (); #if USE_OPENCL cleanup_cl(&clData); #endif exit ( 0 ); }