int main(int argc, char* argv[]) { #pragma omp master { #ifdef _OPENMP int nthreads = omp_get_num_threads(); std::cout << "Using OpenMP - There are " << nthreads << " threads" << std::endl; #else std::cout << "Not using OpenMP" << '\n'; #endif } // ------------------------------------------------------------------------------------- // Create "tiy_log/" subdirectory (win) or "/home/<username>/tiy_log/" (linux) // ------------------------------------------------------------------------------------- std::string log_file_directory = "tiy_log/"; #ifdef WIN32 #else log_file_directory = std::string(getpwuid(getuid())->pw_dir) + "/" + log_file_directory; #endif boost::filesystem::path dir_path(log_file_directory); if (!boost::filesystem::is_directory(dir_path) && !boost::filesystem::create_directory(dir_path)) { std::cerr << "Could not create log subdirectory." << std::endl; std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get(); return 0; } // ------------------------------------------------------------------------------------- // Input ARG // ------------------------------------------------------------------------------------- char *arg_camera_config_file = (char *)"config_camera.xml"; char *arg_object_config_file = (char *)"config_object.xml"; char *arg_run_parameter_config_file = (char *)"config_run_parameters.xml"; if (argc == 1) { std::cerr << "USING DEFAULT CONFIG FILES: config_camera.xml config_object.xml config_run_parameters.xml" << std::endl; } else if (argc!=1 && argc != 4) { std::cerr << "Usage: server <camera_config_file> <object_config_file> <run_parameters_config_file>" << std::endl; std::cerr << "default: server config_camera.xml config_object.xml config_run_parameters.xml" << std::endl; std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get(); return 0; } else { arg_camera_config_file = argv[0]; arg_object_config_file = argv[1]; arg_run_parameter_config_file = argv[2]; } // ------------------------------------------------------------------------------------- // Get Run Parameters from XML Config File // ------------------------------------------------------------------------------------- cv::FileStorage input_file_storage; if (!input_file_storage.open(arg_run_parameter_config_file, cv::FileStorage::READ)) { std::cerr << "could NOT open " << arg_run_parameter_config_file << std::endl; std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get(); return 0; } int do_use_kalman_filter=-1, do_interactive_mode=-1, multicast_port=-1, do_show_graphics=-1, do_output_debug=-1, do_output_2D=-1, do_output_3D=-1, do_output_object=-1, do_output_virt_point=-1, do_log_2D=-1, do_log_3D=-1, do_log_object=-1, do_log_virt_point=-1, do_log_video=-1, do_log_frame=-1, do_send_object_pose=-1, do_send_virt_point_pose=-1; do_use_kalman_filter = (int)input_file_storage["do_use_kalman_filter"]; do_interactive_mode = (int)input_file_storage["do_interactive_mode"]; multicast_port = (int)input_file_storage["multicast_port"]; do_show_graphics = (int)input_file_storage["do_show_graphics"]; do_output_debug = (int)input_file_storage["do_output_debug"]; do_output_2D = (int)input_file_storage["do_output_2D"]; do_output_3D = (int)input_file_storage["do_output_3D"]; do_output_object = (int)input_file_storage["do_output_object"]; do_output_virt_point = (int)input_file_storage["do_output_virt_point"]; do_log_2D = (int)input_file_storage["do_log_2D"]; do_log_3D = (int)input_file_storage["do_log_3D"]; do_log_object = (int)input_file_storage["do_log_object"]; do_log_virt_point = (int)input_file_storage["do_log_virt_point"]; do_log_video = (int)input_file_storage["do_log_video"]; do_log_frame = (int)input_file_storage["do_log_frame"]; do_send_object_pose = (int)input_file_storage["do_send_object_pose"]; do_send_virt_point_pose = (int)input_file_storage["do_send_virt_point_pose"]; std::string multicast_adress = (std::string)input_file_storage["multicast_adress"]; std::string input_device_src = (std::string)input_file_storage["input_device_src"]; // (m: Mouse, k: Keyboard) std::string mouse_device_id = (std::string)input_file_storage["mouse_device_id"]; std::string keyboard_device_id = (std::string)input_file_storage["keyboard_device_id"]; std::string input_src = (std::string)input_file_storage["input_src"]; // (b: Basler Camera, o: OpenCV Camera, v: Video files, t: 2D point files) std::string video_left = (std::string)input_file_storage["video_left"]; std::string video_right = (std::string)input_file_storage["video_right"]; std::string points_2D_left = (std::string)input_file_storage["points_2D_left"]; std::string points_2D_right = (std::string)input_file_storage["points_2D_right"]; std::string log_points_2D_left = log_file_directory + (std::string)input_file_storage["log_points_2D_left"]; std::string log_points_2D_right = log_file_directory + (std::string)input_file_storage["log_points_2D_right"]; std::string log_points_3D = log_file_directory + (std::string)input_file_storage["log_points_3D"]; std::string log_object_pose = log_file_directory + (std::string)input_file_storage["log_object_pose"]; std::string log_virt_point_pose = log_file_directory + (std::string)input_file_storage["log_virt_point_pose"]; std::string log_video_left = log_file_directory + (std::string)input_file_storage["log_video_left"]; std::string log_video_right = log_file_directory + (std::string)input_file_storage["log_video_right"]; std::string log_frame_left_prefix = log_file_directory + (std::string)input_file_storage["log_frame_left_prefix"]; std::string log_frame_right_prefix = log_file_directory + (std::string)input_file_storage["log_frame_right_prefix"]; input_file_storage.release(); if (do_use_kalman_filter==-1 || do_interactive_mode==-1 || multicast_port==-1 || do_show_graphics==-1 || do_output_debug==-1 || do_output_2D==-1 || do_output_3D==-1 || do_output_object==-1 || do_output_virt_point==-1 || do_log_2D==-1 || do_log_3D==-1 || do_log_object==-1 || do_log_virt_point==-1 || do_log_video==-1 || do_log_frame==-1 || do_send_object_pose==-1 || do_send_virt_point_pose==-1 || multicast_adress.empty() || input_device_src.empty() || mouse_device_id.empty() || keyboard_device_id.empty() || input_src.empty() || video_left.empty() || video_right.empty() || points_2D_left.empty() || points_2D_right.empty() || log_points_2D_left.empty() || log_points_2D_right.empty() || log_points_3D.empty() || log_object_pose.empty() || log_virt_point_pose.empty() || log_video_left.empty() || log_video_right.empty() || log_frame_left_prefix.empty() || log_frame_right_prefix.empty()) { std::cerr << "Read all run parameters from " << arg_run_parameter_config_file << " failed" << std::endl; std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get(); return 0; } if (do_log_video && (input_src == "v")) { std::cerr << "Cannot read video files and record to files at the same time." << std::endl; std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get(); return 0; } bool do_debugging = (do_output_debug != 0); // ------------------------------------------------------------------------------------- // Initialize Motion Capturing (segmentation/marker extraction, marker template fitting) // ------------------------------------------------------------------------------------- tiy::MarkerTracking m_track(do_debugging); if (!m_track.readConfigFiles(arg_camera_config_file, arg_object_config_file)) { std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get(); return 0; } // ------------------------------------------------------------------------------------- // Input device // ------------------------------------------------------------------------------------- boost::scoped_ptr<tiy::MouseDevice> mouse_device; boost::scoped_ptr<tiy::KeyboardDevice> keyboard_device; #ifdef WIN32 mouse_device.reset(new tiy::WindowsMouse(do_debugging)); keyboard_device.reset(new tiy::WindowsKeyboard(do_debugging)); #else mouse_device.reset(new tiy::LinuxMouse(do_debugging)); keyboard_device.reset(new tiy::LinuxKeyboard(do_debugging)); #endif int read_intervall_ms = 1; if ((input_device_src == "m") && (!mouse_device->openAndReadMouse(mouse_device_id, read_intervall_ms))) { std::cout << "MouseDevice::openAndReadMouse() failed" << std::endl; std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get(); return 0; } if (!keyboard_device->openAndReadKeyboard(keyboard_device_id, read_intervall_ms)) { std::cout << "KeyboardDevice::openAndReadKeyboard() failed" << std::endl; std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get(); return 0; } // ------------------------------------------------------------------------------------- // Stereo camera // ------------------------------------------------------------------------------------- boost::scoped_ptr<tiy::StereoCamera> stereo_camera; std::string camera_id_left = m_track.left_camera_id; std::string camera_id_right = m_track.right_camera_id; if (input_src == "b") { #ifdef USE_aravis stereo_camera.reset(new tiy::BaslerGigEStereoCamera(do_debugging, camera_id_left, camera_id_right, m_track.frame_width, m_track.frame_height, m_track.camera_exposure, m_track.camera_gain, m_track.frame_rate)); #else std::cerr << "BaslerGigEStereoCamera not available, as aravis NOT found/used." << std::endl; std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get(); return 0; #endif } else if (input_src == "o") stereo_camera.reset(new tiy::OpenCVStereoCamera(do_debugging, camera_id_left, camera_id_right, m_track.frame_width, m_track.frame_height, m_track.camera_exposure, m_track.camera_gain, m_track.frame_rate)); else if (input_src == "v") stereo_camera.reset(new tiy::OpenCVStereoCamera(do_debugging, camera_id_left, camera_id_right, m_track.frame_width, m_track.frame_height, m_track.camera_exposure, m_track.camera_gain, m_track.frame_rate, video_left, video_right)); else { std::cerr << "No input source \"input_src\" specified in the configuration file \"" << arg_run_parameter_config_file << "\"" << std::endl; std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get(); return 0; } if (stereo_camera->openCam()) stereo_camera->startCam(); else { std::cerr << "MarkerTracking::connectStereoCamera() failed" << std::endl; std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get(); return 0; } cv::Mat image_left = stereo_camera->createImage(); cv::Mat image_right = stereo_camera->createImage(); long long int frame_timestamp; // ------------------------------------------------------------------------------------- // BOOST ASIO MULTICAST SERVER // ------------------------------------------------------------------------------------- boost::asio::io_service server_io_service; tiy::MulticastServer multicast_server(server_io_service, boost::asio::ip::address::from_string(multicast_adress), multicast_port, do_debugging); boost::system::error_code error_c; boost::thread server_io_service_thread(boost::bind(&boost::asio::io_service::run, &server_io_service, error_c)); // ------------------------------------------------------------------------------------- // Logging // ------------------------------------------------------------------------------------- std::ofstream log_2D_left, log_2D_right, log_3D, log_object, log_virt_point; if (do_log_2D) { log_2D_left.open(log_points_2D_left.c_str()); log_2D_right.open(log_points_2D_right.c_str()); } if (do_log_3D) log_3D.open(log_points_3D.c_str()); if (do_log_object) log_object.open(log_object_pose.c_str()); if (do_log_virt_point) log_virt_point.open(log_virt_point_pose.c_str()); if (do_log_video) stereo_camera->startRecording(log_video_left, log_video_right); // ------------------------------------------------------------------------------------- // MAIN LOOP // ------------------------------------------------------------------------------------- int capture_counter = 1; bool is_base_temp = false; int test_points_counter = 0; // time measurement boost::posix_time::ptime start_time, end_time; start_time = boost::posix_time::microsec_clock::universal_time(); for(int i = 0; true; i++) { // ------------------------------------------------------------------------------------- // Grab stereo frame // ------------------------------------------------------------------------------------- if(!stereo_camera->grabFrame(image_left, image_right, frame_timestamp)) { if (input_src == "v") { std::cout << "Video file finished." << std::endl; std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get(); return 0; } std::cerr << "Grabbing failed" << std::endl; std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get(); return 0; } if (do_log_video) stereo_camera->recordFrame(); // ------------------------------------------------------------------------------------- // Extract (or read from file) 2D points // ------------------------------------------------------------------------------------- cv::vector<cv::Point2f> points_2D_left, points_2D_right; #pragma omp parallel sections { #pragma omp section { if (input_src == "t") m_track.get2DPointsFromFile("testpoints_left", &points_2D_left, test_points_counter); else m_track.get2DPointsFromImage(image_left, &points_2D_left); } #pragma omp section { if (input_src == "t") m_track.get2DPointsFromFile("testpoints_right", &points_2D_right, test_points_counter); else m_track.get2DPointsFromImage(image_right, &points_2D_right); } } test_points_counter++; // ------------------------------------------------------------------------------------- // Compute 3D points from 2D points // ------------------------------------------------------------------------------------- cv::Mat points_3D = m_track.get3DPointsFrom2DPoints(points_2D_left, points_2D_right); // ------------------------------------------------------------------------------------- // Search for marker objects (templates) // ------------------------------------------------------------------------------------- std::vector<cv::Mat>RT_template_leftcam; std::vector<float>avg_dev; for(int t = 0; t < m_track.num_templates;t++) { RT_template_leftcam.push_back(cv::Mat::zeros(4,4,CV_32F)); avg_dev.push_back(0); } #pragma omp parallel for for(int r = 0; r < m_track.num_templates; r++) m_track.fit3DPointsToObjectTemplate(points_3D, r, RT_template_leftcam[r], &avg_dev[r]); // ------------------------------------------------------------------------------------- // Update mouse and keyboard status // ------------------------------------------------------------------------------------- bool was_SPACE_pressed=false, was_ESC_pressed=false; keyboard_device->getStatusSinceLastReset(was_SPACE_pressed, was_ESC_pressed); if (was_ESC_pressed) { std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get(); return 0; } keyboard_device->resetStatus(); bool was_left_button_pressed=false, was_left_button_released=false, is_left_button_pressed=false, was_right_button_pressed=false, was_right_button_released=false, is_right_button_pressed=false, has_mouse_wheel_changed=false; static int mouse_wheel_position=0; if (input_device_src == "m") { mouse_device->getStatusSinceLastReset(was_left_button_pressed, was_left_button_released, is_left_button_pressed, was_right_button_pressed, was_right_button_released, is_right_button_pressed, has_mouse_wheel_changed, mouse_wheel_position); mouse_device->resetStatus(); } // ------------------------------------------------------------------------------------- // OUTPUT (Send/Display/Log) the selected data // ------------------------------------------------------------------------------------- if (!do_interactive_mode || ((input_device_src == "m") && was_left_button_pressed) || ((input_device_src == "k") && was_SPACE_pressed)) { // ------------------------------------------------------------------------------------- // Send (publish the object/virtual point pose over multicast) // ------------------------------------------------------------------------------------- if(do_send_object_pose) { std::string send_string; for(int r = 0; r < m_track.num_templates; r++) { cv::Mat rodrigues_orientation = cv::Mat::zeros(3, 1, CV_32F); if (countNonZero(RT_template_leftcam[r])) Rodrigues(RT_template_leftcam[r](cv::Range(0,3),cv::Range(0,3)), rodrigues_orientation); int last_col = RT_template_leftcam[r].size.p[0] - 1; std::stringstream frame_timestamp_ss; // as boost::format not compatible with long long int frame_timestamp_ss << frame_timestamp; std::string send_buffer = (boost::format("%s\t%d\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t") % frame_timestamp_ss.str() % r % RT_template_leftcam[r].at<float>(0,last_col) % RT_template_leftcam[r].at<float>(1,last_col) % RT_template_leftcam[r].at<float>(2,last_col) % rodrigues_orientation.at<float>(0,0) % rodrigues_orientation.at<float>(1,0) % rodrigues_orientation.at<float>(2,0) ).str(); send_string += send_buffer; } multicast_server.sendString(send_string); if(do_debugging) std::cout << "-------------" << std::endl << "SENDING :" << send_string << std::endl << "----------------" << std::endl; } if(do_send_virt_point_pose) { std::string send_string; for(int r = 0; r < m_track.num_templates; r++) { cv::Mat RT_virt_point_to_leftcam = cv::Mat::zeros(4, 4, CV_32F); cv::Mat rodrigues_orientation = cv::Mat::zeros(3, 1, CV_32F); if (countNonZero(RT_template_leftcam[r]) && countNonZero(m_track.RT_virt_point_to_template[r] - cv::Mat::eye(4, 4, CV_32F))) { RT_virt_point_to_leftcam = RT_template_leftcam[r] * m_track.RT_virt_point_to_template[r]; Rodrigues(RT_virt_point_to_leftcam(cv::Range(0,3),cv::Range(0,3)), rodrigues_orientation); } int last_col = RT_virt_point_to_leftcam.size.p[0] - 1; std::stringstream frame_timestamp_ss; // as boost::format not compatible with long long int frame_timestamp_ss << frame_timestamp; std::string send_buffer = (boost::format("%s\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t") % frame_timestamp_ss.str() % RT_virt_point_to_leftcam.at<float>(0,last_col) % RT_virt_point_to_leftcam.at<float>(1,last_col) % RT_virt_point_to_leftcam.at<float>(2,last_col) % rodrigues_orientation.at<float>(0,0) % rodrigues_orientation.at<float>(1,0) % rodrigues_orientation.at<float>(2,0) ).str(); send_string += send_buffer; } multicast_server.sendString(send_string); if(do_debugging) std::cout << "-------------" << std::endl << "SENDING :" << send_string << std::endl << "----------------" << std::endl; } // ------------------------------------------------------------------------------------- // Display // ------------------------------------------------------------------------------------- if (do_debugging) { if (was_left_button_pressed) std::cout << "LEFT" << std::endl; if (was_left_button_released) std::cout << "LEFT RELEASED" << std::endl; if (was_right_button_pressed) std::cout << "RIGHT" << std::endl; if (was_right_button_released) std::cout << "RIGHT RELEASED" << std::endl; if (has_mouse_wheel_changed) std::cout << "WHEEL: " << mouse_wheel_position << std::endl; if (is_left_button_pressed) std::cout << "LEFT STILL" << std::endl; if (is_right_button_pressed) std::cout << "RIGHT STILL" << std::endl; if (was_SPACE_pressed) std::cout << "SPACE" << std::endl; if (was_ESC_pressed) std::cout << "ESC" << std::endl; } if (do_output_2D) { std::cout << frame_timestamp; for(unsigned int p = 0; p < points_2D_left.size(); p++) std::cout << "\t" << points_2D_left[p].x << "\t" << points_2D_left[p].y; std::cout << std::endl; std::cout << frame_timestamp; for(unsigned int p = 0; p < points_2D_right.size(); p++) std::cout << "\t" << points_2D_right[p].x << "\t" << points_2D_right[p].y; std::cout << std::endl; } if (do_output_3D) { std::cout << frame_timestamp; for(int p = 0; p < points_3D.cols; p++) std::cout << "\t" << points_3D.at<float>(0,p) << "\t" << points_3D.at<float>(1,p) << "\t" << points_3D.at<float>(2,p); std::cout << std::endl; } if (do_output_object) { std::cout << frame_timestamp; for(int r = 0; r < m_track.num_templates; r++) { cv::Mat rodrigues_orientation = cv::Mat::zeros(3, 1, CV_32F); if (countNonZero(RT_template_leftcam[r])) Rodrigues(RT_template_leftcam[r](cv::Range(0,3),cv::Range(0,3)), rodrigues_orientation); int last_col = RT_template_leftcam[r].size.p[0] - 1; std::cout << "\t" << RT_template_leftcam[r].at<float>(0,last_col) << "\t" << RT_template_leftcam[r].at<float>(1,last_col) << "\t" << RT_template_leftcam[r].at<float>(2,last_col) << "\t" << rodrigues_orientation.at<float>(0,0) << "\t" << rodrigues_orientation.at<float>(1,0) << "\t" << rodrigues_orientation.at<float>(2,0); //std::cout << std::endl << "avg_dev = " << avg_dev[r]; } std::cout << std::endl; } if (do_output_virt_point) { std::cout << frame_timestamp; for(int r = 0; r < m_track.num_templates; r++) { cv::Mat RT_virt_point_to_leftcam = cv::Mat::zeros(4, 4, CV_32F); cv::Mat rodrigues_orientation = cv::Mat::zeros(3, 1, CV_32F); if (countNonZero(RT_template_leftcam[r]) && countNonZero(m_track.RT_virt_point_to_template[r] - cv::Mat::eye(4, 4, CV_32F))) { RT_virt_point_to_leftcam = RT_template_leftcam[r] * m_track.RT_virt_point_to_template[r]; Rodrigues(RT_virt_point_to_leftcam(cv::Range(0,3),cv::Range(0,3)), rodrigues_orientation); } int last_col = RT_virt_point_to_leftcam.size.p[0] - 1; std::cout << "\t" << RT_virt_point_to_leftcam.at<float>(0,last_col) << "\t" << RT_virt_point_to_leftcam.at<float>(1,last_col) << "\t" << RT_virt_point_to_leftcam.at<float>(2,last_col) << "\t" << rodrigues_orientation.at<float>(0,0) << "\t" << rodrigues_orientation.at<float>(1,0) << "\t" << rodrigues_orientation.at<float>(2,0); } std::cout << std::endl; } // ------------------------------------------------------------------------------------- // Log // ------------------------------------------------------------------------------------- if (do_log_2D) { log_2D_left << frame_timestamp; for(unsigned int p = 0; p < points_2D_left.size(); p++) log_2D_left << "\t" << points_2D_left[p].x << "\t" << points_2D_left[p].y; log_2D_left << std::endl; log_2D_right << frame_timestamp; for(unsigned int p = 0; p < points_2D_right.size(); p++) log_2D_right << "\t" << points_2D_right[p].x << "\t" << points_2D_right[p].y; log_2D_right << std::endl; } if (do_log_3D) { log_3D << frame_timestamp; for(int p = 0; p < points_3D.cols; p++) log_3D << "\t" << points_3D.at<float>(0,p) << "\t" << points_3D.at<float>(1,p) << "\t" << points_3D.at<float>(2,p); log_3D << std::endl; } if (do_log_object) { log_object << frame_timestamp; for(int r = 0; r < m_track.num_templates; r++) { cv::Mat rodrigues_orientation = cv::Mat::zeros(3, 1, CV_32F); if (countNonZero(RT_template_leftcam[r])) Rodrigues(RT_template_leftcam[r](cv::Range(0,3),cv::Range(0,3)), rodrigues_orientation); int last_col = RT_template_leftcam[r].size.p[0] - 1; log_object << "\t" << RT_template_leftcam[r].at<float>(0,last_col) << "\t" << RT_template_leftcam[r].at<float>(1,last_col) << "\t" << RT_template_leftcam[r].at<float>(2,last_col) << "\t" << rodrigues_orientation.at<float>(0,0) << "\t" << rodrigues_orientation.at<float>(1,0) << "\t" << rodrigues_orientation.at<float>(2,0); //log_object << std::endl << "avg_dev = " << avg_dev[r]; } log_object << std::endl; } if (do_log_virt_point) { log_virt_point << frame_timestamp; for(int r = 0; r < m_track.num_templates; r++) { cv::Mat RT_virt_point_to_leftcam = cv::Mat::zeros(4, 4, CV_32F); cv::Mat rodrigues_orientation = cv::Mat::zeros(3, 1, CV_32F); if (countNonZero(RT_template_leftcam[r]) && countNonZero(m_track.RT_virt_point_to_template[r] - cv::Mat::eye(4, 4, CV_32F))) { RT_virt_point_to_leftcam = RT_template_leftcam[r] * m_track.RT_virt_point_to_template[r]; Rodrigues(RT_virt_point_to_leftcam(cv::Range(0,3),cv::Range(0,3)), rodrigues_orientation); } int last_col = RT_virt_point_to_leftcam.size.p[0] - 1; log_virt_point << "\t" << RT_virt_point_to_leftcam.at<float>(0,last_col) << "\t" << RT_virt_point_to_leftcam.at<float>(1,last_col) << "\t" << RT_virt_point_to_leftcam.at<float>(2,last_col) << "\t" << rodrigues_orientation.at<float>(0,0) << "\t" << rodrigues_orientation.at<float>(1,0) << "\t" << rodrigues_orientation.at<float>(2,0); } log_virt_point << std::endl; } if (do_log_video) stereo_camera->recordFrame(); } // ------------------------------------------------------------------------------------- // Capture stereo frame // ------------------------------------------------------------------------------------- if (do_log_frame && (((input_device_src == "m") && was_left_button_pressed) || ((input_device_src == "k") && was_SPACE_pressed))) { std::string save_file; save_file = (boost::format("%s%03i.jpg") % log_frame_left_prefix % capture_counter).str(); cv::imwrite(save_file, image_left); save_file = (boost::format("%s%03i.jpg") % log_frame_right_prefix % capture_counter).str(); cv::imwrite(save_file, image_right); if (do_debugging) std::cout << frame_timestamp << "Frame captured." << std::endl; capture_counter++; } // ------------------------------------------------------------------------------------- // Visualize stereo frame with detected points // ------------------------------------------------------------------------------------- if(do_show_graphics && !(input_src == "t")) { // needed, as changing image content (costs 0.5-1.5 [ms]) cv::Mat image_left_cpy, image_right_cpy; image_left.copyTo(image_left_cpy); image_right.copyTo(image_right_cpy); for(unsigned int p=0; p < points_2D_left.size(); p++) cv::circle(image_left_cpy, points_2D_left[p], 2, cv::Scalar(0), 1, CV_AA, 0); for(unsigned int p=0; p < points_2D_right.size(); p++) cv::circle(image_right_cpy, points_2D_right[p], 2, cv::Scalar(0), 1, CV_AA, 0); cv::Mat object_rotation(3, 1, CV_32F); cv::Mat object_translation(3, 1, CV_32F); cv::vector<cv::Point2f> object_2D; for(int r = 0; r < m_track.num_templates; r++) { if (avg_dev[r] < std::numeric_limits<float>::infinity()) { Rodrigues(RT_template_leftcam[r](cv::Range(0,3),cv::Range(0,3)), object_rotation); object_translation = RT_template_leftcam[r](cv::Range(0,3),cv::Range(3,4)).t(); cv::vector<cv::Point3f> object_points; object_points.push_back(cv::Point3f(RT_template_leftcam[r].at<float>(0,3), RT_template_leftcam[r].at<float>(1,3), RT_template_leftcam[r].at<float>(2,3))); projectPoints(cv::Mat(object_points), cv::Mat::zeros(3,1,CV_32F), cv::Mat::zeros(3,1,CV_32F), m_track.KK_left, m_track.kc_left, object_2D); cv::circle(image_left_cpy, object_2D[0], 4, cv::Scalar(255,255,255), 1, CV_AA, 0); cv::circle(image_left_cpy, object_2D[0], 3, cv::Scalar(0,0,150), 1, CV_AA, 0); projectPoints(cv::Mat(object_points), m_track.om_leftcam_to_rightcam, m_track.T_leftcam_to_rightcam, m_track.KK_right, m_track.kc_right, object_2D); cv::circle(image_right_cpy, object_2D[0], 4, cv::Scalar(255,255,255), 1, CV_AA, 0); cv::circle(image_right_cpy, object_2D[0], 3, cv::Scalar(0,0,150), 1, CV_AA, 0); } } imshow("Image Left", image_left_cpy); imshow("Image Right", image_right_cpy); cv::waitKey(1); } // ------------------------------------------------------------------------------------- // END MEASURE of the computation time (of one cycle) // ------------------------------------------------------------------------------------- if (do_debugging) { end_time = boost::posix_time::microsec_clock::universal_time(); boost::posix_time::time_duration time_diff = end_time - start_time; std::cout << "comp_time = " << time_diff.total_microseconds() << " [us]" << std::endl; start_time = boost::posix_time::microsec_clock::universal_time(); } } //end MAIN LOOP if (log_2D_left.is_open()) log_2D_left.close(); if (log_2D_right.is_open()) log_2D_right.close(); if (log_3D.is_open()) log_3D.close(); if (log_object.is_open()) log_object.close(); stereo_camera->closeCam(); std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get(); return 0; }
int main(int argc, char** argv) { sim_param_t params; if (get_params(argc, argv, ¶ms) != 0) exit(-1); // Create global sim_state_t* globalState = init_particles(¶ms); #pragma omp parallel shared(globalState, params) { int proc = omp_get_thread_num(); int nproc = omp_get_num_threads(); FILE* fp = fopen(params.fname, "w"); int nframes = params.nframes; int npframe = params.npframe; float dt = params.dt; int n = globalState->n; // Processor information and holder proc_info* pInfo = malloc(sizeof(proc_info)); pInfo->proc = proc; pInfo->nproc = nproc; pInfo->beg = round((proc/(double)nproc)*n); pInfo->end = round(((proc+1)/(double)nproc)*n); pInfo->forceAccu = calloc(3*n, sizeof(float)); // Never used this... if (proc == 0) { printf("Running in parallel with %d processor\n", nproc); } normalize_mass(globalState, pInfo, ¶ms); double t_start = omp_get_wtime(); if (proc == 0) { // We only write for one processor write_header(fp, n, nframes, params.h); write_frame_data(fp, n, globalState, NULL); } if (proc == 0) { hash_particles(globalState, params.h); } //hash_particles_parallel(globalState, pInfo, params.h); #pragma omp barrier // Need the hashing to be done compute_accel(globalState, pInfo, ¶ms); #pragma omp barrier leapfrog_start(globalState, pInfo, dt); check_state(globalState, pInfo); for (int frame = 1; frame < nframes; ++frame) { // We sort according to Z-Morton to ensure locality, need to implement paralle qsort if (frame % 5 == 0) { // Dividing into chunks of sorting each chunk // This alone turned out to better than sorting the entire array qsort(globalState->part+pInfo->beg, pInfo->end-pInfo->beg ,sizeof(particle_t),compPart); // Sorting the array consisting of sorted chunks // This turned out to actually lower the performance. That's why // I commented it. // #pragma omp barrier // if( pInfo->nproc >1 ) arraymerge(globalState->part, globalState->n, pInfo); //#pragma omp barrier*/ // Serial version /*#pragma omp single // Implied barrier qsort(globalState->part, n, sizeof(particle_t), compPart);*/ } /*else if (frame % 49) {*/ /*if (proc == 0) {*/ /*}*/ /*}*/ #pragma omp barrier // Need sort to finish for (int i = 0; i < npframe; ++i) { if (proc == 0 && npframe % 4 == 0) { // Ammortize hashing cost hash_particles(globalState, params.h); } #pragma omp barrier compute_accel(globalState, pInfo, ¶ms); leapfrog_step(globalState, pInfo, dt); check_state(globalState, pInfo); #pragma omp barrier } if (proc == 0) { printf("Frame: %d of %d - %2.1f%%\n",frame, nframes, 100*(float)frame/nframes); write_frame_data(fp, n, globalState, NULL); } } double t_end = omp_get_wtime(); if (proc == 0) { printf("Ran in %g seconds\n", t_end-t_start); } free(pInfo); fclose(fp); } free_state(globalState); }
int main(int argc, char *argv[]) { struct pngquant_options options = { .floyd = 1.f, // floyd-steinberg dithering }; options.liq = liq_attr_create(); if (!options.liq) { fputs("SSE-capable CPU is required for this build.\n", stderr); return WRONG_ARCHITECTURE; } unsigned int error_count=0, skipped_count=0, file_count=0; pngquant_error latest_error=SUCCESS; const char *newext = NULL, *output_file_path = NULL; fix_obsolete_options(argc, argv); int opt; do { opt = getopt_long(argc, argv, "Vvqfhs:Q:o:", long_options, NULL); switch (opt) { case 'v': options.verbose = true; break; case 'q': options.verbose = false; break; case arg_floyd: options.floyd = optarg ? atof(optarg) : 1.0; if (options.floyd < 0 || options.floyd > 1.f) { fputs("--floyd argument must be in 0..1 range\n", stderr); return INVALID_ARGUMENT; } break; case arg_ordered: options.floyd = 0; break; case 'f': options.force = true; break; case arg_no_force: options.force = false; break; case arg_ext: newext = optarg; break; case 'o': if (output_file_path) { fputs("--output option can be used only once\n", stderr); return INVALID_ARGUMENT; } output_file_path = optarg; break; case arg_iebug: // opacities above 238 will be rounded up to 255, because IE6 truncates <255 to 0. liq_set_min_opacity(options.liq, 238); options.ie_mode = true; break; case arg_transbug: liq_set_last_index_transparent(options.liq, true); break; case arg_skip_larger: options.skip_if_larger = true; break; case 's': { int speed = atoi(optarg); if (speed >= 10) { options.fast_compression = true; } if (speed == 11) { options.floyd = 0; speed = 10; } if (LIQ_OK != liq_set_speed(options.liq, speed)) { fputs("Speed should be between 1 (slow) and 11 (fast).\n", stderr); return INVALID_ARGUMENT; } } break; case 'Q': if (!parse_quality(optarg, options.liq, &options.min_quality_limit)) { fputs("Quality should be in format min-max where min and max are numbers in range 0-100.\n", stderr); return INVALID_ARGUMENT; } break; case arg_posterize: if (LIQ_OK != liq_set_min_posterization(options.liq, atoi(optarg))) { fputs("Posterization should be number of bits in range 0-4.\n", stderr); return INVALID_ARGUMENT; } break; case arg_map: { png24_image tmp = {}; if (SUCCESS != read_image(options.liq, optarg, false, &tmp, &options.fixed_palette_image, false, false)) { fprintf(stderr, " error: Unable to load %s", optarg); return INVALID_ARGUMENT; } } break; case 'h': print_full_version(stdout); print_usage(stdout); return SUCCESS; case 'V': puts(PNGQUANT_VERSION); return SUCCESS; case -1: break; default: return INVALID_ARGUMENT; } } while (opt != -1); int argn = optind; if (argn >= argc) { if (argn > 1) { fputs("No input files specified. See -h for help.\n", stderr); } else { print_full_version(stderr); print_usage(stderr); } return MISSING_ARGUMENT; } if (options.verbose) { liq_set_log_callback(options.liq, log_callback, NULL); options.log_callback = log_callback; } char *colors_end; unsigned long colors = strtoul(argv[argn], &colors_end, 10); if (colors_end != argv[argn] && '\0' == colors_end[0]) { if (LIQ_OK != liq_set_max_colors(options.liq, colors)) { fputs("Number of colors must be between 2 and 256.\n", stderr); return INVALID_ARGUMENT; } argn++; } if (newext && output_file_path) { fputs("--ext and --output options can't be used at the same time\n", stderr); return INVALID_ARGUMENT; } // new filename extension depends on options used. Typically basename-fs8.png if (newext == NULL) { newext = options.floyd > 0 ? "-ie-fs8.png" : "-ie-or8.png"; if (!options.ie_mode) { newext += 3; /* skip "-ie" */ } } if (argn == argc || (argn == argc-1 && 0==strcmp(argv[argn],"-"))) { options.using_stdin = true; argn = argc-1; } if (options.using_stdin && output_file_path) { fputs("--output can't be mixed with stdin\n", stderr); return INVALID_ARGUMENT; } const int num_files = argc-argn; if (output_file_path && num_files != 1) { fputs("Only one input file is allowed when --output is used\n", stderr); return INVALID_ARGUMENT; } #ifdef _OPENMP // if there's a lot of files, coarse parallelism can be used if (num_files > 2*omp_get_max_threads()) { omp_set_nested(0); omp_set_dynamic(1); } else { omp_set_nested(1); } #endif #pragma omp parallel for \ schedule(static, 1) reduction(+:skipped_count) reduction(+:error_count) reduction(+:file_count) shared(latest_error) for(int i=0; i < num_files; i++) { struct pngquant_options opts = options; opts.liq = liq_attr_copy(options.liq); const char *filename = opts.using_stdin ? "stdin" : argv[argn+i]; #ifdef _OPENMP struct buffered_log buf = {}; if (opts.log_callback && omp_get_num_threads() > 1 && num_files > 1) { liq_set_log_callback(opts.liq, log_callback_buferred, &buf); liq_set_log_flush_callback(opts.liq, log_callback_buferred_flush, &buf); options.log_callback = log_callback_buferred; options.log_callback_user_info = &buf; } #endif pngquant_error retval = SUCCESS; const char *outname = output_file_path; char *outname_free = NULL; if (!options.using_stdin) { if (!outname) { outname = outname_free = add_filename_extension(filename, newext); } if (!options.force && file_exists(outname)) { fprintf(stderr, " error: %s exists; not overwriting\n", outname); retval = NOT_OVERWRITING_ERROR; } } if (!retval) { retval = pngquant_file(filename, outname, &opts); } free(outname_free); liq_attr_destroy(opts.liq); if (retval) { #pragma omp critical { latest_error = retval; } if (retval == TOO_LOW_QUALITY || retval == TOO_LARGE_FILE) { skipped_count++; } else { error_count++; } } ++file_count; } if (error_count) { verbose_printf(&options, "There were errors quantizing %d file%s out of a total of %d file%s.", error_count, (error_count == 1)? "" : "s", file_count, (file_count == 1)? "" : "s"); } if (skipped_count) { verbose_printf(&options, "Skipped %d file%s out of a total of %d file%s.", skipped_count, (skipped_count == 1)? "" : "s", file_count, (file_count == 1)? "" : "s"); } if (!skipped_count && !error_count) { verbose_printf(&options, "No errors detected while quantizing %d image%s.", file_count, (file_count == 1)? "" : "s"); } liq_image_destroy(options.fixed_palette_image); liq_attr_destroy(options.liq); return latest_error; } pngquant_error pngquant_file(const char *filename, const char *outname, struct pngquant_options *options) { pngquant_error retval = SUCCESS; verbose_printf(options, "%s:", filename); liq_image *input_image = NULL; png24_image input_image_rwpng = {}; bool keep_input_pixels = options->skip_if_larger || (options->using_stdin && options->min_quality_limit); // original may need to be output to stdout if (!retval) { retval = read_image(options->liq, filename, options->using_stdin, &input_image_rwpng, &input_image, keep_input_pixels, options->verbose); } int quality_percent = 90; // quality on 0-100 scale, updated upon successful remap png8_image output_image = {}; if (!retval) { verbose_printf(options, " read %luKB file", (input_image_rwpng.file_size+1023UL)/1024UL); #if USE_LCMS if (input_image_rwpng.lcms_status == ICCP) { verbose_printf(options, " used embedded ICC profile to transform image to sRGB colorspace"); } else if (input_image_rwpng.lcms_status == GAMA_CHRM) { verbose_printf(options, " used gAMA and cHRM chunks to transform image to sRGB colorspace"); } else if (input_image_rwpng.lcms_status == ICCP_WARN_GRAY) { verbose_printf(options, " warning: ignored ICC profile in GRAY colorspace"); } #endif if (input_image_rwpng.gamma != 0.45455) { verbose_printf(options, " corrected image from gamma %2.1f to sRGB gamma", 1.0/input_image_rwpng.gamma); } // when using image as source of a fixed palette the palette is extracted using regular quantization liq_result *remap = liq_quantize_image(options->liq, options->fixed_palette_image ? options->fixed_palette_image : input_image); if (remap) { liq_set_output_gamma(remap, 0.45455); // fixed gamma ~2.2 for the web. PNG can't store exact 1/2.2 liq_set_dithering_level(remap, options->floyd); retval = prepare_output_image(remap, input_image, &output_image); if (!retval) { if (LIQ_OK != liq_write_remapped_image_rows(remap, input_image, output_image.row_pointers)) { retval = OUT_OF_MEMORY_ERROR; } set_palette(remap, &output_image); double palette_error = liq_get_quantization_error(remap); if (palette_error >= 0) { quality_percent = liq_get_quantization_quality(remap); verbose_printf(options, " mapped image to new colors...MSE=%.3f (Q=%d)", palette_error, quality_percent); } } liq_result_destroy(remap); } else { retval = TOO_LOW_QUALITY; } } if (!retval) { if (options->skip_if_larger) { // this is very rough approximation, but generally avoid losing more quality than is gained in file size. // Quality is squared, because even greater savings are needed to justify big quality loss. double quality = quality_percent/100.0; output_image.maximum_file_size = input_image_rwpng.file_size * quality*quality; } output_image.fast_compression = options->fast_compression; output_image.chunks = input_image_rwpng.chunks; input_image_rwpng.chunks = NULL; retval = write_image(&output_image, NULL, outname, options); if (TOO_LARGE_FILE == retval) { verbose_printf(options, " file exceeded expected size of %luKB", (unsigned long)output_image.maximum_file_size/1024UL); } } if (TOO_LARGE_FILE == retval || (TOO_LOW_QUALITY == retval && options->using_stdin)) { // when outputting to stdout it'd be nasty to create 0-byte file // so if quality is too low, output 24-bit original if (keep_input_pixels) { pngquant_error write_retval = write_image(NULL, &input_image_rwpng, outname, options); if (write_retval) { retval = write_retval; } } } liq_image_destroy(input_image); rwpng_free_image24(&input_image_rwpng); rwpng_free_image8(&output_image); return retval; }
int main() { int quantum, checktick(); int BytesPerWord; int i,k; ssize_t j; STREAM_TYPE scalar; double t, times[4][NTIMES]; double *TimesByRank; double t0,t1,tmin; int rc, numranks, myrank; STREAM_TYPE AvgError[3] = {0.0,0.0,0.0}; STREAM_TYPE *AvgErrByRank; /* --- SETUP --- call MPI_Init() before anything else! --- */ rc = MPI_Init(NULL, NULL); t0 = MPI_Wtime(); if (rc != MPI_SUCCESS) { printf("ERROR: MPI Initialization failed with return code %d\n",rc); exit(1); } // if either of these fail there is something really screwed up! MPI_Comm_size(MPI_COMM_WORLD, &numranks); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); /* --- NEW FEATURE --- distribute requested storage across MPI ranks --- */ array_elements = STREAM_ARRAY_SIZE / numranks; // don't worry about rounding vs truncation array_alignment = 64; // Can be modified -- provides partial support for adjusting relative alignment // Dynamically allocate the three arrays using "posix_memalign()" // NOTE that the OFFSET parameter is not used in this version of the code! array_bytes = array_elements * sizeof(STREAM_TYPE); k = posix_memalign((void **)&a, array_alignment, array_bytes); if (k != 0) { printf("Rank %d: Allocation of array a failed, return code is %d\n",myrank,k); MPI_Abort(MPI_COMM_WORLD, 2); exit(1); } k = posix_memalign((void **)&b, array_alignment, array_bytes); if (k != 0) { printf("Rank %d: Allocation of array b failed, return code is %d\n",myrank,k); MPI_Abort(MPI_COMM_WORLD, 2); exit(1); } k = posix_memalign((void **)&c, array_alignment, array_bytes); if (k != 0) { printf("Rank %d: Allocation of array c failed, return code is %d\n",myrank,k); MPI_Abort(MPI_COMM_WORLD, 2); exit(1); } // Initial informational printouts -- rank 0 handles all the output if (myrank == 0) { printf(HLINE); printf("STREAM version $Revision: 1.7 $\n"); printf(HLINE); BytesPerWord = sizeof(STREAM_TYPE); printf("This system uses %d bytes per array element.\n", BytesPerWord); printf(HLINE); #ifdef N printf("***** WARNING: ******\n"); printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); printf("***** WARNING: ******\n"); #endif if (OFFSET != 0) { printf("***** WARNING: ******\n"); printf(" This version ignores the OFFSET parameter.\n"); printf("***** WARNING: ******\n"); } printf("Total Aggregate Array size = %llu (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE); printf("Total Aggregate Memory per array = %.1f MiB (= %.1f GiB).\n", BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); printf("Total Aggregate memory required = %.1f MiB (= %.1f GiB).\n", (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); printf("Data is distributed across %d MPI ranks\n",numranks); printf(" Array size per MPI rank = %llu (elements)\n" , (unsigned long long) array_elements); printf(" Memory per array per MPI rank = %.1f MiB (= %.1f GiB).\n", BytesPerWord * ( (double) array_elements / 1024.0/1024.0), BytesPerWord * ( (double) array_elements / 1024.0/1024.0/1024.0)); printf(" Total memory per MPI rank = %.1f MiB (= %.1f GiB).\n", (3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024.), (3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024./1024.)); printf(HLINE); printf("Each kernel will be executed %d times.\n", NTIMES); printf(" The *best* time for each kernel (excluding the first iteration)\n"); printf(" will be used to compute the reported bandwidth.\n"); printf("The SCALAR value used for this run is %f\n",SCALAR); #ifdef _OPENMP printf(HLINE); #pragma omp parallel { #pragma omp master { k = omp_get_num_threads(); printf ("Number of Threads requested for each MPI rank = %i\n",k); } } #endif #ifdef _OPENMP k = 0; #pragma omp parallel #pragma omp atomic k++; printf ("Number of Threads counted for rank 0 = %i\n",k); #endif } /* --- SETUP --- initialize arrays and estimate precision of timer --- */ #pragma omp parallel for for (j=0; j<array_elements; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } // Rank 0 needs to allocate arrays to hold error data and timing data from // all ranks for analysis and output. // Allocate and instantiate the arrays here -- after the primary arrays // have been instantiated -- so there is no possibility of having these // auxiliary arrays mess up the NUMA placement of the primary arrays. if (myrank == 0) { // There are 3 average error values for each rank (using STREAM_TYPE). AvgErrByRank = (double *) malloc(3 * sizeof(STREAM_TYPE) * numranks); if (AvgErrByRank == NULL) { printf("Ooops -- allocation of arrays to collect errors on MPI rank 0 failed\n"); MPI_Abort(MPI_COMM_WORLD, 2); } memset(AvgErrByRank,0,3*sizeof(STREAM_TYPE)*numranks); // There are 4*NTIMES timing values for each rank (always doubles) TimesByRank = (double *) malloc(4 * NTIMES * sizeof(double) * numranks); if (TimesByRank == NULL) { printf("Ooops -- allocation of arrays to collect timing data on MPI rank 0 failed\n"); MPI_Abort(MPI_COMM_WORLD, 3); } memset(TimesByRank,0,4*NTIMES*sizeof(double)*numranks); } // Simple check for granularity of the timer being used if (myrank == 0) { printf(HLINE); if ( (quantum = checktick()) >= 1) printf("Your timer granularity/precision appears to be " "%d microseconds.\n", quantum); else { printf("Your timer granularity appears to be " "less than one microsecond.\n"); quantum = 1; } } /* Get initial timing estimate to compare to timer granularity. */ /* All ranks need to run this code since it changes the values in array a */ t = MPI_Wtime(); #pragma omp parallel for for (j = 0; j < array_elements; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (MPI_Wtime() - t); if (myrank == 0) { printf("Each test below will take on the order" " of %d microseconds.\n", (int) t ); printf(" (= %d timer ticks)\n", (int) (t/quantum) ); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 timer ticks per test.\n"); printf(HLINE); printf("WARNING -- The above is only a rough guideline.\n"); printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); #ifdef VERBOSE t1 = MPI_Wtime(); printf("VERBOSE: total setup time for rank 0 = %f seconds\n",t1-t0); printf(HLINE); #endif } /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ // This code has more barriers and timing calls than are actually needed, but // this should not cause a problem for arrays that are large enough to satisfy // the STREAM run rules. scalar = SCALAR; for (k=0; k<NTIMES; k++) { // kernel 1: Copy MPI_Barrier(MPI_COMM_WORLD); t0 = MPI_Wtime(); #ifdef TUNED tuned_STREAM_Copy(); #else #pragma omp parallel for for (j=0; j<array_elements; j++) c[j] = a[j]; #endif MPI_Barrier(MPI_COMM_WORLD); t1 = MPI_Wtime(); times[0][k] = t1 - t0; // kernel 2: Scale MPI_Barrier(MPI_COMM_WORLD); t0 = MPI_Wtime(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #pragma omp parallel for for (j=0; j<array_elements; j++) b[j] = scalar*c[j]; #endif MPI_Barrier(MPI_COMM_WORLD); t1 = MPI_Wtime(); times[1][k] = t1-t0; // kernel 3: Add MPI_Barrier(MPI_COMM_WORLD); t0 = MPI_Wtime(); #ifdef TUNED tuned_STREAM_Add(); #else #pragma omp parallel for for (j=0; j<array_elements; j++) c[j] = a[j]+b[j]; #endif MPI_Barrier(MPI_COMM_WORLD); t1 = MPI_Wtime(); times[2][k] = t1-t0; // kernel 4: Triad MPI_Barrier(MPI_COMM_WORLD); t0 = MPI_Wtime(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #pragma omp parallel for for (j=0; j<array_elements; j++) a[j] = b[j]+scalar*c[j]; #endif MPI_Barrier(MPI_COMM_WORLD); t1 = MPI_Wtime(); times[3][k] = t1-t0; } t0 = MPI_Wtime(); /* --- SUMMARY --- */ // Because of the MPI_Barrier() calls, the timings from any thread are equally valid. // The best estimate of the maximum performance is the minimum of the "outside the barrier" // timings across all the MPI ranks. // Gather all timing data to MPI rank 0 MPI_Gather(times, 4*NTIMES, MPI_DOUBLE, TimesByRank, 4*NTIMES, MPI_DOUBLE, 0, MPI_COMM_WORLD); // Rank 0 processes all timing data if (myrank == 0) { // for each iteration and each kernel, collect the minimum time across all MPI ranks // and overwrite the rank 0 "times" variable with the minimum so the original post- // processing code can still be used. for (k=0; k<NTIMES; k++) { for (j=0; j<4; j++) { tmin = 1.0e36; for (i=0; i<numranks; i++) { // printf("DEBUG: Timing: iter %d, kernel %lu, rank %d, tmin %f, TbyRank %f\n",k,j,i,tmin,TimesByRank[4*NTIMES*i+j*NTIMES+k]); tmin = MIN(tmin, TimesByRank[4*NTIMES*i+j*NTIMES+k]); } // printf("DEBUG: Final Timing: iter %d, kernel %lu, final tmin %f\n",k,j,tmin); times[j][k] = tmin; } } // Back to the original code, but now using the minimum global timing across all ranks for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } // note that "bytes[j]" is the aggregate array size, so no "numranks" is needed here printf("Function Best Rate MB/s Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%s%11.1f %11.6f %11.6f %11.6f\n", label[j], 1.0E-06 * bytes[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]); } printf(HLINE); } /* --- Every Rank Checks its Results --- */ #ifdef INJECTERROR a[11] = 100.0 * a[11]; #endif computeSTREAMerrors(&AvgError[0], &AvgError[1], &AvgError[2]); /* --- Collect the Average Errors for Each Array on Rank 0 --- */ MPI_Gather(AvgError, 3, MPI_DOUBLE, AvgErrByRank, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); /* -- Combined averaged errors and report on Rank 0 only --- */ if (myrank == 0) { #ifdef VERBOSE for (k=0; k<numranks; k++) { printf("VERBOSE: rank %d, AvgErrors %e %e %e\n",k,AvgErrByRank[3*k+0], AvgErrByRank[3*k+1],AvgErrByRank[3*k+2]); } #endif checkSTREAMresults(AvgErrByRank,numranks); printf(HLINE); } #ifdef VERBOSE if (myrank == 0) { t1 = MPI_Wtime(); printf("VERBOSE: total shutdown time for rank %d = %f seconds\n",myrank,t1-t0); } #endif free(a); free(b); free(c); if (myrank == 0) { free(TimesByRank); free(AvgErrByRank); } MPI_Finalize(); return(0); }
int main(int argc, char *argv[]) { //set number of threads here omp_set_num_threads(16); int i, j, k; int nthreads, tid, chunk; double sum; double **A, **B, **C; /***** Setting up matrix *****/ // initializing up matrix rows A = (double**) malloc(M * sizeof(double* )); B = (double**) malloc(M * sizeof(double* )); C = (double**) malloc(M * sizeof(double* )); // initializing up matrix columns for (i = 0; i < M; i++) { A[i]= (double*) malloc(M * sizeof(double)); B[i]= (double*) malloc(M * sizeof(double)); C[i]= (double*) malloc(M * sizeof(double)); } // adding values to matrix for (i = 0; i < M; i++) { for (j = 0; j < M; j++) { A[i][j] = j*1; B[i][j] = i*j+2; C[i][j] = j-i*2; } } double start, end; start = omp_get_wtime(); /***** Matrix multiplication *****/ chunk = CHUNKSIZE; #pragma omp parallel shared(A,B,C, nthreads, chunk)// private(i, tid) // spawns threads { tid = omp_get_thread_num(); if(tid == 0) { nthreads = omp_get_num_threads(); printf("Number of threads = %d\n", nthreads); } printf("Thread %d starting...\n", tid); // where to put the parallel for loops? outter, inner, deep inner? //#pragma omp for schedule(dynamic,chunk) // divides loop iterations #pragma omp for schedule(runtime) // divides loop //#pragma omp parallel for schedule(runtime) // spawns threads and divides loop //runtime - schedule is convenient for experimenting with different schedules and chunk sizes without having to modify and recompile the program for (i = 0; i < M; i++) { //#pragma omp for schedule(dynamic,chunk) for (j = 0; j < M; j++) { sum = 0; //#pragma omp for schedule(dynamic,chunk) //#pragma omp parallel for schedule(dynamic,chunk) // spawns threads and divides loop #pragma omp for schedule(dynamic,chunk) // divides loop for (k=0; k < M; k++) { sum += A[i][k]*B[k][j]; } C[i][j] = sum; } } } end = omp_get_wtime(); printf("Time of computation: %f\n", end-start); }
static boost::tuple< boost::shared_ptr<Matrix>, boost::shared_ptr<Matrix> > transfer_operators(const Matrix &A, params &prm) { typedef typename backend::value_type<Matrix>::type value_type; typedef typename math::scalar_of<value_type>::type scalar_type; const size_t n = rows(A); BOOST_AUTO(Aptr, A.ptr_data()); BOOST_AUTO(Acol, A.col_data()); BOOST_AUTO(Aval, A.val_data()); TIC("aggregates"); Aggregates aggr(A, prm.aggr, prm.nullspace.cols); prm.aggr.eps_strong *= 0.5; TOC("aggregates"); TIC("interpolation"); boost::shared_ptr<Matrix> P_tent = tentative_prolongation<Matrix>( n, aggr.count, aggr.id, prm.nullspace, prm.aggr.block_size ); boost::shared_ptr<Matrix> P = boost::make_shared<Matrix>(); P->nrows = rows(*P_tent); P->ncols = cols(*P_tent); P->ptr.resize(n + 1, 0); #pragma omp parallel { std::vector<ptrdiff_t> marker(P->ncols, -1); #ifdef _OPENMP int nt = omp_get_num_threads(); int tid = omp_get_thread_num(); ptrdiff_t chunk_size = (n + nt - 1) / nt; ptrdiff_t chunk_start = tid * chunk_size; ptrdiff_t chunk_end = std::min<ptrdiff_t>(n, chunk_start + chunk_size); #else ptrdiff_t chunk_start = 0; ptrdiff_t chunk_end = n; #endif // Count number of entries in P. for(ptrdiff_t i = chunk_start; i < chunk_end; ++i) { for(ptrdiff_t ja = Aptr[i], ea = Aptr[i+1]; ja < ea; ++ja) { ptrdiff_t ca = Acol[ja]; // Skip weak off-diagonal connections. if (ca != i && !aggr.strong_connection[ja]) continue; for(ptrdiff_t jp = P_tent->ptr[ca], ep = P_tent->ptr[ca+1]; jp < ep; ++jp) { ptrdiff_t cp = P_tent->col[jp]; if (marker[cp] != i) { marker[cp] = i; ++( P->ptr[i + 1] ); } } } } boost::fill(marker, -1); #pragma omp barrier #pragma omp single { boost::partial_sum(P->ptr, P->ptr.begin()); P->col.resize(P->ptr.back()); P->val.resize(P->ptr.back()); } // Fill the interpolation matrix. for(ptrdiff_t i = chunk_start; i < chunk_end; ++i) { // Diagonal of the filtered matrix is the original matrix // diagonal minus its weak connections. value_type dia = math::zero<value_type>(); for(ptrdiff_t j = Aptr[i], e = Aptr[i+1]; j < e; ++j) { if (Acol[j] == i) dia += Aval[j]; else if (!aggr.strong_connection[j]) dia -= Aval[j]; } dia = math::inverse(dia); ptrdiff_t row_beg = P->ptr[i]; ptrdiff_t row_end = row_beg; for(ptrdiff_t ja = Aptr[i], ea = Aptr[i + 1]; ja < ea; ++ja) { ptrdiff_t ca = Acol[ja]; // Skip weak off-diagonal connections. if (ca != i && !aggr.strong_connection[ja]) continue; value_type va = (ca == i) ? static_cast<value_type>(static_cast<scalar_type>(1 - prm.relax) * math::identity<value_type>()) : static_cast<value_type>(static_cast<scalar_type>(-prm.relax) * dia * Aval[ja]); for(ptrdiff_t jp = P_tent->ptr[ca], ep = P_tent->ptr[ca+1]; jp < ep; ++jp) { ptrdiff_t cp = P_tent->col[jp]; value_type vp = P_tent->val[jp]; if (marker[cp] < row_beg) { marker[cp] = row_end; P->col[row_end] = cp; P->val[row_end] = va * vp; ++row_end; } else { P->val[ marker[cp] ] += va * vp; } } } } } TOC("interpolation"); boost::shared_ptr<Matrix> R = boost::make_shared<Matrix>(); *R = transpose(*P); if (prm.nullspace.cols > 0) prm.aggr.block_size = prm.nullspace.cols; return boost::make_tuple(P, R); }
void Voronoi_Charge() { double time0; int Mc_AN,Gc_AN,Mh_AN,h_AN,Gh_AN; int Cwan,GNc,GRc,Nog,Nh,MN,spin; double x,y,z,dx,dy,dz,fw; double Cxyz[4]; double FuzzyW,sum0,sum1; double magx,magy,magz; double tmagx,tmagy,tmagz; double tden,tmag,theta,phi,rho,mag; double den0,den1; double VC_S,T_VC0,T_VC1; double **VC; double TStime,TEtime; double S_coordinate[3]; int numprocs,myid,tag=999,ID; FILE *fp_VC; char file_VC[YOUSO10]; char buf[fp_bsize]; /* setvbuf */ MPI_Status stat; MPI_Request request; /* for OpenMP */ int OMPID,Nthrds,Nprocs; MPI_Comm_size(mpi_comm_level1,&numprocs); MPI_Comm_rank(mpi_comm_level1,&myid); dtime(&TStime); if (myid==Host_ID) printf("\n<Voronoi_Charge> calculate Voronoi charges\n");fflush(stdout); /***************************************************** allocation of array *****************************************************/ VC = (double**)malloc(sizeof(double*)*4); for (spin=0; spin<4; spin++){ VC[spin] = (double*)malloc(sizeof(double)*(atomnum+1)); } /***************************************************** calculation of Voronoi charge *****************************************************/ #pragma omp parallel shared(S_coordinate,GridVol,VC,Density_Grid,SpinP_switch,MGridListAtom,atv,CellListAtom,GridListAtom,NumOLG,WhatSpecies,M2G,Matomnum) private(OMPID,Nthrds,Nprocs,Mc_AN,Gc_AN,Cwan,sum0,sum1,tden,tmagx,tmagy,tmagz,Nog,GNc,GRc,Cxyz,x,y,z,FuzzyW,MN,den0,den1,theta,phi,rho,mag,magx,magy,magz,tmag) { /* get info. on OpenMP */ OMPID = omp_get_thread_num(); Nthrds = omp_get_num_threads(); Nprocs = omp_get_num_procs(); for (Mc_AN=1+OMPID; Mc_AN<=Matomnum; Mc_AN+=Nthrds){ Gc_AN = M2G[Mc_AN]; Cwan = WhatSpecies[Gc_AN]; sum0 = 0.0; sum1 = 0.0; tden = 0.0; tmagx = 0.0; tmagy = 0.0; tmagz = 0.0; for (Nog=0; Nog<NumOLG[Mc_AN][0]; Nog++){ /* calculate fuzzy weight */ GNc = GridListAtom[Mc_AN][Nog]; GRc = CellListAtom[Mc_AN][Nog]; Get_Grid_XYZ(GNc,Cxyz); x = Cxyz[1] + atv[GRc][1]; y = Cxyz[2] + atv[GRc][2]; z = Cxyz[3] + atv[GRc][3]; FuzzyW = Fuzzy_Weight(Gc_AN,Mc_AN,0,x,y,z); /* find charge */ MN = MGridListAtom[Mc_AN][Nog]; if (SpinP_switch<=1){ den0 = Density_Grid[0][MN]; den1 = Density_Grid[1][MN]; /* sum density */ sum0 += den0*FuzzyW; sum1 += den1*FuzzyW; } else{ den0 = Density_Grid[0][MN]; den1 = Density_Grid[1][MN]; theta = Density_Grid[2][MN]; phi = Density_Grid[3][MN]; rho = den0 + den1; mag = den0 - den1; magx = mag*sin(theta)*cos(phi); magy = mag*sin(theta)*sin(phi); magz = mag*cos(theta); /* sum density */ tden += rho*FuzzyW; tmagx += magx*FuzzyW; tmagy += magy*FuzzyW; tmagz += magz*FuzzyW; } } if (SpinP_switch<=1){ VC[0][Gc_AN] = sum0*GridVol; VC[1][Gc_AN] = sum1*GridVol; } else { tmag = sqrt(tmagx*tmagx + tmagy*tmagy + tmagz*tmagz); sum0 = 0.5*(tden + tmag); sum1 = 0.5*(tden - tmag); xyz2spherical( tmagx,tmagy,tmagz, 0.0,0.0,0.0, S_coordinate ); VC[0][Gc_AN] = sum0*GridVol; VC[1][Gc_AN] = sum1*GridVol; VC[2][Gc_AN] = S_coordinate[1]; VC[3][Gc_AN] = S_coordinate[2]; } } /* Mc_AN */ } /* #pragma omp parallel */ /***************************************************** MPI VC *****************************************************/ for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){ ID = G2ID[Gc_AN]; MPI_Bcast(&VC[0][Gc_AN], 1, MPI_DOUBLE, ID, mpi_comm_level1); } for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){ ID = G2ID[Gc_AN]; MPI_Bcast(&VC[1][Gc_AN], 1, MPI_DOUBLE, ID, mpi_comm_level1); } if (SpinP_switch==3){ for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){ ID = G2ID[Gc_AN]; MPI_Bcast(&VC[2][Gc_AN], 1, MPI_DOUBLE, ID, mpi_comm_level1); } for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){ ID = G2ID[Gc_AN]; MPI_Bcast(&VC[3][Gc_AN], 1, MPI_DOUBLE, ID, mpi_comm_level1); } } VC_S = 0.0; T_VC0 = 0.0; T_VC1 = 0.0; for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){ VC_S += VC[0][Gc_AN] - VC[1][Gc_AN]; T_VC0 += VC[0][Gc_AN]; T_VC1 += VC[1][Gc_AN]; } /**************************************** file, *.VC ****************************************/ if ( myid==Host_ID ){ sprintf(file_VC,"%s%s.VC",filepath,filename); if ((fp_VC = fopen(file_VC,"w")) != NULL){ #ifdef xt3 setvbuf(fp_VC,buf,_IOFBF,fp_bsize); /* setvbuf */ #endif fprintf(fp_VC,"\n"); fprintf(fp_VC,"***********************************************************\n"); fprintf(fp_VC,"***********************************************************\n"); fprintf(fp_VC," Voronoi charges \n"); fprintf(fp_VC,"***********************************************************\n"); fprintf(fp_VC,"***********************************************************\n\n"); fprintf(fp_VC," Sum of Voronoi charges for up = %15.12f\n", T_VC0); fprintf(fp_VC," Sum of Voronoi charges for down = %15.12f\n", T_VC1); fprintf(fp_VC," Sum of Voronoi charges for total = %15.12f\n\n", T_VC0+T_VC1); fprintf(fp_VC," Total spin magnetic moment (muB) by Voronoi charges = %15.12f\n\n",VC_S); if (SpinP_switch<=1){ fprintf(fp_VC," Up spin Down spin Sum Diff\n"); for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){ fprintf(fp_VC," Atom=%4d %12.9f %12.9f %12.9f %12.9f\n", Gc_AN, VC[0][Gc_AN], VC[1][Gc_AN], VC[0][Gc_AN] + VC[1][Gc_AN], VC[0][Gc_AN] - VC[1][Gc_AN]); } } else{ fprintf(fp_VC," Up spin Down spin Sum Diff Theta(Deg) Phi(Deg)\n"); for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){ fprintf(fp_VC," Atom=%4d %12.9f %12.9f %12.9f %12.9f %8.4f %8.4f\n", Gc_AN, VC[0][Gc_AN], VC[1][Gc_AN], VC[0][Gc_AN] + VC[1][Gc_AN], VC[0][Gc_AN] - VC[1][Gc_AN], VC[2][Gc_AN]/PI*180.0,VC[3][Gc_AN]/PI*180.0); } } fclose(fp_VC); } else{ printf("Failure of saving the VC file.\n"); } } /***************************************************** freeing of array *****************************************************/ for (spin=0; spin<4; spin++){ free(VC[spin]); } free(VC); /* for time */ dtime(&TEtime); time0 = TEtime - TStime; }
/* * ----------------------------4th Week Progress-------------------------------- * Find all the valid collisions from the data file given as the third argument. * First argument is a limitation on how many sets the function will examine. * Second argument is a limitation on how many nanoseconds the function will be * running. If the time limit is exceeded, the function will stop examining the * file. * Fourth argument is the number of threads MPI should utilize. * This function uses OMP AND OMPI to go through the examination. * ----------------------------------------------------------------------------- */ int ompi_parallel_estimation(long num_coords, int max_nsecs, char *file_name, int threads) { // Check if file exists. If not, print a message and return !0. if(access(file_name, F_OK) == -1) { printf("File was not found!\n"); return(FAILURE); } // Initialize the MPI. After this line, multiple procs run at the same time. MPI_Init(NULL, NULL); // Get the total number of processes that are running. int total_procs; MPI_Comm_size(MPI_COMM_WORLD, &total_procs); // Get the rank of the process int proc_num; MPI_Comm_rank(MPI_COMM_WORLD, &proc_num); if(proc_num>total_procs) { MPI_Finalize(); } long *succesful_col; if(proc_num == 0) succesful_col = malloc(sizeof(long) * total_procs); // Calculate the total number of coordinates that are stored in the file. // The number is: NUM = FILE_SIZE / SIZE_OF_A_FLOAT / 3 // A coordinate is a set of 3 floats. // The function will check exactly (EXAM_COORDS) collisions from the file. long exam_coords = fsize(file_name) / sizeof(float) / 3; // If there is a limitation on how many collisions the function should go // through, change the value of exam_coords to that limitation. if(num_coords >= 0 && num_coords < exam_coords) exam_coords = num_coords; // If the limitation exceeds the total number of coordinates stored in the // data file, display a message. The total number of collision, the function // will go through, remains the total number of coordinates available in the // file. if(num_coords > exam_coords) printf("You have asked for more lines than the ones available. All the lines are going to be examined.\n"); // Set which coords each PROCESS will go through. We will repeat the same // divison, when we distribute these coords to every THREAD that is running // within ever process. long coord_from = (int)exam_coords/total_procs * proc_num; long coord_to = (int)exam_coords/total_procs * (proc_num+1) - 1; if(proc_num+1 == total_procs) coord_to += exam_coords % total_procs; // Create a variable where the total number of valid collisions from all the // threads will be added to. long sum = 0; // If there is a limitation on how many threads MPI should use (-1 means all // available threads), apply it. if(threads >= 1) omp_set_num_threads(threads); // Start using OpenMP. #pragma omp parallel shared(file_name) { // Get the total number of threads the OMP is running. int total_threads = omp_get_num_threads(); // Get the ID of this particular thread. int tid = omp_get_thread_num(); // Each file opens its own pointer to the data file. FILE *file_ptr = fopen(file_name, "rb"); // Same when we divided the sets and distributed them to each process, // but this time we are doing it for ever thread within EACH process. long coll_from = (int)(coord_to-coord_from+1)/total_threads * tid; long coll_to = (int)(coord_to-coord_from+1)/total_threads * (tid+1) - 1; if(tid+1 == total_threads) coll_to += (coord_to-coord_from+1) % total_threads; // Skip some bytes from the data file, in order to get to the set where // the thread must start examining from fseek(file_ptr, 3*(coord_from+coll_from)*sizeof(float), SEEK_SET); long coords_read; long valid_collisions=0; // The timespecs will keep track of the time, if a limitation has been // set. struct timespec start, current; // Before the start of the examination, get the current time. clock_gettime(CLOCK_MONOTONIC, &start); // The function will check all the collisions, increasing its sum // (valid_collisions) every time a collision is within the limits // defined in the start of the code. // Every time it goes though one set, if there has been set a limitation // on how many nanoseconds the function should run, check the current // time, get the difference from the timestamp when the examination // started running and if the time limit has been exceeded, stop the // loop. for(coords_read=coll_from; coords_read<coll_to+1; coords_read++) { if(process_coords(file_ptr)==0) valid_collisions++; if(max_nsecs!=-1&&calculate_difference(start,current,0)>max_nsecs){ clock_gettime(CLOCK_MONOTONIC, ¤t); printf("Reached maximum time limit.\n"); break; } } // Each threads closes its file pointer. fclose(file_ptr); #pragma omp barrier // Finally, add all the valid collision numbers, each thread has found // to the shared variable "sum". #pragma omp for reduction(+:sum) for(tid=0;tid<total_threads;tid++) sum += valid_collisions; //sums the total collisions of all threads } // After each process has calculated how many valid collisions there are in // its own section of data, the MPI adds all the different result into one // shared variable (final_count) with the help of MPI_Gather. MPI_Gather(&sum, 1, MPI_LONG, succesful_col, 1, MPI_LONG, 0, MPI_COMM_WORLD); if(proc_num == 0){ long final_count = 0; for(int i=0; i<total_procs; i++){ final_count+=succesful_col[i]; } printf("MPI Parallel Examine -> Valid collisions: %ld", final_count); } MPI_Finalize(); return(SUCCESS); }
/* * ----------------------------2nd Week Progress-------------------------------- * Find all the valid collisions from the data file given as the third argument. * First argument is a limitation on how many sets the function will examine. * Second argument is a limitation on how many nanoseconds the function will be * running. If the time limit is exceeded, the function will stop examining the * file. * Fourth argument is the number of threads MPI should utilize. * This function uses OMP, but NOT OMPI to go through the examination. * ----------------------------------------------------------------------------- */ int parallel_estimation(long num_coords, int max_nsecs, char *file_name, int threads) { // Check if file exists. If not, print a message and return !0. if(access(file_name, F_OK) == -1) { printf("File was not found!\n"); return(FAILURE); } // Create a variable where the total number of valid collisions from all the // threads will be added to. long sum = 0; // If there is a limitation on how many threads MPI should use (-1 means all // available threads), apply it. if(threads >= 1) omp_set_num_threads(threads); // Start using OpenMP. #pragma omp parallel shared(file_name) { // Calculate the total number of coordinates that are stored in the // file. // The number is: NUM = FILE_SIZE / SIZE_OF_A_FLOAT / 3 // A coordinate is a set of 3 floats. // The function will check exactly (EXAM_COORDS) collisions from the // file. long exam_coords = fsize(file_name) / sizeof(float) / 3; // If there is a limitation on how many collisions the function should // go through, change the value of exam_coords to that limitation. if(num_coords >=0 && num_coords < exam_coords) exam_coords = num_coords; // If the limitation exceeds the total number of coordinates stored in the // data file, display a message. The total number of collision, the function // will go through, remains the total number of coordinates available in the // file. if(num_coords > exam_coords) printf("You have asked for more lines than the ones available. All the lines are going to be examined.\n"); // Get the total number of threads the OMP is running. int total_threads = omp_get_num_threads(); // Get the ID of this particular thread. int tid = omp_get_thread_num(); // Each file opens its own pointer to the data file. FILE *file_ptr = fopen(file_name, "rb"); // Set which coords each thread will process. long coord_from = (int)exam_coords/total_threads * tid; long coord_to = (int)exam_coords/total_threads * (tid+1) - 1; if(tid+1 == total_threads) coord_to += exam_coords % total_threads; // Skip some bytes from the data file, in order to get to the set where // the thread must start examining from. fseek(file_ptr, 3*coord_from*sizeof(float), SEEK_SET); long coords_read; long valid_collisions=0; // The timespecs will keep track of the time, if a limitation has been // set. struct timespec start, current; // Before the start of the examination, get the current time. clock_gettime(CLOCK_MONOTONIC, &start); // The function will check all the collisions, increasing its sum // (valid_collisions) every time a collision is within the limits // defined in the start of the code. // Every time it goes though one set, if there has been set a limitation // on how many nanoseconds the function should run, check the current // time, get the difference from the timestamp when the examination // started running and if the time limit has been exceeded, stop the // loop. for(coords_read=coord_from; coords_read<coord_to+1; coords_read++) { if(process_coords(file_ptr)==0) valid_collisions++; if(max_nsecs!=-1&&calculate_difference(start,current,0)>max_nsecs){ clock_gettime(CLOCK_MONOTONIC, ¤t); printf("Reached maximum time limit.\n"); break; } } // Each threads closes its file pointer. fclose(file_ptr); #pragma omp barrier // Finally, add all the valid collision numbers, each thread has found // to the shared variable "sum". #pragma omp for reduction(+:sum) for(tid=0;tid<total_threads;tid++) sum+=valid_collisions; #pragma omp master printf("Non-MPI Parallel Examine -> Valid collisions: %ld\n", sum); } return(SUCCESS); }
JNIEXPORT jint JNICALL Java_edu_berkeley_bid_UTILS_getnumthreads (JNIEnv * env, jobject calling_obj) { return omp_get_num_threads(); }
int main(int argc, char * argv[]) { omp_set_nested(10); // none zero value is OK! #pragma omp parallel num_threads(2) { printf("ID: %d, Max threads: %d, Num threads: %d \n",omp_get_thread_num(), omp_get_max_threads(), omp_get_num_threads()); #pragma omp parallel num_threads(5) printf("Nested, ID: %d, Max threads: %d, Num threads: %d \n",omp_get_thread_num(), omp_get_max_threads(), omp_get_num_threads()); } return 0; }
/*! * \return New filtered object with new values or NULL on error. * \ingroup WlzValuesFilters * \brief Applies a seperable filter to the given object using the given * convolution kernels. * \param inObj Input 2 or 3D spatial domain object * to be filtered which must have scalar * values. * \param cBufSz Convolution kernel sizes (sz), each * kernel buffer is sized (2 * sz) + 1 * with the centre indexed sz into the * buffer. * \param cBuf Convolution kernel buffers. * \param direc Set to non-zero in directions for which * the filter is to be applied. * \param gType Required return object grey type. * Passing in WLZ_GREY_ERROR will * request the given input object's grey * type. * \param pad Type of padding. * \param padVal Padding value, only used when * pad == ALG_PAD_VALUE. * \param dstErr Destination error pointer may be NULL. */ WlzObject *WlzSepFilter(WlzObject *inObj, WlzIVertex3 cBufSz, double *cBuf[], WlzIVertex3 direc, WlzGreyType gType, AlgPadType pad, double padVal, WlzErrorNum *dstErr) { int dim = 0, vSz = 0, nThr = 1; double **iBuf = NULL, **rBuf = NULL; double *vBuf = NULL; WlzObject *rnObj = NULL; WlzIVertex3 vBufSz = {0}; WlzIBox3 bBox = {0}; WlzErrorNum errNum = WLZ_ERR_NONE; #ifdef _OPENMP #pragma omp parallel { #pragma omp master { nThr = omp_get_num_threads(); } } #endif if(inObj == NULL) { errNum = WLZ_ERR_OBJECT_NULL; } else if(inObj->domain.core == NULL) { errNum = WLZ_ERR_DOMAIN_NULL; } else if(inObj->values.core == NULL) { errNum = WLZ_ERR_VALUES_NULL; } else { switch(inObj->type) { case WLZ_2D_DOMAINOBJ: dim = 2; break; case WLZ_3D_DOMAINOBJ: dim = 3; break; default: errNum = WLZ_ERR_OBJECT_TYPE; break; } } if((errNum == WLZ_ERR_NONE) && (gType == WLZ_GREY_ERROR)) { gType = WlzGreyTypeFromObj(inObj, &errNum); } if(errNum == WLZ_ERR_NONE) { if(errNum == WLZ_ERR_NONE) { switch(gType) { case WLZ_GREY_INT: /* FALLTHROUGH */ case WLZ_GREY_SHORT: /* FALLTHROUGH */ case WLZ_GREY_UBYTE: /* FALLTHROUGH */ case WLZ_GREY_FLOAT: /* FALLTHROUGH */ case WLZ_GREY_DOUBLE: break; default: errNum = WLZ_ERR_GREY_TYPE; break; } } } if(errNum == WLZ_ERR_NONE) { bBox = WlzBoundingBox3I(inObj, &errNum); if(errNum == WLZ_ERR_NONE) { vBufSz.vtX = bBox.xMax - bBox.xMin + 1; vBufSz.vtY = bBox.yMax - bBox.yMin + 1; if(dim == 3) { vBufSz.vtZ = bBox.zMax - bBox.zMin + 1; } } } if(errNum == WLZ_ERR_NONE) { vSz = ALG_MAX3(vBufSz.vtX, vBufSz.vtY, vBufSz.vtZ); if(((iBuf = (double **)AlcMalloc(sizeof(double *) * 2 * nThr)) == NULL) || ((vBuf = (double *)AlcMalloc(sizeof(double) * 2 * nThr * vSz)) == NULL)) { errNum = WLZ_ERR_MEM_ALLOC; } else { int idt; rBuf = iBuf + nThr; for(idt = 0; idt < nThr; ++idt) { iBuf[idt] = vBuf + (idt * vSz); rBuf[idt] = vBuf + ((nThr + idt) * vSz); } } } if(errNum == WLZ_ERR_NONE) { /* Convolve the object values. */ if(direc.vtX) { rnObj = WlzSepFilterX(inObj, dim, nThr, iBuf, rBuf, cBufSz.vtX, cBuf[0], pad, padVal, &errNum); } if((errNum == WLZ_ERR_NONE) && direc.vtY) { WlzObject *tObj; tObj = WlzSepFilterY((rnObj)? rnObj: inObj, dim, nThr, iBuf, rBuf, cBufSz.vtY, cBuf[1], pad, padVal, &errNum); (void )WlzFreeObj(rnObj); rnObj = tObj; } if((errNum == WLZ_ERR_NONE) && (dim == 3) && direc.vtZ) { WlzObject *tObj; tObj = WlzSepFilterZ((rnObj)? rnObj: inObj, bBox, nThr, iBuf, rBuf, cBufSz.vtZ, cBuf[2], pad, padVal, &errNum); (void )WlzFreeObj(rnObj); rnObj = tObj; } } if((errNum == WLZ_ERR_NONE) && (rnObj != NULL) && (gType != WLZ_GREY_DOUBLE)) { WlzObject *tObj; /* Convert object values to the required grey type. */ tObj = WlzConvertPix((rnObj)? rnObj: inObj, gType, &errNum); (void )WlzFreeObj(rnObj); rnObj = tObj; } if(errNum != WLZ_ERR_NONE) { (void )WlzFreeObj(rnObj); rnObj = NULL; } AlcFree(iBuf); AlcFree(vBuf); return(rnObj); }
void montecarlo_main_loop(storage_model_t * storage, int64_t virtual_packet_flag, int nthreads, unsigned long seed) { int64_t finished_packets = 0; storage->virt_packet_count = 0; #ifdef WITH_VPACKET_LOGGING storage->virt_packet_nus = (double *)safe_malloc(sizeof(double) * storage->no_of_packets); storage->virt_packet_energies = (double *)safe_malloc(sizeof(double) * storage->no_of_packets); storage->virt_packet_last_interaction_in_nu = (double *)safe_malloc(sizeof(double) * storage->no_of_packets); storage->virt_packet_last_interaction_type = (int64_t *)safe_malloc(sizeof(int64_t) * storage->no_of_packets); storage->virt_packet_last_line_interaction_in_id = (int64_t *)safe_malloc(sizeof(int64_t) * storage->no_of_packets); storage->virt_packet_last_line_interaction_out_id = (int64_t *)safe_malloc(sizeof(int64_t) * storage->no_of_packets); storage->virt_array_size = storage->no_of_packets; #endif // WITH_VPACKET_LOGGING #ifdef WITHOPENMP omp_set_dynamic(0); if (nthreads > 0) { omp_set_num_threads(nthreads); } #pragma omp parallel firstprivate(finished_packets) { rk_state mt_state; rk_seed (seed + omp_get_thread_num(), &mt_state); #pragma omp master { fprintf(stderr, "Running with OpenMP - %d threads\n", omp_get_num_threads()); print_progress(0, storage->no_of_packets); } #pragma omp for #else rk_state mt_state; rk_seed (seed, &mt_state); fprintf(stderr, "Running without OpenMP\n"); #endif for (int64_t packet_index = 0; packet_index < storage->no_of_packets; ++packet_index) { int reabsorbed = 0; rpacket_t packet; rpacket_set_id(&packet, packet_index); rpacket_init(&packet, storage, packet_index, virtual_packet_flag); if (virtual_packet_flag > 0) { reabsorbed = montecarlo_one_packet(storage, &packet, -1, &mt_state); } reabsorbed = montecarlo_one_packet(storage, &packet, 0, &mt_state); storage->output_nus[packet_index] = rpacket_get_nu(&packet); if (reabsorbed == 1) { storage->output_energies[packet_index] = -rpacket_get_energy(&packet); } else { storage->output_energies[packet_index] = rpacket_get_energy(&packet); } if ( ++finished_packets%100 == 0 ) { #ifdef WITHOPENMP // WARNING: This only works with a static sheduler and gives an approximation of progress. // The alternative would be to have a shared variable but that could potentially decrease performance when using many threads. if (omp_get_thread_num() == 0 ) print_progress(finished_packets * omp_get_num_threads(), storage->no_of_packets); #else print_progress(finished_packets, storage->no_of_packets); #endif } } #ifdef WITHOPENMP } #endif print_progress(storage->no_of_packets, storage->no_of_packets); fprintf(stderr,"\n"); }
// Constructor shapeAlign::shapeAlign(const string& nameList, const vector<string> &files, const int &minS, const int &maxS, const bool &win, const int &wS, const int &wE, const bool &ign, const int &iS, const int &iE, const int &E): nameFile(nameList), shapeFiles(files), shiftMin(minS), shiftMax(maxS), window(win), winStart(wS), winEnd(wE), ignore(ign), ignStart(iS), ignEnd(iE), thresh(E) { // Get number of shape parameters -- one file for each parameter, // so this is effectively the number of files m = files.size(); // Get site names by reading the single-column file containing // names of sites ifstream file(nameFile.c_str()); string line; while(getline(file,line)) names.push_back(line); file.close(); // Get number of sites nSites = names.size(); cerr << "Read " << nSites << " site names." << endl; // Initialize matrices for tracking pairwise comparison info D = gsl_matrix_calloc(nSites,nSites); S = gsl_matrix_calloc(nSites,nSites); R = gsl_matrix_calloc(nSites,nSites); cerr << "Reading shape files." << endl; matrices.resize(nSites); // Initialize an empty list of matrix references // create matrices containing the shape information. Add data // to these matrices on the fly. for (size_t f = 0; f < files.size(); f++){ cerr << "\t" << files[f] << endl; ifstream shapeFile(files[f].c_str()); int idx = 0; // line/site counter while(getline(shapeFile,line)){ // Each line in the shape files represent a single site stringstream linestream(line); string s; vector <string> temp; while(linestream >> s) // Split on spaces and store data from each position in site temp.push_back(s); int n = temp.size(); // Get number of positions // there are five columns that need to be trimmed off: // The first three columns (identifier and NAs) and the // last two columns (NAs) // Initialize the matrix if the matrix has not previously been // initialized if (f == 0) matrices[idx] = gsl_matrix_alloc(m,n-5); for (size_t i = 0; i < matrices[idx]->size2; i++){ double d; stringstream stod(temp[i+3]); stod >> d; gsl_matrix_set(matrices[idx],f,i,d); } // Increment the line counter idx++; } } cerr << "\tDone reading shape files." << endl; // Scale each matrix such that values to go from 1->2 cerr << "Scaling matrices." << endl; for (size_t i = 0; i < nSites; i++) scaleMatrixZscore(matrices[i]); cerr << "\tDone scaling matrices." << endl; // Loop over the sites and compute all pairwise distances -- note that // distances are symmetric: D[a,b] = D[b,a]. But, the shifts computed // are not symmetric: S[a,b] = -S[b,a]. for (size_t i = 0; i < nSites; i++){ if ((i+1) % 100 == 0) cerr << "\tProcessing " << i+1 << " of " << nSites << endl; // Parallelize this portion: data races shouldn't be a concern // since no threads should be writing to the same block of // memory #pragma omp parallel { #pragma omp master if (i==0) cerr << "Beginning all-by-all distance calculation using " << omp_get_num_threads() << " threads." << endl; #pragma omp for for (size_t j = i; j < nSites; j++){ // Get optimal shift and distances for the simple comparison alignData results = getOptimalShift(matrices[i],matrices[j]); // Get the matrix representing the reverse of the matrices[j] gsl_matrix* rev = gsl_matrix_alloc(matrices[j]->size1,matrices[j]->size2); gsl_matrix_memcpy(rev,matrices[j]); reverse(rev); // Get the optimal shift and distance for the reverse matrix alignData resultsRev = getOptimalShift(matrices[i],rev); if (results.score >= resultsRev.score){ results.rev = 0; } else { results.score = resultsRev.score; results.shift = resultsRev.shift; results.rev = 1; } // Store the data in the matrices used for tracking // pairwise comparisons gsl_matrix_set(D,i,j,results.score); gsl_matrix_set(S,i,j,results.shift); gsl_matrix_set(R,i,j,results.rev); gsl_matrix_set(D,j,i,results.score); gsl_matrix_set(S,j,i,-1*results.shift); gsl_matrix_set(R,j,i,results.rev); // Clean up -- free memory associated with rev gsl_matrix_free(rev); } } } cerr << "\tDone with distance calculation." << endl; cerr << "Finding centroid." << endl; pair<int,double> C = getCentroid(); cIdx = C.first; // Index (w.r.t. names vector) of centroid cDist = C.second; // Distance of centroid to other sequences cerr << "\tCentroid: Site \"" << names[C.first] << "\"" << endl; cerr << "\tDistance: " << C.second << endl; printCentroid(); printShifts(); // cerr << "Printing matrices to files." << endl; // printShiftMatrix(); // printDistanceMatrix(); // printRevMatrix(); // cerr << "\tDone." << endl; cerr << "Printing aligned data." << endl; printShiftedProfiles(); cerr << "\tDone." << endl; cerr << "Job successfully completed." << endl; }
void CDelphiFastSOR::itrOddPoints(const int& forWhom, const int& flag) { delphi_integer n,ix,iy,iz; delphi_integer star,fin; delphi_real temp1,temp2,temp3,temp4; delphi_integer itemp1,itemp2,itemp3,itemp4; //cout << "### oddpoints phimap1: " << flag << endl; #ifdef PARALLEL_OMP int omp_num_threads,omp_thread_id; /* * set number of threads = number of processors */ //omp_set_num_threads(2); omp_set_num_threads(omp_get_num_procs()); #pragma omp parallel default(shared) private(omp_thread_id,n,ix,iy,star,fin,temp1,temp2,temp3) { delphi_integer omp_index; omp_thread_id = omp_get_thread_num(); if (0 == omp_thread_id) omp_num_threads = omp_get_num_threads(); //cout << "thread " << omp_thread_id << " of " << omp_num_threads << " is alive\n"; #endif /* the following loops are about four times faster than the original loop over all grid points for * several reasons, the biggest being that we are only solving laplace's equation (unless salt is present), * which numerically much simpler, hence faster. we put all we leave out, back in below, ending up with * an equivalent calculation, but much faster. */ if (fZero < abs(fIonStrength)) //----- the main loop is as below: { #ifdef PARALLEL_OMP #pragma omp for schedule(auto) #endif for (n = 1; n < iGrid-1; n++) { star = sta1[n]; fin = fi1[n]; for (ix = star; ix <= fin; ix++) { temp1 = phimap2[ix-1] + phimap2[(ix-1)-1]; temp2 = phimap2[(ix-1)+lat1] + phimap2[(ix-1)-lat2]; temp3 = phimap2[(ix-1)+long1] + phimap2[(ix-1)-long2]; phimap1[ix-1] = phimap1[ix-1]*om1 + (qmap1[ix-1]+temp1+temp2+temp3)*prgfSaltMap1[ix-1]; } } } else //----- if there is no salt then the main loop is executed without sf saving about 15% in execution time { #ifdef PARALLEL_OMP #pragma omp for schedule(auto) #endif for (n = 1; n < iGrid-1; n++) { star = sta1[n]; fin = fi1[n]; for (ix = star; ix <= fin; ix++) { temp1 = phimap2[ix-1] + phimap2[(ix-1)-1]; temp2 = phimap2[(ix-1)+lat1] + phimap2[(ix-1)-lat2]; temp3 = phimap2[(ix-1)+long1] + phimap2[(ix-1)-long2]; phimap1[ix-1] = phimap1[ix-1]*om1 + (temp1+temp2+temp3)*sixth; //cout << "phimap1: " << right << setw(10) << flag << setw(10) << ix << setw(20) << setprecision(5) << fixed << phimap1[ix-1] << endl; //if(flag==1)cout << "1phimap1: " << right << setw(10) << flag << setw(10) << ix << setw(20) << setprecision(5) << fixed << phimap1[ix-1] << endl; //if( flag==2 && ix==498 ) //cout << "phimap1: " << right << setw(10) << flag << setw(8) << ix << setw(20) << setprecision(5) << fixed << phimap1[ix-1] // << " " << om1 << " " << temp1 << " " << temp2 << " " << temp3 << " " << sixth <<endl; } } } #ifdef PARALLEL_OMP //#pragma omp barrier #endif /* * first we add back the dielectric boundary points, by recalculating them individually. note this is still * vectorised by means of a gathering load by the compiler. */ #ifdef PARALLEL_OMP #pragma omp for schedule(auto) #endif for (n = 0; n < iDielecBndyEven; n++) { ix = prgiBndyDielecIndex[n]; temp1 = phimap2[(ix-1)-1]*prgfBndyDielec[n][0] + phimap2[ix-1]*prgfBndyDielec[n][1]; temp2 = phimap2[(ix-1)-lat2]*prgfBndyDielec[n][2] + phimap2[(ix-1)+lat1]*prgfBndyDielec[n][3]; temp3 = phimap2[(ix-1)-long2]*prgfBndyDielec[n][4] + phimap2[(ix-1)+long1]*prgfBndyDielec[n][5]; phimap1[ix-1] += temp1 + temp2 + temp3; /* if(flag==1)cout << "2phimap1: " << right << setw(10) << flag << setw(10) << ix << setw(10) << setprecision(5) << fixed << phimap1[ix-1] <<setw(10) << phimap2[(ix-1)-long2] <<setw(10) << prgfBndyDielec[n][4] <<setw(10) << phimap2[(ix-1)+long1] <<setw(10) <<prgfBndyDielec[n][5] <<setw(10) << (ix-1)-long2 <<setw(10) << (ix-1)+long1 << endl; */ //if( flag==1 && ix==498 ) // cout << "phimap1: " << right << setw(10) << ix << setw(20) << setprecision(5) << fixed << phimap1[ix-1] // << " " << temp1 << " " << temp2 << " " << temp3 <<endl; } /* * Now reset boundary values altered in above loops. */ #ifdef PARALLEL_OMP star = (iGrid+1)/2; fin = (iGrid*(iGrid-1)-2)/2; omp_index = iGrid*(iGrid+1)/2-iGrid+1;//iy = iGrid*(iGrid+1)/2-iGrid+1; #pragma omp for schedule(auto) for (n = 0; n < fin-star+1; n++) { iy = omp_index+(n+1)*iGrid; phimap1[iy-1] = bndx1[n]; phimap1[iy+((iGrid+1)/2-1)-1] = bndx2[n]; } #else star = (iGrid+1)/2; fin = (iGrid*(iGrid-1)-2)/2; iy = iGrid*(iGrid+1)/2-iGrid+1; for (n = 0; n < fin-star+1; n++) { iy = iy+iGrid; phimap1[iy-1] = bndx1[n]; phimap1[iy+((iGrid+1)/2-1)-1] = bndx2[n]; } #endif /* * next we add back an adjustment to all the charged grid points due to the charge assigned. the compiler * directive just reassures the vector compiler that all is well as far as recurrence is concerned, i.e. it * would think there is a recurrence below, where as in fact there is none. */ if (0 != forWhom) { #ifdef PARALLEL_OMP #pragma omp for schedule(auto) #endif for (n = 0; n < iCrgedGridEven; n++) { ix = prgiCrgPose[n]; phimap1[ix-1] += prgfCrgValA[n]; //if(flag==1)cout << "3phimap1: " << right << setw(10) << flag << setw(10) << ix << setw(20) << setprecision(5) << fixed << phimap1[ix-1] << endl; } } #ifdef PARALLEL_OMP } // end of #pragma omp parallel #endif /* * if periodic boundary condition option, force periodicity using wrap around update of boundary values: * 2nd slice-->last * last-1 slice-->first */ if (rgbPeriodicBndy[2]) //----- z periodicity { for (iz = 0; iz < (iGrid-2)*(iGrid-2); iz += 2) { temp1 = ibndz[iz]; itemp1 = (delphi_integer)temp1; temp2 = temp1 + idif1z; itemp2 = (delphi_integer)temp2; temp3 = temp2 + inc1za; itemp3 = (delphi_integer)temp3; temp4 = temp1 + inc1zb; itemp4 = (delphi_integer)temp4; phimap1[itemp1-1] = phimap2[itemp2-1]; phimap1[itemp3-1] = phimap2[itemp4-1]; } } if (rgbPeriodicBndy[1]) //----- y periodicity { for (iy = 0; iy < (iGrid-2)*(iGrid-2); iy += 2) { temp1 = ibndy[iy]; itemp1 = (delphi_integer)temp1; temp2 = temp1 + idif1y; itemp2 = (delphi_integer)temp2; temp3 = temp2 + inc1ya; itemp3 = (delphi_integer)temp3; temp4 = temp1 + inc1yb; itemp4 = (delphi_integer)temp4; phimap1[itemp1-1] = phimap2[itemp2-1]; phimap1[itemp3-1] = phimap2[itemp4-1]; } } if (rgbPeriodicBndy[0]) //----- x periodicity { for (ix = 0; ix < (iGrid-2)*(iGrid-2); ix += 2) { temp1 = ibndx[ix]; itemp1 = (delphi_integer)temp1; temp2 = temp1 + idif1x; itemp2 = (delphi_integer)temp2; temp3 = temp2 + inc1xa; itemp3 = (delphi_integer)temp3; temp4 = temp1 + inc1xb; itemp4 = (delphi_integer)temp4; phimap1[itemp1-1] = phimap2[itemp2-1]; phimap1[itemp3-1] = phimap2[itemp4-1]; } } }
int main(int argc, char *argv[]) { uint64_t total_num_nodes = 0; qtimer_t timer; double total_time = 0.0; CHECK_VERBOSE(); { unsigned int tmp = (unsigned int)tree_type; NUMARG(tmp, "UTS_TREE_TYPE"); if (tmp <= BALANCED) { tree_type = (tree_t)tmp; } else { fprintf(stderr, "invalid tree type\n"); return EXIT_FAILURE; } tmp = (unsigned int)shape_fn; NUMARG(tmp, "UTS_SHAPE_FN"); if (tmp <= FIXED) { shape_fn = (shape_t)tmp; } else { fprintf(stderr, "invalid shape function\n"); return EXIT_FAILURE; } } DBLARG(bf_0, "UTS_BF_0"); NUMARG(root_seed, "UTS_ROOT_SEED"); NUMARG(tree_depth, "UTS_TREE_DEPTH"); DBLARG(non_leaf_prob, "UTS_NON_LEAF_PROB"); NUMARG(non_leaf_bf, "UTS_NON_LEAF_NUM"); NUMARG(shift_depth, "UTS_SHIFT_DEPTH"); NUMARG(num_samples, "UTS_NUM_SAMPLES"); #pragma omp parallel #pragma omp single #ifdef PRINT_STATS print_stats(); #else print_banner(); #endif timer = qtimer_create(); qtimer_start(timer); node_t root; root.height = 0; rng_init(root.state.state, root_seed); root.num_children = calc_num_children(&root); nodecount = 1; long retval; #pragma omp parallel #pragma omp single nowait #pragma omp task untied retval = visit(&root, root.num_children); total_num_nodes = retval; qtimer_stop(timer); total_time = qtimer_secs(timer); qtimer_destroy(timer); #ifdef PRINT_STATS printf("tree-size %lu\ntree-depth %d\nnum-leaves %llu\nperc-leaves %.2f\n", (unsigned long)total_num_nodes, (int)tree_height, (unsigned long long)num_leaves, num_leaves / (float)total_num_nodes * 100.0); printf("exec-time %.3f\ntotal-perf %.0f\npu-perf %.0f\n\n", total_time, total_num_nodes / total_time, total_num_nodes / total_time / omp_get_num_threads()); #else printf("Tree size = %lu, tree depth = %d, num leaves = %llu (%.2f%%)\n", (unsigned long)total_num_nodes, (int)tree_height, (unsigned long long)num_leaves, num_leaves / (float)total_num_nodes * 100.0); printf("Wallclock time = %.3f sec, performance = %.0f " "nodes/sec (%.0f nodes/sec per PE)\n\n", total_time, total_num_nodes / total_time, total_num_nodes / total_time / omp_get_num_threads()); #endif /* ifdef PRINT_STATS */ return 0; }
double getStartLists(graph* G, edge** maxIntWtListPtr, INT_T* maxIntWtListSizePtr) { LONG_T *local_max, maxWeight; edge *maxIntWtList; LONG_T maxIntWtListSize; LONG_T *p_start, *p_end; double elapsed_time; elapsed_time = get_seconds(); #ifdef _OPENMP omp_set_num_threads(NUM_THREADS); #pragma omp parallel { #endif LONG_T i, j, n; edge* pList; LONG_T pCount, tmpListSize; int tid, nthreads; #ifdef DIAGNOSTIC double elapsed_time_part; #endif #ifdef _OPENMP tid = omp_get_thread_num(); nthreads = omp_get_num_threads(); #else tid = 0; nthreads = 1; #endif n = G->n; /* Determine the maximum edge weight */ if (tid == 0) { local_max = (LONG_T *) malloc(nthreads*sizeof(LONG_T)); } /* Allocate memory for partial edge list on each thread */ tmpListSize = 1000; pList = (edge *) malloc(tmpListSize*sizeof(edge)); pCount = 0; #ifdef _OPENMP #pragma omp barrier #endif local_max[tid] = -1; #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds(); } #endif #ifdef _OPENMP #pragma omp for #endif for (i=0; i<n; i++) { for (j=G->numEdges[i]; j<G->numEdges[i+1]; j++) { if (G->weight[j] > local_max[tid]) { local_max[tid] = G->weight[j]; pCount = 0; pList[pCount].startVertex = i; pList[pCount].endVertex = G->endV[j]; pList[pCount].w = local_max[tid]; pList[pCount].e = j; pCount++; } else if (G->weight[j] == local_max[tid]) { pList[pCount].startVertex = i; pList[pCount].endVertex = G->endV[j]; pList[pCount].w = local_max[tid]; pList[pCount].e = j; pCount++; } } } #ifdef _OPENMP #pragma omp barrier #endif if (tid == 0) { #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "Max. weight computation time: %lf seconds\n", elapsed_time_part); } #endif maxWeight = local_max[0]; for (i=1; i<nthreads; i++) { if (local_max[i] > maxWeight) maxWeight = local_max[i]; } // free(local_max); } #ifdef _OPENMP #pragma omp barrier #endif if (local_max[tid] != maxWeight) { pCount = 0; } /* Merge all te partial edge lists */ if (tid == 0) { p_start = (LONG_T *) malloc(nthreads*sizeof(LONG_T)); p_end = (LONG_T *) malloc(nthreads*sizeof(LONG_T)); } #ifdef _OPENMP #pragma omp barrier #endif p_end[tid] = pCount; p_start[tid] = 0; #ifdef _OPENMP #pragma omp barrier #endif if (tid == 0) { for (i=1; i<nthreads; i++) { p_end[i] = p_end[i-1] + p_end[i]; p_start[i] = p_end[i-1]; } maxIntWtListSize = p_end[nthreads-1]; free(*maxIntWtListPtr); maxIntWtList = (edge *) malloc((maxIntWtListSize)*sizeof(edge)); } #ifdef _OPENMP #pragma omp barrier #endif for (j=p_start[tid]; j<p_end[tid]; j++) { (maxIntWtList[j]).startVertex = pList[j-p_start[tid]].startVertex; (maxIntWtList[j]).endVertex = pList[j-p_start[tid]].endVertex; (maxIntWtList[j]).e = pList[j-p_start[tid]].e; (maxIntWtList[j]).w = pList[j-p_start[tid]].w; } #ifdef _OPENMP #pragma omp barrier #endif free(pList); if (tid == 0) { free(local_max); free(p_start); free(p_end); *maxIntWtListPtr = maxIntWtList; *maxIntWtListSizePtr = maxIntWtListSize; } #ifdef _OPENMP } #endif /* Verification */ #if 0 maxIntWtList = *maxIntWtListPtr; for (int i=0; i<*maxIntWtListSizePtr; i++) { fprintf(stderr, "[%ld %ld %ld %ld] ", maxIntWtList[i].startVertex, maxIntWtList[i].endVertex, maxIntWtList[i].e, maxIntWtList[i].w); } #endif elapsed_time = get_seconds() - elapsed_time; return elapsed_time; }
int KDE::CalcKDE(DataSet_double& Out, DataSet_1D const& Pdata, std::vector<double> const& Increments, HistBin const& Xdim, double bandwidth) const { int inSize = (int)Pdata.Size(); // Allocate output set, set all to zero. Out.Zero( Xdim.Bins() ); Out.SetDim( Dimension::X, Xdim ); int outSize = (int)Out.Size(); int frame, bin; double increment, val; double total = 0.0; # ifdef _OPENMP int original_num_threads; # pragma omp parallel { # pragma omp master { original_num_threads = omp_get_num_threads(); } } // Ensure we only execute with the desired number of threads if (numthreads_ < original_num_threads) omp_set_num_threads( numthreads_ ); # endif // Calculate KDE, loop over input data # ifdef _OPENMP int mythread; double **P_thread; # pragma omp parallel private(frame, bin, val, increment, mythread) reduction(+:total) { mythread = omp_get_thread_num(); // Prevent race conditions by giving each thread its own histogram # pragma omp master { P_thread = new double*[ numthreads_ ]; for (int nt = 0; nt < numthreads_; nt++) { P_thread[nt] = new double[ outSize ]; std::fill(P_thread[nt], P_thread[nt] + outSize, 0.0); } } # pragma omp barrier # pragma omp for # endif for (frame = 0; frame < inSize; frame++) { val = Pdata.Dval(frame); increment = Increments[frame]; total += increment; // Apply kernel across histogram for (bin = 0; bin < outSize; bin++) # ifdef _OPENMP P_thread[mythread][bin] += # else Out[bin] += # endif (increment * (this->*Kernel_)( (Xdim.Coord(bin) - val) / bandwidth )); } # ifdef _OPENMP } // END parallel block // Combine results from each thread histogram into Out for (int i = 0; i < numthreads_; i++) { for (int j = 0; j < outSize; j++) Out[j] += P_thread[i][j]; delete[] P_thread[i]; } delete[] P_thread; // Restore original number of threads if (original_num_threads != numthreads_) omp_set_num_threads( original_num_threads ); # endif // Normalize for (unsigned int j = 0; j < Out.Size(); j++) Out[j] /= (total * bandwidth); return 0; }
int main(int argc, char **argv) { // Process command-line arguments, if any. int mype=0; int numpe=0; parseInput(argc, argv); L7_Init(&mype, &numpe, &argc, argv, do_quo_setup, lttrace_on); struct timeval tstart_setup; cpu_timer_start(&tstart_setup); double circ_radius = 6.0; // Scale the circle appropriately for the mesh size. circ_radius = circ_radius * (double) nx / 128.0; int boundary = 1; int parallel_in = 1; #ifdef _OPENMP int nt = 0; int tid = 0; nt = omp_get_num_threads(); tid = omp_get_thread_num(); if (0 == tid) { printf("--- num openmp threads: %d\n", nt); fflush(stdout); } #endif mesh = new Mesh(nx, ny, levmx, ndim, boundary, parallel_in, do_gpu_calc); if (DEBUG) { //if (mype == 0) mesh->print(); char filename[10]; sprintf(filename,"out%1d",mype); mesh->fp=fopen(filename,"w"); //mesh->print_local(); } mesh->init(nx, ny, circ_radius, initial_order, do_gpu_calc); size_t &ncells = mesh->ncells; size_t &ncells_global = mesh->ncells_global; int &noffset = mesh->noffset; state = new State(mesh); state->init(do_gpu_calc); vector<int> &nsizes = mesh->nsizes; vector<int> &ndispl = mesh->ndispl; vector<real_t> &x = mesh->x; vector<real_t> &dx = mesh->dx; vector<real_t> &y = mesh->y; vector<real_t> &dy = mesh->dy; nsizes.resize(numpe); ndispl.resize(numpe); int ncells_int = ncells; MPI_Allgather(&ncells_int, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD); ndispl[0]=0; for (int ip=1; ip<numpe; ip++){ ndispl[ip] = ndispl[ip-1] + nsizes[ip-1]; } noffset = ndispl[mype]; state->resize(ncells); state->fill_circle(circ_radius, 100.0, 7.0); mesh->nlft = NULL; mesh->nrht = NULL; mesh->nbot = NULL; mesh->ntop = NULL; x.clear(); dx.clear(); y.clear(); dy.clear(); // Kahan-type enhanced precision sum implementation. double H_sum = state->mass_sum(enhanced_precision_sum); if (mype == 0) printf ("Mass of initialized cells equal to %14.12lg\n", H_sum); H_sum_initial = H_sum; double cpu_time_main_setup = cpu_timer_stop(tstart_setup); state->parallel_timer_output(numpe,mype,"CPU: setup time time was",cpu_time_main_setup); long long mem_used = memstats_memused(); if (mem_used > 0) { state->parallel_memory_output(numpe,mype,"Memory used in startup ",mem_used); state->parallel_memory_output(numpe,mype,"Memory peak in startup ",memstats_mempeak()); state->parallel_memory_output(numpe,mype,"Memory free at startup ",memstats_memfree()); state->parallel_memory_output(numpe,mype,"Memory available at startup ",memstats_memtotal()); } if (mype == 0) { printf("Iteration 0 timestep n/a Sim Time 0.0 cells %ld Mass Sum %14.12lg\n", ncells_global, H_sum); } mesh->cpu_calc_neigh_counter=0; mesh->cpu_time_calc_neighbors=0.0; mesh->cpu_rezone_counter=0; mesh->cpu_refine_smooth_counter=0; #ifdef HAVE_GRAPHICS #ifdef HAVE_OPENGL set_mysize(ncells_global); //vector<real_t> H_global; //vector<real_t> x_global; //vector<real_t> dx_global; //vector<real_t> y_global; //vector<real_t> dy_global; //vector<int> proc_global; if (mype == 0){ H_global.resize(ncells_global); x_global.resize(ncells_global); dx_global.resize(ncells_global); y_global.resize(ncells_global); dy_global.resize(ncells_global); proc_global.resize(ncells_global); } MPI_Gatherv(&x[0], nsizes[mype], MPI_C_REAL, &x_global[0], &nsizes[0], &ndispl[0], MPI_C_REAL, 0, MPI_COMM_WORLD); MPI_Gatherv(&dx[0], nsizes[mype], MPI_C_REAL, &dx_global[0], &nsizes[0], &ndispl[0], MPI_C_REAL, 0, MPI_COMM_WORLD); MPI_Gatherv(&y[0], nsizes[mype], MPI_C_REAL, &y_global[0], &nsizes[0], &ndispl[0], MPI_C_REAL, 0, MPI_COMM_WORLD); MPI_Gatherv(&dy[0], nsizes[mype], MPI_C_REAL, &dy_global[0], &nsizes[0], &ndispl[0], MPI_C_REAL, 0, MPI_COMM_WORLD); MPI_Gatherv(&state->H[0], nsizes[mype], MPI_C_REAL, &H_global[0], &nsizes[0], &ndispl[0], MPI_C_REAL, 0, MPI_COMM_WORLD); set_cell_data(&H_global[0]); set_cell_coordinates(&x_global[0], &dx_global[0], &y_global[0], &dy_global[0]); if (view_mode == 0) { mesh->proc.resize(ncells); for (size_t ii = 0; ii<ncells; ii++){ mesh->proc[ii] = mesh->mype; } MPI_Gatherv(&mesh->proc[0], nsizes[mype], MPI_INT, &proc_global[0], &nsizes[0], &ndispl[0], MPI_C_REAL, 0, MPI_COMM_WORLD); } set_cell_proc(&proc_global[0]); #endif #ifdef HAVE_MPE set_mysize(ncells); set_cell_data(&state->H[0]); set_cell_coordinates(&mesh->x[0], &mesh->dx[0], &mesh->y[0], &mesh->dy[0]); set_cell_proc(&mesh->proc[0]); #endif set_window(mesh->xmin, mesh->xmax, mesh->ymin, mesh->ymax); set_viewmode(view_mode); set_outline((int)outline); init_display(&argc, argv, "Shallow Water", mype); set_circle_radius(circle_radius); draw_scene(); if (verbose) sleep(5); sleep(2); // Set flag to show mesh results rather than domain decomposition. view_mode = 1; // Clear superposition of circle on grid output. circle_radius = -1.0; MPI_Barrier(MPI_COMM_WORLD); cpu_timer_start(&tstart); set_idle_function(&do_calc); start_main_loop(); #else MPI_Barrier(MPI_COMM_WORLD); cpu_timer_start(&tstart); for (int it = 0; it < 10000000; it++) { do_calc(); } #endif return 0; }
int main(int argc, char **argv ) { /* This is the shortest path project for CPSC424/524. Author: Bo Song, Yale University Date: 4/25/2016 Credits: This program is based on the description provided by Andrew Sherman */ double wct0, wct1, total_time, cput; char* sourceFile, * graphFile; int count[8]; #pragma omp parallel printf("num of threads = %d\n", omp_get_num_threads()); for(int i = 0; i < 8; i++) count[i] = 0; for(int i = 0; i < 8; i++) loopCount[i] = 0; for(int i = 0; i < 8; i++) updateCount[i] = 0; if(argc != 3) { printf("serial <graphfile> <sourcefile>\n"); return -1; } graphFile = argv[1]; sourceFile = argv[2]; timing(&wct0, &cput); printf("reading graph...\n"); readGraph(graphFile); printf("reading source...\n"); readSource(sourceFile); // print_adj_list(adj_listhead, N); #pragma omp parallel #pragma omp for schedule(static, 1) for(int i = 0; i < num_sources; i++) { count[omp_get_thread_num()]++; moore(sources[i]); } timing(&wct1, &cput); //get the end time total_time = wct1 - wct0; printf("Message printed by master: Total elapsed time is %f seconds.\n",total_time); // free resources for(int i = 1; i <= N; i++) { adj_node* node = adj_listhead[i]; while(node) { adj_node* next = node->next; free(node); node = next; } } printf("Load balance among threads: "); long long sumLoop = 0, sumUpdate = 0; for(int i = 0; i < 8; i++) { printf("%d ", count[i]); sumLoop += loopCount[i]; sumUpdate += updateCount[i]; } printf("portion = %f", (float)sumUpdate / sumLoop); printf("\n"); free(sources); }
//------------------------------------------------------------------------------------------------------------------------------ int main(int argc, char **argv){ int my_rank=0; int num_tasks=1; int OMP_Threads = 1; int OMP_Nested = 0; #ifdef _OPENMP #pragma omp parallel { #pragma omp master { OMP_Threads = omp_get_num_threads(); OMP_Nested = omp_get_nested(); } } #endif #ifdef USE_MPI int actual_threading_model = -1; int requested_threading_model = -1; requested_threading_model = MPI_THREAD_SINGLE; //requested_threading_model = MPI_THREAD_FUNNELED; //requested_threading_model = MPI_THREAD_SERIALIZED; //requested_threading_model = MPI_THREAD_MULTIPLE; //MPI_Init(&argc, &argv); #ifdef _OPENMP requested_threading_model = MPI_THREAD_FUNNELED; //requested_threading_model = MPI_THREAD_SERIALIZED; //requested_threading_model = MPI_THREAD_MULTIPLE; //MPI_Init_thread(&argc, &argv, requested_threading_model, &actual_threading_model); #endif MPI_Init_thread(&argc, &argv, requested_threading_model, &actual_threading_model); MPI_Comm_size(MPI_COMM_WORLD, &num_tasks); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); //if(actual_threading_model>requested_threading_model)actual_threading_model=requested_threading_model; if(my_rank==0){ if(requested_threading_model == MPI_THREAD_MULTIPLE )printf("Requested MPI_THREAD_MULTIPLE, "); else if(requested_threading_model == MPI_THREAD_SINGLE )printf("Requested MPI_THREAD_SINGLE, "); else if(requested_threading_model == MPI_THREAD_FUNNELED )printf("Requested MPI_THREAD_FUNNELED, "); else if(requested_threading_model == MPI_THREAD_SERIALIZED)printf("Requested MPI_THREAD_SERIALIZED, "); else if(requested_threading_model == MPI_THREAD_MULTIPLE )printf("Requested MPI_THREAD_MULTIPLE, "); else printf("Requested Unknown MPI Threading Model (%d), ",requested_threading_model); if(actual_threading_model == MPI_THREAD_MULTIPLE )printf("got MPI_THREAD_MULTIPLE\n"); else if(actual_threading_model == MPI_THREAD_SINGLE )printf("got MPI_THREAD_SINGLE\n"); else if(actual_threading_model == MPI_THREAD_FUNNELED )printf("got MPI_THREAD_FUNNELED\n"); else if(actual_threading_model == MPI_THREAD_SERIALIZED)printf("got MPI_THREAD_SERIALIZED\n"); else if(actual_threading_model == MPI_THREAD_MULTIPLE )printf("got MPI_THREAD_MULTIPLE\n"); else printf("got Unknown MPI Threading Model (%d)\n",actual_threading_model); } #ifdef USE_HPM // IBM HPM counters for BGQ... HPM_Init(); #endif #endif // USE_MPI int log2_box_dim = 6; int target_boxes_per_rank = 1; if(argc==3){ log2_box_dim=atoi(argv[1]); target_boxes_per_rank=atoi(argv[2]); }else{ if(my_rank==0){printf("usage: ./a.out [log2_box_dim] [target_boxes_per_rank]\n");} #ifdef USE_MPI MPI_Finalize(); #endif exit(0); } if(log2_box_dim<4){ if(my_rank==0){printf("log2_box_dim must be at least 4\n");} #ifdef USE_MPI MPI_Finalize(); #endif exit(0); } if(target_boxes_per_rank<1){ if(my_rank==0){printf("target_boxes_per_rank must be at least 1\n");} #ifdef USE_MPI MPI_Finalize(); #endif exit(0); } if(my_rank==0){ if(OMP_Nested)fprintf(stdout,"%d MPI Tasks of %d threads (OMP_NESTED=TRUE)\n\n" ,num_tasks,OMP_Threads); else fprintf(stdout,"%d MPI Tasks of %d threads (OMP_NESTED=FALSE)\n\n",num_tasks,OMP_Threads); } //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // calculate the problem size... #ifndef MAX_COARSE_DIM #define MAX_COARSE_DIM 11 #endif int64_t box_dim=1<<log2_box_dim; int64_t target_boxes = (int64_t)target_boxes_per_rank*(int64_t)num_tasks; int64_t boxes_in_i = -1; int64_t bi; for(bi=1;bi<1000;bi++){ // all possible problem sizes int64_t total_boxes = bi*bi*bi; if(total_boxes<=target_boxes){ int64_t coarse_grid_dim = box_dim*bi; while( (coarse_grid_dim%2) == 0){coarse_grid_dim=coarse_grid_dim/2;} if(coarse_grid_dim<=MAX_COARSE_DIM){ boxes_in_i = bi; } } } if(boxes_in_i<1){ if(my_rank==0){printf("failed to find an acceptable problem size\n");} #ifdef USE_MPI MPI_Finalize(); #endif exit(0); } //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // create the fine level... #ifdef USE_PERIODIC_BC int bc = BC_PERIODIC; #else int bc = BC_DIRICHLET; #endif level_type fine_grid; int ghosts=stencil_get_radius(); create_level(&fine_grid,boxes_in_i,box_dim,ghosts,VECTORS_RESERVED,bc,my_rank,num_tasks); //create_level(&fine_grid,boxes_in_i,box_dim,ghosts,VECTORS_RESERVED,BC_PERIODIC ,my_rank,num_tasks);double h0=1.0/( (double)boxes_in_i*(double)box_dim );double a=2.0;double b=1.0; // Helmholtz w/Periodic //create_level(&fine_grid,boxes_in_i,box_dim,ghosts,VECTORS_RESERVED,BC_PERIODIC ,my_rank,num_tasks);double h0=1.0/( (double)boxes_in_i*(double)box_dim );double a=0.0;double b=1.0; // Poisson w/Periodic //create_level(&fine_grid,boxes_in_i,box_dim,ghosts,VECTORS_RESERVED,BC_DIRICHLET,my_rank,num_tasks);double h0=1.0/( (double)boxes_in_i*(double)box_dim );double a=2.0;double b=1.0; // Helmholtz w/Dirichlet //create_level(&fine_grid,boxes_in_i,box_dim,ghosts,VECTORS_RESERVED,BC_DIRICHLET,my_rank,num_tasks);double h0=1.0/( (double)boxes_in_i*(double)box_dim );double a=0.0;double b=1.0; // Poisson w/Dirichlet //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #ifdef USE_HELMHOLTZ double a=2.0;double b=1.0; // Helmholtz if(my_rank==0)fprintf(stdout," Creating Helmholtz (a=%f, b=%f) test problem\n",a,b); #else double a=0.0;double b=1.0; // Poisson if(my_rank==0)fprintf(stdout," Creating Poisson (a=%f, b=%f) test problem\n",a,b); #endif double h0=1.0/( (double)boxes_in_i*(double)box_dim ); initialize_problem(&fine_grid,h0,a,b); rebuild_operator(&fine_grid,NULL,a,b); // i.e. calculate Dinv and lambda_max //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - mg_type all_grids; int minCoarseDim = 1; MGBuild(&all_grids,&fine_grid,a,b,minCoarseDim); // build the Multigrid Hierarchy //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int doTiming; int minSolves = 10; // do at least minSolves MGSolves double timePerSolve = 0; for(doTiming=0;doTiming<=1;doTiming++){ // first pass warms up, second pass times #ifdef USE_HPM // IBM performance counters for BGQ... if(doTiming)HPM_Start("FMGSolve()"); #endif #ifdef USE_MPI double minTime = 20.0; // minimum time in seconds that the benchmark should run double startTime = MPI_Wtime(); if(doTiming==1){ if((minTime/timePerSolve)>minSolves)minSolves=(minTime/timePerSolve); // if one needs to do more than minSolves to run for minTime, change minSolves } #endif if(my_rank==0){ if(doTiming==0){fprintf(stdout,"\n\n===== warming up by running %d solves ===============================\n",minSolves);} else{fprintf(stdout,"\n\n===== running %d solves =============================================\n",minSolves);} fflush(stdout); } int numSolves = 0; // solves completed MGResetTimers(&all_grids); while( (numSolves<minSolves) ){ zero_vector(all_grids.levels[0],VECTOR_U); #ifdef USE_FCYCLES FMGSolve(&all_grids,VECTOR_U,VECTOR_F,a,b,1e-15); #else MGSolve(&all_grids,VECTOR_U,VECTOR_F,a,b,1e-15); #endif numSolves++; } #ifdef USE_MPI if(doTiming==0){ double endTime = MPI_Wtime(); timePerSolve = (endTime-startTime)/numSolves; MPI_Bcast(&timePerSolve,1,MPI_DOUBLE,0,MPI_COMM_WORLD); // after warmup, process 0 broadcasts the average time per solve (consensus) } #endif #ifdef USE_HPM // IBM performance counters for BGQ... if(doTiming)HPM_Stop("FMGSolve()"); #endif } MGPrintTiming(&all_grids); // don't include the error check in the timing results //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if(my_rank==0){fprintf(stdout,"calculating error... ");} double fine_error = error(&fine_grid,VECTOR_U,VECTOR_UTRUE); if(my_rank==0){fprintf(stdout,"h = %22.15e ||error|| = %22.15e\n\n",h0,fine_error);fflush(stdout);} //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // MGDestroy() //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #ifdef USE_MPI #ifdef USE_HPM // IBM performance counters for BGQ... HPM_Print(); #endif MPI_Finalize(); #endif //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - return(0); }
void QCDDopr_Mult(QCDSpinor* pV,QCDMatrix* pU,QCDSpinor* pW,double k) { MPI_Request reqSend[8]; MPI_Request reqRecv[8]; MPI_Status st; QCDMatrix* pUx; QCDMatrix* pUy; QCDMatrix* pUz; QCDMatrix* pUt; int i; qcdtKappa[0] = k; qcdtKappa[1] = k; qcdtKappa[2] = k; qcdtKappa[3] = k; pUx = pU; pUy = pU + qcdNsite; pUz = pU + qcdNsite*2; pUt = pU + qcdNsite*3; /* #pragma omp parallel num_threads(8) */ #pragma omp parallel { int tid = 0,nid = 1; tid = omp_get_thread_num(); nid = omp_get_num_threads(); /* //debug */ /* printf("nthreads: %d\n", nid); */ /* printf("max_threads: %d\n", omp_get_max_threads()); */ if(tid == 0){ MPI_Irecv(qcdRecvBuf[QCD_TP],12*qcdNxyz,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_TP],QCD_TP,MPI_COMM_WORLD,&reqRecv[QCD_TP]); MPI_Irecv(qcdRecvBuf[QCD_TM],12*qcdNxyz,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_TM],QCD_TM,MPI_COMM_WORLD,&reqRecv[QCD_TM]); MPI_Irecv(qcdRecvBuf[QCD_XP],12*qcdNy*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_XP],QCD_XP,MPI_COMM_WORLD,&reqRecv[QCD_XP]); MPI_Irecv(qcdRecvBuf[QCD_XM],12*qcdNy*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_XM],QCD_XM,MPI_COMM_WORLD,&reqRecv[QCD_XM]); MPI_Irecv(qcdRecvBuf[QCD_YP],12*qcdNx*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_YP],QCD_YP,MPI_COMM_WORLD,&reqRecv[QCD_YP]); MPI_Irecv(qcdRecvBuf[QCD_YM],12*qcdNx*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_YM],QCD_YM,MPI_COMM_WORLD,&reqRecv[QCD_YM]); MPI_Irecv(qcdRecvBuf[QCD_ZP],12*qcdNx*qcdNy*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_ZP],QCD_ZP,MPI_COMM_WORLD,&reqRecv[QCD_ZP]); MPI_Irecv(qcdRecvBuf[QCD_ZM],12*qcdNx*qcdNy*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_ZM],QCD_ZM,MPI_COMM_WORLD,&reqRecv[QCD_ZM]); } //Send T QCDDopr_MakeTPB_dirac(qcdSendBuf[QCD_TP],pW,tid,nid); #pragma omp barrier if(tid == 0){ MPI_Isend(qcdSendBuf[QCD_TP],12*qcdNxyz,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_TM],QCD_TP,MPI_COMM_WORLD,&reqSend[QCD_TP]); } QCDDopr_MakeTMB_dirac(qcdSendBuf[QCD_TM],pUt + qcdNsite-qcdNxyz,pW + qcdNsite-qcdNxyz,tid,nid); #pragma omp barrier if(tid == 0){ MPI_Isend(qcdSendBuf[QCD_TM],12*qcdNxyz,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_TP],QCD_TM,MPI_COMM_WORLD,&reqSend[QCD_TM]); } //Send X QCDDopr_MakeXPB(qcdSendBuf[QCD_XP],pW,tid,nid); #pragma omp barrier if(tid == 0){ MPI_Isend(qcdSendBuf[QCD_XP],12*qcdNy*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_XM],QCD_XP,MPI_COMM_WORLD,&reqSend[QCD_XP]); } QCDDopr_MakeXMB(qcdSendBuf[QCD_XM],pUx + qcdNx-1,pW + qcdNx-1,tid,nid); #pragma omp barrier if(tid == 0){ MPI_Isend(qcdSendBuf[QCD_XM],12*qcdNy*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_XP],QCD_XM,MPI_COMM_WORLD,&reqSend[QCD_XM]); } //Send Y QCDDopr_MakeYPB(qcdSendBuf[QCD_YP],pW,tid,nid); #pragma omp barrier if(tid == 0){ MPI_Isend(qcdSendBuf[QCD_YP],12*qcdNx*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_YM],QCD_YP,MPI_COMM_WORLD,&reqSend[QCD_YP]); } QCDDopr_MakeYMB(qcdSendBuf[QCD_YM],pUy + qcdNxy-qcdNx,pW + qcdNxy-qcdNx,tid,nid); #pragma omp barrier if(tid == 0){ MPI_Isend(qcdSendBuf[QCD_YM],12*qcdNx*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_YP],QCD_YM,MPI_COMM_WORLD,&reqSend[QCD_YM]); } //Send Z QCDDopr_MakeZPB(qcdSendBuf[QCD_ZP],pW,tid,nid); #pragma omp barrier if(tid == 0){ MPI_Isend(qcdSendBuf[QCD_ZP],12*qcdNx*qcdNy*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_ZM],QCD_ZP,MPI_COMM_WORLD,&reqSend[QCD_ZP]); } QCDDopr_MakeZMB(qcdSendBuf[QCD_ZM],pUz + qcdNxyz-qcdNxy,pW + qcdNxyz-qcdNxy,tid,nid); #pragma omp barrier if(tid == 0){ MPI_Isend(qcdSendBuf[QCD_ZM],12*qcdNx*qcdNy*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_ZP],QCD_ZM,MPI_COMM_WORLD,&reqSend[QCD_ZM]); } QCDLA_Equate(pV + tid*qcdNsite/nid,pW + tid*qcdNsite/nid, (tid+1)*qcdNsite/nid - tid*qcdNsite/nid); #pragma omp barrier QCDDopr_TPin_dirac(pV,pUt,pW + qcdNxyz,tid,nid); #pragma omp barrier QCDDopr_TMin_dirac(pV,pUt-qcdNxyz,pW - qcdNxyz,tid,nid); #pragma omp barrier QCDDopr_XPin(pV,pUx,pW+1,tid,nid); #pragma omp barrier QCDDopr_XMin(pV,pUx-1,pW-1,tid,nid); #pragma omp barrier QCDDopr_YPin(pV,pUy,pW + qcdNx,tid,nid); #pragma omp barrier QCDDopr_YMin(pV,pUy-qcdNx,pW - qcdNx,tid,nid); #pragma omp barrier QCDDopr_ZPin(pV,pUz,pW + qcdNxy,tid,nid); #pragma omp barrier QCDDopr_ZMin(pV,pUz-qcdNxy,pW - qcdNxy,tid,nid); if(tid == 0){ MPI_Wait(&reqRecv[QCD_TP],&st); } #pragma omp barrier QCDDopr_SetTPBnd_dirac(pV,pUt,qcdRecvBuf[QCD_TP],tid,nid); if(tid == 0){ MPI_Wait(&reqRecv[QCD_TM],&st); } #pragma omp barrier QCDDopr_SetTMBnd_dirac(pV,qcdRecvBuf[QCD_TM],tid,nid); if(tid == 0){ MPI_Wait(&reqRecv[QCD_XP],&st); } #pragma omp barrier QCDDopr_SetXPBnd(pV,pUx,qcdRecvBuf[QCD_XP],tid,nid); if(tid == 0){ MPI_Wait(&reqRecv[QCD_XM],&st); } #pragma omp barrier QCDDopr_SetXMBnd(pV,qcdRecvBuf[QCD_XM],tid,nid); if(tid == 0){ MPI_Wait(&reqRecv[QCD_YP],&st); } #pragma omp barrier QCDDopr_SetYPBnd(pV,pUy,qcdRecvBuf[QCD_YP],tid,nid); if(tid == 0){ MPI_Wait(&reqRecv[QCD_YM],&st); } #pragma omp barrier QCDDopr_SetYMBnd(pV,qcdRecvBuf[QCD_YM],tid,nid); if(tid == 0){ MPI_Wait(&reqRecv[QCD_ZP],&st); } #pragma omp barrier QCDDopr_SetZPBnd(pV,pUz,qcdRecvBuf[QCD_ZP],tid,nid); if(tid == 0){ MPI_Wait(&reqRecv[QCD_ZM],&st); } #pragma omp barrier QCDDopr_SetZMBnd(pV,qcdRecvBuf[QCD_ZM],tid,nid); if(tid == 0){ MPI_Wait(&reqSend[QCD_TP],&st); MPI_Wait(&reqSend[QCD_TM],&st); MPI_Wait(&reqSend[QCD_XP],&st); MPI_Wait(&reqSend[QCD_XM],&st); MPI_Wait(&reqSend[QCD_YP],&st); MPI_Wait(&reqSend[QCD_YM],&st); MPI_Wait(&reqSend[QCD_ZP],&st); MPI_Wait(&reqSend[QCD_ZM],&st); } #pragma omp barrier } }
/* Main Program */ main(int argc,char **argv) { double **InputMatrix, **CheckMatrix; int j, i, N,Noofthreads,total_threads; printf("\n\t\t---------------------------------------------------------------------------"); printf("\n\t\t Centre for Development of Advanced Computing (C-DAC)"); printf("\n\t\t Email : [email protected]"); printf("\n\t\t---------------------------------------------------------------------------"); printf("\n\t\t Objective : Parallization of a loop nest contating a recurrence relation.\n "); printf("\n\t\t Demonstrates the use of OpenMP Parallel for directive "); printf("\n\t\t..........................................................................\n"); /* Checking for command line arguments */ if( argc !=3 ){ printf("\t\t Very Few Arguments\n "); printf("\t\t Syntax : exec <Threads> <matrix-size>\n"); exit(-1); } Noofthreads=atoi(argv[1]); N=atoi(argv[2]); if ((Noofthreads!=1) && (Noofthreads!=2) && (Noofthreads!=4) && (Noofthreads!=8) && (Noofthreads!= 16) ) { printf("\n Number of threads should be 1,2,4,8 or 16 for the execution of program. \n\n"); exit(-1); } /* printf("\n\t\t Enter the size of the Matrix\n"); scanf("%d", &N);*/ /* Input Checking */ if (N <= 0) { printf("\n\t\t Array Size Should Be Of Positive Sign \n"); exit(1); } /* Dynamic Memory Allocation */ InputMatrix = (double **) malloc(sizeof(double *) * N); CheckMatrix = (double **) malloc(sizeof(double *) * N); /* Initializing The Matrix Elements */ for (i = 0; i < N; i++) { InputMatrix[i] = (double *) malloc(sizeof(double) * N); for (j = 0 ; j < N; j++) InputMatrix[i][j] = i + j; } /* CheckMatrix Is Also Same As Input Matrix */ for (i =0; i < N; i++) { CheckMatrix[i] = (double *) malloc(sizeof(double) * N); for (j = 0; j < N; j++) CheckMatrix[i][j] = InputMatrix[i][j]; } /* set the number of threads */ omp_set_num_threads(Noofthreads); /* OpenMP Parallel For Directive : Fork the team of threads As In following code segment the j loop contain the recurrence i.e. the loop contain the data dependence each iteration write an element of InputMatrix that is read by the next iteration. that is difficult to remove,so the i loop is parallelised instead. */ struct timeval tv,tv1; gettimeofday(&tv,NULL); for (j = 1; j < N; j++) #pragma omp parallel for for (i = 1; i < N; i++) { if ( (omp_get_thread_num()) == 0) { total_threads=omp_get_num_threads(); } InputMatrix[i][j] = InputMatrix[i][j] + InputMatrix[i][j - 1]; }/* End of the parallel region */ gettimeofday(&tv1,NULL); double t1=tv1.tv_sec-tv.tv_sec+(tv1.tv_usec-tv.tv_usec)*0.000001; /* For Validity Of Output */ /* Serial Calculation */ gettimeofday(&tv,NULL); for (j = 1; j < N; j++) for (i = 1; i < N; i++) CheckMatrix[i][j] = CheckMatrix[i][j] + CheckMatrix[i][j - 1]; gettimeofday(&tv1,NULL); double t2=tv1.tv_sec-tv.tv_sec+(tv1.tv_usec-tv.tv_usec)*0.000001; for (i = 0; i < N; i++) for (j = 0; j < N; j++) if (CheckMatrix[i][j] == InputMatrix[i][j]) { continue; } else { printf("\n\t\t The result of the serial and parallel calculation are not Equal \n"); exit(1); } /* printf("\n The Output Matrix After Loop Nest Containing a Recurrence \n"); for (i = 0; i < N; i++) { for (j = 0; j < N; j++) printf("%lf\t", InputMatrix[i][j]); printf("\n"); }*/ printf("\n\n\t\t Threads : %d",total_threads); printf("\n\t\t Matrix Size : %d ",N); printf("\n\n\t\t Serial And Parallel Calculation Are Same. \n"); printf("\n\t\t paralle took %f serial took %f ",t1,t2); printf("\n\t\t..........................................................................\n"); printf("\n"); /* Freeing Of Allocated Memory */ free(InputMatrix); free(CheckMatrix); }
hpcc_fftw_mpi_plan HPCC_fftw_mpi_create_plan(MPI_Comm comm, s64Int_t n, fftw_direction dir, int flags) { hpcc_fftw_mpi_plan p; fftw_complex *a = NULL, *b = NULL; int nxyz; int rank, size; MPI_Comm_size( comm, &size ); MPI_Comm_rank( comm, &rank ); p = (hpcc_fftw_mpi_plan)fftw_malloc( sizeof *p ); if (! p) return p; nxyz = GetNXYZ( n, size ); p->wx = (fftw_complex *)HPCC_fftw_malloc( (nxyz/2 + FFTE_NP) * (sizeof *p->wx) ); p->wy = (fftw_complex *)HPCC_fftw_malloc( (nxyz/2 + FFTE_NP) * (sizeof *p->wy) ); p->wz = (fftw_complex *)HPCC_fftw_malloc( (nxyz/2 + FFTE_NP) * (sizeof *p->wz) ); p->work = (fftw_complex *)HPCC_fftw_malloc( n / size * 3 / 2 * (sizeof *p->work) ); p->c_size = (nxyz+FFTE_NP) * (FFTE_NBLK + 1) + FFTE_NP; #ifdef _OPENMP #pragma omp parallel { #pragma omp single { int i; i = omp_get_num_threads(); p->c = (fftw_complex *)HPCC_fftw_malloc( p->c_size * (sizeof *p->c) * i ); } } #else p->c = (fftw_complex *)HPCC_fftw_malloc( p->c_size * (sizeof *p->c) ); #endif if (! p->wx || ! p->wy || ! p->wz || ! p->work || ! p->c) { if (p->c) HPCC_fftw_free( p->c ); if (p->work) HPCC_fftw_free( p->work ); if (p->wz) HPCC_fftw_free( p->wz ); if (p->wy) HPCC_fftw_free( p->wy ); if (p->wx) HPCC_fftw_free( p->wx ); fftw_free( p ); return NULL; } p->n = n; p->comm = comm; p->dir = dir; p->flags = flags; MPI_Type_contiguous( 2, MPI_DOUBLE, &p->cmplx ); MPI_Type_commit( &p->cmplx ); if (FFTW_FORWARD == p->dir) p->timings = HPCC_fft_timings_forward; else p->timings = HPCC_fft_timings_backward; HPCC_pzfft1d( n, a, b, p->work, rank, size, 0, p ); return p; }
void populate_kmer_counter_from_reads (KmerCounter& kcounter, string& fasta_filename) { unsigned int kmer_length = kcounter.get_kmer_length(); int i, myTid; unsigned long sum, *record_counter = new unsigned long[omp_get_max_threads()]; unsigned long start, end; // init record counter for (int i = 0; i < omp_get_max_threads(); i++) { record_counter[i] = 0; } cerr << "-storing Kmers..." << "\n"; start = time(NULL); Fasta_reader fasta_reader(fasta_filename); unsigned int entry_num = 0; #pragma omp parallel private (myTid) { myTid = omp_get_thread_num(); record_counter[myTid] = 0; while (fasta_reader.hasNext()) { Fasta_entry fe = fasta_reader.getNext(); string accession = fe.get_accession(); #pragma omp atomic entry_num++; record_counter[myTid]++; if (IRKE_COMMON::MONITOR >= 4) { cerr << "[" << entry_num << "] acc: " << accession << ", by thread no: " << myTid << "\n";; } else if (IRKE_COMMON::MONITOR) { if (myTid == 0 && record_counter[myTid] % 1000 == 0) { sum = record_counter[0]; for (i=1; i<omp_get_num_threads(); i++) sum+= record_counter[i]; cerr << "\r [" << sum << "] sequences parsed. "; } } string seq = fe.get_sequence(); if (seq.length() < KMER_SIZE + 1) { continue; } kcounter.add_sequence(seq); } cerr << "\n" << " done parsing " << sum << " sequences, extracted " << kcounter.size() << " kmers, taking " << (end-start) << " seconds." << "\n"; } return; }
int main () { int thds, *buf; int errors = 0; thds = omp_get_max_threads (); if (thds == 1) { printf ("should be run this program on multi thread.\n"); exit (0); } buf = (int *) malloc (sizeof(int) * (thds + 1)); if (buf == NULL) { printf ("can not allocate memory.\n"); exit (1); } omp_set_dynamic (0); omp_set_nested (1); if (omp_get_nested () == 0) { printf ("nested parallelism is not implement.\n"); goto END; } omp_set_num_threads (1); #pragma omp parallel { int i, j; if (omp_get_num_threads () != 1) { #pragma omp critical errors += 1; } if (omp_get_thread_num () != 0) { errors += 1; } for (i=1; i<=thds; i++) { memset (buf, 0, sizeof(int) * (thds+1)); omp_set_num_threads (i); #pragma omp parallel { int id = omp_get_thread_num (); if (omp_get_num_threads () != i) { #pragma omp critical errors += 1; } buf[id] += 1; } for (j=0; j<i; j++) { if (buf[j] != 1) { #pragma omp critical errors += 1; } } for (j=i; j<=thds; j++) { if (buf[j] != 0) { #pragma omp critical errors += 1; } } } } END: if (errors == 0) { printf ("omp_set_nested 002 : SUCCESS\n"); return 0; } else { printf ("omp_set_nested 002 : FAILED\n"); return 1; } }
int main(int argc, char **argv) { //initialize plqcd int init_status; if(argc < 3) { fprintf(stderr,"Error. Must pass the name of the input file and the number of multiplications to be performed \n"); fprintf(stderr,"Usage: %s input_file_name Nmul\n",argv[0]); exit(1); } init_status = init_plqcd(argc,argv); if(init_status != 0) printf("Error initializing plqcd\n"); int proc_id; int i,j,k,Nmul; proc_id = ipr(plqcd_g.cpr); Nmul=atoi(argv[2]); #if 0 //Intialize the ranlux random number generator start_ranlux(0,1); #endif int NPROCS=plqcd_g.nprocs[0]*plqcd_g.nprocs[1]*plqcd_g.nprocs[2]*plqcd_g.nprocs[3]; char ofname[128]; char buff[128]; strcpy(ofname,"test_hopping_output.procgrid."); sprintf(buff,"%d-%d-%d-%d.nthreads.%d.proc.%d",plqcd_g.nprocs[0],plqcd_g.nprocs[1],plqcd_g.nprocs[2],plqcd_g.nprocs[3],plqcd_g.nthread,proc_id); strcat(ofname,buff); FILE *ofp; //FILE *ofp_source; //if(proc_id==0) //{ // ofp_source = fopen("test_rand_vals.out","w"); //} if(proc_id==0) { ofp=fopen(ofname,"w"); fprintf(ofp,"INPUT GLOBALS:\n"); fprintf(ofp,"----------------\n"); fprintf(ofp,"NPROC0 %d, NPROC1 %d, NPROC2 %d, NPROC3 %d, NTHREAD %d\n",plqcd_g.nprocs[0],plqcd_g.nprocs[1],plqcd_g.nprocs[2],plqcd_g.nprocs[3], plqcd_g.nthread); fprintf(ofp,"L0 %d, L1 %d, L2 %d, L3 %d\n\n",plqcd_g.latdims[0],plqcd_g.latdims[1],plqcd_g.latdims[2],plqcd_g.latdims[3]); //printf("sizeof(spinor) %ld, sizeof(halfspinor) %ld, sizeof(su3) %ld \n",sizeof(spinor),sizeof(halfspinor),sizeof(su3)); } int nthr; #ifdef _OPENMP #pragma omp parallel { nthr=omp_get_num_threads(); if(omp_get_thread_num() == 0) if(proc_id==0) fprintf(ofp,"Number of threads as returned by openmp %d\n",nthr); } #endif /***************************************************** *Testing the Dirac operator interface ****************************************************/ spinor *pin= (spinor *) amalloc(plqcd_g.VOLUME*sizeof(spinor), plqcd_g.ALIGN); if(pin==NULL) { fprintf(stderr,"ERROR: insufficient memory for spinor pin.\n"); exit(2); } spinor *pout= (spinor *) amalloc(plqcd_g.VOLUME*sizeof(spinor), plqcd_g.ALIGN); if(pout==NULL) { fprintf(stderr,"ERROR: insufficient memory for spinor pout.\n"); exit(2); } su3 *ufield= (su3 *) amalloc(4*plqcd_g.VOLUME*sizeof(su3), plqcd_g.ALIGN); if(ufield==NULL) { fprintf(stderr,"ERROR: insufficient memory for gauge field ufield.\n"); exit(2); } //256 arrays #ifdef AVX spinor_256 *pin_256= (spinor_256 *) amalloc(plqcd_g.VOLUME/2*sizeof(spinor_256), plqcd_g.ALIGN); if(pin_256==NULL) { fprintf(stderr,"ERROR: insufficient memory for spinor pin_256.\n"); exit(2); } spinor_256 *pout_256= (spinor_256 *) amalloc(plqcd_g.VOLUME/2*sizeof(spinor_256), plqcd_g.ALIGN); if(pout_256==NULL) { fprintf(stderr,"ERROR: insufficient memory for spinor pout_256.\n"); exit(2); } su3_256 *ufield_256= (su3_256 *) amalloc(4*plqcd_g.VOLUME/2*sizeof(su3_256), plqcd_g.ALIGN); if(ufield_256==NULL) { fprintf(stderr,"ERROR: insufficient memory for gauge field ufield_256.\n"); exit(2); } #endif //512 arrays #ifdef MIC spinor_512 *pin_512= (spinor_512 *) amalloc(plqcd_g.VOLUME/4*sizeof(spinor_512), plqcd_g.ALIGN); if(pin_512==NULL) { fprintf(stderr,"ERROR: insufficient memory for spinor pin_512.\n"); exit(2); } spinor_512 *pout_512= (spinor_512 *) amalloc(plqcd_g.VOLUME/4*sizeof(spinor_512), plqcd_g.ALIGN); if(pout_512==NULL) { fprintf(stderr,"ERROR: insufficient memory for spinor pout_512.\n"); exit(2); } su3_512 *ufield_512= (su3_512 *) amalloc(4*plqcd_g.VOLUME/4*sizeof(su3_512), plqcd_g.ALIGN); if(ufield_512==NULL) { fprintf(stderr,"ERROR: insufficient memory for gauge field ufield_512.\n"); exit(2); } #endif //intialize the random number generator by a seed equals to the process rank srand((unsigned int) proc_id); //Initialize the input spinor and gauge links to random numbers //intialize the random number generator by a seed equals to the process rank srand((unsigned int) proc_id); //Initialize the input spinor and gauge links to random numbers double ru[18]; double rs[24]; for(i=0; i<plqcd_g.VOLUME; i++) { //ranlxd(rs,24); for(j=0; j<24; j++) { rs[j]= rand() / (double)RAND_MAX; //fprintf(stderr,"rs[%d]=%lf\n",j,rs[j]); } pin[i].s0.c0=rs[0]+I*rs[1]; pin[i].s0.c1=rs[2]+I*rs[3]; pin[i].s0.c2=rs[4]+I*rs[5]; pin[i].s1.c0=rs[6]+I*rs[7]; pin[i].s1.c1=rs[8]+I*rs[9]; pin[i].s1.c2=rs[10]+I*rs[11]; pin[i].s2.c0=rs[12]+I*rs[13]; pin[i].s2.c1=rs[14]+I*rs[15]; pin[i].s2.c2=rs[16]+I*rs[17]; pin[i].s3.c0=rs[18]+I*rs[19]; pin[i].s3.c1=rs[20]+I*rs[21]; pin[i].s3.c2=rs[22]+I*rs[23]; //ranlxd(rs,24); for(j=0; j<24; j++) rs[j]= rand() / (double)RAND_MAX; pout[i].s0.c0=rs[0]+I*rs[1]; pout[i].s0.c1=rs[2]+I*rs[3]; pout[i].s0.c2=rs[4]+I*rs[5]; pout[i].s1.c0=rs[6]+I*rs[7]; pout[i].s1.c1=rs[8]+I*rs[9]; pout[i].s1.c2=rs[10]+I*rs[11]; pout[i].s2.c0=rs[12]+I*rs[13]; pout[i].s2.c1=rs[14]+I*rs[15]; pout[i].s2.c2=rs[16]+I*rs[17]; pout[i].s3.c0=rs[18]+I*rs[19]; pout[i].s3.c1=rs[20]+I*rs[21]; pout[i].s3.c2=rs[22]+I*rs[23]; for(j=0; j<4; j++) { //ranlxd(ru,18); for(k=0; k<18; k++) { ru[k]= rand() / (double)RAND_MAX; //fprintf(stderr,"ru[%d]=%lf\n",k,ru[k]); } ufield[4*i+j].c00=ru[0]+I*ru[1]; ufield[4*i+j].c01=ru[2]+I*ru[3]; ufield[4*i+j].c02=ru[4]+I*ru[5]; ufield[4*i+j].c10=ru[6]+I*ru[7]; ufield[4*i+j].c11=ru[8]+I*ru[9]; ufield[4*i+j].c12=ru[10]+I*ru[11]; ufield[4*i+j].c20=ru[12]+I*ru[13]; ufield[4*i+j].c21=ru[14]+I*ru[15]; ufield[4*i+j].c22=ru[16]+I*ru[17]; } } #ifdef AVX for(i=0; i<plqcd_g.VOLUME; i +=2) { for(j=0; j<4; j++) copy_su3_to_su3_256(ufield_256+4*i/2+j, ufield+4*i+j, ufield+4*(i+1)+j); copy_spinor_to_spinor_256(pin_256+i/2, pin+i, pin+i+1); copy_spinor_to_spinor_256(pout_256+i/2, pout+i, pout+i+1); } #endif #ifdef MIC for(i=0; i<plqcd_g.VOLUME; i +=4) { for(j=0; j<4; j++) copy_su3_to_su3_512(ufield_512+4*i/4+j, ufield+4*i+j, ufield+4*(i+1)+j, ufield+4*(i+2)+j, ufield+4*(i+3)+j); copy_spinor_to_spinor_512(pin_512+i/4, pin+i, pin+i+1, pin+i+2, pin+i+3); copy_spinor_to_spinor_512(pout_512+i/4, pout+i, pout+i+1, pout+i+2, pout+i+3); } #endif double total,t1=0.0,t2=0.0,mytotal; int matvecs; #ifdef ASSYMBLY //--------------------------------------------- //1: non-blocking assymbly/c version //--------------------------------------------- matvecs=0; total=0.0; mytotal =0.0; while(mytotal < 30) { MPI_Barrier(MPI_COMM_WORLD); for(i=0; i<Nmul; i++) { t1=plqcd_hopping_matrix_eo_sse3_assymbly(pin,pout,ufield); t2=plqcd_hopping_matrix_oe_sse3_assymbly(pin,pout,ufield); mytotal += t1+t2; } matvecs += Nmul; } MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD); MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD); if (proc_id==0) { total /= (double)(NPROCS); } if(proc_id==0) { fprintf(ofp,"non-blocking assymbly/c version:\n"); fprintf(ofp,"------------------------------------------\n"); fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n", matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6); } #endif #ifdef SSE3_INTRIN //--------------------------------------------- //1: non-blocking sse3 with intrinsics version //--------------------------------------------- matvecs=0; total=0.0; mytotal =0.0; while(mytotal < 30) { MPI_Barrier(MPI_COMM_WORLD); for(i=0; i<Nmul; i++) { t1=plqcd_hopping_matrix_eo_sse3_intrin(pin,pout,ufield); t2=plqcd_hopping_matrix_oe_sse3_intrin(pin,pout,ufield); mytotal += t1+t2; } matvecs += Nmul; } MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD); MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD); if (proc_id==0) { total /= (double)(NPROCS); } if(proc_id==0) { fprintf(ofp,"non-blocking sse3 with intrinsics version:\n"); fprintf(ofp,"------------------------------------------\n"); fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n", matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6); } //--------------------------------------------- //2: blocking sse3 with intrinsics version //--------------------------------------------- matvecs=0; total=0.0; mytotal =0.0; while(mytotal < 30) { MPI_Barrier(MPI_COMM_WORLD); for(i=0; i<Nmul; i++) { t1=plqcd_hopping_matrix_eo_sse3_intrin_blocking(pin,pout,ufield); t2=plqcd_hopping_matrix_oe_sse3_intrin_blocking(pin,pout,ufield); mytotal += t1+t2; } matvecs += Nmul; } MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD); MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD); if (proc_id==0) { total /= (double)(NPROCS); } if(proc_id==0) { fprintf(ofp,"blocking sse3 with intrinsics version:\n"); fprintf(ofp,"------------------------------------------\n"); fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n", matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6); } #endif #ifdef AVX //--------------------------------------------- //2: avx version //--------------------------------------------- matvecs=0; total=0.0; mytotal =0.0; t1=plqcd_hopping_matrix_eo_intrin_256(pin_256,pout_256,ufield_256); while(mytotal < 30) { MPI_Barrier(MPI_COMM_WORLD); for(i=0; i<Nmul; i++) { t1=plqcd_hopping_matrix_eo_intrin_256(pin_256,pout_256,ufield_256); t2=plqcd_hopping_matrix_oe_intrin_256(pin_256,pout_256,ufield_256); mytotal += t1+t2; } matvecs += Nmul; } MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD); MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD); if (proc_id==0) { total /= (double)(NPROCS); } if(proc_id==0) { fprintf(ofp,"avxversion:\n"); fprintf(ofp,"------------------------------------------\n"); fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n", matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6); } #endif #ifdef MIC #ifdef TEST_HOPPING_MIC //--------------------------------------------- //3: MIC version full su3 matrix //--------------------------------------------- matvecs=0; total=0.0; mytotal =0.0; t1=plqcd_hopping_matrix_eo_single_mic(pin_512,pout_512,ufield_512); while(mytotal < 30) { MPI_Barrier(MPI_COMM_WORLD); for(i=0; i<Nmul; i++) { //t1=plqcd_hopping_matrix_eo_intrin_512(pin_512,pout_512,ufield_512); //t2=plqcd_hopping_matrix_oe_intrin_512(pin_512,pout_512,ufield_512); t1=plqcd_hopping_matrix_eo_single_mic(pin_512,pout_512,ufield_512); t2=plqcd_hopping_matrix_eo_single_mic(pin_512,pout_512,ufield_512); mytotal += t1+t2; } matvecs += 2*Nmul; } MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD); MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD); if (proc_id==0) { total /= (double)(NPROCS); } if(proc_id==0) { fprintf(ofp,"mic version, 3x3 links:\n"); fprintf(ofp,"------------------------------------------\n"); fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n", matvecs,total,(double )matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6); } //--------------------------------------------- //3: MIC version full reduced su3 storage //--------------------------------------------- matvecs=0; total=0.0; mytotal =0.0; t1=plqcd_hopping_matrix_eo_single_mic_short(pin_512,pout_512,ufield_512); while(mytotal < 30) { MPI_Barrier(MPI_COMM_WORLD); for(i=0; i<Nmul; i++) { //t1=plqcd_hopping_matrix_eo_intrin_512(pin_512,pout_512,ufield_512); //t2=plqcd_hopping_matrix_oe_intrin_512(pin_512,pout_512,ufield_512); t1=plqcd_hopping_matrix_eo_single_mic_short(pin_512,pout_512,ufield_512); t2=plqcd_hopping_matrix_eo_single_mic_short(pin_512,pout_512,ufield_512); mytotal += t1+t2; } matvecs += 2*Nmul; } MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD); MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD); if (proc_id==0) { total /= (double)(NPROCS); } if(proc_id==0) { fprintf(ofp,"mic version, 2x3 links:\n"); fprintf(ofp,"------------------------------------------\n"); fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n", matvecs,total,(double )matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6); } #endif #ifdef TEST_SU3MUL_MIC matvecs=0; total=0.0; mytotal =0.0; //while(mytotal < 10) //{ MPI_Barrier(MPI_COMM_WORLD); for(i=0; i<Nmul; i++) { t1=stop_watch(0.0); #ifdef _OPENMP #pragma omp parallel { #endif __m512d U[3][3], gin[3],gout[3]; su3_512 *u0; su3_vector_512 *hin,*hout; #ifdef _OPENMP #pragma omp for #endif for(j=0; j< plqcd_g.VOLUME/4; j++) { u0 = &ufield_512[4*j]; hin = &pin_512[j].s0; hout= &pout_512[j].s0; intrin_su3_load_512(U,u0); intrin_vector_load_512(gin,hin); intrin_su3_multiply_512(gout,U,gin); intrin_vector_store_512(hout,gout); u0++; hin++; hout++; intrin_su3_load_512(U,u0); intrin_vector_load_512(gin,hin); intrin_su3_multiply_512(gout,U,gin); intrin_vector_store_512(hout,gout); u0++; hin++; hout++; intrin_su3_load_512(U,u0); intrin_vector_load_512(gin,hin); intrin_su3_multiply_512(gout,U,gin); intrin_vector_store_512(hout,gout); u0++; hin++; hout++; intrin_su3_load_512(U,u0); intrin_vector_load_512(gin,hin); intrin_su3_multiply_512(gout,U,gin); intrin_vector_store_512(hout,gout); } #ifdef _OPENMP } #endif t2 = stop_watch(t1); mytotal += t2; } matvecs += 4*Nmul*plqcd_g.VOLUME; //} MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD); MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD); if (proc_id==0) { total /= (double)(NPROCS); } if(proc_id==0) { fprintf(ofp,"su3mul mic version:\n"); fprintf(ofp,"------------------------------------------\n"); fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n", matvecs,total,matvecs*66.0/total/1e+6); } #endif #endif //MIC finalize_plqcd(); return 0; }
/** * * Local Matrix Multiply * Computes C = alpha * A * B + beta * C * * * Similar to the DGEMM routine in BLAS * * * alpha and beta are double-precision scalars * * A, B, and C are matrices of double-precision elements * stored in column-major format * * The output is stored in C * A and B are not modified during computation * * * m - number of rows of matrix A and rows of C * n - number of columns of matrix B and columns of C * k - number of columns of matrix A and rows of B * * lda, ldb, and ldc specifies the size of the first dimension of the matrices * **/ void local_mm(const int m, const int n, const int k, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc) { int row, col; /* Verify the sizes of lda, ladb, and ldc */ assert(lda >= m); assert(ldb >= k); assert(ldc >= m); #ifdef USE_MKL const char N = 'N'; dgemm(&N, &N, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); #else # ifdef USE_BLOCKING /* * Z = 256 KB = 256 * 1024 = 262144 * b = sqrt(X) = 512 * sizeof(double) = 8 * * n = 1024 * m = n^3 / b = 2097152 * * * */ #pragma omp parallel private(col, row) shared(C) { int tid; int nthreads; tid = omp_get_thread_num(); nthreads = omp_get_num_threads(); if (tid == 1) { fprintf(stderr, "nthreads=%i, tid*n/nthreads=%i, tid*m/nthreads=%i\n", nthreads, tid*n/nthreads, tid*m/nthreads); fprintf(stderr, "(tid+1)*n/nthreads=%i, (tid+1)*m/nthreads=%i\n", (tid+1)*n/nthreads, (tid+1)*m/nthreads); //fprintf(stderr, "MATRIX A=\n"); //print_matrix(m, n, A); //fprintf(stderr, "\bMATRIX B=\n"); //print_matrix(m, n, B); } /* Iterate over the columns of C */ for (col = 0; col < n; col++) { /* Spread the computations among the CPUs; the last CPU may get fewer rows. */ int row_min = tid * ((float)m/nthreads + 0.5); int row_max = MIN((tid+1) * ((float)m/nthreads + 0.5), m); /* Iterate over the rows of C */ for (row = row_min; row < row_max; row++) { int k_iter; double dotprod = 0.0; /* Accumulates the sum of the dot-product */ /* Iterate over column of A, row of B */ for (k_iter = 0; k_iter < k; k_iter++) { int a_index, b_index; a_index = (k_iter * lda) + row; /* Compute index of A element */ b_index = (col * ldb) + k_iter; /* Compute index of B element */ dotprod += A[a_index] * B[b_index]; /* Compute product of A and B */ } /* k_iter */ int c_index = (col * ldc) + row; C[c_index] = (alpha * dotprod) + (beta * C[c_index]); } /* row */ } /* col */ } # else /* OPEN_MP */ #pragma omp parallel for private(col, row) /* Iterate over the columns of C */ for (col = 0; col < n; col++) { /* Iterate over the rows of C */ for (row = 0; row < m; row++) { int k_iter; double dotprod = 0.0; /* Accumulates the sum of the dot-product */ /* Iterate over column of A, row of B */ for (k_iter = 0; k_iter < k; k_iter++) { int a_index, b_index; a_index = (k_iter * lda) + row; /* Compute index of A element */ b_index = (col * ldb) + k_iter; /* Compute index of B element */ dotprod += A[a_index] * B[b_index]; /* Compute product of A and B */ } /* k_iter */ int c_index = (col * ldc) + row; C[c_index] = (alpha * dotprod) + (beta * C[c_index]); } /* row */ } /* col */ # endif /* USE_BLOCKING, OPEN_MP */ #endif /* USE_MKL */ }
int main(int argc, char *argv[]) { int i,j,k; machineInformation currentMachine; counterSessionInfo session; initializeCUDA(); // Set machine information from CounterHomeBrew.h currentMachine.cpu_model = CPU_MODEL; currentMachine.num_sockets = NUM_SOCKETS; currentMachine.num_phys_cores_per_socket = NUM_PHYS_CORES_PER_SOCKET; currentMachine.num_cores_per_socket = NUM_CORES_PER_SOCKET; currentMachine.num_cores = NUM_CORES; currentMachine.num_cbos = NUM_PHYS_CORES_PER_SOCKET; // should multiply by NUM_SOCKETS??? currentMachine.core_gen_counter_num_max = CORE_GEN_COUNTER_MAX; currentMachine.cbo_counter_num_max = CBO_COUNTER_NUM_MAX; // Set session events, umasks and counters used // int32 core_event_numbers[] = {FP_COMP_OPS_EXE_EVTNR,SIMD_FP_256_EVTNR,0x51,0xF1,0x80}; // int32 core_umasks[] = {FP_COMP_OPS_EXE_SCALAR_DOUBLE_UMASK,SIMD_FP_256_PACKED_DOUBLE_UMASK,0x01, 0x07,0x01}; session.core_gen_counter_num_used = 5; int32 core_event_numbers[] = {0x10,0x10,0x11,0x51,0xF1}; int32 core_umasks[] = {0x20,0x40,0x01,0x01, 0x07}; session.cbo_counter_num_used = 1; int32 cbo_event_numbers[] = {0x37}; int32 cbo_umasks[] = {0xf}; session.cbo_filter = 0x1f; for (i = 0; i < session.core_gen_counter_num_used; i++) { session.core_event_numbers[i] = core_event_numbers[i]; session.core_umasks[i] = core_umasks[i]; } for (i = 0; i < session.cbo_counter_num_used; i++) { session.cbo_event_numbers[i] = cbo_event_numbers[i]; session.cbo_umasks[i] = cbo_umasks[i]; } int fd[NUM_CORES]; // Arrays to hold counter data... counterData before; counterData after; // some data for doing a naive matmul to test flop counting... // initloop(N); // M,N,K are multiples of the block size.... int gpuOuter = atoi(argv[1]); int gpuInner = atoi(argv[2]); int cpuInner = atoi(argv[3]); double minRuntime = atoi(argv[4]); int Md = atoi(argv[5])*block_size; int Nd = atoi(argv[6])*block_size; int Kd = atoi(argv[7])*block_size; int Mh = atoi(argv[8]); int Nh = atoi(argv[9]); int Kh = atoi(argv[10]); char *ts1,*ts2,*ts3,*ts4; char *ts5,*ts6,*ts7,*ts8; double fineTimeStamps[8]; double gTime = 0.0; double cTime = 0.0; double seconds = 0.0; int num_iters; uint64 *coreSums; coreSums = (uint64*)calloc(currentMachine.num_sockets*session.core_gen_counter_num_used,sizeof(uint64)); uint64 *sums; sums = (uint64*)calloc(currentMachine.num_sockets*session.cbo_counter_num_used,sizeof(uint64)); float *Atmp = NULL; float *Btmp = NULL; float *Ctmp = NULL; Atmp = (float*) malloc( Mh * Nh * sizeof(float) ); Btmp = (float*) malloc( Nh * sizeof(float) ); Ctmp = (float*) malloc( Mh * sizeof(float) ); randomInit(Atmp,Mh*Nh); randomInit(Btmp,Nh); for (num_iters = cpuInner; seconds < minRuntime; num_iters *=2) { seconds = 0.0; for (i =0; i < num_iters; i++) BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, Atmp,Mh, Btmp,1, 1, Ctmp,1 ); seconds = read_timer()-seconds; } // num_iters /= 2; free(Atmp); free(Btmp); free(Ctmp); int readyThreads = 0; #pragma omp parallel { int threadNum = omp_get_thread_num(); int numThreads = omp_get_num_threads(); assert(numThreads==2); if (threadNum == 0) { cudaError_t error; int memSizeA = sizeof(float)*Md*Nd; int memSizeB = sizeof(float)*Nd; int memSizeC = sizeof(float)*Md; float *Ahost,*Bhost,*Chost; // use pinned memory on the host for BW and asynch memory transfers.. int flags = cudaHostAllocDefault; ts5 = getTimeStamp(); fineTimeStamps[0] = read_timer(); error = cudaHostAlloc((void**)&Ahost,memSizeA,flags);if (error != cudaSuccess){printf("cudaHostMalloc Ahost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaHostAlloc((void**)&Bhost,memSizeB,flags);if (error != cudaSuccess){printf("cudaHostMalloc Bhost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaHostAlloc((void**)&Chost,memSizeC,flags);if (error != cudaSuccess){printf("cudaHostMalloc Chost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} // set local arrays randomInit(Ahost,Md*Nd); randomInit(Bhost,Nd); // allocate device memory float *Adevice,*Bdevice,*Cdevice; error = cudaMalloc((void**)&Adevice,memSizeA); if (error != cudaSuccess){printf("cudaMalloc Adevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaMalloc((void**)&Bdevice,memSizeB); if (error != cudaSuccess){printf("cudaMalloc Bdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaMalloc((void**)&Cdevice,memSizeC); if (error != cudaSuccess){printf("cudaMalloc Cdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} fineTimeStamps[1] = read_timer(); ts6 = getTimeStamp(); #pragma omp critical { readyThreads += 1; } // fprintf(stderr,"Incremented ready GPU\n"); while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 0: %d\n",readyThreads);}; //#pragma omp single //{ cudaStream_t stream1; cudaStreamCreate ( &stream1) ; ts3 = getTimeStamp(); fineTimeStamps[2] = read_timer(); gTime = read_timer(); for (int i = 0; i < gpuOuter; i++) GPUsgemv(gpuInner,Md,Nd,Kd,Adevice,Bdevice,Cdevice,Ahost,Bhost,Chost,&stream1); cudaStreamSynchronize(stream1); gTime = read_timer() - gTime; fineTimeStamps[3] = read_timer(); ts4 = getTimeStamp(); cudaFreeHost(Ahost); cudaFreeHost(Bhost); cudaFreeHost(Chost); } else { // uint64 min_iters = strtoull(argv[4],NULL,0); float *A = NULL; float *B = NULL; float *C = NULL; ts7 = getTimeStamp(); fineTimeStamps[4] = read_timer(); A = (float*) malloc( Mh * Nh * sizeof(float) ); B = (float*) malloc( Nh * sizeof(float) ); C = (float*) malloc( Mh * sizeof(float) ); randomInit(A,Mh*Nh); randomInit(B,Nh); fineTimeStamps[5] = read_timer(); ts8 = getTimeStamp(); #pragma omp critical { readyThreads += 1; } // fprintf(stderr,"Incremented ready CPU\n"); while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 1: %d\n",readyThreads);}; // open the msr files for each core on the machine for (i = 0; i < currentMachine.num_cores; i++) open_msr_file(i,&fd[i]); int socketsProgrammed = 0; for (i = 0; i < currentMachine.num_cores; i++) { int currentCoreFD = fd[i]; stopCounters(i, currentCoreFD, ¤tMachine, &session); programCoreFixedCounters(currentCoreFD); programGeneralPurposeRegisters(currentCoreFD, ¤tMachine, &session); /* Program the Uncore as desired...*/ // Only program the first physical core on each socket. // NOTE: Some assumptions about topology here...check /proc/cpuinfo to confirm. if (i % currentMachine.num_phys_cores_per_socket == 0 && socketsProgrammed < currentMachine.num_sockets) { programUncoreCounters( currentCoreFD, ¤tMachine, &session); socketsProgrammed++; } } seconds = 0.0; // start the programmed counters... for (i = 0; i < currentMachine.num_cores; i++) startCounters( i, fd[i], ¤tMachine, &session); /* READ COUNTERS BEFORE STUFF */ readCounters(fd,¤tMachine,&session, &before); ts1 = getTimeStamp(); fineTimeStamps[6] = read_timer(); seconds = read_timer(); /* DO STUFF */ for (i =0; i < num_iters; i++) BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, A,Mh, B,1, 1, C,1 ); /* END DOING STUFF */ seconds = read_timer()-seconds; fineTimeStamps[7] = read_timer(); ts2 = getTimeStamp(); /* READ COUNTERS AFTER STUFF */ for (i = 0; i < currentMachine.num_cores; i++) stopCounters(i,fd[i],¤tMachine, &session); // printf("num_iters = %"PRIu64", runtime is %g\n",num_iters,seconds); readCounters(fd,¤tMachine,&session,&after); diffCounterData(¤tMachine, &session, &after, &before, &after); for (i = 0; i < currentMachine.num_sockets; i++) { // printf("Socket %d\n",i); for (j = 0; j < currentMachine.num_cores_per_socket; j++) { // printf("%d,",j); for (k = 0; k < session.core_gen_counter_num_used; k++){ // printf("%"PRIu64",",after.generalCore[i*currentMachine.num_cores_per_socket + j][k]); // bug in the indexing of the core sums??? // coreSums[i*session.core_gen_counter_num_used + k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k]; coreSums[k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k]; } // printf("\n"); } } for (i = 0; i < currentMachine.num_sockets; i++) { // printf("%d,",i); for (j = 0; j < currentMachine.num_cbos; j++) { // printf("%d,",j); for (k = 0; k < session.cbo_counter_num_used; k++) { // printf("%llu,",after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]); // bug in the indexing of the core sums??? // sums[i*session.cbo_counter_num_used + k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; sums[k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; } } } // printf("\n"); // Stop counters, reset PMU, close msr files cleanup(fd,¤tMachine,&session); free(A); free(B); free(C); } } // end parallel region printf("%s,%s,%s,%s,%s,%s,%s,%s,%d,%d,%d,%d,%d,%d,%d,%d,%d,%f,%f,%f,",ts7,ts8,ts1,ts2,ts5,ts6,ts3,ts4,Mh,Nh,Kh,Md/block_size,Nd/block_size,Kd/block_size,num_iters,gpuOuter,gpuInner,seconds,gTime,(float)(gpuOuter*(Md*Kd+Nd+Md))/16.0); for (int i = 0; i < 8; i++) printf("%f,",fineTimeStamps[i]); for (j = 0; j < session.core_gen_counter_num_used; j++) printf("%llu,",coreSums[j]); for (j = 0; j < session.cbo_counter_num_used; j++) if (j == session.cbo_counter_num_used-1) printf("%llu",sums[j]); else printf("%llu,",sums[j]); printf("\n"); free(sums); free(coreSums); return 0; }
int main(int argc, char* argv[]) { int nt, ncmp, ncdp, nh, nh2, nm, nd, memsize, niter, reg, ix, ih, i3, i2, i1, iter, filt, nw, np; float t0, cmp0, cdp0, h0, dt, dcmp, dcdp, dh, apt, rho, aal, norm; bool verb, half, amp; float ***data, ***modl, **vrms, **mask, *off, *error=NULL; float **pp, **qq, *aa; char *errfile; sf_file in, out, vel, offset, err=NULL; sf_file fdip; int ompchunk = 1; int ompnth = 1; #ifdef _OPENMP int ompath=1; #endif /*------------------------------------------------------------*/ sf_init(argc,argv); if(! sf_getint("ompchunk",&ompchunk)) ompchunk=1; /* OpenMP data chunk size */ #ifdef _OPENMP if(! sf_getint("ompnth", &ompnth)) ompnth=0; /* OpenMP available threads */ #pragma omp parallel ompath=omp_get_num_threads(); if(ompnth<1) ompnth=ompath; omp_set_num_threads(ompnth); sf_warning("using %d threads of a total of %d",ompnth,ompath); #endif in = sf_input("in"); vel = sf_input("vel"); out = sf_output("out"); if (!sf_getbool("verb",&verb)) verb=false; /* verbosity flag */ if (!sf_getbool("half",&half)) half = true; /* if y, the third axis is half-offset instead of full offset */ if (!sf_getbool("amp",&)) amp = true; /* if y, use amplitue factor */ if (!sf_histint(in,"n1",&nt)) sf_error("No n1= in input"); if (!sf_histfloat(in,"d1",&dt)) sf_error("No d1= in input"); if (!sf_histfloat(in,"o1",&t0)) sf_error("No o1= in input"); if (!sf_histint(in,"n2",&ncmp)) sf_error("No n2= in input"); if (!sf_histfloat(in,"d2",&dcmp)) sf_error("No d2= in input"); if (!sf_histfloat(in,"o2",&cmp0)) sf_error("No o2= in input"); if (!sf_getint("ncdp",&ncdp)) ncdp = ncmp; if (!sf_getfloat("dcdp",&dcdp)) dcdp = dcmp; if (!sf_getfloat("cdp0",&cdp0)) cdp0 = cmp0; sf_putint(out,"n2",ncdp); sf_putfloat(out,"d2",dcdp); sf_putfloat(out,"o2",cdp0); if (!sf_histint(in,"n3",&nh)) sf_error("No n3= in input"); if (NULL != sf_getstring("offset")) { offset = sf_input("offset"); nh2 = sf_filesize(offset); if (nh2 != nh*ncmp) sf_error("Wrong dimensions in offset, it should be %d",nh*ncmp); off = sf_floatalloc(nh2); sf_floatread (off,nh2,offset); sf_fileclose(offset); if (!half) { for (ih = 0; ih < nh2; ih++) { off[ih] *= 0.5; } } } else { if (!sf_histfloat(in,"o3",&h0)) sf_error("No o3="); if (!sf_histfloat(in,"d3",&dh)) sf_error("No d3="); if (!half) dh *= 0.5,h0 *= 0.5; off = sf_floatalloc(nh*ncmp); for (ix = 0; ix < ncmp; ix++) { for (ih = 0; ih < nh; ih++) { off[ih*ncmp+ix] = h0 + ih*dh; } } offset = NULL; } if (!sf_getint("reg",®)) reg=0; /* regularization type */ if (!sf_getfloat("antialias",&aal)) aal = 1.0; /* antialiasing */ if (!sf_getfloat("apt",&apt)) apt=ncmp; /* migration aperture */ if (!sf_getfloat("rho",&rho)) rho = 1.-1./nt; /* Leaky integration constant */ if (!sf_getint("niter",&niter)) niter=5; /* number of iterations */ nm = nt*ncdp*nh; nd = nt*ncmp*nh; vrms = sf_floatalloc2(nt,ncdp); mask = sf_floatalloc2(ncmp,nh); data = sf_floatalloc3(nt,ncmp,nh); modl = sf_floatalloc3(nt,ncdp,nh); /* read velocity file */ sf_floatread(vrms[0],nt*ncdp,vel); sf_fileclose(vel); memsize = nm+nd+nt*ncdp+ncmp*nh; if (verb) sf_warning("memory needs: %f G (%f M)",4.*memsize/1024/1024/1024,4.*memsize/1024/1024); if (niter > 0) { errfile = sf_getstring("err"); /* output file for error */ if (NULL != errfile) { err = sf_output(errfile); sf_putint(err,"n1",niter); sf_putfloat(err,"d1",1); sf_putfloat(err,"o1",1); sf_putstring(err,"label1","Iteration Number"); sf_putstring(err,"label2","Relative Squared Error"); sf_putint(err,"n2",1); sf_putint(err,"n3",1); } error = sf_floatalloc(niter); } sf_floatread(data[0][0],nd,in); for (i3=0; i3 < nh; i3++) { for (i2=0; i2 < ncmp; i2++) { mask[i3][i2]=cblas_sdot(nt,data[i3][i2],1,data[i3][i2],1); } } tkirmig_init(ompnth,ompchunk,nt,dt,t0,ncmp,dcmp,cmp0,ncdp,dcdp,cdp0,nh,dh,h0,apt,aal,rho,vrms,off,mask,amp,verb); sf_cdstep_init(); if (verb) sf_warning("Iteration begin..."); if (reg == 0) sf_solver(tkirmig_lop,sf_cdstep,nm,nd,modl[0][0],data[0][0], niter,"nmem",0,"nfreq",niter,"err",error,"end"); else if (reg == 1) { filt=2; aa=sf_floatalloc(filt); aa[0]=1.; aa[1]=-1.; tcaih_init(filt,aa,nt,ncdp,nh); sf_solver_reg(tkirmig_lop,sf_cdstep,tcaih_lop,nm+filt*nt*ncdp,nm,nd, modl[0][0],data[0][0],niter,0.01,"nmem",0,"nfreq",niter, "err",error,"end"); } else if (reg == 2) { sf_causinth_init(nt,ncdp,nh); sf_solver_prec(tkirmig_lop,sf_cdstep,sf_causinth_lop,nm,nm,nd, modl[0][0],data[0][0],niter,0.01,"nmem",0,"nfreq",niter, "err",error,"end"); } else if (reg == 3) { sf_triangleh_init(3,nt,ncdp,nh); sf_solver_prec(tkirmig_lop,sf_cdstep,sf_triangleh_lop,nm,nm,nd, modl[0][0],data[0][0],niter,0.01,"nmem",0,"nfreq",niter, "err",error,"end"); } else if (reg == 4) { sf_warning("pwd constraints along t-x plane and smoothing along offset axis"); if (!sf_getstring("fdip")) sf_error("Need input dip file!"); if (!sf_getint("nw",&nw)) nw=3; fdip = sf_input("fdip"); if (!sf_histint(fdip,"n3",&np)) np=1; sf_warning("np=%d",np); pp = sf_floatalloc2(nt,ncdp); if (np > 1) { qq = sf_floatalloc2(nt,ncdp); } else { qq = NULL; } if (NULL != qq) { predicth2_init(nt,ncdp,nh,0.1,nw,pp,qq); } else { predicth_init(nt,ncdp,nh,0.1,nw,1,false); predict_set(pp); } sf_floatread(pp[0],nt*ncdp,fdip); if (NULL != qq) { sf_floatread(qq[0],nt*ncdp,fdip); sf_solver_prec(tkirmig_lop,sf_cdstep,predicth2_lop,nm,nm,nd, modl[0][0],data[0][0],niter,0.01,"nmem",0,"nfreq",niter, "err",error,"end"); predict2_close(); } else { sf_solver_prec(tkirmig_lop,sf_cdstep,predicth_lop,nm,nm,nd, modl[0][0],data[0][0],niter,0.01,"nmem",0,"nfreq",niter, "err",error,"end"); predict_close(); } } sf_cdstep_close(); sf_floatwrite(modl[0][0],nm,out); if (NULL != err) { for (i3=0; i3 < nh; i3++) { for (i2=0; i2 < ncmp; i2++) { for (i1=0; i1 < nt; i1++) { norm += data[i3][i2][i1]*data[i3][i2][i1]; } } } for (iter=0; iter < niter; iter++) error[iter] /=norm; sf_floatwrite(error,niter,err); } sf_warning("iter/niter=%d/%d, err=%f",iter,niter,error); exit(0); }