Exemplo n.º 1
2
int main(int argc, char* argv[])
{
  #pragma omp master
	{
	#ifdef _OPENMP
		int nthreads = omp_get_num_threads();
		std::cout << "Using OpenMP - There are " << nthreads << " threads" << std::endl;
	#else
		std::cout << "Not using OpenMP" << '\n';
	#endif
	}


  // -------------------------------------------------------------------------------------
  // Create "tiy_log/" subdirectory (win) or "/home/<username>/tiy_log/" (linux)
  // -------------------------------------------------------------------------------------
  std::string log_file_directory = "tiy_log/";
#ifdef WIN32
#else
  log_file_directory = std::string(getpwuid(getuid())->pw_dir) + "/" + log_file_directory;
#endif
  boost::filesystem::path dir_path(log_file_directory);
  if (!boost::filesystem::is_directory(dir_path) && !boost::filesystem::create_directory(dir_path))
  {
	  std::cerr << "Could not create log subdirectory." << std::endl;
	  std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get();
	  return 0;
  }


  // -------------------------------------------------------------------------------------
  // Input ARG
  // -------------------------------------------------------------------------------------
  char *arg_camera_config_file = (char *)"config_camera.xml";
  char *arg_object_config_file = (char *)"config_object.xml";
  char *arg_run_parameter_config_file = (char *)"config_run_parameters.xml";

  if (argc == 1)
  {
    std::cerr << "USING DEFAULT CONFIG FILES:  config_camera.xml config_object.xml config_run_parameters.xml" << std::endl;
  }
  else if (argc!=1 && argc != 4)
  {
	std::cerr << "Usage: 	server <camera_config_file> <object_config_file> <run_parameters_config_file>" << std::endl;
	std::cerr << "default:  server config_camera.xml config_object.xml config_run_parameters.xml" << std::endl;
	std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get();
	return 0;
  }
  else
  {
	arg_camera_config_file = argv[0];
	arg_object_config_file = argv[1];
	arg_run_parameter_config_file = argv[2];
  }


  // -------------------------------------------------------------------------------------
  // Get Run Parameters from XML Config File
  // -------------------------------------------------------------------------------------
	cv::FileStorage input_file_storage;
	if (!input_file_storage.open(arg_run_parameter_config_file, cv::FileStorage::READ))
	{
		std::cerr << "could NOT open " << arg_run_parameter_config_file << std::endl;
		std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get();
		return 0;
	}

	int do_use_kalman_filter=-1, do_interactive_mode=-1, multicast_port=-1, do_show_graphics=-1,
		do_output_debug=-1, do_output_2D=-1, do_output_3D=-1, do_output_object=-1, do_output_virt_point=-1,
		do_log_2D=-1, do_log_3D=-1, do_log_object=-1, do_log_virt_point=-1, do_log_video=-1, do_log_frame=-1,
		do_send_object_pose=-1, do_send_virt_point_pose=-1;

	do_use_kalman_filter = (int)input_file_storage["do_use_kalman_filter"];
	do_interactive_mode = (int)input_file_storage["do_interactive_mode"];
	multicast_port = (int)input_file_storage["multicast_port"];
	do_show_graphics = (int)input_file_storage["do_show_graphics"];
	do_output_debug = (int)input_file_storage["do_output_debug"];
	do_output_2D = (int)input_file_storage["do_output_2D"];
	do_output_3D = (int)input_file_storage["do_output_3D"];
	do_output_object = (int)input_file_storage["do_output_object"];
	do_output_virt_point = (int)input_file_storage["do_output_virt_point"];
	do_log_2D = (int)input_file_storage["do_log_2D"];
	do_log_3D = (int)input_file_storage["do_log_3D"];
	do_log_object = (int)input_file_storage["do_log_object"];
	do_log_virt_point = (int)input_file_storage["do_log_virt_point"];
	do_log_video = (int)input_file_storage["do_log_video"];
	do_log_frame = (int)input_file_storage["do_log_frame"];
	do_send_object_pose = (int)input_file_storage["do_send_object_pose"];
	do_send_virt_point_pose = (int)input_file_storage["do_send_virt_point_pose"];

	std::string multicast_adress = (std::string)input_file_storage["multicast_adress"];
	std::string input_device_src = (std::string)input_file_storage["input_device_src"];	// (m: Mouse, k: Keyboard)
	std::string mouse_device_id = (std::string)input_file_storage["mouse_device_id"];
	std::string keyboard_device_id = (std::string)input_file_storage["keyboard_device_id"];
	std::string input_src = (std::string)input_file_storage["input_src"];	// (b: Basler Camera, o: OpenCV Camera, v: Video files, t: 2D point files)
	std::string video_left = (std::string)input_file_storage["video_left"];
	std::string video_right = (std::string)input_file_storage["video_right"];
	std::string points_2D_left = (std::string)input_file_storage["points_2D_left"];
	std::string points_2D_right = (std::string)input_file_storage["points_2D_right"];
	std::string log_points_2D_left = log_file_directory + (std::string)input_file_storage["log_points_2D_left"];
	std::string log_points_2D_right = log_file_directory + (std::string)input_file_storage["log_points_2D_right"];
	std::string log_points_3D = log_file_directory + (std::string)input_file_storage["log_points_3D"];
	std::string log_object_pose = log_file_directory + (std::string)input_file_storage["log_object_pose"];
	std::string log_virt_point_pose = log_file_directory + (std::string)input_file_storage["log_virt_point_pose"];
	std::string log_video_left = log_file_directory + (std::string)input_file_storage["log_video_left"];
	std::string log_video_right = log_file_directory + (std::string)input_file_storage["log_video_right"];
	std::string log_frame_left_prefix = log_file_directory + (std::string)input_file_storage["log_frame_left_prefix"];
	std::string log_frame_right_prefix = log_file_directory + (std::string)input_file_storage["log_frame_right_prefix"];

	input_file_storage.release();

	if (do_use_kalman_filter==-1 || do_interactive_mode==-1 || multicast_port==-1 || do_show_graphics==-1 ||
		do_output_debug==-1 || do_output_2D==-1 || do_output_3D==-1 || do_output_object==-1 || do_output_virt_point==-1 ||
		do_log_2D==-1 || do_log_3D==-1 || do_log_object==-1 || do_log_virt_point==-1 || do_log_video==-1 || do_log_frame==-1 || 
		do_send_object_pose==-1 || do_send_virt_point_pose==-1 ||
		multicast_adress.empty() || input_device_src.empty() || mouse_device_id.empty() || 
		keyboard_device_id.empty() || input_src.empty() || video_left.empty() || video_right.empty() || 
		points_2D_left.empty() || points_2D_right.empty() ||
		log_points_2D_left.empty() || log_points_2D_right.empty() || log_points_3D.empty() ||
		log_object_pose.empty() || log_virt_point_pose.empty() || 
		log_video_left.empty() || log_video_right.empty() ||
		log_frame_left_prefix.empty() || log_frame_right_prefix.empty())
	{
		std::cerr << "Read all run parameters from " << arg_run_parameter_config_file << " failed" << std::endl;
		std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get();
		return 0;
	}

	if (do_log_video && (input_src == "v"))
	{
		std::cerr << "Cannot read video files and record to files at the same time." << std::endl;
		std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get();
		return 0;
	}

	bool do_debugging = (do_output_debug != 0);


  // -------------------------------------------------------------------------------------
  // Initialize Motion Capturing (segmentation/marker extraction, marker template fitting)
  // -------------------------------------------------------------------------------------
  tiy::MarkerTracking m_track(do_debugging);

  if (!m_track.readConfigFiles(arg_camera_config_file, arg_object_config_file))
  {
	  std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get();
	  return 0;
  }


  // -------------------------------------------------------------------------------------
  // Input device
  // -------------------------------------------------------------------------------------
  boost::scoped_ptr<tiy::MouseDevice> mouse_device;
  boost::scoped_ptr<tiy::KeyboardDevice> keyboard_device;
#ifdef WIN32
  mouse_device.reset(new tiy::WindowsMouse(do_debugging));
  keyboard_device.reset(new tiy::WindowsKeyboard(do_debugging));
#else
  mouse_device.reset(new tiy::LinuxMouse(do_debugging));
  keyboard_device.reset(new tiy::LinuxKeyboard(do_debugging));
#endif

  int read_intervall_ms = 1;

  if ((input_device_src == "m") && (!mouse_device->openAndReadMouse(mouse_device_id, read_intervall_ms)))
  {
	std::cout << "MouseDevice::openAndReadMouse() failed" << std::endl;
	std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get();
	return 0;
  }

  if (!keyboard_device->openAndReadKeyboard(keyboard_device_id, read_intervall_ms))
  {
	std::cout << "KeyboardDevice::openAndReadKeyboard() failed" << std::endl;
	std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get();
	return 0;
  }


  // -------------------------------------------------------------------------------------
  // Stereo camera
  // -------------------------------------------------------------------------------------
  boost::scoped_ptr<tiy::StereoCamera> stereo_camera;

  std::string camera_id_left = m_track.left_camera_id;
  std::string camera_id_right = m_track.right_camera_id;
  if (input_src == "b")
  {
#ifdef USE_aravis
	  	  stereo_camera.reset(new tiy::BaslerGigEStereoCamera(do_debugging, camera_id_left, camera_id_right,
								m_track.frame_width, m_track.frame_height, m_track.camera_exposure, m_track.camera_gain, m_track.frame_rate));
#else
  	  	  std::cerr << "BaslerGigEStereoCamera not available, as aravis NOT found/used." << std::endl;
		  std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get();
  	  	  return 0;
#endif
  }
  else if (input_src == "o")
  		  stereo_camera.reset(new tiy::OpenCVStereoCamera(do_debugging, camera_id_left, camera_id_right,
								m_track.frame_width, m_track.frame_height, m_track.camera_exposure, m_track.camera_gain, m_track.frame_rate));
  else if (input_src == "v")
  		  stereo_camera.reset(new tiy::OpenCVStereoCamera(do_debugging, camera_id_left, camera_id_right,
								m_track.frame_width, m_track.frame_height, m_track.camera_exposure, m_track.camera_gain, m_track.frame_rate, video_left, video_right));
  else
  {
	  std::cerr << "No input source \"input_src\" specified in the configuration file \"" << arg_run_parameter_config_file << "\"" << std::endl;
	  std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get();
	  return 0;
  }


  if (stereo_camera->openCam())
	  stereo_camera->startCam();
  else
  {
	  std::cerr << "MarkerTracking::connectStereoCamera() failed" << std::endl;
	  std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get();
	  return 0;
  }

  cv::Mat image_left = stereo_camera->createImage();
  cv::Mat image_right = stereo_camera->createImage();
  long long int frame_timestamp;


  // -------------------------------------------------------------------------------------
  // BOOST ASIO MULTICAST SERVER
  // -------------------------------------------------------------------------------------
  boost::asio::io_service server_io_service;
  tiy::MulticastServer multicast_server(server_io_service, boost::asio::ip::address::from_string(multicast_adress), multicast_port, do_debugging);

  boost::system::error_code error_c;
  boost::thread server_io_service_thread(boost::bind(&boost::asio::io_service::run, &server_io_service, error_c));


  // -------------------------------------------------------------------------------------
  // Logging
  // -------------------------------------------------------------------------------------
  std::ofstream log_2D_left, log_2D_right, log_3D, log_object, log_virt_point;
  if (do_log_2D)
  {
	  log_2D_left.open(log_points_2D_left.c_str());
	  log_2D_right.open(log_points_2D_right.c_str());
  }
  if (do_log_3D)
	  log_3D.open(log_points_3D.c_str());
  if (do_log_object)
	  log_object.open(log_object_pose.c_str());
  if (do_log_virt_point)
	  log_virt_point.open(log_virt_point_pose.c_str());
  if (do_log_video)
	  stereo_camera->startRecording(log_video_left, log_video_right);


  // -------------------------------------------------------------------------------------
  // MAIN LOOP
  // -------------------------------------------------------------------------------------
  int capture_counter = 1;
  bool is_base_temp = false;
  int test_points_counter = 0;

  // time measurement
  boost::posix_time::ptime start_time, end_time;
  start_time = boost::posix_time::microsec_clock::universal_time();

  for(int i = 0; true; i++)
    {
	  // -------------------------------------------------------------------------------------
	  // Grab stereo frame
	  // -------------------------------------------------------------------------------------
	  if(!stereo_camera->grabFrame(image_left, image_right, frame_timestamp))
      {
		  if (input_src == "v")
    	  {
			  std::cout << "Video file finished." << std::endl;
		  	  std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get();
    		  return 0;
    	  }

    	  std::cerr << "Grabbing failed" << std::endl;
    	  std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get();
		  return 0;
      }

	  if (do_log_video)
		  stereo_camera->recordFrame();


      // -------------------------------------------------------------------------------------
      // Extract (or read from file) 2D points
      // -------------------------------------------------------------------------------------
      cv::vector<cv::Point2f> points_2D_left, points_2D_right;
#pragma omp parallel sections
      {
#pragma omp section
        {
        	if (input_src == "t")
        		m_track.get2DPointsFromFile("testpoints_left", &points_2D_left, test_points_counter);
        	else
        		m_track.get2DPointsFromImage(image_left, &points_2D_left);
        }
#pragma omp section
        {
        	if (input_src == "t")
    	    	m_track.get2DPointsFromFile("testpoints_right", &points_2D_right, test_points_counter);
        	else
        		m_track.get2DPointsFromImage(image_right, &points_2D_right);
        }
      }
      test_points_counter++;


      // -------------------------------------------------------------------------------------
      // Compute 3D points from 2D points
      // -------------------------------------------------------------------------------------
      cv::Mat points_3D = m_track.get3DPointsFrom2DPoints(points_2D_left, points_2D_right);


      // -------------------------------------------------------------------------------------
      // Search for marker objects (templates)
      // -------------------------------------------------------------------------------------
      std::vector<cv::Mat>RT_template_leftcam;
      std::vector<float>avg_dev;

      for(int t = 0; t < m_track.num_templates;t++)
      {
    	  RT_template_leftcam.push_back(cv::Mat::zeros(4,4,CV_32F));
    	  avg_dev.push_back(0);
      }
#pragma omp parallel for
      for(int r = 0; r < m_track.num_templates; r++)	  
    	  m_track.fit3DPointsToObjectTemplate(points_3D, r, RT_template_leftcam[r], &avg_dev[r]);

		  
      // -------------------------------------------------------------------------------------
      // Update mouse and keyboard status
      // -------------------------------------------------------------------------------------
      bool was_SPACE_pressed=false, was_ESC_pressed=false;

      keyboard_device->getStatusSinceLastReset(was_SPACE_pressed, was_ESC_pressed);
      if (was_ESC_pressed)
      {
    	  std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get();
    	  return 0;
      }
      keyboard_device->resetStatus();

	  bool was_left_button_pressed=false, was_left_button_released=false, is_left_button_pressed=false,
			  was_right_button_pressed=false, was_right_button_released=false, is_right_button_pressed=false,
				  has_mouse_wheel_changed=false;
	  static int mouse_wheel_position=0;

	  if (input_device_src == "m")
	  {
		  mouse_device->getStatusSinceLastReset(was_left_button_pressed, was_left_button_released, is_left_button_pressed,
										  was_right_button_pressed, was_right_button_released, is_right_button_pressed,
										  has_mouse_wheel_changed, mouse_wheel_position);
		  mouse_device->resetStatus();
	  }
	  

      // -------------------------------------------------------------------------------------
      // OUTPUT (Send/Display/Log) the selected data
      // -------------------------------------------------------------------------------------
	  if (!do_interactive_mode || ((input_device_src == "m") && was_left_button_pressed) || ((input_device_src == "k") && was_SPACE_pressed))
        {
	      // -------------------------------------------------------------------------------------
	      // Send (publish the object/virtual point pose over multicast)
	      // -------------------------------------------------------------------------------------
	      if(do_send_object_pose)
	        {
	    	  std::string send_string;
			  for(int r = 0; r < m_track.num_templates; r++)
			  {
				  cv::Mat rodrigues_orientation = cv::Mat::zeros(3, 1, CV_32F);
			      if (countNonZero(RT_template_leftcam[r]))
					 Rodrigues(RT_template_leftcam[r](cv::Range(0,3),cv::Range(0,3)), rodrigues_orientation);

			      int last_col = RT_template_leftcam[r].size.p[0] - 1;

				  std::stringstream frame_timestamp_ss; // as boost::format not compatible with long long int
				  frame_timestamp_ss << frame_timestamp;
				  std::string send_buffer = (boost::format("%s\t%d\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t") % frame_timestamp_ss.str() % r 
											% RT_template_leftcam[r].at<float>(0,last_col) % RT_template_leftcam[r].at<float>(1,last_col) % RT_template_leftcam[r].at<float>(2,last_col)
											% rodrigues_orientation.at<float>(0,0) % rodrigues_orientation.at<float>(1,0) % rodrigues_orientation.at<float>(2,0) ).str();

				  send_string += send_buffer;
			  }

			  multicast_server.sendString(send_string);

			  if(do_debugging)
			  	std::cout << "-------------" << std::endl << "SENDING :" << send_string << std::endl << "----------------" << std::endl;
	        }			
		  if(do_send_virt_point_pose)
	        {
	    	  std::string send_string;
			  for(int r = 0; r < m_track.num_templates; r++)
			  {			  
				  cv::Mat RT_virt_point_to_leftcam = cv::Mat::zeros(4, 4, CV_32F);				
				  cv::Mat rodrigues_orientation = cv::Mat::zeros(3, 1, CV_32F);
				  if (countNonZero(RT_template_leftcam[r]) && countNonZero(m_track.RT_virt_point_to_template[r] - cv::Mat::eye(4, 4, CV_32F)))
				  { 
					RT_virt_point_to_leftcam = RT_template_leftcam[r] * m_track.RT_virt_point_to_template[r];
					Rodrigues(RT_virt_point_to_leftcam(cv::Range(0,3),cv::Range(0,3)), rodrigues_orientation);
				  }
			  
			      int last_col = RT_virt_point_to_leftcam.size.p[0] - 1;

				  std::stringstream frame_timestamp_ss; // as boost::format not compatible with long long int
				  frame_timestamp_ss << frame_timestamp;
				  std::string send_buffer = (boost::format("%s\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t") % frame_timestamp_ss.str()
											% RT_virt_point_to_leftcam.at<float>(0,last_col) % RT_virt_point_to_leftcam.at<float>(1,last_col) % RT_virt_point_to_leftcam.at<float>(2,last_col)
											% rodrigues_orientation.at<float>(0,0) % rodrigues_orientation.at<float>(1,0) % rodrigues_orientation.at<float>(2,0) ).str();

				  send_string += send_buffer;
			  }
			  multicast_server.sendString(send_string);

			  if(do_debugging)
			  	std::cout << "-------------" << std::endl << "SENDING :" << send_string << std::endl << "----------------" << std::endl;
	        }
			
		  // -------------------------------------------------------------------------------------
		  // Display
		  // -------------------------------------------------------------------------------------
		  if (do_debugging)
		  {
			if (was_left_button_pressed)
				std::cout << "LEFT" << std::endl;
			if (was_left_button_released)
				std::cout << "LEFT RELEASED" << std::endl;
			if (was_right_button_pressed)
				std::cout << "RIGHT" << std::endl;
			if (was_right_button_released)
				std::cout << "RIGHT RELEASED" << std::endl;
			if (has_mouse_wheel_changed)
				std::cout << "WHEEL: " << mouse_wheel_position << std::endl;
			if (is_left_button_pressed)
				std::cout << "LEFT STILL" << std::endl;
			if (is_right_button_pressed)
				std::cout << "RIGHT STILL" << std::endl;

			if (was_SPACE_pressed)
				std::cout << "SPACE" << std::endl;
			if (was_ESC_pressed)
				std::cout << "ESC" << std::endl;
		  }
          if (do_output_2D)
            {
        	  std::cout << frame_timestamp;
        	  for(unsigned int p = 0; p < points_2D_left.size(); p++)
        		  std::cout << "\t" << points_2D_left[p].x << "\t" << points_2D_left[p].y;
        	  std::cout << std::endl;

        	  std::cout << frame_timestamp;
        	  for(unsigned int p = 0; p < points_2D_right.size(); p++)
        		  std::cout  << "\t" << points_2D_right[p].x << "\t" << points_2D_right[p].y;
        	  std::cout << std::endl;
            }
          if (do_output_3D)
            {
        	  std::cout << frame_timestamp;
			  for(int p = 0; p < points_3D.cols; p++)
				  std::cout  << "\t" << points_3D.at<float>(0,p) << "\t" << points_3D.at<float>(1,p) << "\t" << points_3D.at<float>(2,p);
			  std::cout << std::endl;
            }
    	  if (do_output_object)
    	    {
			  std::cout << frame_timestamp;
			  for(int r = 0; r < m_track.num_templates; r++)
			  {
				  cv::Mat rodrigues_orientation = cv::Mat::zeros(3, 1, CV_32F);
			      if (countNonZero(RT_template_leftcam[r]))
					 Rodrigues(RT_template_leftcam[r](cv::Range(0,3),cv::Range(0,3)), rodrigues_orientation);

			      int last_col = RT_template_leftcam[r].size.p[0] - 1;
			      std::cout << "\t" << RT_template_leftcam[r].at<float>(0,last_col) << "\t" << RT_template_leftcam[r].at<float>(1,last_col) << "\t" << RT_template_leftcam[r].at<float>(2,last_col) << "\t" << rodrigues_orientation.at<float>(0,0) << "\t" << rodrigues_orientation.at<float>(1,0) << "\t" << rodrigues_orientation.at<float>(2,0);
			      //std::cout << std::endl << "avg_dev = " << avg_dev[r];
			  }
			  std::cout << std::endl;
    	    }			
		  if (do_output_virt_point)
    	    {
			  std::cout << frame_timestamp;
			  for(int r = 0; r < m_track.num_templates; r++)
			  {				
				  cv::Mat RT_virt_point_to_leftcam = cv::Mat::zeros(4, 4, CV_32F);				
				  cv::Mat rodrigues_orientation = cv::Mat::zeros(3, 1, CV_32F);
				  if (countNonZero(RT_template_leftcam[r]) && countNonZero(m_track.RT_virt_point_to_template[r] - cv::Mat::eye(4, 4, CV_32F)))
				  { 
					RT_virt_point_to_leftcam = RT_template_leftcam[r] * m_track.RT_virt_point_to_template[r];
					Rodrigues(RT_virt_point_to_leftcam(cv::Range(0,3),cv::Range(0,3)), rodrigues_orientation);
				  }
				  
			      int last_col = RT_virt_point_to_leftcam.size.p[0] - 1;
			      std::cout << "\t" << RT_virt_point_to_leftcam.at<float>(0,last_col) << "\t" << RT_virt_point_to_leftcam.at<float>(1,last_col) << "\t" << RT_virt_point_to_leftcam.at<float>(2,last_col) << "\t" << rodrigues_orientation.at<float>(0,0) << "\t" << rodrigues_orientation.at<float>(1,0) << "\t" << rodrigues_orientation.at<float>(2,0);
			  }
			  std::cout << std::endl;
    	    }
			

		  // -------------------------------------------------------------------------------------
		  // Log
		  // -------------------------------------------------------------------------------------
		  if (do_log_2D)
          {
			  log_2D_left << frame_timestamp;
			  for(unsigned int p = 0; p < points_2D_left.size(); p++)
				  log_2D_left << "\t" << points_2D_left[p].x << "\t" << points_2D_left[p].y;
			  log_2D_left << std::endl;

			  log_2D_right << frame_timestamp;
			  for(unsigned int p = 0; p < points_2D_right.size(); p++)
				  log_2D_right  << "\t" << points_2D_right[p].x << "\t" << points_2D_right[p].y;
			  log_2D_right << std::endl;
          }
		  if (do_log_3D)
          {
			  log_3D << frame_timestamp;
			  for(int p = 0; p < points_3D.cols; p++)
				  log_3D  << "\t" << points_3D.at<float>(0,p) << "\t" << points_3D.at<float>(1,p) << "\t" << points_3D.at<float>(2,p);
			  log_3D << std::endl;
          }
		  if (do_log_object)
		  {
			  log_object << frame_timestamp;
			  for(int r = 0; r < m_track.num_templates; r++)
			  {
				  cv::Mat rodrigues_orientation = cv::Mat::zeros(3, 1, CV_32F);
			      if (countNonZero(RT_template_leftcam[r]))
					 Rodrigues(RT_template_leftcam[r](cv::Range(0,3),cv::Range(0,3)), rodrigues_orientation);

			      int last_col = RT_template_leftcam[r].size.p[0] - 1;
			      log_object << "\t" << RT_template_leftcam[r].at<float>(0,last_col) << "\t" << RT_template_leftcam[r].at<float>(1,last_col) << "\t" << RT_template_leftcam[r].at<float>(2,last_col) << "\t" << rodrigues_orientation.at<float>(0,0) << "\t" << rodrigues_orientation.at<float>(1,0) << "\t" << rodrigues_orientation.at<float>(2,0);
			      //log_object << std::endl << "avg_dev = " << avg_dev[r];
			  }
			  log_object << std::endl;
		  }
		  if (do_log_virt_point)
		  {
			  log_virt_point << frame_timestamp;			  
			  for(int r = 0; r < m_track.num_templates; r++)
			  {				  					
				  cv::Mat RT_virt_point_to_leftcam = cv::Mat::zeros(4, 4, CV_32F);				
				  cv::Mat rodrigues_orientation = cv::Mat::zeros(3, 1, CV_32F);
				  if (countNonZero(RT_template_leftcam[r]) && countNonZero(m_track.RT_virt_point_to_template[r] - cv::Mat::eye(4, 4, CV_32F)))
				  { 
					RT_virt_point_to_leftcam = RT_template_leftcam[r] * m_track.RT_virt_point_to_template[r];
					Rodrigues(RT_virt_point_to_leftcam(cv::Range(0,3),cv::Range(0,3)), rodrigues_orientation);
				  }

			      int last_col = RT_virt_point_to_leftcam.size.p[0] - 1;
				  log_virt_point << "\t" << RT_virt_point_to_leftcam.at<float>(0,last_col) << "\t" << RT_virt_point_to_leftcam.at<float>(1,last_col) << "\t" << RT_virt_point_to_leftcam.at<float>(2,last_col) << "\t" << rodrigues_orientation.at<float>(0,0) << "\t" << rodrigues_orientation.at<float>(1,0) << "\t" << rodrigues_orientation.at<float>(2,0);
			  }
			  log_virt_point << std::endl;
		  }
		  if (do_log_video)
			  stereo_camera->recordFrame();
        }

	  // -------------------------------------------------------------------------------------
      // Capture stereo frame
      // -------------------------------------------------------------------------------------
	  if (do_log_frame && (((input_device_src == "m") && was_left_button_pressed) || ((input_device_src == "k") && was_SPACE_pressed)))
		{			
		  std::string save_file;

		  save_file = (boost::format("%s%03i.jpg") % log_frame_left_prefix % capture_counter).str();
		  cv::imwrite(save_file, image_left);

		  save_file = (boost::format("%s%03i.jpg") % log_frame_right_prefix % capture_counter).str();
		  cv::imwrite(save_file, image_right);

		  if (do_debugging)
			  std::cout << frame_timestamp << "Frame captured." << std::endl;

		  capture_counter++;
		}


      // -------------------------------------------------------------------------------------
      // Visualize stereo frame with detected points
      // -------------------------------------------------------------------------------------
      if(do_show_graphics && !(input_src == "t"))
        {
    	  // needed, as changing image content (costs 0.5-1.5 [ms])
    	  cv::Mat image_left_cpy, image_right_cpy;
    	  image_left.copyTo(image_left_cpy);
    	  image_right.copyTo(image_right_cpy);

          for(unsigned int p=0; p < points_2D_left.size(); p++)
              cv::circle(image_left_cpy, points_2D_left[p], 2, cv::Scalar(0), 1, CV_AA, 0);
          for(unsigned int p=0; p < points_2D_right.size(); p++)
              cv::circle(image_right_cpy, points_2D_right[p], 2, cv::Scalar(0), 1, CV_AA, 0);

          cv::Mat object_rotation(3, 1, CV_32F);
          cv::Mat object_translation(3, 1, CV_32F);
          cv::vector<cv::Point2f> object_2D;

          for(int r = 0; r < m_track.num_templates; r++)
            {
			  if (avg_dev[r] < std::numeric_limits<float>::infinity())
              {
                  Rodrigues(RT_template_leftcam[r](cv::Range(0,3),cv::Range(0,3)), object_rotation);
                  object_translation = RT_template_leftcam[r](cv::Range(0,3),cv::Range(3,4)).t();
                  cv::vector<cv::Point3f> object_points;
                  object_points.push_back(cv::Point3f(RT_template_leftcam[r].at<float>(0,3), RT_template_leftcam[r].at<float>(1,3), RT_template_leftcam[r].at<float>(2,3)));
                  projectPoints(cv::Mat(object_points), cv::Mat::zeros(3,1,CV_32F), cv::Mat::zeros(3,1,CV_32F), m_track.KK_left, m_track.kc_left, object_2D);
                  cv::circle(image_left_cpy, object_2D[0], 4, cv::Scalar(255,255,255), 1, CV_AA, 0);
                  cv::circle(image_left_cpy, object_2D[0], 3, cv::Scalar(0,0,150), 1, CV_AA, 0);
                  projectPoints(cv::Mat(object_points), m_track.om_leftcam_to_rightcam, m_track.T_leftcam_to_rightcam, m_track.KK_right, m_track.kc_right, object_2D);
                  cv::circle(image_right_cpy, object_2D[0], 4, cv::Scalar(255,255,255), 1, CV_AA, 0);
                  cv::circle(image_right_cpy, object_2D[0], 3, cv::Scalar(0,0,150), 1, CV_AA, 0);
              }
            }

		  imshow("Image Left", image_left_cpy);
		  imshow("Image Right", image_right_cpy);

	      cv::waitKey(1);
        }


      // -------------------------------------------------------------------------------------
      // END MEASURE of the computation time (of one cycle)
      // -------------------------------------------------------------------------------------
      if (do_debugging)
      {
		end_time = boost::posix_time::microsec_clock::universal_time();
		boost::posix_time::time_duration time_diff = end_time - start_time;

		std::cout << "comp_time = " << time_diff.total_microseconds() << " [us]" << std::endl;

		start_time = boost::posix_time::microsec_clock::universal_time();
      }
    } //end MAIN LOOP

	if (log_2D_left.is_open())
		log_2D_left.close();
	if (log_2D_right.is_open())
		log_2D_right.close();
	if (log_3D.is_open())
		log_3D.close();
	if (log_object.is_open())
		log_object.close();

	stereo_camera->closeCam();

  std::cerr << "PRESS A KEY TO EXIT"; cv::destroyAllWindows(); cv::waitKey(1); std::cin.get();
  return 0;
}
Exemplo n.º 2
0
int main(int argc, char** argv)
{
  sim_param_t params;
  if (get_params(argc, argv, &params) != 0)
    exit(-1);

  // Create global
  sim_state_t* globalState = init_particles(&params);

#pragma omp parallel shared(globalState, params) 
  {
    int proc = omp_get_thread_num();
    int nproc = omp_get_num_threads();

    FILE* fp    = fopen(params.fname, "w");
    int nframes = params.nframes;
    int npframe = params.npframe;
    float dt    = params.dt;
    int n       = globalState->n;

    // Processor information and holder
    proc_info* pInfo = malloc(sizeof(proc_info)); 
    pInfo->proc = proc;
    pInfo->nproc = nproc;
    pInfo->beg = round((proc/(double)nproc)*n);
    pInfo->end = round(((proc+1)/(double)nproc)*n);
    pInfo->forceAccu = calloc(3*n, sizeof(float)); // Never used this...


    if (proc == 0) {
      printf("Running in parallel with %d processor\n", nproc);
    }

    normalize_mass(globalState, pInfo, &params);

    double t_start = omp_get_wtime();

    if (proc == 0) { // We only write for one processor
      write_header(fp, n, nframes, params.h);
      write_frame_data(fp, n, globalState, NULL);
    }

    if (proc == 0) {
      hash_particles(globalState, params.h);
    }
    //hash_particles_parallel(globalState, pInfo, params.h);

#pragma omp barrier // Need the hashing to be done

    compute_accel(globalState, pInfo, &params);

#pragma omp barrier
    leapfrog_start(globalState, pInfo, dt);
    check_state(globalState, pInfo);
    for (int frame = 1; frame < nframes; ++frame) {

      // We sort according to Z-Morton to ensure locality, need to implement paralle qsort
      if (frame % 5 == 0) {

        // Dividing into chunks of sorting each chunk
        // This alone turned out to better than sorting the entire array
        qsort(globalState->part+pInfo->beg, pInfo->end-pInfo->beg ,sizeof(particle_t),compPart);
        // Sorting the array consisting of sorted chunks
        // This turned out to actually lower the performance. That's why
        // I commented it.
        // #pragma omp barrier
        //   if( pInfo->nproc >1 ) arraymerge(globalState->part, globalState->n, pInfo);
//#pragma omp barrier*/

        // Serial version
        /*#pragma omp single // Implied barrier
          qsort(globalState->part, n, sizeof(particle_t), compPart);*/
      }
      /*else if (frame % 49) {*/
        /*if (proc == 0) {*/
        /*}*/
      /*}*/

#pragma omp barrier // Need sort to finish

    for (int i = 0; i < npframe; ++i) {
      if (proc == 0 && npframe % 4 == 0) { // Ammortize hashing cost
        hash_particles(globalState, params.h);        
      }

#pragma omp barrier
      compute_accel(globalState, pInfo, &params);
      leapfrog_step(globalState, pInfo, dt);
      check_state(globalState, pInfo);
#pragma omp barrier
    }

    if (proc == 0) {
      printf("Frame: %d of %d - %2.1f%%\n",frame, nframes, 
          100*(float)frame/nframes);
      write_frame_data(fp, n, globalState, NULL);
    }
  }

  double t_end = omp_get_wtime();

  if (proc == 0) {
    printf("Ran in %g seconds\n", t_end-t_start);
  }

  free(pInfo);
  fclose(fp);
}

free_state(globalState);
}
Exemplo n.º 3
0
int main(int argc, char *argv[])
{
    struct pngquant_options options = {
        .floyd = 1.f, // floyd-steinberg dithering
    };
    options.liq = liq_attr_create();

    if (!options.liq) {
        fputs("SSE-capable CPU is required for this build.\n", stderr);
        return WRONG_ARCHITECTURE;
    }

    unsigned int error_count=0, skipped_count=0, file_count=0;
    pngquant_error latest_error=SUCCESS;
    const char *newext = NULL, *output_file_path = NULL;

    fix_obsolete_options(argc, argv);

    int opt;
    do {
        opt = getopt_long(argc, argv, "Vvqfhs:Q:o:", long_options, NULL);
        switch (opt) {
            case 'v':
                options.verbose = true;
                break;
            case 'q':
                options.verbose = false;
                break;

            case arg_floyd:
                options.floyd = optarg ? atof(optarg) : 1.0;
                if (options.floyd < 0 || options.floyd > 1.f) {
                    fputs("--floyd argument must be in 0..1 range\n", stderr);
                    return INVALID_ARGUMENT;
                }
                break;
            case arg_ordered: options.floyd = 0; break;

            case 'f': options.force = true; break;
            case arg_no_force: options.force = false; break;

            case arg_ext: newext = optarg; break;
            case 'o':
                if (output_file_path) {
                    fputs("--output option can be used only once\n", stderr);
                    return INVALID_ARGUMENT;
                }
                output_file_path = optarg; break;

            case arg_iebug:
                // opacities above 238 will be rounded up to 255, because IE6 truncates <255 to 0.
                liq_set_min_opacity(options.liq, 238);
                options.ie_mode = true;
                break;

            case arg_transbug:
                liq_set_last_index_transparent(options.liq, true);
                break;

            case arg_skip_larger:
                options.skip_if_larger = true;
                break;

            case 's':
                {
                    int speed = atoi(optarg);
                    if (speed >= 10) {
                        options.fast_compression = true;
                    }
                    if (speed == 11) {
                        options.floyd = 0;
                        speed = 10;
                    }
                    if (LIQ_OK != liq_set_speed(options.liq, speed)) {
                        fputs("Speed should be between 1 (slow) and 11 (fast).\n", stderr);
                        return INVALID_ARGUMENT;
                    }
                }
                break;

            case 'Q':
                if (!parse_quality(optarg, options.liq, &options.min_quality_limit)) {
                    fputs("Quality should be in format min-max where min and max are numbers in range 0-100.\n", stderr);
                    return INVALID_ARGUMENT;
                }
                break;

            case arg_posterize:
                if (LIQ_OK != liq_set_min_posterization(options.liq, atoi(optarg))) {
                    fputs("Posterization should be number of bits in range 0-4.\n", stderr);
                    return INVALID_ARGUMENT;
                }
                break;

            case arg_map:
                {
                    png24_image tmp = {};
                    if (SUCCESS != read_image(options.liq, optarg, false, &tmp, &options.fixed_palette_image, false, false)) {
                        fprintf(stderr, "  error: Unable to load %s", optarg);
                        return INVALID_ARGUMENT;
                    }
                }
                break;

            case 'h':
                print_full_version(stdout);
                print_usage(stdout);
                return SUCCESS;

            case 'V':
                puts(PNGQUANT_VERSION);
                return SUCCESS;

            case -1: break;

            default:
                return INVALID_ARGUMENT;
        }
    } while (opt != -1);

    int argn = optind;

    if (argn >= argc) {
        if (argn > 1) {
            fputs("No input files specified. See -h for help.\n", stderr);
        } else {
            print_full_version(stderr);
            print_usage(stderr);
        }
        return MISSING_ARGUMENT;
    }

    if (options.verbose) {
        liq_set_log_callback(options.liq, log_callback, NULL);
        options.log_callback = log_callback;
    }

    char *colors_end;
    unsigned long colors = strtoul(argv[argn], &colors_end, 10);
    if (colors_end != argv[argn] && '\0' == colors_end[0]) {
        if (LIQ_OK != liq_set_max_colors(options.liq, colors)) {
            fputs("Number of colors must be between 2 and 256.\n", stderr);
            return INVALID_ARGUMENT;
        }
        argn++;
    }

    if (newext && output_file_path) {
        fputs("--ext and --output options can't be used at the same time\n", stderr);
        return INVALID_ARGUMENT;
    }

    // new filename extension depends on options used. Typically basename-fs8.png
    if (newext == NULL) {
        newext = options.floyd > 0 ? "-ie-fs8.png" : "-ie-or8.png";
        if (!options.ie_mode) {
            newext += 3;    /* skip "-ie" */
        }
    }

    if (argn == argc || (argn == argc-1 && 0==strcmp(argv[argn],"-"))) {
        options.using_stdin = true;
        argn = argc-1;
    }

    if (options.using_stdin && output_file_path) {
        fputs("--output can't be mixed with stdin\n", stderr);
        return INVALID_ARGUMENT;
    }

    const int num_files = argc-argn;

    if (output_file_path && num_files != 1) {
        fputs("Only one input file is allowed when --output is used\n", stderr);
        return INVALID_ARGUMENT;
    }

#ifdef _OPENMP
    // if there's a lot of files, coarse parallelism can be used
    if (num_files > 2*omp_get_max_threads()) {
        omp_set_nested(0);
        omp_set_dynamic(1);
    } else {
        omp_set_nested(1);
    }
#endif

    #pragma omp parallel for \
        schedule(static, 1) reduction(+:skipped_count) reduction(+:error_count) reduction(+:file_count) shared(latest_error)
    for(int i=0; i < num_files; i++) {
        struct pngquant_options opts = options;
        opts.liq = liq_attr_copy(options.liq);

        const char *filename = opts.using_stdin ? "stdin" : argv[argn+i];

        #ifdef _OPENMP
        struct buffered_log buf = {};
        if (opts.log_callback && omp_get_num_threads() > 1 && num_files > 1) {
            liq_set_log_callback(opts.liq, log_callback_buferred, &buf);
            liq_set_log_flush_callback(opts.liq, log_callback_buferred_flush, &buf);
            options.log_callback = log_callback_buferred;
            options.log_callback_user_info = &buf;
        }
        #endif


        pngquant_error retval = SUCCESS;

        const char *outname = output_file_path;
        char *outname_free = NULL;
        if (!options.using_stdin) {
            if (!outname) {
                outname = outname_free = add_filename_extension(filename, newext);
            }
            if (!options.force && file_exists(outname)) {
                fprintf(stderr, "  error:  %s exists; not overwriting\n", outname);
                retval = NOT_OVERWRITING_ERROR;
            }
        }

        if (!retval) {
            retval = pngquant_file(filename, outname, &opts);
        }

        free(outname_free);

        liq_attr_destroy(opts.liq);

        if (retval) {
            #pragma omp critical
            {
                latest_error = retval;
            }
            if (retval == TOO_LOW_QUALITY || retval == TOO_LARGE_FILE) {
                skipped_count++;
            } else {
                error_count++;
            }
        }
        ++file_count;
    }

    if (error_count) {
        verbose_printf(&options, "There were errors quantizing %d file%s out of a total of %d file%s.",
                       error_count, (error_count == 1)? "" : "s", file_count, (file_count == 1)? "" : "s");
    }
    if (skipped_count) {
        verbose_printf(&options, "Skipped %d file%s out of a total of %d file%s.",
                       skipped_count, (skipped_count == 1)? "" : "s", file_count, (file_count == 1)? "" : "s");
    }
    if (!skipped_count && !error_count) {
        verbose_printf(&options, "No errors detected while quantizing %d image%s.",
                       file_count, (file_count == 1)? "" : "s");
    }

    liq_image_destroy(options.fixed_palette_image);
    liq_attr_destroy(options.liq);

    return latest_error;
}

pngquant_error pngquant_file(const char *filename, const char *outname, struct pngquant_options *options)
{
    pngquant_error retval = SUCCESS;

    verbose_printf(options, "%s:", filename);

    liq_image *input_image = NULL;
    png24_image input_image_rwpng = {};
    bool keep_input_pixels = options->skip_if_larger || (options->using_stdin && options->min_quality_limit); // original may need to be output to stdout
    if (!retval) {
        retval = read_image(options->liq, filename, options->using_stdin, &input_image_rwpng, &input_image, keep_input_pixels, options->verbose);
    }

    int quality_percent = 90; // quality on 0-100 scale, updated upon successful remap
    png8_image output_image = {};
    if (!retval) {
        verbose_printf(options, "  read %luKB file", (input_image_rwpng.file_size+1023UL)/1024UL);

#if USE_LCMS
        if (input_image_rwpng.lcms_status == ICCP) {
            verbose_printf(options, "  used embedded ICC profile to transform image to sRGB colorspace");
        } else if (input_image_rwpng.lcms_status == GAMA_CHRM) {
            verbose_printf(options, "  used gAMA and cHRM chunks to transform image to sRGB colorspace");
        } else if (input_image_rwpng.lcms_status == ICCP_WARN_GRAY) {
            verbose_printf(options, "  warning: ignored ICC profile in GRAY colorspace");
        }
#endif

        if (input_image_rwpng.gamma != 0.45455) {
            verbose_printf(options, "  corrected image from gamma %2.1f to sRGB gamma",
                           1.0/input_image_rwpng.gamma);
        }

        // when using image as source of a fixed palette the palette is extracted using regular quantization
        liq_result *remap = liq_quantize_image(options->liq, options->fixed_palette_image ? options->fixed_palette_image : input_image);

        if (remap) {
            liq_set_output_gamma(remap, 0.45455); // fixed gamma ~2.2 for the web. PNG can't store exact 1/2.2
            liq_set_dithering_level(remap, options->floyd);

            retval = prepare_output_image(remap, input_image, &output_image);
            if (!retval) {
                if (LIQ_OK != liq_write_remapped_image_rows(remap, input_image, output_image.row_pointers)) {
                    retval = OUT_OF_MEMORY_ERROR;
                }

                set_palette(remap, &output_image);

                double palette_error = liq_get_quantization_error(remap);
                if (palette_error >= 0) {
                    quality_percent = liq_get_quantization_quality(remap);
                    verbose_printf(options, "  mapped image to new colors...MSE=%.3f (Q=%d)", palette_error, quality_percent);
                }
            }
            liq_result_destroy(remap);
        } else {
            retval = TOO_LOW_QUALITY;
        }
    }

    if (!retval) {

        if (options->skip_if_larger) {
            // this is very rough approximation, but generally avoid losing more quality than is gained in file size.
            // Quality is squared, because even greater savings are needed to justify big quality loss.
            double quality = quality_percent/100.0;
            output_image.maximum_file_size = input_image_rwpng.file_size * quality*quality;
        }

        output_image.fast_compression = options->fast_compression;
        output_image.chunks = input_image_rwpng.chunks; input_image_rwpng.chunks = NULL;
        retval = write_image(&output_image, NULL, outname, options);

        if (TOO_LARGE_FILE == retval) {
            verbose_printf(options, "  file exceeded expected size of %luKB", (unsigned long)output_image.maximum_file_size/1024UL);
        }
    }

    if (TOO_LARGE_FILE == retval || (TOO_LOW_QUALITY == retval && options->using_stdin)) {
        // when outputting to stdout it'd be nasty to create 0-byte file
        // so if quality is too low, output 24-bit original
        if (keep_input_pixels) {
            pngquant_error write_retval = write_image(NULL, &input_image_rwpng, outname, options);
            if (write_retval) {
                retval = write_retval;
            }
        }
    }

    liq_image_destroy(input_image);
    rwpng_free_image24(&input_image_rwpng);
    rwpng_free_image8(&output_image);

    return retval;
}
Exemplo n.º 4
0
int
main()
    {
    int			quantum, checktick();
    int			BytesPerWord;
    int			i,k;
    ssize_t		j;
    STREAM_TYPE		scalar;
    double		t, times[4][NTIMES];
	double		*TimesByRank;
	double		t0,t1,tmin;
	int         rc, numranks, myrank;
	STREAM_TYPE	AvgError[3] = {0.0,0.0,0.0};
	STREAM_TYPE *AvgErrByRank;

    /* --- SETUP --- call MPI_Init() before anything else! --- */

    rc = MPI_Init(NULL, NULL);
	t0 = MPI_Wtime();
    if (rc != MPI_SUCCESS) {
       printf("ERROR: MPI Initialization failed with return code %d\n",rc);
       exit(1);
    }
	// if either of these fail there is something really screwed up!
	MPI_Comm_size(MPI_COMM_WORLD, &numranks);
	MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

    /* --- NEW FEATURE --- distribute requested storage across MPI ranks --- */
	array_elements = STREAM_ARRAY_SIZE / numranks;		// don't worry about rounding vs truncation
    array_alignment = 64;						// Can be modified -- provides partial support for adjusting relative alignment

	// Dynamically allocate the three arrays using "posix_memalign()"
	// NOTE that the OFFSET parameter is not used in this version of the code!
    array_bytes = array_elements * sizeof(STREAM_TYPE);
    k = posix_memalign((void **)&a, array_alignment, array_bytes);
    if (k != 0) {
        printf("Rank %d: Allocation of array a failed, return code is %d\n",myrank,k);
		MPI_Abort(MPI_COMM_WORLD, 2);
        exit(1);
    }
    k = posix_memalign((void **)&b, array_alignment, array_bytes);
    if (k != 0) {
        printf("Rank %d: Allocation of array b failed, return code is %d\n",myrank,k);
		MPI_Abort(MPI_COMM_WORLD, 2);
        exit(1);
    }
    k = posix_memalign((void **)&c, array_alignment, array_bytes);
    if (k != 0) {
        printf("Rank %d: Allocation of array c failed, return code is %d\n",myrank,k);
		MPI_Abort(MPI_COMM_WORLD, 2);
        exit(1);
    }

	// Initial informational printouts -- rank 0 handles all the output
	if (myrank == 0) {
		printf(HLINE);
		printf("STREAM version $Revision: 1.7 $\n");
		printf(HLINE);
		BytesPerWord = sizeof(STREAM_TYPE);
		printf("This system uses %d bytes per array element.\n",
		BytesPerWord);

		printf(HLINE);
#ifdef N
		printf("*****  WARNING: ******\n");
		printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
		printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
		printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
		printf("*****  WARNING: ******\n");
#endif
		if (OFFSET != 0) {
			printf("*****  WARNING: ******\n");
			printf("   This version ignores the OFFSET parameter.\n");
			printf("*****  WARNING: ******\n");
		}

		printf("Total Aggregate Array size = %llu (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE);
		printf("Total Aggregate Memory per array = %.1f MiB (= %.1f GiB).\n", 
			BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
			BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
		printf("Total Aggregate memory required = %.1f MiB (= %.1f GiB).\n",
			(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
			(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
		printf("Data is distributed across %d MPI ranks\n",numranks);
		printf("   Array size per MPI rank = %llu (elements)\n" , (unsigned long long) array_elements);
		printf("   Memory per array per MPI rank = %.1f MiB (= %.1f GiB).\n", 
			BytesPerWord * ( (double) array_elements / 1024.0/1024.0),
			BytesPerWord * ( (double) array_elements / 1024.0/1024.0/1024.0));
		printf("   Total memory per MPI rank = %.1f MiB (= %.1f GiB).\n",
			(3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024.),
			(3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024./1024.));

		printf(HLINE);
		printf("Each kernel will be executed %d times.\n", NTIMES);
		printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
		printf(" will be used to compute the reported bandwidth.\n");
		printf("The SCALAR value used for this run is %f\n",SCALAR);

#ifdef _OPENMP
		printf(HLINE);
#pragma omp parallel 
		{
#pragma omp master
		{
			k = omp_get_num_threads();
			printf ("Number of Threads requested for each MPI rank = %i\n",k);
			}
		}
#endif

#ifdef _OPENMP
		k = 0;
#pragma omp parallel
#pragma omp atomic 
			k++;
		printf ("Number of Threads counted for rank 0 = %i\n",k);
#endif

	}

    /* --- SETUP --- initialize arrays and estimate precision of timer --- */

#pragma omp parallel for
    for (j=0; j<array_elements; j++) {
	    a[j] = 1.0;
	    b[j] = 2.0;
	    c[j] = 0.0;
	}

	// Rank 0 needs to allocate arrays to hold error data and timing data from
	// all ranks for analysis and output.
	// Allocate and instantiate the arrays here -- after the primary arrays 
	// have been instantiated -- so there is no possibility of having these 
	// auxiliary arrays mess up the NUMA placement of the primary arrays.

	if (myrank == 0) {
		// There are 3 average error values for each rank (using STREAM_TYPE).
		AvgErrByRank = (double *) malloc(3 * sizeof(STREAM_TYPE) * numranks);
		if (AvgErrByRank == NULL) {
			printf("Ooops -- allocation of arrays to collect errors on MPI rank 0 failed\n");
			MPI_Abort(MPI_COMM_WORLD, 2);
		}
		memset(AvgErrByRank,0,3*sizeof(STREAM_TYPE)*numranks);

		// There are 4*NTIMES timing values for each rank (always doubles)
		TimesByRank = (double *) malloc(4 * NTIMES * sizeof(double) * numranks);
		if (TimesByRank == NULL) {
			printf("Ooops -- allocation of arrays to collect timing data on MPI rank 0 failed\n");
			MPI_Abort(MPI_COMM_WORLD, 3);
		}
		memset(TimesByRank,0,4*NTIMES*sizeof(double)*numranks);
	}

	// Simple check for granularity of the timer being used
	if (myrank == 0) {
		printf(HLINE);

		if  ( (quantum = checktick()) >= 1) 
		printf("Your timer granularity/precision appears to be "
			"%d microseconds.\n", quantum);
		else {
		printf("Your timer granularity appears to be "
			"less than one microsecond.\n");
		quantum = 1;
		}
	}

    /* Get initial timing estimate to compare to timer granularity. */
	/* All ranks need to run this code since it changes the values in array a */
    t = MPI_Wtime();
#pragma omp parallel for
    for (j = 0; j < array_elements; j++)
		a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (MPI_Wtime() - t);

	if (myrank == 0) {
		printf("Each test below will take on the order"
		" of %d microseconds.\n", (int) t  );
		printf("   (= %d timer ticks)\n", (int) (t/quantum) );
		printf("Increase the size of the arrays if this shows that\n");
		printf("you are not getting at least 20 timer ticks per test.\n");

		printf(HLINE);

		printf("WARNING -- The above is only a rough guideline.\n");
		printf("For best results, please be sure you know the\n");
		printf("precision of your system timer.\n");
		printf(HLINE);
#ifdef VERBOSE
		t1 = MPI_Wtime();
		printf("VERBOSE: total setup time for rank 0 = %f seconds\n",t1-t0);
		printf(HLINE);
#endif
	}
    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    // This code has more barriers and timing calls than are actually needed, but
    // this should not cause a problem for arrays that are large enough to satisfy
    // the STREAM run rules.

    scalar = SCALAR;
    for (k=0; k<NTIMES; k++)
	{
		// kernel 1: Copy
		MPI_Barrier(MPI_COMM_WORLD);
		t0 = MPI_Wtime();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for
		for (j=0; j<array_elements; j++)
			c[j] = a[j];
#endif
		MPI_Barrier(MPI_COMM_WORLD);
		t1 = MPI_Wtime();
		times[0][k] = t1 - t0;

		// kernel 2: Scale
		MPI_Barrier(MPI_COMM_WORLD);
		t0 = MPI_Wtime();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for
		for (j=0; j<array_elements; j++)
			b[j] = scalar*c[j];
#endif
		MPI_Barrier(MPI_COMM_WORLD);
		t1 = MPI_Wtime();
		times[1][k] = t1-t0;
	
		// kernel 3: Add
		MPI_Barrier(MPI_COMM_WORLD);
		t0 = MPI_Wtime();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for
		for (j=0; j<array_elements; j++)
			c[j] = a[j]+b[j];
#endif
		MPI_Barrier(MPI_COMM_WORLD);
		t1 = MPI_Wtime();
		times[2][k] = t1-t0;
	
		// kernel 4: Triad
		MPI_Barrier(MPI_COMM_WORLD);
		t0 = MPI_Wtime();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for
		for (j=0; j<array_elements; j++)
			a[j] = b[j]+scalar*c[j];
#endif
		MPI_Barrier(MPI_COMM_WORLD);
		t1 = MPI_Wtime();
		times[3][k] = t1-t0;
	}

	t0 = MPI_Wtime();

    /*	--- SUMMARY --- */

	// Because of the MPI_Barrier() calls, the timings from any thread are equally valid. 
    // The best estimate of the maximum performance is the minimum of the "outside the barrier"
    // timings across all the MPI ranks.

	// Gather all timing data to MPI rank 0
	MPI_Gather(times, 4*NTIMES, MPI_DOUBLE, TimesByRank, 4*NTIMES, MPI_DOUBLE, 0, MPI_COMM_WORLD);

	// Rank 0 processes all timing data
	if (myrank == 0) {
		// for each iteration and each kernel, collect the minimum time across all MPI ranks
		// and overwrite the rank 0 "times" variable with the minimum so the original post-
		// processing code can still be used.
		for (k=0; k<NTIMES; k++) {
			for (j=0; j<4; j++) {
				tmin = 1.0e36;
				for (i=0; i<numranks; i++) {
					// printf("DEBUG: Timing: iter %d, kernel %lu, rank %d, tmin %f, TbyRank %f\n",k,j,i,tmin,TimesByRank[4*NTIMES*i+j*NTIMES+k]);
					tmin = MIN(tmin, TimesByRank[4*NTIMES*i+j*NTIMES+k]);
				}
				// printf("DEBUG: Final Timing: iter %d, kernel %lu, final tmin %f\n",k,j,tmin);
				times[j][k] = tmin;
			}
		}

	// Back to the original code, but now using the minimum global timing across all ranks
		for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
		{
		for (j=0; j<4; j++)
			{
			avgtime[j] = avgtime[j] + times[j][k];
			mintime[j] = MIN(mintime[j], times[j][k]);
			maxtime[j] = MAX(maxtime[j], times[j][k]);
			}
		}
    
		// note that "bytes[j]" is the aggregate array size, so no "numranks" is needed here
		printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
		for (j=0; j<4; j++) {
			avgtime[j] = avgtime[j]/(double)(NTIMES-1);

			printf("%s%11.1f  %11.6f  %11.6f  %11.6f\n", label[j],
			   1.0E-06 * bytes[j]/mintime[j],
			   avgtime[j],
			   mintime[j],
			   maxtime[j]);
		}
		printf(HLINE);
	}

    /* --- Every Rank Checks its Results --- */
#ifdef INJECTERROR
	a[11] = 100.0 * a[11];
#endif
	computeSTREAMerrors(&AvgError[0], &AvgError[1], &AvgError[2]);
	/* --- Collect the Average Errors for Each Array on Rank 0 --- */
	MPI_Gather(AvgError, 3, MPI_DOUBLE, AvgErrByRank, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);

	/* -- Combined averaged errors and report on Rank 0 only --- */
	if (myrank == 0) {
#ifdef VERBOSE
		for (k=0; k<numranks; k++) {
			printf("VERBOSE: rank %d, AvgErrors %e %e %e\n",k,AvgErrByRank[3*k+0],
				AvgErrByRank[3*k+1],AvgErrByRank[3*k+2]);
		}
#endif
		checkSTREAMresults(AvgErrByRank,numranks);
		printf(HLINE);
	}

#ifdef VERBOSE
	if (myrank == 0) {
		t1 = MPI_Wtime();
		printf("VERBOSE: total shutdown time for rank %d = %f seconds\n",myrank,t1-t0);
	}
#endif

	free(a);
	free(b);
	free(c);
	if (myrank == 0) {
		free(TimesByRank);
		free(AvgErrByRank);
	}

    MPI_Finalize();
	return(0);
}
Exemplo n.º 5
0
int main(int argc, char *argv[]) {
	//set number of threads here 
	omp_set_num_threads(16);

	int i, j, k;
	int nthreads, tid, chunk;
	double sum;
	double **A, **B, **C;
	
/***** Setting up matrix *****/
	// initializing up matrix rows
	A = (double**) malloc(M * sizeof(double* ));
	B = (double**) malloc(M * sizeof(double* ));
	C = (double**) malloc(M * sizeof(double* ));	
	// initializing up matrix columns
	for (i = 0; i < M; i++) 
	{
		A[i]= (double*) malloc(M * sizeof(double)); 
		B[i]= (double*) malloc(M * sizeof(double)); 
		C[i]= (double*) malloc(M * sizeof(double));
	}
	// adding values to matrix
	
	for (i = 0; i < M; i++) 
	{
		for (j = 0; j < M; j++) 
		{ 
			A[i][j] = j*1;
			B[i][j] = i*j+2; 
			C[i][j] = j-i*2;
		}
	}

	double start, end;
	start = omp_get_wtime();
/***** Matrix multiplication *****/	
	chunk = CHUNKSIZE;
	#pragma omp parallel shared(A,B,C, nthreads, chunk)// private(i, tid) // spawns threads
	{
		tid = omp_get_thread_num();
		if(tid == 0)
		{
			nthreads = omp_get_num_threads();
			printf("Number of threads = %d\n", nthreads);
		}
		printf("Thread %d starting...\n", tid);
	
		// where to put the parallel for loops? outter, inner, deep inner?
		
		//#pragma omp for schedule(dynamic,chunk) // divides loop iterations
		#pragma omp for schedule(runtime) // divides loop
		//#pragma omp parallel for schedule(runtime) // spawns threads and divides loop
		//runtime - schedule is convenient for experimenting with different schedules and chunk sizes without having to modify and recompile the program
		
		for (i = 0; i < M; i++) 
		{
			//#pragma omp for schedule(dynamic,chunk)
			for (j = 0; j < M; j++)
			{
				sum = 0;
				
				//#pragma omp for schedule(dynamic,chunk)
				//#pragma omp parallel for schedule(dynamic,chunk) // spawns threads and divides loop
				#pragma omp for schedule(dynamic,chunk) // divides loop
				for (k=0; k < M; k++) 
				{
					sum += A[i][k]*B[k][j];
				}
				C[i][j] = sum;
			}
		}
	
	}

	end = omp_get_wtime();

	printf("Time of computation: %f\n", end-start);
}
Exemplo n.º 6
0
    static boost::tuple< boost::shared_ptr<Matrix>, boost::shared_ptr<Matrix> >
    transfer_operators(const Matrix &A, params &prm)
    {
        typedef typename backend::value_type<Matrix>::type value_type;
        typedef typename math::scalar_of<value_type>::type scalar_type;

        const size_t n = rows(A);

        BOOST_AUTO(Aptr, A.ptr_data());
        BOOST_AUTO(Acol, A.col_data());
        BOOST_AUTO(Aval, A.val_data());

        TIC("aggregates");
        Aggregates aggr(A, prm.aggr, prm.nullspace.cols);
        prm.aggr.eps_strong *= 0.5;
        TOC("aggregates");

        TIC("interpolation");
        boost::shared_ptr<Matrix> P_tent = tentative_prolongation<Matrix>(
                n, aggr.count, aggr.id, prm.nullspace, prm.aggr.block_size
                );

        boost::shared_ptr<Matrix> P = boost::make_shared<Matrix>();
        P->nrows = rows(*P_tent);
        P->ncols = cols(*P_tent);

        P->ptr.resize(n + 1, 0);

#pragma omp parallel
        {
            std::vector<ptrdiff_t> marker(P->ncols, -1);

#ifdef _OPENMP
            int nt  = omp_get_num_threads();
            int tid = omp_get_thread_num();

            ptrdiff_t chunk_size  = (n + nt - 1) / nt;
            ptrdiff_t chunk_start = tid * chunk_size;
            ptrdiff_t chunk_end   = std::min<ptrdiff_t>(n, chunk_start + chunk_size);
#else
            ptrdiff_t chunk_start = 0;
            ptrdiff_t chunk_end   = n;
#endif

            // Count number of entries in P.
            for(ptrdiff_t i = chunk_start; i < chunk_end; ++i) {
                for(ptrdiff_t ja = Aptr[i], ea = Aptr[i+1]; ja < ea; ++ja) {
                    ptrdiff_t ca = Acol[ja];

                    // Skip weak off-diagonal connections.
                    if (ca != i && !aggr.strong_connection[ja])
                        continue;

                    for(ptrdiff_t jp = P_tent->ptr[ca], ep = P_tent->ptr[ca+1]; jp < ep; ++jp) {
                        ptrdiff_t cp = P_tent->col[jp];

                        if (marker[cp] != i) {
                            marker[cp] = i;
                            ++( P->ptr[i + 1] );
                        }
                    }
                }
            }

            boost::fill(marker, -1);

#pragma omp barrier
#pragma omp single
            {
                boost::partial_sum(P->ptr, P->ptr.begin());
                P->col.resize(P->ptr.back());
                P->val.resize(P->ptr.back());
            }

            // Fill the interpolation matrix.
            for(ptrdiff_t i = chunk_start; i < chunk_end; ++i) {

                // Diagonal of the filtered matrix is the original matrix
                // diagonal minus its weak connections.
                value_type dia = math::zero<value_type>();
                for(ptrdiff_t j = Aptr[i], e = Aptr[i+1]; j < e; ++j) {
                    if (Acol[j] == i)
                        dia += Aval[j];
                    else if (!aggr.strong_connection[j])
                        dia -= Aval[j];
                }
                dia = math::inverse(dia);

                ptrdiff_t row_beg = P->ptr[i];
                ptrdiff_t row_end = row_beg;
                for(ptrdiff_t ja = Aptr[i], ea = Aptr[i + 1]; ja < ea; ++ja) {
                    ptrdiff_t ca = Acol[ja];

                    // Skip weak off-diagonal connections.
                    if (ca != i && !aggr.strong_connection[ja]) continue;

                    value_type va = (ca == i)
                        ? static_cast<value_type>(static_cast<scalar_type>(1 - prm.relax) * math::identity<value_type>())
                        : static_cast<value_type>(static_cast<scalar_type>(-prm.relax) * dia * Aval[ja]);

                    for(ptrdiff_t jp = P_tent->ptr[ca], ep = P_tent->ptr[ca+1]; jp < ep; ++jp) {
                        ptrdiff_t cp = P_tent->col[jp];
                        value_type vp = P_tent->val[jp];

                        if (marker[cp] < row_beg) {
                            marker[cp] = row_end;
                            P->col[row_end] = cp;
                            P->val[row_end] = va * vp;
                            ++row_end;
                        } else {
                            P->val[ marker[cp] ] += va * vp;
                        }
                    }
                }
            }
        }
        TOC("interpolation");

        boost::shared_ptr<Matrix> R = boost::make_shared<Matrix>();
        *R = transpose(*P);

        if (prm.nullspace.cols > 0)
            prm.aggr.block_size = prm.nullspace.cols;

        return boost::make_tuple(P, R);
    }
Exemplo n.º 7
0
void Voronoi_Charge()
{
  double time0;
  int Mc_AN,Gc_AN,Mh_AN,h_AN,Gh_AN;
  int Cwan,GNc,GRc,Nog,Nh,MN,spin;
  double x,y,z,dx,dy,dz,fw;
  double Cxyz[4];
  double FuzzyW,sum0,sum1;
  double magx,magy,magz;
  double tmagx,tmagy,tmagz;
  double tden,tmag,theta,phi,rho,mag;
  double den0,den1;
  double VC_S,T_VC0,T_VC1;
  double **VC;
  double TStime,TEtime;
  double S_coordinate[3];
  int numprocs,myid,tag=999,ID;
  FILE *fp_VC;
  char file_VC[YOUSO10];
  char buf[fp_bsize];          /* setvbuf */

  MPI_Status stat;
  MPI_Request request;

  /* for OpenMP */
  int OMPID,Nthrds,Nprocs;

  MPI_Comm_size(mpi_comm_level1,&numprocs);
  MPI_Comm_rank(mpi_comm_level1,&myid);

  dtime(&TStime);
  if (myid==Host_ID) printf("\n<Voronoi_Charge>  calculate Voronoi charges\n");fflush(stdout);

  /*****************************************************
    allocation of array
  *****************************************************/

  VC = (double**)malloc(sizeof(double*)*4);
  for (spin=0; spin<4; spin++){
    VC[spin] = (double*)malloc(sizeof(double)*(atomnum+1));
  }

  /*****************************************************
            calculation of Voronoi charge
  *****************************************************/

#pragma omp parallel shared(S_coordinate,GridVol,VC,Density_Grid,SpinP_switch,MGridListAtom,atv,CellListAtom,GridListAtom,NumOLG,WhatSpecies,M2G,Matomnum) private(OMPID,Nthrds,Nprocs,Mc_AN,Gc_AN,Cwan,sum0,sum1,tden,tmagx,tmagy,tmagz,Nog,GNc,GRc,Cxyz,x,y,z,FuzzyW,MN,den0,den1,theta,phi,rho,mag,magx,magy,magz,tmag)
  {

    /* get info. on OpenMP */ 

    OMPID = omp_get_thread_num();
    Nthrds = omp_get_num_threads();
    Nprocs = omp_get_num_procs();

    for (Mc_AN=1+OMPID; Mc_AN<=Matomnum; Mc_AN+=Nthrds){

      Gc_AN = M2G[Mc_AN];    
      Cwan = WhatSpecies[Gc_AN];

      sum0 = 0.0;
      sum1 = 0.0;

      tden  = 0.0;
      tmagx = 0.0;
      tmagy = 0.0;
      tmagz = 0.0;

      for (Nog=0; Nog<NumOLG[Mc_AN][0]; Nog++){

	/* calculate fuzzy weight */

	GNc = GridListAtom[Mc_AN][Nog];
	GRc = CellListAtom[Mc_AN][Nog];

	Get_Grid_XYZ(GNc,Cxyz);
	x = Cxyz[1] + atv[GRc][1];
	y = Cxyz[2] + atv[GRc][2]; 
	z = Cxyz[3] + atv[GRc][3];
	FuzzyW = Fuzzy_Weight(Gc_AN,Mc_AN,0,x,y,z);

	/* find charge */

	MN = MGridListAtom[Mc_AN][Nog];

	if (SpinP_switch<=1){

	  den0  = Density_Grid[0][MN];
	  den1  = Density_Grid[1][MN];

	  /* sum density */
	  sum0 += den0*FuzzyW; 
	  sum1 += den1*FuzzyW; 
	}

	else{

	  den0  = Density_Grid[0][MN];
	  den1  = Density_Grid[1][MN];
	  theta = Density_Grid[2][MN];
	  phi   = Density_Grid[3][MN];

	  rho = den0 + den1;
	  mag = den0 - den1;
	  magx = mag*sin(theta)*cos(phi);
	  magy = mag*sin(theta)*sin(phi);
	  magz = mag*cos(theta);

	  /* sum density */
 
	  tden  +=  rho*FuzzyW; 
	  tmagx += magx*FuzzyW; 
	  tmagy += magy*FuzzyW; 
	  tmagz += magz*FuzzyW; 
	}

      }

      if (SpinP_switch<=1){
	VC[0][Gc_AN] = sum0*GridVol; 
	VC[1][Gc_AN] = sum1*GridVol;
      }

      else {

	tmag = sqrt(tmagx*tmagx + tmagy*tmagy + tmagz*tmagz); 
	sum0 = 0.5*(tden + tmag);
	sum1 = 0.5*(tden - tmag);

	xyz2spherical( tmagx,tmagy,tmagz, 0.0,0.0,0.0, S_coordinate ); 

	VC[0][Gc_AN] = sum0*GridVol; 
	VC[1][Gc_AN] = sum1*GridVol;
	VC[2][Gc_AN] = S_coordinate[1];
	VC[3][Gc_AN] = S_coordinate[2];
      }

    } /* Mc_AN */

  } /* #pragma omp parallel */

  /*****************************************************
    MPI VC
  *****************************************************/

  for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){
    ID = G2ID[Gc_AN];
    MPI_Bcast(&VC[0][Gc_AN], 1, MPI_DOUBLE, ID, mpi_comm_level1);
  }

  for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){
    ID = G2ID[Gc_AN];
    MPI_Bcast(&VC[1][Gc_AN], 1, MPI_DOUBLE, ID, mpi_comm_level1);
  }

  if (SpinP_switch==3){

    for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){
      ID = G2ID[Gc_AN];
      MPI_Bcast(&VC[2][Gc_AN], 1, MPI_DOUBLE, ID, mpi_comm_level1);
    }

    for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){
      ID = G2ID[Gc_AN];
      MPI_Bcast(&VC[3][Gc_AN], 1, MPI_DOUBLE, ID, mpi_comm_level1);
    }
  }

  VC_S = 0.0;
  T_VC0 = 0.0;
  T_VC1 = 0.0;
  for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){
    VC_S += VC[0][Gc_AN] - VC[1][Gc_AN];  
    T_VC0 += VC[0][Gc_AN];
    T_VC1 += VC[1][Gc_AN];
  }

  /****************************************
   file, *.VC
  ****************************************/

  if ( myid==Host_ID ){

    sprintf(file_VC,"%s%s.VC",filepath,filename);

    if ((fp_VC = fopen(file_VC,"w")) != NULL){

#ifdef xt3
      setvbuf(fp_VC,buf,_IOFBF,fp_bsize);  /* setvbuf */
#endif

      fprintf(fp_VC,"\n");
      fprintf(fp_VC,"***********************************************************\n");
      fprintf(fp_VC,"***********************************************************\n");
      fprintf(fp_VC,"                     Voronoi charges                       \n");
      fprintf(fp_VC,"***********************************************************\n");
      fprintf(fp_VC,"***********************************************************\n\n");

      fprintf(fp_VC,"  Sum of Voronoi charges for up    = %15.12f\n", T_VC0);
      fprintf(fp_VC,"  Sum of Voronoi charges for down  = %15.12f\n", T_VC1);
      fprintf(fp_VC,"  Sum of Voronoi charges for total = %15.12f\n\n",
              T_VC0+T_VC1);

      fprintf(fp_VC,"  Total spin magnetic moment (muB) by Voronoi charges  = %15.12f\n\n",VC_S);

      if (SpinP_switch<=1){

	fprintf(fp_VC,"                     Up spin      Down spin     Sum           Diff\n");
	for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){
	  fprintf(fp_VC,"       Atom=%4d  %12.9f %12.9f  %12.9f  %12.9f\n",
		  Gc_AN, VC[0][Gc_AN], VC[1][Gc_AN],
		  VC[0][Gc_AN] + VC[1][Gc_AN],
		  VC[0][Gc_AN] - VC[1][Gc_AN]);
	}
      }

      else{
	fprintf(fp_VC,"                     Up spin      Down spin     Sum           Diff        Theta(Deg)   Phi(Deg)\n");
	for (Gc_AN=1; Gc_AN<=atomnum; Gc_AN++){
	  fprintf(fp_VC,"       Atom=%4d  %12.9f %12.9f  %12.9f  %12.9f  %8.4f    %8.4f\n",
		  Gc_AN, VC[0][Gc_AN], VC[1][Gc_AN],
		  VC[0][Gc_AN] + VC[1][Gc_AN],
		  VC[0][Gc_AN] - VC[1][Gc_AN],
                  VC[2][Gc_AN]/PI*180.0,VC[3][Gc_AN]/PI*180.0);
	}
      }

      fclose(fp_VC);
    }
    else{
      printf("Failure of saving the VC file.\n");
    }

  }

  /*****************************************************
    freeing of array
  *****************************************************/

  for (spin=0; spin<4; spin++){
    free(VC[spin]);
  }
  free(VC);

  /* for time */
  dtime(&TEtime);
  time0 = TEtime - TStime;

}
Exemplo n.º 8
0
/* 
 * ----------------------------4th Week Progress--------------------------------
 * Find all the valid collisions from the data file given as the third argument.
 * First argument is a limitation on how many sets the function will examine.
 * Second argument is a limitation on how many nanoseconds the function will be
 * running. If the time limit is exceeded, the function will stop examining the
 * file.
 * Fourth argument is the number of threads MPI should utilize.
 * This function uses OMP AND OMPI to go through the examination.
 * -----------------------------------------------------------------------------
 */
int ompi_parallel_estimation(long num_coords, int max_nsecs, char *file_name, int threads) {
	// Check if file exists. If not, print a message and return !0.
	if(access(file_name, F_OK) == -1) {
		printf("File was not found!\n");
		return(FAILURE);
	}
	
	// Initialize the MPI. After this line, multiple procs run at the same time.
	MPI_Init(NULL, NULL);

    // Get the total number of processes that are running.
    int total_procs;
    MPI_Comm_size(MPI_COMM_WORLD, &total_procs);    

    // Get the rank of the process
    int proc_num;
    MPI_Comm_rank(MPI_COMM_WORLD, &proc_num);
	if(proc_num>total_procs)
	{
		MPI_Finalize();
	}
    
    long *succesful_col;
	if(proc_num == 0)
		 succesful_col = malloc(sizeof(long) * total_procs); 

    // Calculate the total number of coordinates that are stored in the file.
    // The number is: NUM = FILE_SIZE / SIZE_OF_A_FLOAT / 3
    // A coordinate is a set of 3 floats.
    // The function will check exactly (EXAM_COORDS) collisions from the file.
    long exam_coords = fsize(file_name) / sizeof(float) / 3;
    
    // If there is a limitation on how many collisions the function should go
	// through, change the value of exam_coords to that limitation.
    if(num_coords >= 0 && num_coords < exam_coords)
        exam_coords = num_coords;
        
    // If the limitation exceeds the total number of coordinates stored in the
	// data file, display a message. The total number of collision, the function
	// will go through, remains the total number of coordinates available in the
	// file.	
    if(num_coords > exam_coords)
        printf("You have asked for more lines than the ones available. All the lines are going to be examined.\n");
    
    // Set which coords each PROCESS will go through. We will repeat the same
    // divison, when we distribute these coords to every THREAD that is running
    // within ever process. 
    long coord_from = (int)exam_coords/total_procs * proc_num;
    long coord_to = (int)exam_coords/total_procs * (proc_num+1) - 1;
    if(proc_num+1 == total_procs)
        coord_to += exam_coords % total_procs;

	// Create a variable where the total number of valid collisions from all the
	// threads will be added to.
	long sum = 0;
	
	// If there is a limitation on how many threads MPI should use (-1 means all
	// available threads), apply it.
	if(threads >= 1)
		omp_set_num_threads(threads);
		
	// Start using OpenMP.
	#pragma omp parallel shared(file_name)
	{
		// Get the total number of threads the OMP is running.
		int total_threads = omp_get_num_threads();
		
		// Get the ID of this particular thread.
		int tid = omp_get_thread_num();
		
		// Each file opens its own pointer to the data file.
		FILE *file_ptr = fopen(file_name, "rb");
		
		// Same when we divided the sets and distributed them to each process,
		// but this time we are doing it for ever thread within EACH process.
		long coll_from = (int)(coord_to-coord_from+1)/total_threads * tid;
		long coll_to = (int)(coord_to-coord_from+1)/total_threads * (tid+1) - 1;
		if(tid+1 == total_threads)
			coll_to += (coord_to-coord_from+1) % total_threads;

		// Skip some bytes from the data file, in order to get to the set where
		// the thread must start examining from
		fseek(file_ptr, 3*(coord_from+coll_from)*sizeof(float), SEEK_SET);
		
		long coords_read;
		long valid_collisions=0;

		// The timespecs will keep track of the time, if a limitation has been
		// set.
    	struct timespec start, current;
    
		// Before the start of the examination, get the current time.
		clock_gettime(CLOCK_MONOTONIC, &start);

		// The function will check all the collisions, increasing its sum
		// (valid_collisions) every time a collision is within the limits
		// defined in the start of the code.
		// Every time it goes though one set, if there has been set a limitation
		// on how many nanoseconds the function should run, check the current 
		// time, get the difference from the timestamp when the examination
		// started running and if the time limit has been exceeded, stop the
		// loop.
		for(coords_read=coll_from; coords_read<coll_to+1; coords_read++) {
			if(process_coords(file_ptr)==0)
				valid_collisions++;
			if(max_nsecs!=-1&&calculate_difference(start,current,0)>max_nsecs){
				clock_gettime(CLOCK_MONOTONIC, &current);
				printf("Reached maximum time limit.\n");
				break;
			}
		}

		// Each threads closes its file pointer.
		fclose(file_ptr);

		#pragma omp barrier
		
		// Finally, add all the valid collision numbers, each thread has found
		// to the shared variable "sum".
		#pragma omp  for reduction(+:sum)
			for(tid=0;tid<total_threads;tid++)
		  		sum += valid_collisions; //sums the total collisions of all threads
	}
	
	// After each process has calculated how many valid collisions there are in
	// its own section of data, the MPI adds all the different result into one
	// shared variable (final_count) with the help of MPI_Gather.
	MPI_Gather(&sum, 1, MPI_LONG, succesful_col, 1, MPI_LONG, 0, MPI_COMM_WORLD);
	if(proc_num == 0){
		long final_count = 0;
		for(int i=0; i<total_procs; i++){
			final_count+=succesful_col[i];
		}
		printf("MPI Parallel Examine -> Valid collisions: %ld", final_count); 
	}

	MPI_Finalize();
	
	return(SUCCESS);
}
Exemplo n.º 9
0
/* 
 * ----------------------------2nd Week Progress--------------------------------
 * Find all the valid collisions from the data file given as the third argument.
 * First argument is a limitation on how many sets the function will examine.
 * Second argument is a limitation on how many nanoseconds the function will be
 * running. If the time limit is exceeded, the function will stop examining the
 * file.
 * Fourth argument is the number of threads MPI should utilize.
 * This function uses OMP, but NOT OMPI to go through the examination.
 * -----------------------------------------------------------------------------
 */
int parallel_estimation(long num_coords, int max_nsecs, char *file_name, int threads) {
	// Check if file exists. If not, print a message and return !0.
	if(access(file_name, F_OK) == -1) {
		printf("File was not found!\n");
		return(FAILURE);
	}

	// Create a variable where the total number of valid collisions from all the
	// threads will be added to.
	long sum = 0;
	
	// If there is a limitation on how many threads MPI should use (-1 means all
	// available threads), apply it.
	if(threads >= 1)
		omp_set_num_threads(threads);
		
	// Start using OpenMP.
	#pragma omp parallel shared(file_name)
	{
		// Calculate the total number of coordinates that are stored in the 
		// file.
    	// The number is: NUM = FILE_SIZE / SIZE_OF_A_FLOAT / 3
    	// A coordinate is a set of 3 floats.
    	// The function will check exactly (EXAM_COORDS) collisions from the
    	// file.
		long exam_coords = fsize(file_name) / sizeof(float) / 3;
		
		// If there is a limitation on how many collisions the function should
		// go through, change the value of exam_coords to that limitation.
		if(num_coords >=0 && num_coords < exam_coords)
			exam_coords = num_coords;
			
		// If the limitation exceeds the total number of coordinates stored in the
		// data file, display a message. The total number of collision, the function
		// will go through, remains the total number of coordinates available in the
		// file.	
		if(num_coords > exam_coords)
			printf("You have asked for more lines than the ones available. All the lines are going to be examined.\n");

		// Get the total number of threads the OMP is running.
		int total_threads = omp_get_num_threads();
		
		// Get the ID of this particular thread.
		int tid = omp_get_thread_num();
		
		// Each file opens its own pointer to the data file.
		FILE *file_ptr = fopen(file_name, "rb");
		
		// Set which coords each thread will process.
		long coord_from = (int)exam_coords/total_threads * tid;
		long coord_to = (int)exam_coords/total_threads * (tid+1) - 1;
		if(tid+1 == total_threads)
			coord_to += exam_coords % total_threads;

		// Skip some bytes from the data file, in order to get to the set where
		// the thread must start examining from.
		fseek(file_ptr, 3*coord_from*sizeof(float), SEEK_SET);
		
		long coords_read;
		long valid_collisions=0;

		// The timespecs will keep track of the time, if a limitation has been
		// set.
    	struct timespec start, current;
    
		// Before the start of the examination, get the current time.
		clock_gettime(CLOCK_MONOTONIC, &start);

		// The function will check all the collisions, increasing its sum
		// (valid_collisions) every time a collision is within the limits
		// defined in the start of the code.
		// Every time it goes though one set, if there has been set a limitation
		// on how many nanoseconds the function should run, check the current 
		// time, get the difference from the timestamp when the examination
		// started running and if the time limit has been exceeded, stop the
		// loop.
		for(coords_read=coord_from; coords_read<coord_to+1; coords_read++) {
			if(process_coords(file_ptr)==0)
				valid_collisions++;
			if(max_nsecs!=-1&&calculate_difference(start,current,0)>max_nsecs){
				clock_gettime(CLOCK_MONOTONIC, &current);
				printf("Reached maximum time limit.\n");
				break;
			}
		}

		// Each threads closes its file pointer.
		fclose(file_ptr);
		
		#pragma omp barrier
		
		// Finally, add all the valid collision numbers, each thread has found
		// to the shared variable "sum".
		#pragma omp  for reduction(+:sum)
			for(tid=0;tid<total_threads;tid++)
		  		sum+=valid_collisions;
		
		#pragma omp master	
			printf("Non-MPI Parallel Examine -> Valid collisions: %ld\n", sum);
	}
	return(SUCCESS);
}
Exemplo n.º 10
0
JNIEXPORT jint JNICALL Java_edu_berkeley_bid_UTILS_getnumthreads
(JNIEnv * env, jobject calling_obj) {
  return omp_get_num_threads();
}
Exemplo n.º 11
0
int main(int argc, char * argv[])    
{  
    omp_set_nested(10);     // none zero value is OK!  
#pragma omp parallel num_threads(2)  
    {  
        printf("ID: %d, Max threads: %d, Num threads: %d \n",omp_get_thread_num(), omp_get_max_threads(), omp_get_num_threads());  
#pragma omp parallel num_threads(5)  
        printf("Nested, ID: %d, Max threads: %d, Num threads: %d \n",omp_get_thread_num(), omp_get_max_threads(), omp_get_num_threads());  
  }  
	return 0;    
}  
Exemplo n.º 12
0
/*!
* \return	New filtered object with new values or NULL on error.
* \ingroup	WlzValuesFilters
* \brief	Applies a seperable filter to the given object using the given
* 		convolution kernels.
* \param	inObj			Input 2 or 3D spatial domain object
* 					to be filtered which must have scalar
* 					values.
* \param	cBufSz			Convolution kernel sizes (sz), each
* 					kernel buffer is sized (2 * sz) + 1
* 					with the centre indexed sz into the
* 					buffer.
* \param	cBuf			Convolution kernel buffers.
* \param	direc			Set to non-zero in directions for which
* 					the filter is to be applied.
* \param	gType			Required return object grey type.
* 					Passing in WLZ_GREY_ERROR will
* 					request the given input object's grey
* 					type.
* \param	pad			Type of padding.
* \param	padVal			Padding value, only used when
* 					pad == ALG_PAD_VALUE.
* \param	dstErr			Destination error pointer may be NULL.
*/
WlzObject			*WlzSepFilter(WlzObject *inObj,
				  WlzIVertex3 cBufSz,
				  double *cBuf[],
				  WlzIVertex3 direc,
				  WlzGreyType gType,
				  AlgPadType pad,
				  double padVal,
				  WlzErrorNum *dstErr)
{
  int		dim = 0,
  		vSz = 0,
  		nThr = 1;
  double	**iBuf = NULL,
  		**rBuf = NULL;
  double	*vBuf = NULL;
  WlzObject	*rnObj = NULL;
  WlzIVertex3	vBufSz = {0};
  WlzIBox3	bBox = {0};
  WlzErrorNum	errNum = WLZ_ERR_NONE;

#ifdef _OPENMP
#pragma omp parallel
  {
#pragma omp master
    {
      nThr = omp_get_num_threads();
    }
  }
#endif
  if(inObj == NULL)
  {
    errNum = WLZ_ERR_OBJECT_NULL;
  }
  else if(inObj->domain.core == NULL)
  {
    errNum = WLZ_ERR_DOMAIN_NULL;
  }
  else if(inObj->values.core == NULL)
  {
    errNum = WLZ_ERR_VALUES_NULL;
  }
  else
  {
    switch(inObj->type)
    {
      case WLZ_2D_DOMAINOBJ:
        dim = 2;
	break;
      case WLZ_3D_DOMAINOBJ:
        dim = 3;
        break;
      default:
        errNum = WLZ_ERR_OBJECT_TYPE;
	break;
    }
  }
  if((errNum == WLZ_ERR_NONE) && (gType == WLZ_GREY_ERROR))
  {
    gType = WlzGreyTypeFromObj(inObj, &errNum);
  }
  if(errNum == WLZ_ERR_NONE)
  {
    if(errNum == WLZ_ERR_NONE)
    {
      switch(gType)
      {
        case WLZ_GREY_INT:    /* FALLTHROUGH */
        case WLZ_GREY_SHORT:  /* FALLTHROUGH */
        case WLZ_GREY_UBYTE:  /* FALLTHROUGH */
        case WLZ_GREY_FLOAT:  /* FALLTHROUGH */
        case WLZ_GREY_DOUBLE:
	  break;
        default:
	  errNum = WLZ_ERR_GREY_TYPE;
	  break;
      }
    }
  }
  if(errNum == WLZ_ERR_NONE)
  {
    bBox = WlzBoundingBox3I(inObj, &errNum);
    if(errNum == WLZ_ERR_NONE)
    {
      vBufSz.vtX = bBox.xMax - bBox.xMin + 1;
      vBufSz.vtY = bBox.yMax - bBox.yMin + 1;
      if(dim == 3)
      {
	vBufSz.vtZ = bBox.zMax - bBox.zMin + 1;
      }
    }
  }
  if(errNum == WLZ_ERR_NONE)
  {
    vSz = ALG_MAX3(vBufSz.vtX, vBufSz.vtY, vBufSz.vtZ);
    if(((iBuf = (double **)AlcMalloc(sizeof(double *) * 2 * nThr)) == NULL) ||
       ((vBuf = (double *)AlcMalloc(sizeof(double) * 2 * nThr * vSz)) == NULL))
    {
      errNum = WLZ_ERR_MEM_ALLOC;
    }
    else
    {
      int	idt;

      rBuf = iBuf + nThr;
      for(idt = 0; idt < nThr; ++idt)
      {
        iBuf[idt] = vBuf + (idt * vSz);
	rBuf[idt] = vBuf + ((nThr + idt) * vSz);
      }
    }
  }
  if(errNum == WLZ_ERR_NONE)
  {
    /* Convolve the object values. */
    if(direc.vtX)
    {
      rnObj = WlzSepFilterX(inObj, dim, nThr,
			    iBuf, rBuf, cBufSz.vtX, cBuf[0],
			    pad, padVal, &errNum);
    }
    if((errNum == WLZ_ERR_NONE) && direc.vtY)
    {
      WlzObject *tObj;

      tObj = WlzSepFilterY((rnObj)? rnObj: inObj, dim, nThr,
                            iBuf, rBuf, cBufSz.vtY, cBuf[1],
			    pad, padVal, &errNum);
      (void )WlzFreeObj(rnObj);
      rnObj = tObj;
    }
    if((errNum == WLZ_ERR_NONE) && (dim == 3) && direc.vtZ)
    {
      WlzObject *tObj;

      tObj = WlzSepFilterZ((rnObj)? rnObj: inObj, bBox, nThr,
                            iBuf, rBuf, cBufSz.vtZ, cBuf[2],
			    pad, padVal, &errNum);
      (void )WlzFreeObj(rnObj);
      rnObj = tObj;
    }
  }
  if((errNum == WLZ_ERR_NONE) && (rnObj != NULL) && (gType != WLZ_GREY_DOUBLE))
  {
    WlzObject *tObj;

    /* Convert object values to the required grey type. */
    tObj = WlzConvertPix((rnObj)? rnObj: inObj, gType, &errNum);
    (void )WlzFreeObj(rnObj);
    rnObj = tObj;
  }
  if(errNum != WLZ_ERR_NONE)
  {
    (void )WlzFreeObj(rnObj);
    rnObj = NULL;
  }
  AlcFree(iBuf);
  AlcFree(vBuf);
  return(rnObj);
}
Exemplo n.º 13
0
void
montecarlo_main_loop(storage_model_t * storage, int64_t virtual_packet_flag, int nthreads, unsigned long seed)
{
  int64_t finished_packets = 0;
  storage->virt_packet_count = 0;
#ifdef WITH_VPACKET_LOGGING
  storage->virt_packet_nus = (double *)safe_malloc(sizeof(double) * storage->no_of_packets);
  storage->virt_packet_energies = (double *)safe_malloc(sizeof(double) * storage->no_of_packets);
  storage->virt_packet_last_interaction_in_nu = (double *)safe_malloc(sizeof(double) * storage->no_of_packets);
  storage->virt_packet_last_interaction_type = (int64_t *)safe_malloc(sizeof(int64_t) * storage->no_of_packets);
  storage->virt_packet_last_line_interaction_in_id = (int64_t *)safe_malloc(sizeof(int64_t) * storage->no_of_packets);
  storage->virt_packet_last_line_interaction_out_id = (int64_t *)safe_malloc(sizeof(int64_t) * storage->no_of_packets);
  storage->virt_array_size = storage->no_of_packets;
#endif // WITH_VPACKET_LOGGING
#ifdef WITHOPENMP
  omp_set_dynamic(0);
  if (nthreads > 0)
    {
      omp_set_num_threads(nthreads);
    }

#pragma omp parallel firstprivate(finished_packets)
    {
      rk_state mt_state;
      rk_seed (seed + omp_get_thread_num(), &mt_state);
#pragma omp master
      {
        fprintf(stderr, "Running with OpenMP - %d threads\n", omp_get_num_threads());
        print_progress(0, storage->no_of_packets);
      }
#pragma omp for
#else
      rk_state mt_state;
      rk_seed (seed, &mt_state);
      fprintf(stderr, "Running without OpenMP\n");
#endif
      for (int64_t packet_index = 0; packet_index < storage->no_of_packets; ++packet_index)
        {
          int reabsorbed = 0;
          rpacket_t packet;
          rpacket_set_id(&packet, packet_index);
          rpacket_init(&packet, storage, packet_index, virtual_packet_flag);
          if (virtual_packet_flag > 0)
            {
              reabsorbed = montecarlo_one_packet(storage, &packet, -1, &mt_state);
            }
          reabsorbed = montecarlo_one_packet(storage, &packet, 0, &mt_state);
          storage->output_nus[packet_index] = rpacket_get_nu(&packet);
          if (reabsorbed == 1)
            {
              storage->output_energies[packet_index] = -rpacket_get_energy(&packet);
            }
          else
            {
              storage->output_energies[packet_index] = rpacket_get_energy(&packet);
            }
          if ( ++finished_packets%100 == 0 )
            {
#ifdef WITHOPENMP
              // WARNING: This only works with a static sheduler and gives an approximation of progress.
              // The alternative would be to have a shared variable but that could potentially decrease performance when using many threads.
              if (omp_get_thread_num() == 0 )
                print_progress(finished_packets * omp_get_num_threads(), storage->no_of_packets);
#else
              print_progress(finished_packets, storage->no_of_packets);
#endif
            }
        }
#ifdef WITHOPENMP
    }
#endif
  print_progress(storage->no_of_packets, storage->no_of_packets);
  fprintf(stderr,"\n");
}
Exemplo n.º 14
0
// Constructor
shapeAlign::shapeAlign(const string& nameList, const vector<string> &files,
	const int &minS, const int &maxS, const bool &win, const int &wS,
	const int &wE, const bool &ign, const int &iS, const int &iE, const int &E):
	nameFile(nameList), shapeFiles(files), shiftMin(minS), shiftMax(maxS),
	window(win), winStart(wS), winEnd(wE), ignore(ign), ignStart(iS), ignEnd(iE),
	thresh(E)
{

	// Get number of shape parameters -- one file for each parameter,
	// so this is effectively the number of files
	m = files.size();

	// Get site names by reading the single-column file containing
	// names of sites
	ifstream file(nameFile.c_str());
	string line;

	while(getline(file,line))
		names.push_back(line);
	file.close();

	// Get number of sites
	nSites = names.size();

	cerr << "Read " << nSites << " site names." << endl;

	// Initialize matrices for tracking pairwise comparison info
	D = gsl_matrix_calloc(nSites,nSites);
	S = gsl_matrix_calloc(nSites,nSites);
	R = gsl_matrix_calloc(nSites,nSites);

	cerr << "Reading shape files." << endl;

	matrices.resize(nSites);	// Initialize an empty list of matrix references

	// create matrices containing the shape information. Add data
	// to these matrices on the fly.
	for (size_t f = 0; f < files.size(); f++){

		cerr << "\t" << files[f] << endl;

		ifstream shapeFile(files[f].c_str());
		int idx = 0;	// line/site counter
		while(getline(shapeFile,line)){		// Each line in the shape files represent a single site
			stringstream linestream(line);
			string s;
			vector <string> temp;
			while(linestream >> s)	// Split on spaces and store data from each position in site
				temp.push_back(s);

			int n = temp.size();	// Get number of positions

			// there are five columns that need to be trimmed off:
			// The first three columns (identifier and NAs) and the
			// last two columns (NAs)

			// Initialize the matrix if the matrix has not previously been
			// initialized
			if (f == 0)	matrices[idx] = gsl_matrix_alloc(m,n-5);

			for (size_t i = 0; i < matrices[idx]->size2; i++){
				double d;
				stringstream stod(temp[i+3]);
				stod >> d;
				gsl_matrix_set(matrices[idx],f,i,d);
				}
			// Increment the line counter
			idx++;
		}
	}


	cerr << "\tDone reading shape files." << endl;

	// Scale each matrix such that values to go from 1->2
	cerr << "Scaling matrices." << endl;

	for (size_t i = 0; i < nSites; i++)
		scaleMatrixZscore(matrices[i]);

	cerr << "\tDone scaling matrices." << endl;

	// Loop over the sites and compute all pairwise distances -- note that
	// distances are symmetric: D[a,b] = D[b,a]. But, the shifts computed
	// are not symmetric: S[a,b] = -S[b,a].
	for (size_t i = 0; i < nSites; i++){

		if ((i+1) % 100 == 0)
			cerr << "\tProcessing " << i+1 << " of " << nSites << endl;

		// Parallelize this portion: data races shouldn't be a concern
		// since no threads should be writing to the same block of
		// memory

		#pragma omp parallel
		{
			#pragma omp master
			if (i==0)
				cerr << "Beginning all-by-all distance calculation using "
					 << omp_get_num_threads() << " threads." << endl;


			#pragma omp for
			for (size_t j = i; j < nSites; j++){

				// Get optimal shift and distances for the simple comparison
				alignData results = getOptimalShift(matrices[i],matrices[j]);

				// Get the matrix representing the reverse of the matrices[j]
				gsl_matrix* rev = gsl_matrix_alloc(matrices[j]->size1,matrices[j]->size2);
				gsl_matrix_memcpy(rev,matrices[j]);
				reverse(rev);

				// Get the optimal shift and distance for the reverse matrix
				alignData resultsRev = getOptimalShift(matrices[i],rev);

				if (results.score >= resultsRev.score){
					results.rev = 0;
				} else {
					results.score = resultsRev.score;
					results.shift = resultsRev.shift;
					results.rev = 1;
				}

				// Store the data in the matrices used for tracking
				// pairwise comparisons
				gsl_matrix_set(D,i,j,results.score);
				gsl_matrix_set(S,i,j,results.shift);
				gsl_matrix_set(R,i,j,results.rev);

				gsl_matrix_set(D,j,i,results.score);
				gsl_matrix_set(S,j,i,-1*results.shift);
				gsl_matrix_set(R,j,i,results.rev);

				// Clean up -- free memory associated with rev
				gsl_matrix_free(rev);
			}
		}
	}

	cerr << "\tDone with distance calculation." << endl;

	cerr << "Finding centroid." << endl;
	pair<int,double> C = getCentroid();
	cIdx = C.first;		// Index (w.r.t. names vector) of centroid
	cDist = C.second;	// Distance of centroid to other sequences
	cerr << "\tCentroid: Site \"" << names[C.first] << "\"" << endl;
	cerr << "\tDistance: "  << C.second << endl;
	printCentroid();
	printShifts();

// 	cerr << "Printing matrices to files." << endl;
// 	printShiftMatrix();
	// printDistanceMatrix();
// 	printRevMatrix();
// 	cerr << "\tDone." << endl;

	cerr << "Printing aligned data." << endl;
	printShiftedProfiles();
	cerr << "\tDone." << endl;

	cerr << "Job successfully completed." << endl;

}
void CDelphiFastSOR::itrOddPoints(const int& forWhom, const int& flag)
{
   delphi_integer n,ix,iy,iz;
   delphi_integer star,fin;
   delphi_real temp1,temp2,temp3,temp4;
   delphi_integer itemp1,itemp2,itemp3,itemp4;

    //cout << "### oddpoints phimap1: " << flag << endl;
#ifdef PARALLEL_OMP
   int omp_num_threads,omp_thread_id;

   /*
    * set number of threads = number of processors
    */
   //omp_set_num_threads(2);
   omp_set_num_threads(omp_get_num_procs());

   #pragma omp parallel default(shared) private(omp_thread_id,n,ix,iy,star,fin,temp1,temp2,temp3)
   {
      delphi_integer omp_index;

      omp_thread_id = omp_get_thread_num();

      if (0 == omp_thread_id) omp_num_threads = omp_get_num_threads();

      //cout << "thread " << omp_thread_id << " of " << omp_num_threads << " is alive\n";
#endif

      /* the following loops are about four times faster than the original loop over all grid points for
       * several reasons, the biggest being that we are only solving laplace's equation (unless salt is present),
       * which numerically much simpler, hence faster. we put all we leave out, back in below, ending up with
       * an equivalent calculation, but much faster.
       */
      if (fZero < abs(fIonStrength))  //----- the main loop is as below:
      {
#ifdef PARALLEL_OMP
         #pragma omp for schedule(auto)
#endif
         for (n = 1; n < iGrid-1; n++)
         {
            star = sta1[n]; fin = fi1[n];
            for (ix = star; ix <= fin; ix++)
            {
               temp1 = phimap2[ix-1]         + phimap2[(ix-1)-1];
               temp2 = phimap2[(ix-1)+lat1]  + phimap2[(ix-1)-lat2];
               temp3 = phimap2[(ix-1)+long1] + phimap2[(ix-1)-long2];
               phimap1[ix-1] = phimap1[ix-1]*om1 + (qmap1[ix-1]+temp1+temp2+temp3)*prgfSaltMap1[ix-1];
            }
         }
      }
      else //----- if there is no salt then the main loop is executed without sf saving about 15% in execution time
      {
#ifdef PARALLEL_OMP
         #pragma omp for schedule(auto)
#endif
         for (n = 1; n < iGrid-1; n++)
         {
            star = sta1[n]; fin = fi1[n];
            for (ix = star; ix <= fin; ix++)
            {
               temp1 = phimap2[ix-1]         + phimap2[(ix-1)-1];
               temp2 = phimap2[(ix-1)+lat1]  + phimap2[(ix-1)-lat2];
               temp3 = phimap2[(ix-1)+long1] + phimap2[(ix-1)-long2];
               phimap1[ix-1] = phimap1[ix-1]*om1 + (temp1+temp2+temp3)*sixth;
                //cout << "phimap1: " << right << setw(10) << flag << setw(10) << ix << setw(20) << setprecision(5) << fixed << phimap1[ix-1] << endl;
               //if(flag==1)cout << "1phimap1: " << right << setw(10) << flag << setw(10) << ix << setw(20) << setprecision(5) << fixed << phimap1[ix-1] << endl;

               //if( flag==2 && ix==498 )
               //cout << "phimap1: " << right << setw(10) << flag << setw(8) << ix << setw(20) << setprecision(5) << fixed << phimap1[ix-1]
               //     << " " << om1 << " " << temp1  << " " << temp2 << " " << temp3 << " " << sixth <<endl;

            }
         }
      }

#ifdef PARALLEL_OMP
      //#pragma omp barrier
#endif

      /*
       * first we add back the dielectric boundary points, by recalculating them individually. note this is still
       * vectorised by means of a gathering load by the compiler.
       */
#ifdef PARALLEL_OMP
      #pragma omp for schedule(auto)
#endif
      for (n = 0; n < iDielecBndyEven; n++)
      {
         ix = prgiBndyDielecIndex[n];
         temp1 = phimap2[(ix-1)-1]*prgfBndyDielec[n][0]     + phimap2[ix-1]*prgfBndyDielec[n][1];
         temp2 = phimap2[(ix-1)-lat2]*prgfBndyDielec[n][2]  + phimap2[(ix-1)+lat1]*prgfBndyDielec[n][3];
         temp3 = phimap2[(ix-1)-long2]*prgfBndyDielec[n][4] + phimap2[(ix-1)+long1]*prgfBndyDielec[n][5];
         phimap1[ix-1] += temp1 + temp2 + temp3;
        /*
        if(flag==1)cout << "2phimap1: " << right << setw(10) << flag << setw(10) << ix << setw(10) << setprecision(5) << fixed << phimap1[ix-1]
            <<setw(10) << phimap2[(ix-1)-long2] <<setw(10) << prgfBndyDielec[n][4]
            <<setw(10) << phimap2[(ix-1)+long1] <<setw(10) <<prgfBndyDielec[n][5]
            <<setw(10) << (ix-1)-long2 <<setw(10) << (ix-1)+long1
            << endl;
        */
        //if( flag==1 && ix==498 )
          //  cout << "phimap1: " << right << setw(10) << ix << setw(20) << setprecision(5) << fixed << phimap1[ix-1]
            //    << " " << temp1  << " " << temp2 << " " << temp3 <<endl;
      }


      /*
       * Now reset boundary values altered in above loops.
       */
#ifdef PARALLEL_OMP
      star = (iGrid+1)/2; fin = (iGrid*(iGrid-1)-2)/2; omp_index = iGrid*(iGrid+1)/2-iGrid+1;//iy = iGrid*(iGrid+1)/2-iGrid+1;
      #pragma omp for schedule(auto)
      for (n = 0; n < fin-star+1; n++)
      {
         iy = omp_index+(n+1)*iGrid;
         phimap1[iy-1] = bndx1[n];
         phimap1[iy+((iGrid+1)/2-1)-1] = bndx2[n];
      }
#else
      star = (iGrid+1)/2; fin = (iGrid*(iGrid-1)-2)/2; iy = iGrid*(iGrid+1)/2-iGrid+1;
      for (n = 0; n < fin-star+1; n++)
      {
         iy = iy+iGrid; phimap1[iy-1] = bndx1[n]; phimap1[iy+((iGrid+1)/2-1)-1] = bndx2[n];
      }
#endif

      /*
       * next we add back an adjustment to all the charged grid points due to the charge assigned. the compiler
       * directive just reassures the vector compiler that all is well as far as recurrence is concerned, i.e. it
       * would think there is a recurrence below, where as in fact there is none.
       */
      if (0 != forWhom)
      {
#ifdef PARALLEL_OMP
         #pragma omp for schedule(auto)
#endif
         for (n = 0; n < iCrgedGridEven; n++)
         {
            ix = prgiCrgPose[n]; phimap1[ix-1] += prgfCrgValA[n];
            //if(flag==1)cout << "3phimap1: " << right << setw(10) << flag << setw(10) << ix << setw(20) << setprecision(5) << fixed << phimap1[ix-1] << endl;

         }
      }

#ifdef PARALLEL_OMP
   } // end of #pragma omp parallel
#endif

   /*
    * if periodic boundary condition option, force periodicity using wrap around update of boundary values:
    *    2nd slice-->last
    *    last-1 slice-->first
    */
   if (rgbPeriodicBndy[2]) //----- z periodicity
   {
      for (iz = 0; iz < (iGrid-2)*(iGrid-2); iz += 2)
      {
         temp1 = ibndz[iz];      itemp1 = (delphi_integer)temp1;
         temp2 = temp1 + idif1z; itemp2 = (delphi_integer)temp2;
         temp3 = temp2 + inc1za; itemp3 = (delphi_integer)temp3;
         temp4 = temp1 + inc1zb; itemp4 = (delphi_integer)temp4;
         phimap1[itemp1-1] = phimap2[itemp2-1];
         phimap1[itemp3-1] = phimap2[itemp4-1];
      }
   }

   if (rgbPeriodicBndy[1]) //----- y periodicity
   {
      for (iy = 0; iy < (iGrid-2)*(iGrid-2); iy += 2)
      {
         temp1 = ibndy[iy];      itemp1 = (delphi_integer)temp1;
         temp2 = temp1 + idif1y; itemp2 = (delphi_integer)temp2;
         temp3 = temp2 + inc1ya; itemp3 = (delphi_integer)temp3;
         temp4 = temp1 + inc1yb; itemp4 = (delphi_integer)temp4;
         phimap1[itemp1-1] = phimap2[itemp2-1];
         phimap1[itemp3-1] = phimap2[itemp4-1];
      }
   }

   if (rgbPeriodicBndy[0]) //----- x periodicity
   {
      for (ix = 0; ix < (iGrid-2)*(iGrid-2); ix += 2)
      {
         temp1 = ibndx[ix];      itemp1 = (delphi_integer)temp1;
         temp2 = temp1 + idif1x; itemp2 = (delphi_integer)temp2;
         temp3 = temp2 + inc1xa; itemp3 = (delphi_integer)temp3;
         temp4 = temp1 + inc1xb; itemp4 = (delphi_integer)temp4;
         phimap1[itemp1-1] = phimap2[itemp2-1];
         phimap1[itemp3-1] = phimap2[itemp4-1];
      }
   }
}
Exemplo n.º 16
0
int main(int   argc,
         char *argv[])
{
    uint64_t total_num_nodes = 0;
    qtimer_t timer;
    double   total_time = 0.0;

    CHECK_VERBOSE();

    {
        unsigned int tmp = (unsigned int)tree_type;
        NUMARG(tmp, "UTS_TREE_TYPE");
        if (tmp <= BALANCED) {
            tree_type = (tree_t)tmp;
        } else {
            fprintf(stderr, "invalid tree type\n");
            return EXIT_FAILURE;
        }
        tmp = (unsigned int)shape_fn;
        NUMARG(tmp, "UTS_SHAPE_FN");
        if (tmp <= FIXED) {
            shape_fn = (shape_t)tmp;
        } else {
            fprintf(stderr, "invalid shape function\n");
            return EXIT_FAILURE;
        }
    }
    DBLARG(bf_0, "UTS_BF_0");
    NUMARG(root_seed, "UTS_ROOT_SEED");
    NUMARG(tree_depth, "UTS_TREE_DEPTH");
    DBLARG(non_leaf_prob, "UTS_NON_LEAF_PROB");
    NUMARG(non_leaf_bf, "UTS_NON_LEAF_NUM");
    NUMARG(shift_depth, "UTS_SHIFT_DEPTH");
    NUMARG(num_samples, "UTS_NUM_SAMPLES");

#pragma omp parallel
#pragma omp single
#ifdef PRINT_STATS
    print_stats();
#else
    print_banner();
#endif

    timer = qtimer_create();
    qtimer_start(timer);

    node_t root;
    root.height = 0;
    rng_init(root.state.state, root_seed);
    root.num_children = calc_num_children(&root);

    nodecount = 1;
    long retval;
#pragma omp parallel
#pragma omp single nowait
#pragma omp task untied
    retval = visit(&root, root.num_children);

    total_num_nodes = retval;

    qtimer_stop(timer);

    total_time = qtimer_secs(timer);

    qtimer_destroy(timer);

#ifdef PRINT_STATS
    printf("tree-size %lu\ntree-depth %d\nnum-leaves %llu\nperc-leaves %.2f\n",
           (unsigned long)total_num_nodes,
           (int)tree_height,
           (unsigned long long)num_leaves,
           num_leaves / (float)total_num_nodes * 100.0);
    printf("exec-time %.3f\ntotal-perf %.0f\npu-perf %.0f\n\n",
           total_time,
           total_num_nodes / total_time,
           total_num_nodes / total_time / omp_get_num_threads());
#else
    printf("Tree size = %lu, tree depth = %d, num leaves = %llu (%.2f%%)\n",
           (unsigned long)total_num_nodes,
           (int)tree_height,
           (unsigned long long)num_leaves,
           num_leaves / (float)total_num_nodes * 100.0);
    printf("Wallclock time = %.3f sec, performance = %.0f "
           "nodes/sec (%.0f nodes/sec per PE)\n\n",
           total_time,
           total_num_nodes / total_time,
           total_num_nodes / total_time / omp_get_num_threads());
#endif /* ifdef PRINT_STATS */

    return 0;
}
Exemplo n.º 17
0
double getStartLists(graph* G, edge** maxIntWtListPtr, 
        INT_T* maxIntWtListSizePtr) {
    
    LONG_T *local_max, maxWeight;
    
    edge *maxIntWtList;
    LONG_T maxIntWtListSize;

    LONG_T *p_start, *p_end;
    double elapsed_time;
    elapsed_time = get_seconds();

#ifdef _OPENMP
    omp_set_num_threads(NUM_THREADS);
#pragma omp parallel
{
#endif    

    LONG_T i, j, n;
    edge* pList;
    LONG_T pCount, tmpListSize;
    int tid, nthreads;
#ifdef DIAGNOSTIC
    double elapsed_time_part;
#endif
    
#ifdef _OPENMP
    tid = omp_get_thread_num();
    nthreads = omp_get_num_threads();
#else
    tid = 0;
    nthreads = 1;
#endif

    n = G->n;

    /* Determine the maximum edge weight */

    if (tid == 0) {
        local_max = (LONG_T *) malloc(nthreads*sizeof(LONG_T));
    }

    /* Allocate memory for partial edge list on each thread */
    tmpListSize = 1000;
    pList = (edge *) malloc(tmpListSize*sizeof(edge));
    pCount = 0;

#ifdef _OPENMP
#pragma omp barrier
#endif

    local_max[tid] = -1;

#ifdef DIAGNOSTIC
    if (tid == 0) {
       elapsed_time_part = get_seconds();
    }
#endif

    
#ifdef _OPENMP    
#pragma omp for
#endif
    for (i=0; i<n; i++) {
        for (j=G->numEdges[i]; j<G->numEdges[i+1]; j++) {
            if (G->weight[j] > local_max[tid]) {
                local_max[tid] = G->weight[j];
                pCount = 0;
                pList[pCount].startVertex = i;
                pList[pCount].endVertex = G->endV[j];
                pList[pCount].w = local_max[tid];
                pList[pCount].e = j;
                pCount++;
            } else if (G->weight[j] == local_max[tid]) {
                pList[pCount].startVertex = i;
                pList[pCount].endVertex = G->endV[j];
                pList[pCount].w = local_max[tid];
                pList[pCount].e = j;
                pCount++; 
            }
        }
    }

#ifdef _OPENMP
#pragma omp barrier
#endif

    if (tid == 0) {
 
#ifdef DIAGNOSTIC
    if (tid == 0) {
       elapsed_time_part = get_seconds() - elapsed_time_part;
       fprintf(stderr, "Max. weight computation time: %lf seconds\n", elapsed_time_part);
    }
#endif

       maxWeight = local_max[0];

        for (i=1; i<nthreads; i++) {
            if (local_max[i] > maxWeight)
                  maxWeight = local_max[i];
        }
        // free(local_max);
    }

#ifdef _OPENMP
#pragma omp barrier
#endif
 
    if (local_max[tid] != maxWeight) {
        pCount = 0;
    }

    /* Merge all te partial edge lists */
    if (tid == 0) {
        p_start = (LONG_T *) malloc(nthreads*sizeof(LONG_T));
        p_end = (LONG_T *) malloc(nthreads*sizeof(LONG_T));
    }

#ifdef _OPENMP    
    #pragma omp barrier
#endif
    
    p_end[tid] = pCount;
    p_start[tid] = 0;

#ifdef _OPENMP    
    #pragma omp barrier
#endif

    if (tid == 0) {
        for (i=1; i<nthreads; i++) {
            p_end[i] = p_end[i-1] + p_end[i];
            p_start[i] = p_end[i-1]; 
        }

        maxIntWtListSize = p_end[nthreads-1];
        free(*maxIntWtListPtr);
        maxIntWtList = (edge *) malloc((maxIntWtListSize)*sizeof(edge));
    }

#ifdef _OPENMP    
    #pragma omp barrier
#endif
    
    for (j=p_start[tid]; j<p_end[tid]; j++) {
        (maxIntWtList[j]).startVertex = pList[j-p_start[tid]].startVertex;
        (maxIntWtList[j]).endVertex = pList[j-p_start[tid]].endVertex;
        (maxIntWtList[j]).e = pList[j-p_start[tid]].e;
        (maxIntWtList[j]).w = pList[j-p_start[tid]].w;
    } 
 
   
#ifdef _OPENMP
    #pragma omp barrier
#endif

    free(pList);

    if (tid == 0) {
        free(local_max);
        free(p_start);
        free(p_end);
        *maxIntWtListPtr = maxIntWtList;
        *maxIntWtListSizePtr = maxIntWtListSize;
    }
    
#ifdef _OPENMP
}
#endif

    /* Verification */
#if 0
    maxIntWtList = *maxIntWtListPtr;
    for (int i=0; i<*maxIntWtListSizePtr; i++) {
        fprintf(stderr, "[%ld %ld %ld %ld] ", maxIntWtList[i].startVertex, 
                maxIntWtList[i].endVertex, maxIntWtList[i].e, maxIntWtList[i].w);
    }
#endif

    elapsed_time = get_seconds() - elapsed_time;
    return elapsed_time;
}
Exemplo n.º 18
0
int KDE::CalcKDE(DataSet_double& Out, DataSet_1D const& Pdata,
                 std::vector<double> const& Increments,
                 HistBin const& Xdim, double bandwidth) const
{
  int inSize = (int)Pdata.Size();
  // Allocate output set, set all to zero.
  Out.Zero( Xdim.Bins() );
  Out.SetDim( Dimension::X, Xdim );
  int outSize = (int)Out.Size();

  int frame, bin;
  double increment, val;
  double total = 0.0;
# ifdef _OPENMP
  int original_num_threads;
# pragma omp parallel
  {
#   pragma omp master
    {
      original_num_threads = omp_get_num_threads();
    }
  }
  // Ensure we only execute with the desired number of threads
  if (numthreads_ < original_num_threads)
    omp_set_num_threads( numthreads_ );
# endif
  // Calculate KDE, loop over input data
# ifdef _OPENMP
  int mythread;
  double **P_thread;
# pragma omp parallel private(frame, bin, val, increment, mythread) reduction(+:total)
  {
    mythread = omp_get_thread_num();
    // Prevent race conditions by giving each thread its own histogram
#   pragma omp master
    {
      P_thread = new double*[ numthreads_ ];
      for (int nt = 0; nt < numthreads_; nt++) {
        P_thread[nt] = new double[ outSize ];
        std::fill(P_thread[nt], P_thread[nt] + outSize, 0.0);
      }
    }
#   pragma omp barrier
#   pragma omp for
# endif
    for (frame = 0; frame < inSize; frame++) {
      val = Pdata.Dval(frame);
      increment = Increments[frame];
      total += increment;
      // Apply kernel across histogram
      for (bin = 0; bin < outSize; bin++)
#       ifdef _OPENMP
        P_thread[mythread][bin] +=
#       else
        Out[bin] +=
#       endif
          (increment * (this->*Kernel_)( (Xdim.Coord(bin) - val) / bandwidth ));
    }
# ifdef _OPENMP
  } // END parallel block
  // Combine results from each thread histogram into Out
  for (int i = 0; i < numthreads_; i++) {
    for (int j = 0; j < outSize; j++)
      Out[j] += P_thread[i][j];
    delete[] P_thread[i];
  }
  delete[] P_thread;
  // Restore original number of threads
  if (original_num_threads != numthreads_)
    omp_set_num_threads( original_num_threads );
# endif
  // Normalize
  for (unsigned int j = 0; j < Out.Size(); j++)
    Out[j] /= (total * bandwidth);
  return 0;
}
Exemplo n.º 19
0
int main(int argc, char **argv) {

   //  Process command-line arguments, if any.
   int mype=0;
   int numpe=0;
   parseInput(argc, argv);
   L7_Init(&mype, &numpe, &argc, argv, do_quo_setup, lttrace_on);

   struct timeval tstart_setup;
   cpu_timer_start(&tstart_setup);

   double circ_radius = 6.0;
   //  Scale the circle appropriately for the mesh size.
   circ_radius = circ_radius * (double) nx / 128.0;
   int boundary = 1;
   int parallel_in = 1;

#ifdef _OPENMP
   int nt = 0;
   int tid = 0;

   nt = omp_get_num_threads();
   tid = omp_get_thread_num();

   if (0 == tid) {
        printf("--- num openmp threads: %d\n", nt);        fflush(stdout);
   }     
#endif

   mesh = new Mesh(nx, ny, levmx, ndim, boundary, parallel_in, do_gpu_calc);
   if (DEBUG) {
      //if (mype == 0) mesh->print();

      char filename[10];
      sprintf(filename,"out%1d",mype);
      mesh->fp=fopen(filename,"w");

      //mesh->print_local();
   }
   mesh->init(nx, ny, circ_radius, initial_order, do_gpu_calc);

   size_t &ncells = mesh->ncells;
   size_t &ncells_global = mesh->ncells_global;
   int &noffset = mesh->noffset;

   state = new State(mesh);
   state->init(do_gpu_calc);

   vector<int>   &nsizes     = mesh->nsizes;
   vector<int>   &ndispl     = mesh->ndispl;

   vector<real_t> &x  = mesh->x;
   vector<real_t> &dx = mesh->dx;
   vector<real_t> &y  = mesh->y;
   vector<real_t> &dy = mesh->dy;

   nsizes.resize(numpe);
   ndispl.resize(numpe);

   int ncells_int = ncells;
   MPI_Allgather(&ncells_int, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD);

   ndispl[0]=0;
   for (int ip=1; ip<numpe; ip++){
      ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
   }
   noffset = ndispl[mype];

   state->resize(ncells);

   state->fill_circle(circ_radius, 100.0, 7.0);

   mesh->nlft = NULL;
   mesh->nrht = NULL;
   mesh->nbot = NULL;
   mesh->ntop = NULL;

   x.clear();
   dx.clear();
   y.clear();
   dy.clear();

   //  Kahan-type enhanced precision sum implementation.
   double H_sum = state->mass_sum(enhanced_precision_sum);
   if (mype == 0) printf ("Mass of initialized cells equal to %14.12lg\n", H_sum);
   H_sum_initial = H_sum;

   double cpu_time_main_setup = cpu_timer_stop(tstart_setup);
   state->parallel_timer_output(numpe,mype,"CPU:  setup time               time was",cpu_time_main_setup);

   long long mem_used = memstats_memused();
   if (mem_used > 0) {
      state->parallel_memory_output(numpe,mype,"Memory used      in startup ",mem_used);
      state->parallel_memory_output(numpe,mype,"Memory peak      in startup ",memstats_mempeak());
      state->parallel_memory_output(numpe,mype,"Memory free      at startup ",memstats_memfree());
      state->parallel_memory_output(numpe,mype,"Memory available at startup ",memstats_memtotal());
   }

   if (mype == 0) {
      printf("Iteration   0 timestep      n/a Sim Time      0.0 cells %ld Mass Sum %14.12lg\n", ncells_global, H_sum);
   }

   mesh->cpu_calc_neigh_counter=0;
   mesh->cpu_time_calc_neighbors=0.0;
   mesh->cpu_rezone_counter=0;
   mesh->cpu_refine_smooth_counter=0;

#ifdef HAVE_GRAPHICS
#ifdef HAVE_OPENGL
   set_mysize(ncells_global);
   //vector<real_t> H_global;
   //vector<real_t> x_global;
   //vector<real_t> dx_global;
   //vector<real_t> y_global;
   //vector<real_t> dy_global;
   //vector<int> proc_global;
   if (mype == 0){
      H_global.resize(ncells_global);
      x_global.resize(ncells_global);
      dx_global.resize(ncells_global);
      y_global.resize(ncells_global);
      dy_global.resize(ncells_global);
      proc_global.resize(ncells_global);
   }
   MPI_Gatherv(&x[0],  nsizes[mype], MPI_C_REAL, &x_global[0],  &nsizes[0], &ndispl[0], MPI_C_REAL, 0, MPI_COMM_WORLD);
   MPI_Gatherv(&dx[0], nsizes[mype], MPI_C_REAL, &dx_global[0], &nsizes[0], &ndispl[0], MPI_C_REAL, 0, MPI_COMM_WORLD);
   MPI_Gatherv(&y[0],  nsizes[mype], MPI_C_REAL, &y_global[0],  &nsizes[0], &ndispl[0], MPI_C_REAL, 0, MPI_COMM_WORLD);
   MPI_Gatherv(&dy[0], nsizes[mype], MPI_C_REAL, &dy_global[0], &nsizes[0], &ndispl[0], MPI_C_REAL, 0, MPI_COMM_WORLD);
   MPI_Gatherv(&state->H[0], nsizes[mype], MPI_C_REAL, &H_global[0], &nsizes[0], &ndispl[0], MPI_C_REAL, 0, MPI_COMM_WORLD);

   set_cell_data(&H_global[0]);
   set_cell_coordinates(&x_global[0], &dx_global[0], &y_global[0], &dy_global[0]);

   if (view_mode == 0) {
      mesh->proc.resize(ncells);
      for (size_t ii = 0; ii<ncells; ii++){
         mesh->proc[ii] = mesh->mype;
      }
   
      MPI_Gatherv(&mesh->proc[0],  nsizes[mype], MPI_INT, &proc_global[0],  &nsizes[0], &ndispl[0], MPI_C_REAL, 0, MPI_COMM_WORLD);
   }

   set_cell_proc(&proc_global[0]);
#endif
#ifdef HAVE_MPE
   set_mysize(ncells);
   set_cell_data(&state->H[0]);
   set_cell_coordinates(&mesh->x[0], &mesh->dx[0], &mesh->y[0], &mesh->dy[0]);
   set_cell_proc(&mesh->proc[0]);
#endif

   set_window(mesh->xmin, mesh->xmax, mesh->ymin, mesh->ymax);
   set_viewmode(view_mode);
   set_outline((int)outline);
   init_display(&argc, argv, "Shallow Water", mype);

   set_circle_radius(circle_radius);
   draw_scene();
   if (verbose) sleep(5);
   sleep(2);

   //  Set flag to show mesh results rather than domain decomposition.
   view_mode = 1;
   
   //  Clear superposition of circle on grid output.
   circle_radius = -1.0;
   
   MPI_Barrier(MPI_COMM_WORLD);
   cpu_timer_start(&tstart);

   set_idle_function(&do_calc);
   start_main_loop();
#else
   MPI_Barrier(MPI_COMM_WORLD);
   cpu_timer_start(&tstart);
   for (int it = 0; it < 10000000; it++) {
      do_calc();
   }
#endif
   
   return 0;
}
Exemplo n.º 20
0
int main(int argc, char **argv ) {

    /*
      This is the shortest path project for CPSC424/524.

      Author: Bo Song, Yale University

      Date: 4/25/2016

      Credits: This program is based on the description provided by Andrew Sherman
    */

    double wct0, wct1, total_time, cput;
    char* sourceFile, * graphFile;
    int count[8];
    #pragma omp parallel
    printf("num of threads = %d\n", omp_get_num_threads());
    for(int i = 0; i < 8; i++) count[i] = 0;
    for(int i = 0; i < 8; i++) loopCount[i] = 0;
    for(int i = 0; i < 8; i++) updateCount[i] = 0;
    if(argc != 3) {
        printf("serial <graphfile> <sourcefile>\n");
        return -1;
    }
    graphFile = argv[1];
    sourceFile = argv[2];
    timing(&wct0, &cput);
    printf("reading graph...\n");
    readGraph(graphFile);
    printf("reading source...\n");
    readSource(sourceFile);
    // print_adj_list(adj_listhead, N);
    #pragma omp parallel
    #pragma omp for schedule(static, 1)
    for(int i = 0; i < num_sources; i++) {
        count[omp_get_thread_num()]++;
        moore(sources[i]);
    }
    timing(&wct1, &cput); //get the end time
    total_time = wct1 - wct0;
    printf("Message printed by master: Total elapsed time is %f seconds.\n",total_time);
    // free resources
    for(int i = 1; i <= N; i++) {
        adj_node* node = adj_listhead[i];
        while(node) {
            adj_node* next = node->next;
            free(node);
            node = next;
        }
    }
    printf("Load balance among threads: ");
    long long sumLoop = 0, sumUpdate = 0;
    for(int i = 0; i < 8; i++) {
        printf("%d ", count[i]);
        sumLoop += loopCount[i];
        sumUpdate += updateCount[i];
    }
    printf("portion = %f", (float)sumUpdate / sumLoop);
    printf("\n");
    free(sources);
}
Exemplo n.º 21
0
//------------------------------------------------------------------------------------------------------------------------------
int main(int argc, char **argv){
  int my_rank=0;
  int num_tasks=1;
  int OMP_Threads = 1;
  int OMP_Nested = 0;

  #ifdef _OPENMP
  #pragma omp parallel
  {
    #pragma omp master
    {
      OMP_Threads = omp_get_num_threads();
      OMP_Nested  = omp_get_nested();
    }
  }
  #endif


  #ifdef USE_MPI
  int    actual_threading_model = -1;
  int requested_threading_model = -1;
      requested_threading_model = MPI_THREAD_SINGLE;
    //requested_threading_model = MPI_THREAD_FUNNELED;
    //requested_threading_model = MPI_THREAD_SERIALIZED;
    //requested_threading_model = MPI_THREAD_MULTIPLE;
  //MPI_Init(&argc, &argv);
  #ifdef _OPENMP
      requested_threading_model = MPI_THREAD_FUNNELED;
    //requested_threading_model = MPI_THREAD_SERIALIZED;
    //requested_threading_model = MPI_THREAD_MULTIPLE;
  //MPI_Init_thread(&argc, &argv, requested_threading_model, &actual_threading_model);
  #endif
  MPI_Init_thread(&argc, &argv, requested_threading_model, &actual_threading_model);
  MPI_Comm_size(MPI_COMM_WORLD, &num_tasks);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
//if(actual_threading_model>requested_threading_model)actual_threading_model=requested_threading_model;
  if(my_rank==0){
       if(requested_threading_model == MPI_THREAD_MULTIPLE  )printf("Requested MPI_THREAD_MULTIPLE, ");
  else if(requested_threading_model == MPI_THREAD_SINGLE    )printf("Requested MPI_THREAD_SINGLE, ");
  else if(requested_threading_model == MPI_THREAD_FUNNELED  )printf("Requested MPI_THREAD_FUNNELED, ");
  else if(requested_threading_model == MPI_THREAD_SERIALIZED)printf("Requested MPI_THREAD_SERIALIZED, ");
  else if(requested_threading_model == MPI_THREAD_MULTIPLE  )printf("Requested MPI_THREAD_MULTIPLE, ");
  else                                                       printf("Requested Unknown MPI Threading Model (%d), ",requested_threading_model);
       if(actual_threading_model    == MPI_THREAD_MULTIPLE  )printf("got MPI_THREAD_MULTIPLE\n");
  else if(actual_threading_model    == MPI_THREAD_SINGLE    )printf("got MPI_THREAD_SINGLE\n");
  else if(actual_threading_model    == MPI_THREAD_FUNNELED  )printf("got MPI_THREAD_FUNNELED\n");
  else if(actual_threading_model    == MPI_THREAD_SERIALIZED)printf("got MPI_THREAD_SERIALIZED\n");
  else if(actual_threading_model    == MPI_THREAD_MULTIPLE  )printf("got MPI_THREAD_MULTIPLE\n");
  else                                                       printf("got Unknown MPI Threading Model (%d)\n",actual_threading_model);
  }
  #ifdef USE_HPM // IBM HPM counters for BGQ...
  HPM_Init();
  #endif
  #endif // USE_MPI


  int log2_box_dim = 6;
  int target_boxes_per_rank = 1;

  if(argc==3){
           log2_box_dim=atoi(argv[1]);
     target_boxes_per_rank=atoi(argv[2]);
  }else{
    if(my_rank==0){printf("usage: ./a.out  [log2_box_dim]  [target_boxes_per_rank]\n");}
    #ifdef USE_MPI
    MPI_Finalize();
    #endif
    exit(0);
  }

  if(log2_box_dim<4){
    if(my_rank==0){printf("log2_box_dim must be at least 4\n");}
    #ifdef USE_MPI
    MPI_Finalize();
    #endif
    exit(0);
  }

  if(target_boxes_per_rank<1){
    if(my_rank==0){printf("target_boxes_per_rank must be at least 1\n");}
    #ifdef USE_MPI
    MPI_Finalize();
    #endif
    exit(0);
  }

  if(my_rank==0){
    if(OMP_Nested)fprintf(stdout,"%d MPI Tasks of %d threads (OMP_NESTED=TRUE)\n\n" ,num_tasks,OMP_Threads);
             else fprintf(stdout,"%d MPI Tasks of %d threads (OMP_NESTED=FALSE)\n\n",num_tasks,OMP_Threads);
  }
  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  // calculate the problem size...
  #ifndef MAX_COARSE_DIM
  #define MAX_COARSE_DIM 11
  #endif
  int64_t box_dim=1<<log2_box_dim;
  int64_t target_boxes = (int64_t)target_boxes_per_rank*(int64_t)num_tasks;
  int64_t boxes_in_i = -1;
  int64_t bi;
  for(bi=1;bi<1000;bi++){ // all possible problem sizes
    int64_t total_boxes = bi*bi*bi;
    if(total_boxes<=target_boxes){
      int64_t coarse_grid_dim = box_dim*bi;
      while( (coarse_grid_dim%2) == 0){coarse_grid_dim=coarse_grid_dim/2;}
      if(coarse_grid_dim<=MAX_COARSE_DIM){
        boxes_in_i = bi;
      }
    }
  }
  if(boxes_in_i<1){
    if(my_rank==0){printf("failed to find an acceptable problem size\n");}
    #ifdef USE_MPI
    MPI_Finalize();
    #endif
    exit(0);
  }
  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  // create the fine level...
  #ifdef USE_PERIODIC_BC
  int bc = BC_PERIODIC;
  #else
  int bc = BC_DIRICHLET;
  #endif
  level_type fine_grid;
  int ghosts=stencil_get_radius();
  create_level(&fine_grid,boxes_in_i,box_dim,ghosts,VECTORS_RESERVED,bc,my_rank,num_tasks);
  //create_level(&fine_grid,boxes_in_i,box_dim,ghosts,VECTORS_RESERVED,BC_PERIODIC ,my_rank,num_tasks);double h0=1.0/( (double)boxes_in_i*(double)box_dim );double a=2.0;double b=1.0; // Helmholtz w/Periodic
  //create_level(&fine_grid,boxes_in_i,box_dim,ghosts,VECTORS_RESERVED,BC_PERIODIC ,my_rank,num_tasks);double h0=1.0/( (double)boxes_in_i*(double)box_dim );double a=0.0;double b=1.0; //   Poisson w/Periodic
  //create_level(&fine_grid,boxes_in_i,box_dim,ghosts,VECTORS_RESERVED,BC_DIRICHLET,my_rank,num_tasks);double h0=1.0/( (double)boxes_in_i*(double)box_dim );double a=2.0;double b=1.0; // Helmholtz w/Dirichlet
  //create_level(&fine_grid,boxes_in_i,box_dim,ghosts,VECTORS_RESERVED,BC_DIRICHLET,my_rank,num_tasks);double h0=1.0/( (double)boxes_in_i*(double)box_dim );double a=0.0;double b=1.0; //   Poisson w/Dirichlet
  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #ifdef USE_HELMHOLTZ
  double a=2.0;double b=1.0; // Helmholtz
  if(my_rank==0)fprintf(stdout,"  Creating Helmholtz (a=%f, b=%f) test problem\n",a,b);
  #else
  double a=0.0;double b=1.0; // Poisson
  if(my_rank==0)fprintf(stdout,"  Creating Poisson (a=%f, b=%f) test problem\n",a,b);
  #endif
  double h0=1.0/( (double)boxes_in_i*(double)box_dim );
  initialize_problem(&fine_grid,h0,a,b);
  rebuild_operator(&fine_grid,NULL,a,b); // i.e. calculate Dinv and lambda_max
  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  mg_type all_grids;
  int minCoarseDim = 1;
  MGBuild(&all_grids,&fine_grid,a,b,minCoarseDim); // build the Multigrid Hierarchy
  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     int     doTiming;
     int    minSolves = 10; // do at least minSolves MGSolves
  double timePerSolve = 0;
  for(doTiming=0;doTiming<=1;doTiming++){ // first pass warms up, second pass times

    #ifdef USE_HPM // IBM performance counters for BGQ...
    if(doTiming)HPM_Start("FMGSolve()");
    #endif

    #ifdef USE_MPI
    double minTime   = 20.0; // minimum time in seconds that the benchmark should run
    double startTime = MPI_Wtime();
    if(doTiming==1){
      if((minTime/timePerSolve)>minSolves)minSolves=(minTime/timePerSolve); // if one needs to do more than minSolves to run for minTime, change minSolves
    }
    #endif

    if(my_rank==0){
      if(doTiming==0){fprintf(stdout,"\n\n===== warming up by running %d solves ===============================\n",minSolves);}
                 else{fprintf(stdout,"\n\n===== running %d solves =============================================\n",minSolves);}
      fflush(stdout);
    }

    int numSolves =  0; // solves completed
    MGResetTimers(&all_grids);
    while( (numSolves<minSolves) ){
      zero_vector(all_grids.levels[0],VECTOR_U);
      #ifdef USE_FCYCLES
      FMGSolve(&all_grids,VECTOR_U,VECTOR_F,a,b,1e-15);
      #else
       MGSolve(&all_grids,VECTOR_U,VECTOR_F,a,b,1e-15);
      #endif
      numSolves++;
    }

    #ifdef USE_MPI
    if(doTiming==0){
      double endTime = MPI_Wtime();
      timePerSolve = (endTime-startTime)/numSolves;
      MPI_Bcast(&timePerSolve,1,MPI_DOUBLE,0,MPI_COMM_WORLD); // after warmup, process 0 broadcasts the average time per solve (consensus)
    }
    #endif

    #ifdef USE_HPM // IBM performance counters for BGQ...
    if(doTiming)HPM_Stop("FMGSolve()");
    #endif
  }
  MGPrintTiming(&all_grids); // don't include the error check in the timing results
  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if(my_rank==0){fprintf(stdout,"calculating error...  ");}
  double fine_error = error(&fine_grid,VECTOR_U,VECTOR_UTRUE);
  if(my_rank==0){fprintf(stdout,"h = %22.15e  ||error|| = %22.15e\n\n",h0,fine_error);fflush(stdout);}
  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  // MGDestroy()
  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #ifdef USE_MPI
  #ifdef USE_HPM // IBM performance counters for BGQ...
  HPM_Print();
  #endif
  MPI_Finalize();
  #endif
  //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  return(0);
}
Exemplo n.º 22
0
void QCDDopr_Mult(QCDSpinor* pV,QCDMatrix* pU,QCDSpinor* pW,double k)
{
	MPI_Request reqSend[8];
	MPI_Request reqRecv[8];
	MPI_Status st;
	QCDMatrix* pUx;
	QCDMatrix* pUy;
	QCDMatrix* pUz;
	QCDMatrix* pUt;
	int i;

	qcdtKappa[0] = k;
	qcdtKappa[1] = k;
	qcdtKappa[2] = k;
	qcdtKappa[3] = k;

	pUx = pU;
	pUy = pU + qcdNsite;
	pUz = pU + qcdNsite*2;
	pUt = pU + qcdNsite*3;

/* #pragma omp parallel num_threads(8) */
#pragma omp parallel
	{
	int tid = 0,nid = 1;

	tid = omp_get_thread_num();
	nid = omp_get_num_threads();

	/* //debug */
	/* printf("nthreads: %d\n", nid); */
	/* printf("max_threads: %d\n", omp_get_max_threads()); */

	if(tid == 0){
		MPI_Irecv(qcdRecvBuf[QCD_TP],12*qcdNxyz,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_TP],QCD_TP,MPI_COMM_WORLD,&reqRecv[QCD_TP]);
		MPI_Irecv(qcdRecvBuf[QCD_TM],12*qcdNxyz,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_TM],QCD_TM,MPI_COMM_WORLD,&reqRecv[QCD_TM]);

		MPI_Irecv(qcdRecvBuf[QCD_XP],12*qcdNy*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_XP],QCD_XP,MPI_COMM_WORLD,&reqRecv[QCD_XP]);
		MPI_Irecv(qcdRecvBuf[QCD_XM],12*qcdNy*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_XM],QCD_XM,MPI_COMM_WORLD,&reqRecv[QCD_XM]);

		MPI_Irecv(qcdRecvBuf[QCD_YP],12*qcdNx*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_YP],QCD_YP,MPI_COMM_WORLD,&reqRecv[QCD_YP]);
		MPI_Irecv(qcdRecvBuf[QCD_YM],12*qcdNx*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_YM],QCD_YM,MPI_COMM_WORLD,&reqRecv[QCD_YM]);

		MPI_Irecv(qcdRecvBuf[QCD_ZP],12*qcdNx*qcdNy*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_ZP],QCD_ZP,MPI_COMM_WORLD,&reqRecv[QCD_ZP]);
		MPI_Irecv(qcdRecvBuf[QCD_ZM],12*qcdNx*qcdNy*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_ZM],QCD_ZM,MPI_COMM_WORLD,&reqRecv[QCD_ZM]);
	}

	//Send T
	QCDDopr_MakeTPB_dirac(qcdSendBuf[QCD_TP],pW,tid,nid);
#pragma omp barrier
	if(tid == 0){
		MPI_Isend(qcdSendBuf[QCD_TP],12*qcdNxyz,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_TM],QCD_TP,MPI_COMM_WORLD,&reqSend[QCD_TP]);
	}

	QCDDopr_MakeTMB_dirac(qcdSendBuf[QCD_TM],pUt + qcdNsite-qcdNxyz,pW + qcdNsite-qcdNxyz,tid,nid);
#pragma omp barrier
	if(tid == 0){
		MPI_Isend(qcdSendBuf[QCD_TM],12*qcdNxyz,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_TP],QCD_TM,MPI_COMM_WORLD,&reqSend[QCD_TM]);
	}

	//Send X
	QCDDopr_MakeXPB(qcdSendBuf[QCD_XP],pW,tid,nid);
#pragma omp barrier
	if(tid == 0){
		MPI_Isend(qcdSendBuf[QCD_XP],12*qcdNy*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_XM],QCD_XP,MPI_COMM_WORLD,&reqSend[QCD_XP]);
	}

	QCDDopr_MakeXMB(qcdSendBuf[QCD_XM],pUx + qcdNx-1,pW + qcdNx-1,tid,nid);
#pragma omp barrier
	if(tid == 0){
		MPI_Isend(qcdSendBuf[QCD_XM],12*qcdNy*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_XP],QCD_XM,MPI_COMM_WORLD,&reqSend[QCD_XM]);
	}


	//Send Y
	QCDDopr_MakeYPB(qcdSendBuf[QCD_YP],pW,tid,nid);
#pragma omp barrier
	if(tid == 0){
		MPI_Isend(qcdSendBuf[QCD_YP],12*qcdNx*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_YM],QCD_YP,MPI_COMM_WORLD,&reqSend[QCD_YP]);
	}

	QCDDopr_MakeYMB(qcdSendBuf[QCD_YM],pUy + qcdNxy-qcdNx,pW + qcdNxy-qcdNx,tid,nid);
#pragma omp barrier
	if(tid == 0){
		MPI_Isend(qcdSendBuf[QCD_YM],12*qcdNx*qcdNz*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_YP],QCD_YM,MPI_COMM_WORLD,&reqSend[QCD_YM]);
	}

	//Send Z
	QCDDopr_MakeZPB(qcdSendBuf[QCD_ZP],pW,tid,nid);
#pragma omp barrier
	if(tid == 0){
		MPI_Isend(qcdSendBuf[QCD_ZP],12*qcdNx*qcdNy*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_ZM],QCD_ZP,MPI_COMM_WORLD,&reqSend[QCD_ZP]);
	}

	QCDDopr_MakeZMB(qcdSendBuf[QCD_ZM],pUz + qcdNxyz-qcdNxy,pW + qcdNxyz-qcdNxy,tid,nid);
#pragma omp barrier
	if(tid == 0){
		MPI_Isend(qcdSendBuf[QCD_ZM],12*qcdNx*qcdNy*qcdNt,MPI_DOUBLE_PRECISION,qcdRankNeighbors[QCD_ZP],QCD_ZM,MPI_COMM_WORLD,&reqSend[QCD_ZM]);
	}

	QCDLA_Equate(pV + tid*qcdNsite/nid,pW + tid*qcdNsite/nid, (tid+1)*qcdNsite/nid - tid*qcdNsite/nid);
#pragma omp barrier

	QCDDopr_TPin_dirac(pV,pUt,pW + qcdNxyz,tid,nid);
#pragma omp barrier
	QCDDopr_TMin_dirac(pV,pUt-qcdNxyz,pW - qcdNxyz,tid,nid);
#pragma omp barrier
	QCDDopr_XPin(pV,pUx,pW+1,tid,nid);
#pragma omp barrier
	QCDDopr_XMin(pV,pUx-1,pW-1,tid,nid);
#pragma omp barrier

	QCDDopr_YPin(pV,pUy,pW + qcdNx,tid,nid);
#pragma omp barrier
	QCDDopr_YMin(pV,pUy-qcdNx,pW - qcdNx,tid,nid);
#pragma omp barrier
	QCDDopr_ZPin(pV,pUz,pW + qcdNxy,tid,nid);
#pragma omp barrier
	QCDDopr_ZMin(pV,pUz-qcdNxy,pW - qcdNxy,tid,nid);

	if(tid == 0){
		MPI_Wait(&reqRecv[QCD_TP],&st);
	}
#pragma omp barrier
	QCDDopr_SetTPBnd_dirac(pV,pUt,qcdRecvBuf[QCD_TP],tid,nid);
	if(tid == 0){
		MPI_Wait(&reqRecv[QCD_TM],&st);
	}
#pragma omp barrier
	QCDDopr_SetTMBnd_dirac(pV,qcdRecvBuf[QCD_TM],tid,nid);

	if(tid == 0){
		MPI_Wait(&reqRecv[QCD_XP],&st);
	}
#pragma omp barrier
	QCDDopr_SetXPBnd(pV,pUx,qcdRecvBuf[QCD_XP],tid,nid);
	if(tid == 0){
		MPI_Wait(&reqRecv[QCD_XM],&st);
	}
#pragma omp barrier
	QCDDopr_SetXMBnd(pV,qcdRecvBuf[QCD_XM],tid,nid);

	if(tid == 0){
		MPI_Wait(&reqRecv[QCD_YP],&st);
	}
#pragma omp barrier
	QCDDopr_SetYPBnd(pV,pUy,qcdRecvBuf[QCD_YP],tid,nid);
	if(tid == 0){
		MPI_Wait(&reqRecv[QCD_YM],&st);
	}
#pragma omp barrier
	QCDDopr_SetYMBnd(pV,qcdRecvBuf[QCD_YM],tid,nid);

	if(tid == 0){
		MPI_Wait(&reqRecv[QCD_ZP],&st);
	}
#pragma omp barrier
	QCDDopr_SetZPBnd(pV,pUz,qcdRecvBuf[QCD_ZP],tid,nid);
	if(tid == 0){
		MPI_Wait(&reqRecv[QCD_ZM],&st);
	}
#pragma omp barrier
	QCDDopr_SetZMBnd(pV,qcdRecvBuf[QCD_ZM],tid,nid);

	if(tid == 0){
		MPI_Wait(&reqSend[QCD_TP],&st);
		MPI_Wait(&reqSend[QCD_TM],&st);
		MPI_Wait(&reqSend[QCD_XP],&st);
		MPI_Wait(&reqSend[QCD_XM],&st);
		MPI_Wait(&reqSend[QCD_YP],&st);
		MPI_Wait(&reqSend[QCD_YM],&st);
		MPI_Wait(&reqSend[QCD_ZP],&st);
		MPI_Wait(&reqSend[QCD_ZM],&st);
	}
#pragma omp barrier

	}
}
Exemplo n.º 23
0
/* Main Program */
main(int argc,char **argv)
{
	double        **InputMatrix, **CheckMatrix;
	int             j, i, N,Noofthreads,total_threads;

	printf("\n\t\t---------------------------------------------------------------------------");
        printf("\n\t\t Centre for Development of Advanced Computing (C-DAC)");
        printf("\n\t\t Email : [email protected]");
        printf("\n\t\t---------------------------------------------------------------------------");
        printf("\n\t\t Objective : Parallization of a loop nest contating a recurrence relation.\n ");
        printf("\n\t\t Demonstrates the use of OpenMP Parallel for directive  ");
        printf("\n\t\t..........................................................................\n");


	 /* Checking for command line arguments */
        if( argc !=3 ){

           printf("\t\t Very Few Arguments\n ");
           printf("\t\t Syntax : exec <Threads> <matrix-size>\n");
           exit(-1);
        }

              

        Noofthreads=atoi(argv[1]);
        N=atoi(argv[2]);

        if ((Noofthreads!=1) && (Noofthreads!=2) && (Noofthreads!=4) && (Noofthreads!=8) && (Noofthreads!= 16) ) {
               printf("\n Number of threads should be 1,2,4,8 or 16 for the execution of program. \n\n");
               exit(-1);
         }

/*	printf("\n\t\t Enter the size of the Matrix\n");
	scanf("%d", &N);*/

	/* Input Checking */
	if (N <= 0) {
		printf("\n\t\t Array Size Should Be Of Positive Sign \n");
		exit(1);
	}

	/* Dynamic Memory Allocation */
	InputMatrix = (double **) malloc(sizeof(double *) * N);
	CheckMatrix = (double **) malloc(sizeof(double *) * N);

	/* Initializing The Matrix Elements */
	for (i = 0; i < N; i++) {
		InputMatrix[i] = (double *) malloc(sizeof(double) * N);
		for (j = 0 ; j < N; j++)
			InputMatrix[i][j] = i + j;
	}

	/* CheckMatrix Is Also Same As Input Matrix */
	for (i =0; i < N; i++) {
		CheckMatrix[i] = (double *) malloc(sizeof(double) * N);
		for (j = 0; j < N; j++)
			CheckMatrix[i][j] = InputMatrix[i][j];
	}

        /* set the number of threads */
	omp_set_num_threads(Noofthreads);

	/* OpenMP Parallel For Directive : Fork the team of threads 
           As In following code segment the j loop contain the recurrence i.e. the loop contain the data dependence
           each iteration write an element of InputMatrix that is read by the next iteration. that is difficult 
           to remove,so the  i loop is parallelised instead.  
  
             */
struct timeval tv,tv1;
gettimeofday(&tv,NULL);
	for (j = 1; j < N; j++)
	#pragma omp parallel for
	for (i = 1; i < N; i++)
               	{
                       if ( (omp_get_thread_num()) == 0)
                        {
                            total_threads=omp_get_num_threads();
                         } 
			InputMatrix[i][j] = InputMatrix[i][j] + InputMatrix[i][j - 1];
              }/* End of the parallel region */
         
gettimeofday(&tv1,NULL);
double t1=tv1.tv_sec-tv.tv_sec+(tv1.tv_usec-tv.tv_usec)*0.000001;

	/* For Validity Of Output */
	/* Serial Calculation */
gettimeofday(&tv,NULL);
	for (j = 1; j < N; j++)
		for (i = 1; i < N; i++)
			CheckMatrix[i][j] = CheckMatrix[i][j] + CheckMatrix[i][j - 1];

         
gettimeofday(&tv1,NULL);
double t2=tv1.tv_sec-tv.tv_sec+(tv1.tv_usec-tv.tv_usec)*0.000001;


	for (i = 0; i < N; i++)
		for (j = 0; j < N; j++)
			if (CheckMatrix[i][j] == InputMatrix[i][j]) {
				continue;
			} else {
				printf("\n\t\t The result of the serial and parallel calculation are not Equal \n");
				exit(1);
			}



/*	printf("\n The Output Matrix After Loop Nest Containing a Recurrence \n");
	for (i = 0; i < N; i++) {
		for (j = 0; j < N; j++)
			printf("%lf\t", InputMatrix[i][j]);
		printf("\n");
	}*/

        printf("\n\n\t\t Threads     : %d",total_threads);	
        printf("\n\t\t Matrix Size : %d ",N); 
	printf("\n\n\t\t Serial And Parallel Calculation Are Same. \n");
	printf("\n\t\t paralle took %f serial took %f ",t1,t2);
        printf("\n\t\t..........................................................................\n");
	printf("\n");

	/* Freeing Of Allocated Memory */
	free(InputMatrix);
	free(CheckMatrix);

}
Exemplo n.º 24
0
hpcc_fftw_mpi_plan
HPCC_fftw_mpi_create_plan(MPI_Comm comm, s64Int_t n, fftw_direction dir, int flags) {
  hpcc_fftw_mpi_plan p;
  fftw_complex *a = NULL, *b = NULL;
  int nxyz;
  int rank, size;

  MPI_Comm_size( comm, &size );
  MPI_Comm_rank( comm, &rank );

  p = (hpcc_fftw_mpi_plan)fftw_malloc( sizeof *p );
  if (! p) return p;

  nxyz = GetNXYZ( n, size );

  p->wx = (fftw_complex *)HPCC_fftw_malloc( (nxyz/2 + FFTE_NP) * (sizeof *p->wx) );
  p->wy = (fftw_complex *)HPCC_fftw_malloc( (nxyz/2 + FFTE_NP) * (sizeof *p->wy) );
  p->wz = (fftw_complex *)HPCC_fftw_malloc( (nxyz/2 + FFTE_NP) * (sizeof *p->wz) );
  p->work = (fftw_complex *)HPCC_fftw_malloc( n / size * 3 / 2 * (sizeof *p->work) );

  p->c_size = (nxyz+FFTE_NP) * (FFTE_NBLK + 1) + FFTE_NP;
#ifdef _OPENMP
#pragma omp parallel
  {
#pragma omp single
    {
      int i;
      i = omp_get_num_threads();
      p->c = (fftw_complex *)HPCC_fftw_malloc( p->c_size * (sizeof *p->c) * i );
    }
  }
#else
  p->c = (fftw_complex *)HPCC_fftw_malloc( p->c_size * (sizeof *p->c) );
#endif

  if (! p->wx || ! p->wy || ! p->wz || ! p->work || ! p->c) {
    if (p->c) HPCC_fftw_free( p->c );
    if (p->work) HPCC_fftw_free( p->work );
    if (p->wz) HPCC_fftw_free( p->wz );
    if (p->wy) HPCC_fftw_free( p->wy );
    if (p->wx) HPCC_fftw_free( p->wx );
    fftw_free( p );
    return NULL;
  }

  p->n = n;
  p->comm = comm;
  p->dir = dir;
  p->flags = flags;

  MPI_Type_contiguous( 2, MPI_DOUBLE, &p->cmplx );
  MPI_Type_commit( &p->cmplx );

  if (FFTW_FORWARD == p->dir)
    p->timings = HPCC_fft_timings_forward;
  else
    p->timings = HPCC_fft_timings_backward;

  HPCC_pzfft1d( n, a, b, p->work, rank, size, 0, p );

  return p;
}
Exemplo n.º 25
0
void populate_kmer_counter_from_reads (KmerCounter& kcounter, string& fasta_filename) {
    unsigned int kmer_length = kcounter.get_kmer_length();
    int i, myTid;
    unsigned long sum,
        *record_counter = new unsigned long[omp_get_max_threads()];
    unsigned long start, end;

    // init record counter
    for (int i = 0; i < omp_get_max_threads(); i++) {
        record_counter[i] = 0;
    }


    cerr << "-storing Kmers..." << "\n";
    start = time(NULL);

    Fasta_reader fasta_reader(fasta_filename);

    unsigned int entry_num = 0;

#pragma omp parallel private (myTid)
    {
        myTid = omp_get_thread_num();
        record_counter[myTid] = 0;

        while (fasta_reader.hasNext()) {
            Fasta_entry fe = fasta_reader.getNext();
            string accession = fe.get_accession();

#pragma omp atomic
            entry_num++;
            record_counter[myTid]++;
            
            if (IRKE_COMMON::MONITOR >= 4) {
                cerr << "[" << entry_num << "] acc: " << accession << ", by thread no: " << myTid << "\n";;
            }
            else if (IRKE_COMMON::MONITOR) {
                if (myTid == 0 && record_counter[myTid] % 1000 == 0)
                    {
                        sum = record_counter[0];
                        for (i=1; i<omp_get_num_threads(); i++)
                            sum+= record_counter[i];
                        cerr << "\r [" << sum << "] sequences parsed.     ";
                    }
            }
            
            string seq = fe.get_sequence();

            if (seq.length() < KMER_SIZE + 1) {
                continue;
            }
            kcounter.add_sequence(seq);

        }
        
        cerr << "\n" << " done parsing " << sum << " sequences, extracted " << kcounter.size() << " kmers, taking " << (end-start) << " seconds." << "\n";
        
        
    }


    return;
    
}
Exemplo n.º 26
0
int
main ()
{
  int	thds, *buf;

  int	errors = 0;


  thds = omp_get_max_threads ();
  if (thds == 1) {
    printf ("should be run this program on multi thread.\n");
    exit (0);
  }
  buf = (int *) malloc (sizeof(int) * (thds + 1));
  if (buf == NULL) {
    printf ("can not allocate memory.\n");
    exit (1);
  }

  omp_set_dynamic (0);
  omp_set_nested (1);
  if (omp_get_nested () == 0) {
    printf ("nested parallelism is not implement.\n");
    goto END;
  }


  omp_set_num_threads (1);

  #pragma omp parallel 
  {
    int	i, j;

    if (omp_get_num_threads () != 1) {
      #pragma omp critical
      errors += 1;
    }
    if (omp_get_thread_num () != 0) {
      errors += 1;
    }

    for (i=1; i<=thds; i++) {

      memset (buf, 0, sizeof(int) * (thds+1));
      omp_set_num_threads (i);

      #pragma omp parallel
      {
	int	id = omp_get_thread_num ();

	if (omp_get_num_threads () != i) {
	  #pragma omp critical
	  errors += 1;
	}
	buf[id] += 1;
      }

      for (j=0; j<i; j++) {
	if (buf[j] != 1) {
	  #pragma omp critical
	  errors += 1;
	}	
      }
      for (j=i; j<=thds; j++) {
	if (buf[j] != 0) {
	  #pragma omp critical
	  errors += 1;
	}	
      }
    }
  }


 END:
  if (errors == 0) {
    printf ("omp_set_nested 002 : SUCCESS\n");
    return 0;
  } else {
    printf ("omp_set_nested 002 : FAILED\n");
    return 1;
  }
}
Exemplo n.º 27
0
int main(int argc, char **argv)
{
    //initialize plqcd
    int init_status;

    if(argc < 3) {
        fprintf(stderr,"Error. Must pass the name of the input file and the number of multiplications to be performed \n");
        fprintf(stderr,"Usage: %s input_file_name Nmul\n",argv[0]);
        exit(1);
    }

    init_status = init_plqcd(argc,argv);

    if(init_status != 0)
        printf("Error initializing plqcd\n");

    int proc_id;
    int i,j,k,Nmul;
    proc_id = ipr(plqcd_g.cpr);

    Nmul=atoi(argv[2]);

#if 0
    //Intialize the ranlux random number generator
    start_ranlux(0,1);
#endif

    int NPROCS=plqcd_g.nprocs[0]*plqcd_g.nprocs[1]*plqcd_g.nprocs[2]*plqcd_g.nprocs[3];

    char ofname[128];

    char buff[128];

    strcpy(ofname,"test_hopping_output.procgrid.");

    sprintf(buff,"%d-%d-%d-%d.nthreads.%d.proc.%d",plqcd_g.nprocs[0],plqcd_g.nprocs[1],plqcd_g.nprocs[2],plqcd_g.nprocs[3],plqcd_g.nthread,proc_id);



    strcat(ofname,buff);


    FILE *ofp;

    //FILE *ofp_source;

    //if(proc_id==0)
    //{
    //     ofp_source = fopen("test_rand_vals.out","w");
    //}

    if(proc_id==0)
    {
        ofp=fopen(ofname,"w");
        fprintf(ofp,"INPUT GLOBALS:\n");
        fprintf(ofp,"----------------\n");
        fprintf(ofp,"NPROC0 %d, NPROC1 %d, NPROC2 %d, NPROC3 %d, NTHREAD %d\n",plqcd_g.nprocs[0],plqcd_g.nprocs[1],plqcd_g.nprocs[2],plqcd_g.nprocs[3], plqcd_g.nthread);
        fprintf(ofp,"L0 %d, L1 %d, L2 %d, L3 %d\n\n",plqcd_g.latdims[0],plqcd_g.latdims[1],plqcd_g.latdims[2],plqcd_g.latdims[3]);
        //printf("sizeof(spinor) %ld, sizeof(halfspinor) %ld, sizeof(su3) %ld \n",sizeof(spinor),sizeof(halfspinor),sizeof(su3));
    }


    int nthr;
#ifdef _OPENMP
    #pragma omp parallel
    {
        nthr=omp_get_num_threads();
        if(omp_get_thread_num() == 0)
            if(proc_id==0)
                fprintf(ofp,"Number of threads as returned by openmp %d\n",nthr);
    }
#endif


    /*****************************************************
     *Testing the Dirac operator interface
     ****************************************************/




    spinor *pin= (spinor *) amalloc(plqcd_g.VOLUME*sizeof(spinor), plqcd_g.ALIGN);
    if(pin==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pin.\n");
        exit(2);
    }

    spinor *pout= (spinor *) amalloc(plqcd_g.VOLUME*sizeof(spinor), plqcd_g.ALIGN);
    if(pout==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pout.\n");
        exit(2);
    }

    su3 *ufield= (su3 *) amalloc(4*plqcd_g.VOLUME*sizeof(su3), plqcd_g.ALIGN);
    if(ufield==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for gauge field ufield.\n");
        exit(2);
    }


    //256 arrays
#ifdef AVX
    spinor_256 *pin_256= (spinor_256 *) amalloc(plqcd_g.VOLUME/2*sizeof(spinor_256), plqcd_g.ALIGN);
    if(pin_256==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pin_256.\n");
        exit(2);
    }


    spinor_256 *pout_256= (spinor_256 *) amalloc(plqcd_g.VOLUME/2*sizeof(spinor_256), plqcd_g.ALIGN);
    if(pout_256==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pout_256.\n");
        exit(2);
    }


    su3_256 *ufield_256= (su3_256 *) amalloc(4*plqcd_g.VOLUME/2*sizeof(su3_256), plqcd_g.ALIGN);

    if(ufield_256==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for gauge field ufield_256.\n");
        exit(2);
    }
#endif


    //512 arrays
#ifdef MIC
    spinor_512 *pin_512= (spinor_512 *) amalloc(plqcd_g.VOLUME/4*sizeof(spinor_512), plqcd_g.ALIGN);
    if(pin_512==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pin_512.\n");
        exit(2);
    }


    spinor_512 *pout_512= (spinor_512 *) amalloc(plqcd_g.VOLUME/4*sizeof(spinor_512), plqcd_g.ALIGN);
    if(pout_512==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pout_512.\n");
        exit(2);
    }


    su3_512 *ufield_512= (su3_512 *) amalloc(4*plqcd_g.VOLUME/4*sizeof(su3_512), plqcd_g.ALIGN);

    if(ufield_512==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for gauge field ufield_512.\n");
        exit(2);
    }
#endif





    //intialize the random number generator by a seed equals to the process rank
    srand((unsigned int) proc_id);


    //Initialize the input spinor and gauge links to random numbers



    //intialize the random number generator by a seed equals to the process rank
    srand((unsigned int) proc_id);


    //Initialize the input spinor and gauge links to random numbers
    double ru[18];
    double rs[24];

    for(i=0; i<plqcd_g.VOLUME; i++)
    {
        //ranlxd(rs,24);
        for(j=0; j<24; j++)
        {
            rs[j]= rand() / (double)RAND_MAX;
            //fprintf(stderr,"rs[%d]=%lf\n",j,rs[j]);
        }

        pin[i].s0.c0=rs[0]+I*rs[1];
        pin[i].s0.c1=rs[2]+I*rs[3];
        pin[i].s0.c2=rs[4]+I*rs[5];
        pin[i].s1.c0=rs[6]+I*rs[7];
        pin[i].s1.c1=rs[8]+I*rs[9];
        pin[i].s1.c2=rs[10]+I*rs[11];
        pin[i].s2.c0=rs[12]+I*rs[13];
        pin[i].s2.c1=rs[14]+I*rs[15];
        pin[i].s2.c2=rs[16]+I*rs[17];
        pin[i].s3.c0=rs[18]+I*rs[19];
        pin[i].s3.c1=rs[20]+I*rs[21];
        pin[i].s3.c2=rs[22]+I*rs[23];


        //ranlxd(rs,24);
        for(j=0; j<24; j++)
            rs[j]= rand() / (double)RAND_MAX;

        pout[i].s0.c0=rs[0]+I*rs[1];
        pout[i].s0.c1=rs[2]+I*rs[3];
        pout[i].s0.c2=rs[4]+I*rs[5];
        pout[i].s1.c0=rs[6]+I*rs[7];
        pout[i].s1.c1=rs[8]+I*rs[9];
        pout[i].s1.c2=rs[10]+I*rs[11];
        pout[i].s2.c0=rs[12]+I*rs[13];
        pout[i].s2.c1=rs[14]+I*rs[15];
        pout[i].s2.c2=rs[16]+I*rs[17];
        pout[i].s3.c0=rs[18]+I*rs[19];
        pout[i].s3.c1=rs[20]+I*rs[21];
        pout[i].s3.c2=rs[22]+I*rs[23];

        for(j=0; j<4; j++)
        {
            //ranlxd(ru,18);
            for(k=0; k<18; k++)
            {
                ru[k]= rand() / (double)RAND_MAX;
                //fprintf(stderr,"ru[%d]=%lf\n",k,ru[k]);
            }


            ufield[4*i+j].c00=ru[0]+I*ru[1];
            ufield[4*i+j].c01=ru[2]+I*ru[3];
            ufield[4*i+j].c02=ru[4]+I*ru[5];
            ufield[4*i+j].c10=ru[6]+I*ru[7];
            ufield[4*i+j].c11=ru[8]+I*ru[9];
            ufield[4*i+j].c12=ru[10]+I*ru[11];
            ufield[4*i+j].c20=ru[12]+I*ru[13];
            ufield[4*i+j].c21=ru[14]+I*ru[15];
            ufield[4*i+j].c22=ru[16]+I*ru[17];
        }

    }

#ifdef AVX
    for(i=0; i<plqcd_g.VOLUME; i +=2)
    {
        for(j=0; j<4; j++)
            copy_su3_to_su3_256(ufield_256+4*i/2+j, ufield+4*i+j, ufield+4*(i+1)+j);

        copy_spinor_to_spinor_256(pin_256+i/2, pin+i, pin+i+1);
        copy_spinor_to_spinor_256(pout_256+i/2, pout+i, pout+i+1);
    }
#endif

#ifdef MIC
    for(i=0; i<plqcd_g.VOLUME; i +=4)
    {
        for(j=0; j<4; j++)
            copy_su3_to_su3_512(ufield_512+4*i/4+j, ufield+4*i+j, ufield+4*(i+1)+j, ufield+4*(i+2)+j, ufield+4*(i+3)+j);

        copy_spinor_to_spinor_512(pin_512+i/4, pin+i, pin+i+1, pin+i+2, pin+i+3);
        copy_spinor_to_spinor_512(pout_512+i/4, pout+i, pout+i+1, pout+i+2, pout+i+3);
    }
#endif


    double total,t1=0.0,t2=0.0,mytotal;
    int  matvecs;


#ifdef ASSYMBLY
    //---------------------------------------------
    //1: non-blocking assymbly/c version
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            t1=plqcd_hopping_matrix_eo_sse3_assymbly(pin,pout,ufield);
            t2=plqcd_hopping_matrix_oe_sse3_assymbly(pin,pout,ufield);
            mytotal += t1+t2;
        }
        matvecs += Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"non-blocking assymbly/c version:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }
#endif


#ifdef SSE3_INTRIN
    //---------------------------------------------
    //1: non-blocking sse3 with intrinsics version
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            t1=plqcd_hopping_matrix_eo_sse3_intrin(pin,pout,ufield);
            t2=plqcd_hopping_matrix_oe_sse3_intrin(pin,pout,ufield);
            mytotal += t1+t2;
        }
        matvecs += Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"non-blocking sse3 with intrinsics version:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }



    //---------------------------------------------
    //2: blocking sse3 with intrinsics version
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            t1=plqcd_hopping_matrix_eo_sse3_intrin_blocking(pin,pout,ufield);
            t2=plqcd_hopping_matrix_oe_sse3_intrin_blocking(pin,pout,ufield);
            mytotal += t1+t2;
        }
        matvecs += Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"blocking sse3 with intrinsics version:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }
#endif


#ifdef AVX
    //---------------------------------------------
    //2: avx version
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    t1=plqcd_hopping_matrix_eo_intrin_256(pin_256,pout_256,ufield_256);
    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            t1=plqcd_hopping_matrix_eo_intrin_256(pin_256,pout_256,ufield_256);
            t2=plqcd_hopping_matrix_oe_intrin_256(pin_256,pout_256,ufield_256);
            mytotal += t1+t2;
        }
        matvecs += Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"avxversion:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }
#endif


#ifdef MIC

#ifdef TEST_HOPPING_MIC
    //---------------------------------------------
    //3: MIC version full su3 matrix
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    t1=plqcd_hopping_matrix_eo_single_mic(pin_512,pout_512,ufield_512);

    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            //t1=plqcd_hopping_matrix_eo_intrin_512(pin_512,pout_512,ufield_512);
            //t2=plqcd_hopping_matrix_oe_intrin_512(pin_512,pout_512,ufield_512);
            t1=plqcd_hopping_matrix_eo_single_mic(pin_512,pout_512,ufield_512);
            t2=plqcd_hopping_matrix_eo_single_mic(pin_512,pout_512,ufield_512);
            mytotal += t1+t2;
        }
        matvecs += 2*Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"mic version, 3x3 links:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,(double )matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }


    //---------------------------------------------
    //3: MIC version full reduced su3 storage
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    t1=plqcd_hopping_matrix_eo_single_mic_short(pin_512,pout_512,ufield_512);

    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            //t1=plqcd_hopping_matrix_eo_intrin_512(pin_512,pout_512,ufield_512);
            //t2=plqcd_hopping_matrix_oe_intrin_512(pin_512,pout_512,ufield_512);
            t1=plqcd_hopping_matrix_eo_single_mic_short(pin_512,pout_512,ufield_512);
            t2=plqcd_hopping_matrix_eo_single_mic_short(pin_512,pout_512,ufield_512);
            mytotal += t1+t2;
        }
        matvecs += 2*Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"mic version, 2x3 links:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,(double )matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }

#endif

#ifdef TEST_SU3MUL_MIC

    matvecs=0;
    total=0.0;
    mytotal =0.0;

    //while(mytotal < 10)
    //{
    MPI_Barrier(MPI_COMM_WORLD);
    for(i=0; i<Nmul; i++)
    {
        t1=stop_watch(0.0);

#ifdef _OPENMP
        #pragma omp parallel
        {
#endif
            __m512d U[3][3], gin[3],gout[3];
            su3_512 *u0;
            su3_vector_512 *hin,*hout;
#ifdef _OPENMP
            #pragma omp for
#endif
            for(j=0; j< plqcd_g.VOLUME/4; j++)
            {
                u0  = &ufield_512[4*j];
                hin = &pin_512[j].s0;
                hout= &pout_512[j].s0;

                intrin_su3_load_512(U,u0);
                intrin_vector_load_512(gin,hin);
                intrin_su3_multiply_512(gout,U,gin);
                intrin_vector_store_512(hout,gout);

                u0++;
                hin++;
                hout++;

                intrin_su3_load_512(U,u0);
                intrin_vector_load_512(gin,hin);
                intrin_su3_multiply_512(gout,U,gin);
                intrin_vector_store_512(hout,gout);
                u0++;
                hin++;
                hout++;

                intrin_su3_load_512(U,u0);
                intrin_vector_load_512(gin,hin);
                intrin_su3_multiply_512(gout,U,gin);
                intrin_vector_store_512(hout,gout);
                u0++;
                hin++;
                hout++;

                intrin_su3_load_512(U,u0);
                intrin_vector_load_512(gin,hin);
                intrin_su3_multiply_512(gout,U,gin);
                intrin_vector_store_512(hout,gout);

            }
#ifdef _OPENMP
        }
#endif

        t2 = stop_watch(t1);
        mytotal += t2;
    }
    matvecs += 4*Nmul*plqcd_g.VOLUME;
    //}

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);

    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"su3mul mic version:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,matvecs*66.0/total/1e+6);
    }
#endif

#endif //MIC

    finalize_plqcd();

    return 0;
}
Exemplo n.º 28
0
/**
 *
 *  Local Matrix Multiply
 *   Computes C = alpha * A * B + beta * C
 *
 *
 *  Similar to the DGEMM routine in BLAS
 *
 *
 *  alpha and beta are double-precision scalars
 *
 *  A, B, and C are matrices of double-precision elements
 *  stored in column-major format
 *
 *  The output is stored in C
 *  A and B are not modified during computation
 *
 *
 *  m - number of rows of matrix A and rows of C
 *  n - number of columns of matrix B and columns of C
 *  k - number of columns of matrix A and rows of B
 *
 *  lda, ldb, and ldc specifies the size of the first dimension of the matrices
 *
 **/
void local_mm(const int m, const int n, const int k, const double alpha,
    const double *A, const int lda, const double *B, const int ldb,
    const double beta, double *C, const int ldc) {

  int row, col;

  /* Verify the sizes of lda, ladb, and ldc */
  assert(lda >= m);
  assert(ldb >= k);
  assert(ldc >= m);

#ifdef USE_MKL
  const char N = 'N';
  dgemm(&N, &N, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
#else
# ifdef USE_BLOCKING

  /*
   * Z = 256 KB = 256 * 1024 = 262144
   * b = sqrt(X) = 512
   * sizeof(double) = 8
   *
   * n = 1024
   * m = n^3 / b = 2097152
   *
   *
   *
   */


#pragma omp parallel private(col, row) shared(C)
  {
  int tid;
  int nthreads;

  tid = omp_get_thread_num();
  nthreads = omp_get_num_threads();


  if (tid == 1)
  {
    fprintf(stderr, "nthreads=%i, tid*n/nthreads=%i, tid*m/nthreads=%i\n",
        nthreads, tid*n/nthreads, tid*m/nthreads);
    fprintf(stderr, "(tid+1)*n/nthreads=%i, (tid+1)*m/nthreads=%i\n",
        (tid+1)*n/nthreads, (tid+1)*m/nthreads);

    //fprintf(stderr, "MATRIX A=\n");
    //print_matrix(m, n, A);

    //fprintf(stderr, "\bMATRIX B=\n");
    //print_matrix(m, n, B);
  }

  /* Iterate over the columns of C */
  for (col = 0; col < n; col++) {

    /* Spread the computations among the CPUs; the last CPU may get fewer rows. */
    int row_min = tid * ((float)m/nthreads + 0.5);
    int row_max = MIN((tid+1) * ((float)m/nthreads + 0.5), m);

    /* Iterate over the rows of C */
    for (row = row_min; row < row_max; row++) {

      int k_iter;
      double dotprod = 0.0; /* Accumulates the sum of the dot-product */

      /* Iterate over column of A, row of B */
      for (k_iter = 0; k_iter < k; k_iter++) {
        int a_index, b_index;
        a_index = (k_iter * lda) + row; /* Compute index of A element */
        b_index = (col * ldb) + k_iter; /* Compute index of B element */
        dotprod += A[a_index] * B[b_index]; /* Compute product of A and B */
      } /* k_iter */

      int c_index = (col * ldc) + row;
      C[c_index] = (alpha * dotprod) + (beta * C[c_index]);
    } /* row */
  } /* col */

  }

# else /* OPEN_MP */

#pragma omp parallel for private(col, row)
  /* Iterate over the columns of C */
  for (col = 0; col < n; col++) {

    /* Iterate over the rows of C */
    for (row = 0; row < m; row++) {

      int k_iter;
      double dotprod = 0.0; /* Accumulates the sum of the dot-product */

      /* Iterate over column of A, row of B */
      for (k_iter = 0; k_iter < k; k_iter++) {
        int a_index, b_index;
        a_index = (k_iter * lda) + row; /* Compute index of A element */
        b_index = (col * ldb) + k_iter; /* Compute index of B element */
        dotprod += A[a_index] * B[b_index]; /* Compute product of A and B */
      } /* k_iter */

      int c_index = (col * ldc) + row;
      C[c_index] = (alpha * dotprod) + (beta * C[c_index]);
    } /* row */
  } /* col */
# endif /* USE_BLOCKING, OPEN_MP */
#endif /* USE_MKL */

}
Exemplo n.º 29
0
int main(int argc, char *argv[]) {
  int i,j,k;
  machineInformation currentMachine;
  counterSessionInfo session;

  initializeCUDA();

  // Set machine information from CounterHomeBrew.h
  currentMachine.cpu_model = CPU_MODEL;
  currentMachine.num_sockets = NUM_SOCKETS;
  currentMachine.num_phys_cores_per_socket = NUM_PHYS_CORES_PER_SOCKET;
  currentMachine.num_cores_per_socket = NUM_CORES_PER_SOCKET;
  currentMachine.num_cores = NUM_CORES;
  currentMachine.num_cbos = NUM_PHYS_CORES_PER_SOCKET; // should multiply by NUM_SOCKETS???
  currentMachine.core_gen_counter_num_max = CORE_GEN_COUNTER_MAX;
  currentMachine.cbo_counter_num_max = CBO_COUNTER_NUM_MAX;

  // Set session events, umasks and counters used
  //  int32 core_event_numbers[] = {FP_COMP_OPS_EXE_EVTNR,SIMD_FP_256_EVTNR,0x51,0xF1,0x80};
  // int32 core_umasks[] = {FP_COMP_OPS_EXE_SCALAR_DOUBLE_UMASK,SIMD_FP_256_PACKED_DOUBLE_UMASK,0x01, 0x07,0x01};

  session.core_gen_counter_num_used = 5;
  int32 core_event_numbers[] = {0x10,0x10,0x11,0x51,0xF1};
  int32 core_umasks[] = {0x20,0x40,0x01,0x01, 0x07};

  session.cbo_counter_num_used = 1;
  int32 cbo_event_numbers[] = {0x37};
  int32 cbo_umasks[] = {0xf};
  session.cbo_filter = 0x1f;

  for (i = 0; i < session.core_gen_counter_num_used; i++) {
    session.core_event_numbers[i] = core_event_numbers[i];
    session.core_umasks[i] = core_umasks[i];
  }
  for (i = 0; i < session.cbo_counter_num_used; i++) {
    session.cbo_event_numbers[i] = cbo_event_numbers[i];
    session.cbo_umasks[i] = cbo_umasks[i];
  }

  int fd[NUM_CORES];

  // Arrays to hold counter data...
  counterData before;
  counterData after;

  // some data for doing a naive matmul to test flop counting...
  // initloop(N);
  

  // M,N,K are multiples of the block size....
  int gpuOuter = atoi(argv[1]);
  int gpuInner = atoi(argv[2]);
  int cpuInner = atoi(argv[3]);
  double minRuntime = atoi(argv[4]);
  int Md = atoi(argv[5])*block_size;
  int Nd = atoi(argv[6])*block_size;
  int Kd = atoi(argv[7])*block_size;
  int Mh = atoi(argv[8]);
  int Nh = atoi(argv[9]);
  int Kh = atoi(argv[10]);

  char *ts1,*ts2,*ts3,*ts4;
  char *ts5,*ts6,*ts7,*ts8;
  double fineTimeStamps[8];
  double gTime = 0.0;
  double cTime = 0.0;
  double seconds = 0.0;
  int num_iters;

  uint64 *coreSums;
  coreSums = (uint64*)calloc(currentMachine.num_sockets*session.core_gen_counter_num_used,sizeof(uint64));

  uint64 *sums;
  sums = (uint64*)calloc(currentMachine.num_sockets*session.cbo_counter_num_used,sizeof(uint64));

  float *Atmp = NULL;
  float *Btmp = NULL;
  float *Ctmp = NULL;
  Atmp = (float*) malloc( Mh * Nh * sizeof(float) );
  Btmp = (float*) malloc( Nh * sizeof(float) );
  Ctmp = (float*) malloc( Mh * sizeof(float) );
  randomInit(Atmp,Mh*Nh);
  randomInit(Btmp,Nh);

  for (num_iters = cpuInner; seconds < minRuntime; num_iters *=2) {
    seconds = 0.0;	
    for (i =0; i < num_iters; i++)
      BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, Atmp,Mh, Btmp,1, 1, Ctmp,1 );
    seconds = read_timer()-seconds;
  }
  //  num_iters /= 2;

  free(Atmp);
  free(Btmp);
  free(Ctmp);

  int readyThreads = 0;
  #pragma omp parallel
  {
    int threadNum = omp_get_thread_num();
    int numThreads = omp_get_num_threads();
    assert(numThreads==2);
    if (threadNum == 0) {
      cudaError_t error;
      int memSizeA = sizeof(float)*Md*Nd;
      int memSizeB = sizeof(float)*Nd;
      int memSizeC = sizeof(float)*Md;
      
      float *Ahost,*Bhost,*Chost;
      // use pinned memory on the host for BW and asynch memory transfers..
      int flags = cudaHostAllocDefault;
      ts5 = getTimeStamp();
      fineTimeStamps[0] = read_timer();
      error = cudaHostAlloc((void**)&Ahost,memSizeA,flags);if (error != cudaSuccess){printf("cudaHostMalloc Ahost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaHostAlloc((void**)&Bhost,memSizeB,flags);if (error != cudaSuccess){printf("cudaHostMalloc Bhost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaHostAlloc((void**)&Chost,memSizeC,flags);if (error != cudaSuccess){printf("cudaHostMalloc Chost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      // set local arrays
      randomInit(Ahost,Md*Nd);
      randomInit(Bhost,Nd);

      // allocate device memory
      float *Adevice,*Bdevice,*Cdevice;
      error = cudaMalloc((void**)&Adevice,memSizeA); if (error != cudaSuccess){printf("cudaMalloc Adevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaMalloc((void**)&Bdevice,memSizeB); if (error != cudaSuccess){printf("cudaMalloc Bdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaMalloc((void**)&Cdevice,memSizeC); if (error != cudaSuccess){printf("cudaMalloc Cdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      fineTimeStamps[1] = read_timer();
      ts6 = getTimeStamp();
#pragma omp critical
      {
	readyThreads += 1;
      }
      //     fprintf(stderr,"Incremented ready GPU\n");
      while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 0: %d\n",readyThreads);};

      //#pragma omp single 
      //{
      cudaStream_t stream1;
      cudaStreamCreate ( &stream1) ;
      ts3 = getTimeStamp();
      fineTimeStamps[2] = read_timer();
      gTime = read_timer();
      for (int i = 0; i < gpuOuter; i++) 
	GPUsgemv(gpuInner,Md,Nd,Kd,Adevice,Bdevice,Cdevice,Ahost,Bhost,Chost,&stream1);
      cudaStreamSynchronize(stream1);
      gTime = read_timer() - gTime;
      fineTimeStamps[3] = read_timer();
      ts4 = getTimeStamp();
      cudaFreeHost(Ahost);
      cudaFreeHost(Bhost);
      cudaFreeHost(Chost);

    } else {
      //  uint64 min_iters = strtoull(argv[4],NULL,0);
      float *A = NULL;
      float *B = NULL;
      float *C = NULL;
      ts7 = getTimeStamp();
      fineTimeStamps[4] = read_timer();
      A = (float*) malloc( Mh * Nh * sizeof(float) );
      B = (float*) malloc( Nh * sizeof(float) );
      C = (float*) malloc( Mh * sizeof(float) );
      randomInit(A,Mh*Nh);
      randomInit(B,Nh);
      fineTimeStamps[5] = read_timer();
      ts8 = getTimeStamp();
#pragma omp critical
      {
	readyThreads += 1;
      }
      //   fprintf(stderr,"Incremented ready CPU\n");
      while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 1: %d\n",readyThreads);};
                  
      // open the msr files for each core on the machine
      for (i = 0; i < currentMachine.num_cores; i++)
	open_msr_file(i,&fd[i]);
      
      
      int socketsProgrammed = 0;
      for (i = 0; i < currentMachine.num_cores; i++) {
	int currentCoreFD = fd[i];
	
	stopCounters(i, currentCoreFD, &currentMachine, &session);
	programCoreFixedCounters(currentCoreFD);    
	programGeneralPurposeRegisters(currentCoreFD, &currentMachine, &session);
	
	/* Program the Uncore as desired...*/
	// Only program the first physical core on each socket. 
	// NOTE: Some assumptions about topology here...check /proc/cpuinfo to confirm.
	if (i % currentMachine.num_phys_cores_per_socket == 0 && socketsProgrammed < currentMachine.num_sockets) {
	  programUncoreCounters( currentCoreFD, &currentMachine, &session);
	  socketsProgrammed++;
	}
      }
      
      seconds = 0.0;
      
      // start the programmed counters...
      for (i = 0; i < currentMachine.num_cores; i++)
	startCounters( i, fd[i], &currentMachine, &session);
      
      /* READ COUNTERS BEFORE STUFF */
      readCounters(fd,&currentMachine,&session, &before);
      ts1 = getTimeStamp();
      fineTimeStamps[6] = read_timer();
      seconds = read_timer();
      
      /* DO STUFF */    
      for (i =0; i < num_iters; i++)
	BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, A,Mh, B,1, 1, C,1 );
      
      /* END DOING STUFF */
      
      seconds = read_timer()-seconds;
      fineTimeStamps[7] = read_timer();
      ts2 = getTimeStamp();
      
      /* READ COUNTERS AFTER STUFF */    
      for (i = 0; i < currentMachine.num_cores; i++)
	stopCounters(i,fd[i],&currentMachine, &session);
      
      //  printf("num_iters = %"PRIu64", runtime is %g\n",num_iters,seconds);
      
      readCounters(fd,&currentMachine,&session,&after);
      diffCounterData(&currentMachine, &session, &after, &before, &after);
      
      for (i = 0; i < currentMachine.num_sockets; i++) {
	//    printf("Socket %d\n",i);
	for (j = 0; j < currentMachine.num_cores_per_socket; j++) {
	  //   printf("%d,",j);
	  for (k = 0; k < session.core_gen_counter_num_used; k++){
	    //	printf("%"PRIu64",",after.generalCore[i*currentMachine.num_cores_per_socket + j][k]);
	    // bug in the indexing of the core sums???
	    //        coreSums[i*session.core_gen_counter_num_used + k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k];
	    coreSums[k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k];
	  }
	  //	printf("\n");
	}
      }
      
      for (i = 0; i < currentMachine.num_sockets; i++) {
	//	printf("%d,",i);
	for (j = 0; j < currentMachine.num_cbos; j++) {
	  //	  printf("%d,",j);
	  for (k = 0; k < session.cbo_counter_num_used; k++) {
	    //	    printf("%llu,",after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]);
	    // bug in the indexing of the core sums???
	    //        sums[i*session.cbo_counter_num_used + k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k];
	    sums[k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k];
	  }
	}
      }
      //      printf("\n");
            
      // Stop counters, reset PMU, close msr files
      cleanup(fd,&currentMachine,&session);
      
      
      free(A);
      free(B);
      free(C);
    }
  } // end parallel region

  printf("%s,%s,%s,%s,%s,%s,%s,%s,%d,%d,%d,%d,%d,%d,%d,%d,%d,%f,%f,%f,",ts7,ts8,ts1,ts2,ts5,ts6,ts3,ts4,Mh,Nh,Kh,Md/block_size,Nd/block_size,Kd/block_size,num_iters,gpuOuter,gpuInner,seconds,gTime,(float)(gpuOuter*(Md*Kd+Nd+Md))/16.0);
  for (int i = 0; i < 8; i++)
    printf("%f,",fineTimeStamps[i]);

  for (j = 0; j < session.core_gen_counter_num_used; j++)
    printf("%llu,",coreSums[j]);
  for (j = 0; j < session.cbo_counter_num_used; j++)
    if (j == session.cbo_counter_num_used-1)
      printf("%llu",sums[j]);
    else
      printf("%llu,",sums[j]);
  printf("\n");
  
  free(sums);
  free(coreSums);
  
  return 0;
}
Exemplo n.º 30
0
int main(int argc, char* argv[])
{
    int nt, ncmp, ncdp, nh, nh2, nm, nd, memsize, niter, reg, ix, ih, i3, i2, i1, iter, filt, nw, np;
    float t0, cmp0, cdp0, h0, dt, dcmp, dcdp, dh, apt, rho, aal, norm;
    bool verb, half, amp;
    float ***data, ***modl, **vrms, **mask, *off, *error=NULL;
    float **pp, **qq, *aa;
    char *errfile;
    sf_file in, out, vel, offset, err=NULL;
    sf_file fdip;

    int ompchunk = 1;
    int ompnth = 1;

#ifdef _OPENMP
    int ompath=1;
#endif

    /*------------------------------------------------------------*/
    sf_init(argc,argv);

    if(! sf_getint("ompchunk",&ompchunk)) ompchunk=1;
    /* OpenMP data chunk size */
#ifdef _OPENMP
    if(! sf_getint("ompnth",  &ompnth))     ompnth=0;
    /* OpenMP available threads */

#pragma omp parallel
    ompath=omp_get_num_threads();
    if(ompnth<1) ompnth=ompath;
    omp_set_num_threads(ompnth);
    sf_warning("using %d threads of a total of %d",ompnth,ompath);
#endif

    in = sf_input("in");
    vel = sf_input("vel");
    out = sf_output("out");

    if (!sf_getbool("verb",&verb)) verb=false;
    /* verbosity flag */

    if (!sf_getbool("half",&half)) half = true;
    /* if y, the third axis is half-offset instead of full offset */

    if (!sf_getbool("amp",&amp)) amp = true;
    /* if y, use amplitue factor */

    if (!sf_histint(in,"n1",&nt)) sf_error("No n1= in input");
    if (!sf_histfloat(in,"d1",&dt)) sf_error("No d1= in input");
    if (!sf_histfloat(in,"o1",&t0)) sf_error("No o1= in input");

    if (!sf_histint(in,"n2",&ncmp)) sf_error("No n2= in input");
    if (!sf_histfloat(in,"d2",&dcmp)) sf_error("No d2= in input");
    if (!sf_histfloat(in,"o2",&cmp0)) sf_error("No o2= in input");

    if (!sf_getint("ncdp",&ncdp)) ncdp = ncmp;
    if (!sf_getfloat("dcdp",&dcdp)) dcdp = dcmp;
    if (!sf_getfloat("cdp0",&cdp0)) cdp0 = cmp0;

    sf_putint(out,"n2",ncdp);
    sf_putfloat(out,"d2",dcdp);
    sf_putfloat(out,"o2",cdp0);

    if (!sf_histint(in,"n3",&nh)) sf_error("No n3= in input");

    if (NULL != sf_getstring("offset")) {
        offset = sf_input("offset");
        nh2 = sf_filesize(offset);

        if (nh2 != nh*ncmp) sf_error("Wrong dimensions in offset, it should be %d",nh*ncmp);

        off = sf_floatalloc(nh2);
        sf_floatread (off,nh2,offset);
        sf_fileclose(offset);
        if (!half) {
           for (ih = 0; ih < nh2; ih++) {
               off[ih] *= 0.5;
            }
        }
    } else {
        if (!sf_histfloat(in,"o3",&h0)) sf_error("No o3=");
        if (!sf_histfloat(in,"d3",&dh)) sf_error("No d3=");

        if (!half) dh *= 0.5,h0 *= 0.5;

        off = sf_floatalloc(nh*ncmp);
        for (ix = 0; ix < ncmp; ix++) {
            for (ih = 0; ih < nh; ih++) {
                off[ih*ncmp+ix] = h0 + ih*dh;
            }
        }
        offset = NULL;
    }

    if (!sf_getint("reg",&reg)) reg=0;
    /* regularization type */ 

    if (!sf_getfloat("antialias",&aal)) aal = 1.0;
    /* antialiasing */

    if (!sf_getfloat("apt",&apt)) apt=ncmp;
    /* migration aperture */

    if (!sf_getfloat("rho",&rho)) rho = 1.-1./nt;
    /* Leaky integration constant */

    if (!sf_getint("niter",&niter)) niter=5;
    /* number of iterations */


    nm = nt*ncdp*nh;
    nd = nt*ncmp*nh;

    vrms = sf_floatalloc2(nt,ncdp);
    mask = sf_floatalloc2(ncmp,nh);
    data = sf_floatalloc3(nt,ncmp,nh);
    modl = sf_floatalloc3(nt,ncdp,nh);

    /* read velocity file */
    sf_floatread(vrms[0],nt*ncdp,vel);
    sf_fileclose(vel);

    memsize = nm+nd+nt*ncdp+ncmp*nh;
    if (verb) sf_warning("memory needs: %f G (%f M)",4.*memsize/1024/1024/1024,4.*memsize/1024/1024);

    if (niter > 0) {
        errfile = sf_getstring("err");
        /* output file for error */
        if (NULL != errfile) {
            err = sf_output(errfile);
            sf_putint(err,"n1",niter);
            sf_putfloat(err,"d1",1);
            sf_putfloat(err,"o1",1);
            sf_putstring(err,"label1","Iteration Number");
            sf_putstring(err,"label2","Relative Squared Error");
            sf_putint(err,"n2",1);
            sf_putint(err,"n3",1);
        }
        error = sf_floatalloc(niter);
    }

    sf_floatread(data[0][0],nd,in);

    for (i3=0; i3 < nh; i3++) {
        for (i2=0; i2 < ncmp; i2++) {
            mask[i3][i2]=cblas_sdot(nt,data[i3][i2],1,data[i3][i2],1);
        }
    }

    tkirmig_init(ompnth,ompchunk,nt,dt,t0,ncmp,dcmp,cmp0,ncdp,dcdp,cdp0,nh,dh,h0,apt,aal,rho,vrms,off,mask,amp,verb);

    sf_cdstep_init();

    if (verb) sf_warning("Iteration begin...");

    if (reg == 0)
       sf_solver(tkirmig_lop,sf_cdstep,nm,nd,modl[0][0],data[0][0],
                 niter,"nmem",0,"nfreq",niter,"err",error,"end");

    else if (reg == 1) {
       filt=2;
       aa=sf_floatalloc(filt);
       aa[0]=1.;
       aa[1]=-1.;
       tcaih_init(filt,aa,nt,ncdp,nh);
       sf_solver_reg(tkirmig_lop,sf_cdstep,tcaih_lop,nm+filt*nt*ncdp,nm,nd,
                    modl[0][0],data[0][0],niter,0.01,"nmem",0,"nfreq",niter,
                    "err",error,"end");
    }
    else if (reg == 2) {
       sf_causinth_init(nt,ncdp,nh);
       sf_solver_prec(tkirmig_lop,sf_cdstep,sf_causinth_lop,nm,nm,nd,
                      modl[0][0],data[0][0],niter,0.01,"nmem",0,"nfreq",niter,
                      "err",error,"end");
    }
    else if (reg == 3) {
       sf_triangleh_init(3,nt,ncdp,nh);
       sf_solver_prec(tkirmig_lop,sf_cdstep,sf_triangleh_lop,nm,nm,nd,
                      modl[0][0],data[0][0],niter,0.01,"nmem",0,"nfreq",niter,
                      "err",error,"end");
    }

    else if (reg == 4) {
       sf_warning("pwd constraints along t-x plane and smoothing along offset axis");
       if (!sf_getstring("fdip")) sf_error("Need input dip file!");
       if (!sf_getint("nw",&nw)) nw=3;
       fdip = sf_input("fdip");

       if (!sf_histint(fdip,"n3",&np)) np=1;
       sf_warning("np=%d",np);
       pp = sf_floatalloc2(nt,ncdp);

       if (np > 1) {
          qq = sf_floatalloc2(nt,ncdp);
       } else {
          qq = NULL;
       }

       if (NULL != qq) {
          predicth2_init(nt,ncdp,nh,0.1,nw,pp,qq);
       } else {
          predicth_init(nt,ncdp,nh,0.1,nw,1,false);
          predict_set(pp);
       }

       sf_floatread(pp[0],nt*ncdp,fdip);

       if (NULL != qq) {
          sf_floatread(qq[0],nt*ncdp,fdip);
          sf_solver_prec(tkirmig_lop,sf_cdstep,predicth2_lop,nm,nm,nd,
                      modl[0][0],data[0][0],niter,0.01,"nmem",0,"nfreq",niter,
                      "err",error,"end");
         predict2_close();
       } else {
         sf_solver_prec(tkirmig_lop,sf_cdstep,predicth_lop,nm,nm,nd,
                      modl[0][0],data[0][0],niter,0.01,"nmem",0,"nfreq",niter,
                      "err",error,"end");
         predict_close();
      }
    }

    sf_cdstep_close();
    sf_floatwrite(modl[0][0],nm,out);

    if (NULL != err) {
       for (i3=0; i3 < nh; i3++) {
           for (i2=0; i2 < ncmp; i2++) {
               for (i1=0; i1 < nt; i1++) {
                   norm += data[i3][i2][i1]*data[i3][i2][i1];
               }
           }
        }
        
        for (iter=0; iter < niter; iter++) error[iter] /=norm;
        sf_floatwrite(error,niter,err);
    }

    sf_warning("iter/niter=%d/%d, err=%f",iter,niter,error);

    exit(0);
}