示例#1
0
double tpi_dot( int n , const VECTOR_SCALAR * x , const VECTOR_SCALAR * y )
{
  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
  double result = 0.0 ;
  tmp.x = x ;
  tmp.y = y ;
  tmp.n = n ;
  if ( x != y ) {
    TPI_Run_threads_reduce( tpi_work_dot_partial , & tmp ,
                            tpi_work_dot_join , tpi_work_dot_init ,
                            sizeof(result) , & result );
  }
  else {
    TPI_Run_threads_reduce( tpi_work_dot_partial_self , & tmp ,
                            tpi_work_dot_join , tpi_work_dot_init ,
                            sizeof(result) , & result );
  }
#if defined HAVE_MPI
  {
    double tmp = result ;
    MPI_Allreduce( & tmp , & result , 1 , MPI_DOUBLE , MPI_SUM , MPI_COMM_WORLD );
  }
#endif
  return result ;
}
示例#2
0
 static typename WDP::ReductionType
 parallel_reduce(int beg, int end, WDP wd) {
   typedef typename WDP::ReductionType ReductionType;
   ReductionType result = wd.identity();
   WDPPlusRange<WDP> wdp_plus(beg,end,wd);
   TPI_Run_threads_reduce(tpi_reduction_work<WDP>, &wdp_plus,
                          tpi_reduction_join<WDP>,
                          tpi_reduction_init<WDP>, sizeof(result), &result);
   return result;
 }
示例#3
0
void test_tpi_reduce( const int ntest , const int nthread[] , const int ntrial )
{
  int j ;

  fprintf( stdout , "\n\"TEST TPI_Run_threads(reduce) / TPI_Run_threads_reduce\"\n" );
  fprintf( stdout , "\"#Thread\" , \"#Trial\" , \"TPI_Run_threads(avg-msec)\" , \"TPI_Run_threads(stddev-msec)\" , \"TPI_Run_threads_reduce(avg-msec)\" , \"TPI_Run_threads_reduce(stddev-msec)\"\n");

  for ( j = 0 ; j < ntest ; ++j ) {
    const int nth = nthread[j];

    double dt_lock_total   = 0.0 ;
    double dt_lock_total_2 = 0.0 ;
    double dt_reduce_total    = 0.0 ;
    double dt_reduce_total_2  = 0.0 ;
    int i ;

    int result = TPI_Init( nth );

    if ( result != nth ) {
      fprintf(stderr,"%d != TPI_Init(%d) : FAILED\n", result , nth );
    }

    for ( i = 0 ; i < ntrial ; ++i ) {
      double t , dt ;
      int value = 0 ;
      int * const ptr = & value ;

      t = TPI_Walltime();
      TPI_Run_threads( test_reduce_via_lock , & ptr , 1 );
      dt = TPI_Walltime() - t ;
      dt_lock_total += dt ;
      dt_lock_total_2 += dt * dt ;

      if ( value != nth ) {
        fprintf(stderr,
                "TPI_Run_threads(reduce,...) : FAILED at trial %d\n",
                i );
        abort();
      }

      value = 0 ;

      t = TPI_Walltime();
      TPI_Run_threads_reduce( test_reduce_via_nolock , NULL ,
                              test_reduce_join , test_reduce_init ,
                              sizeof(value) , & value );
  
      dt = TPI_Walltime() - t ;
      dt_reduce_total += dt ;
      dt_reduce_total_2 += dt * dt ;

      if ( value != nth ) {
        fprintf(stderr,
                "TPI_Run_threads_reduce(...) : FAILED at trial %d\n",
                i );
        abort();
      }
    }

    TPI_Finalize();

    if ( 1 < ntrial ) {
      const double lock_mean = 1.0e6 * dt_lock_total / ntrial ;
      const double lock_sdev = 1.0e6 * sqrt( ( ntrial * dt_lock_total_2 -
                                       dt_lock_total * dt_lock_total ) /
                                     ( ntrial * ( ntrial - 1 ) ) );

      const double reduce_mean = 1.0e6 * dt_reduce_total / ntrial ;
      const double reduce_sdev = 1.0e6 * sqrt( ( ntrial * dt_reduce_total_2 -
                                         dt_reduce_total * dt_reduce_total) /
                                       ( ntrial * ( ntrial - 1 ) ) );
      
      fprintf(stdout,"%d , %d , %10g , %10g , %10g , %10g\n",
              nth, ntrial, lock_mean, lock_sdev, reduce_mean, reduce_sdev);
    }
  }
}
示例#4
0
double dcrs_apply_and_dot(
  const struct distributed_crs_matrix * matrix ,
  VECTOR_SCALAR * x ,
  VECTOR_SCALAR * y ,
  const int overlap_communication )
{
  struct work_dcrs info ;

  double result = 0.0 ;

  info.matrix = matrix ;
  info.x      = x ;
  info.y      = y ;

  if ( overlap_communication &&
       matrix->n_internal_row < matrix->n_local_row ) {

    double remote_result = 0 ;

    /* Start the internal matrix-vector multiply */
    /* result += dot( output = A * input , input ); */

    info.jBeg = 0 ;
    info.jEnd = matrix->n_internal_row ;

    /*  Divide internal work evenly among worker threads.
     *  This leave the primary thread completely out of the computation.
     */
    TPI_Start_threads_reduce( tpi_work_dcrs_apply_and_dot , & info , 
                              tpi_work_dot_join ,
                              tpi_work_dot_init ,
                              sizeof(result) , & result );

    get_off_process_entries( matrix , x );

    TPI_Wait(); /* Wait for internal result */

    info.jBeg = matrix->n_internal_row ;
    info.jEnd = matrix->n_local_row ;

    TPI_Run_threads_reduce( tpi_work_dcrs_apply_and_dot , & info , 
                            tpi_work_dot_join ,
                            tpi_work_dot_init ,
                            sizeof(remote_result) , & remote_result );

    result += remote_result ;
  }
  else {
    info.jBeg = 0 ;
    info.jEnd = matrix->n_local_row ;

    get_off_process_entries( matrix , x );

    TPI_Run_threads_reduce( tpi_work_dcrs_apply_and_dot , & info , 
                            tpi_work_dot_join ,
                            tpi_work_dot_init ,
                            sizeof(result) , & result );
  }

  result = comm_sum( result );

  return result ;
}