示例#1
0
void test_tpi_work_async(
  const int ntest , const int nthread[] , const int nwork , const int ntrial )
{
  int * const flags = (int *) malloc( sizeof(int) * nwork );
  int j ;

  fprintf( stdout , "\n\"TEST TPI_Start / TPI_Start_reduce\"\n" );
  fprintf( stdout , "\"#Thread\" , \"#Work\" , \"#Trial\" , \"TPI_Start(avg-msec)\" , \"TPI_Start(stddev-msec)\" , \"TPI_Start_reduce(avg-msec)\" , \"TPI_Start_reduce(stddev-msec)\"\n");

  for ( j = 0 ; j < ntest ; ++j ) {
    const int nth = nthread[j];

    double dt_work_total   = 0.0 ;
    double dt_work_total_2 = 0.0 ;
    double dt_reduce_total    = 0.0 ;
    double dt_reduce_total_2  = 0.0 ;
    int i , k ;

    int result = TPI_Init( nth );

    if ( result != nth ) {
      fprintf(stderr,"%d != TPI_Init(%d) : FAILED\n", result , nth );
    }

    for ( i = 0 ; i < ntrial ; ++i ) {
      double t , dt ;
      int value = 0 ;

      for ( k = 0 ; k < nwork ; ++k ) { flags[k] = 0 ; }

      t = TPI_Walltime();
      TPI_Start( test_work , & flags , nwork , 0 );
      TPI_Wait();
      dt = TPI_Walltime() - t ;
      dt_work_total += dt ;
      dt_work_total_2 += dt * dt ;

      for ( k = 0 ; k < nwork && flags[k] ; ++k );

      if ( k < nwork ) {
        fprintf(stderr, "TPI_Run(...) : FAILED at trial %d\n", i );
        abort();
      }

      for ( k = 0 ; k < nwork ; ++k ) { flags[k] = 0 ; }

      t = TPI_Walltime();

      TPI_Start_reduce( test_reduce_work , & flags , nwork ,
                        test_reduce_join , test_reduce_init ,
                        sizeof(value) , & value );
      TPI_Wait();
  
      dt = TPI_Walltime() - t ;
      dt_reduce_total += dt ;
      dt_reduce_total_2 += dt * dt ;

      for ( k = 0 ; k < nwork && flags[k] ; ++k );

      if ( value != nwork || k < nwork ) {
        fprintf(stderr, "TPI_Run_reduce(...) : FAILED at trial %d\n", i );
        abort();
      }
    }

    TPI_Finalize();

    if ( 1 < ntrial ) {
      const double work_mean = 1.0e6 * dt_work_total / ntrial ;
      const double work_sdev = 1.0e6 * sqrt( ( ntrial * dt_work_total_2 -
                                       dt_work_total * dt_work_total ) /
                                     ( ntrial * ( ntrial - 1 ) ) );

      const double reduce_mean = 1.0e6 * dt_reduce_total / ntrial ;
      const double reduce_sdev = 1.0e6 * sqrt( ( ntrial * dt_reduce_total_2 -
                                         dt_reduce_total * dt_reduce_total) /
                                       ( ntrial * ( ntrial - 1 ) ) );
      
      fprintf(stdout,"%d , %d , %d , %10g , %10g , %10g , %10g\n",
              nth, ntrial, nwork, work_mean, work_sdev, reduce_mean, reduce_sdev);
    }
  }

  free( flags );
}
示例#2
0
double dcrs_apply_and_dot(
  const struct distributed_crs_matrix * matrix ,
  VECTOR_SCALAR * x ,
  VECTOR_SCALAR * y ,
  const int overlap_communication )
{
  struct work_dcrs info ;

  double result = 0.0 ;

  info.matrix = matrix ;
  info.x      = x ;
  info.y      = y ;

  if ( overlap_communication &&
       matrix->n_internal_row < matrix->n_local_row ) {

    double remote_result = 0 ;

    /* Start the internal matrix-vector multiply */
    /* result += dot( output = A * input , input ); */

    info.jBeg = 0 ;
    info.jEnd = matrix->n_internal_row ;

    /*  Divide internal work evenly among worker threads.
     *  This leave the primary thread completely out of the computation.
     */
    TPI_Start_threads_reduce( tpi_work_dcrs_apply_and_dot , & info , 
                              tpi_work_dot_join ,
                              tpi_work_dot_init ,
                              sizeof(result) , & result );

    get_off_process_entries( matrix , x );

    TPI_Wait(); /* Wait for internal result */

    info.jBeg = matrix->n_internal_row ;
    info.jEnd = matrix->n_local_row ;

    TPI_Run_threads_reduce( tpi_work_dcrs_apply_and_dot , & info , 
                            tpi_work_dot_join ,
                            tpi_work_dot_init ,
                            sizeof(remote_result) , & remote_result );

    result += remote_result ;
  }
  else {
    info.jBeg = 0 ;
    info.jEnd = matrix->n_local_row ;

    get_off_process_entries( matrix , x );

    TPI_Run_threads_reduce( tpi_work_dcrs_apply_and_dot , & info , 
                            tpi_work_dot_join ,
                            tpi_work_dot_init ,
                            sizeof(result) , & result );
  }

  result = comm_sum( result );

  return result ;
}