double tpi_dot( int n , const VECTOR_SCALAR * x , const VECTOR_SCALAR * y ) { struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 }; double result = 0.0 ; tmp.x = x ; tmp.y = y ; tmp.n = n ; if ( x != y ) { TPI_Run_threads_reduce( tpi_work_dot_partial , & tmp , tpi_work_dot_join , tpi_work_dot_init , sizeof(result) , & result ); } else { TPI_Run_threads_reduce( tpi_work_dot_partial_self , & tmp , tpi_work_dot_join , tpi_work_dot_init , sizeof(result) , & result ); } #if defined HAVE_MPI { double tmp = result ; MPI_Allreduce( & tmp , & result , 1 , MPI_DOUBLE , MPI_SUM , MPI_COMM_WORLD ); } #endif return result ; }
static typename WDP::ReductionType parallel_reduce(int beg, int end, WDP wd) { typedef typename WDP::ReductionType ReductionType; ReductionType result = wd.identity(); WDPPlusRange<WDP> wdp_plus(beg,end,wd); TPI_Run_threads_reduce(tpi_reduction_work<WDP>, &wdp_plus, tpi_reduction_join<WDP>, tpi_reduction_init<WDP>, sizeof(result), &result); return result; }
void test_tpi_reduce( const int ntest , const int nthread[] , const int ntrial ) { int j ; fprintf( stdout , "\n\"TEST TPI_Run_threads(reduce) / TPI_Run_threads_reduce\"\n" ); fprintf( stdout , "\"#Thread\" , \"#Trial\" , \"TPI_Run_threads(avg-msec)\" , \"TPI_Run_threads(stddev-msec)\" , \"TPI_Run_threads_reduce(avg-msec)\" , \"TPI_Run_threads_reduce(stddev-msec)\"\n"); for ( j = 0 ; j < ntest ; ++j ) { const int nth = nthread[j]; double dt_lock_total = 0.0 ; double dt_lock_total_2 = 0.0 ; double dt_reduce_total = 0.0 ; double dt_reduce_total_2 = 0.0 ; int i ; int result = TPI_Init( nth ); if ( result != nth ) { fprintf(stderr,"%d != TPI_Init(%d) : FAILED\n", result , nth ); } for ( i = 0 ; i < ntrial ; ++i ) { double t , dt ; int value = 0 ; int * const ptr = & value ; t = TPI_Walltime(); TPI_Run_threads( test_reduce_via_lock , & ptr , 1 ); dt = TPI_Walltime() - t ; dt_lock_total += dt ; dt_lock_total_2 += dt * dt ; if ( value != nth ) { fprintf(stderr, "TPI_Run_threads(reduce,...) : FAILED at trial %d\n", i ); abort(); } value = 0 ; t = TPI_Walltime(); TPI_Run_threads_reduce( test_reduce_via_nolock , NULL , test_reduce_join , test_reduce_init , sizeof(value) , & value ); dt = TPI_Walltime() - t ; dt_reduce_total += dt ; dt_reduce_total_2 += dt * dt ; if ( value != nth ) { fprintf(stderr, "TPI_Run_threads_reduce(...) : FAILED at trial %d\n", i ); abort(); } } TPI_Finalize(); if ( 1 < ntrial ) { const double lock_mean = 1.0e6 * dt_lock_total / ntrial ; const double lock_sdev = 1.0e6 * sqrt( ( ntrial * dt_lock_total_2 - dt_lock_total * dt_lock_total ) / ( ntrial * ( ntrial - 1 ) ) ); const double reduce_mean = 1.0e6 * dt_reduce_total / ntrial ; const double reduce_sdev = 1.0e6 * sqrt( ( ntrial * dt_reduce_total_2 - dt_reduce_total * dt_reduce_total) / ( ntrial * ( ntrial - 1 ) ) ); fprintf(stdout,"%d , %d , %10g , %10g , %10g , %10g\n", nth, ntrial, lock_mean, lock_sdev, reduce_mean, reduce_sdev); } } }
double dcrs_apply_and_dot( const struct distributed_crs_matrix * matrix , VECTOR_SCALAR * x , VECTOR_SCALAR * y , const int overlap_communication ) { struct work_dcrs info ; double result = 0.0 ; info.matrix = matrix ; info.x = x ; info.y = y ; if ( overlap_communication && matrix->n_internal_row < matrix->n_local_row ) { double remote_result = 0 ; /* Start the internal matrix-vector multiply */ /* result += dot( output = A * input , input ); */ info.jBeg = 0 ; info.jEnd = matrix->n_internal_row ; /* Divide internal work evenly among worker threads. * This leave the primary thread completely out of the computation. */ TPI_Start_threads_reduce( tpi_work_dcrs_apply_and_dot , & info , tpi_work_dot_join , tpi_work_dot_init , sizeof(result) , & result ); get_off_process_entries( matrix , x ); TPI_Wait(); /* Wait for internal result */ info.jBeg = matrix->n_internal_row ; info.jEnd = matrix->n_local_row ; TPI_Run_threads_reduce( tpi_work_dcrs_apply_and_dot , & info , tpi_work_dot_join , tpi_work_dot_init , sizeof(remote_result) , & remote_result ); result += remote_result ; } else { info.jBeg = 0 ; info.jEnd = matrix->n_local_row ; get_off_process_entries( matrix , x ); TPI_Run_threads_reduce( tpi_work_dcrs_apply_and_dot , & info , tpi_work_dot_join , tpi_work_dot_init , sizeof(result) , & result ); } result = comm_sum( result ); return result ; }