void txddot( double * s , unsigned n , const double * x , const double * y ) { int p_size ; if ( ! TPI_Size( & p_size ) ) { double* tmp; const int ntmp = 4 * p_size ; tmp = malloc(ntmp * sizeof(double)); { struct TaskXY data = { tmp , x , y , n , BLOCKING_SIZE }; int i ; for ( i = 0 ; i < ntmp ; ++i ) { tmp[i] = 0 ; } if ( data.block ) { TPI_Run( & task_xddot_xy_work_blocking , & data , 0 ); } else { TPI_Run( & task_xddot_xy_work , & data , 0 ); } for ( i = 0 ; i < p_size ; ++i ) { xdsum_add_dsum( s , tmp + 4 * i ); } } free(tmp); } }
void tdaxpby( unsigned n , double a , const double * x , double b , double * y , int block ) { int p_size ; TPI_Size( & p_size ); { unsigned *tmp = malloc( p_size ); struct TaskXY data = { a , b , x , y , n , tmp }; int i ; for ( i = 0 ; i < p_size ; ++i ) { tmp[i] = i ; } if ( 0 < block ) { TPI_Run( & task_axpby_work_block , & data , 0 ); } else if ( block < 0 ) { TPI_Set_lock_size( p_size ); TPI_Run( & task_axpby_work_steal , & data , 0 ); } else { TPI_Run( & task_axpby_work , & data , 0 ); } free(tmp); } }
void txddot1( double * s , unsigned n , const double * x ) { struct TaskX data ; data.x_sum = s ; data.x_beg = x ; data.number = n ; TPI_Set_lock_size( 1 ); TPI_Run( & task_xddot_x_work , & data , 0 ); }
inline int Run( Worker & worker, void (Worker::*method)(ThreadPool) , int n ) { typedef WorkerMethodHelper<Worker> WM ; WM tmp( worker , method ); return TPI_Run( reinterpret_cast<TPI_parallel_subprogram>(& WM::run),&tmp,n); }
inline int Run( Worker & worker, void (Worker::*method)(Work &) , int work_count , int lock_count ) { typedef WorkerMethodHelper<Worker> WM ; WM tmp( worker , method ); return TPI_Run( reinterpret_cast<TPI_work_subprogram>(& WM::run),&tmp,work_count,lock_count); }
void txblas_cr_mxv( const unsigned nr /* Number rows */ , const unsigned pc[] , const unsigned ia[] , const double a[] , const double x[] , /* Input vector */ double y[] ) /* Output vector */ { txblasTask_cr_Matrix data = { nr , pc , ia , a , x , y }; TPI_Run( & txblas_task_cr_mxv , & data , 0 ); }
void tddot( double * s , unsigned n , const double * x , const double * y ) { int p_size ; if ( ! TPI_Size( & p_size ) ) { double* tmp = malloc( p_size * sizeof(double)); struct TaskXY data = { tmp , x , y , n , BLOCKING_SIZE }; int i ; for ( i = 0 ; i < p_size ; ++i ) { tmp[i] = 0 ; } if ( data.block ) { TPI_Run( & task_ddot_xy_work_blocking , & data , 0 ); } else { TPI_Run( & task_ddot_xy_work , & data , 0 ); } for ( i = 1 ; i < p_size ; ++i ) { tmp[0] += tmp[i] ; } *s = tmp[0] ; free(tmp); } }
void test_tpi_work( const int ntest , const int nthread[] , const int nwork , const int ntrial ) { int * const flags = (int *) malloc( sizeof(int) * nwork ); int j ; fprintf( stdout , "\n\"TEST TPI_Run / TPI_Run_reduce\"\n" ); fprintf( stdout , "\"#Thread\" , \"#Work\" , \"#Trial\" , \"TPI_Run(avg-msec)\" , \"TPI_Run(stddev-msec)\" , \"TPI_Run_reduce(avg-msec)\" , \"TPI_Run_reduce(stddev-msec)\"\n"); for ( j = 0 ; j < ntest ; ++j ) { const int nth = nthread[j]; double dt_work_total = 0.0 ; double dt_work_total_2 = 0.0 ; double dt_reduce_total = 0.0 ; double dt_reduce_total_2 = 0.0 ; int i , k ; int result = TPI_Init( nth ); if ( result != nth ) { fprintf(stderr,"%d != TPI_Init(%d) : FAILED\n", result , nth ); } for ( i = 0 ; i < ntrial ; ++i ) { double t , dt ; int value = 0 ; for ( k = 0 ; k < nwork ; ++k ) { flags[k] = 0 ; } t = TPI_Walltime(); TPI_Run( test_work , & flags , nwork , 0 ); dt = TPI_Walltime() - t ; dt_work_total += dt ; dt_work_total_2 += dt * dt ; for ( k = 0 ; k < nwork && flags[k] ; ++k ); if ( k < nwork ) { fprintf(stderr, "TPI_Run(...) : FAILED at trial %d\n", i ); abort(); } for ( k = 0 ; k < nwork ; ++k ) { flags[k] = 0 ; } t = TPI_Walltime(); TPI_Run_reduce( test_reduce_work , & flags , nwork , test_reduce_join , test_reduce_init , sizeof(value) , & value ); dt = TPI_Walltime() - t ; dt_reduce_total += dt ; dt_reduce_total_2 += dt * dt ; for ( k = 0 ; k < nwork && flags[k] ; ++k ); if ( value != nwork || k < nwork ) { fprintf(stderr, "TPI_Run_reduce(...) : FAILED at trial %d\n", i ); abort(); } } TPI_Finalize(); if ( 1 < ntrial ) { const double work_mean = 1.0e6 * dt_work_total / ntrial ; const double work_sdev = 1.0e6 * sqrt( ( ntrial * dt_work_total_2 - dt_work_total * dt_work_total ) / ( ntrial * ( ntrial - 1 ) ) ); const double reduce_mean = 1.0e6 * dt_reduce_total / ntrial ; const double reduce_sdev = 1.0e6 * sqrt( ( ntrial * dt_reduce_total_2 - dt_reduce_total * dt_reduce_total) / ( ntrial * ( ntrial - 1 ) ) ); fprintf(stdout,"%d , %d , %d , %10g , %10g , %10g , %10g\n", nth, ntrial, nwork, work_mean, work_sdev, reduce_mean, reduce_sdev); } } free( flags ); }
inline int Run( void (*func)( void * , ThreadPool ) , void * arg , int n ) { return TPI_Run( reinterpret_cast< TPI_parallel_subprogram >(func), arg , n ); }