void test_tpi_block( const int ntest , const int nthread[] , const int ntrial ) { int i, j ; fprintf( stdout , "\n\"TEST TPI_Block / TPI_Unblock\"\n" ); fprintf( stdout , "\"#Thread\" , \"#Trial\" , \"TPI_Block(avg-msec)\" , \"TPI_Block(stddev-msec)\" , \"TPI_Unblock(avg-msec)\" , \"TPI_Unblock(stddev-msec)\"\n"); for ( j = 0 ; j < ntest ; ++j ) { const int nth = nthread[j]; double dt_block_total = 0.0 ; double dt_block_total_2 = 0.0 ; double dt_unblock_total = 0.0 ; double dt_unblock_total_2 = 0.0 ; int result = TPI_Init( nth ); if ( result != nth ) { fprintf(stderr,"%d != TPI_Init(%d) : FAILED\n", result , nth ); abort(); } for ( i = 0 ; i < ntrial ; ++i ) { double t , dt ; t = TPI_Walltime(); TPI_Block(); dt = TPI_Walltime() - t ; dt_block_total += dt ; dt_block_total_2 += dt * dt ; t = TPI_Walltime(); TPI_Unblock(); dt = TPI_Walltime() - t ; dt_unblock_total += dt ; dt_unblock_total_2 += dt * dt ; } TPI_Finalize(); if ( 1 < ntrial ) { const double block_mean = 1.0e6 * dt_block_total / ntrial ; const double block_sdev = 1.0e6 * sqrt( ( ntrial * dt_block_total_2 - dt_block_total * dt_block_total ) / ( ntrial * ( ntrial - 1 ) ) ); const double unblock_mean = 1.0e6 * dt_unblock_total / ntrial ; const double unblock_sdev = 1.0e6 * sqrt( ( ntrial * dt_unblock_total_2 - dt_unblock_total * dt_unblock_total) / ( ntrial * ( ntrial - 1 ) ) ); fprintf(stdout,"%d , %d , %10g , %10g , %10g , %10g\n", nth , ntrial , block_mean , block_sdev , unblock_mean , unblock_sdev ); } } }
void TPINode::init(int numThreads) { if (curNumThreads_ >= 1) { TPI_Finalize(); } curNumThreads_ = numThreads; if (curNumThreads_ >= 1) { TPI_Init(curNumThreads_); } }
void test_tpi_init( const int ntest , const int nthread[] , const int ntrial ) { int j ; fprintf( stdout , "\n\"TEST TPI_Init / TPI_Finalize\"\n" ); fprintf( stdout , "\"#Thread\" , \"#Trial\" , \"TPI_Init(avg-msec)\" , \"TPI_Init(stddev-msec)\" , \"TPI_Finalize(avg-msec)\" , \"TPI_Finalize(stddev-msec)\"\n"); for ( j = 0 ; j < ntest ; ++j ) { const int nth = nthread[j]; double dt_init_total = 0.0 ; double dt_init_total_2 = 0.0 ; double dt_fin_total = 0.0 ; double dt_fin_total_2 = 0.0 ; int i ; int result ; for ( i = 0 ; i < ntrial ; ++i ) { double t , dt ; t = TPI_Walltime(); result = TPI_Init( nth ); dt = TPI_Walltime() - t ; dt_init_total += dt ; dt_init_total_2 += dt * dt ; if ( result != nth ) { fprintf(stderr,"%d != TPI_Init(%d) : FAILED at trial %d\n", result , nth , i ); abort(); } t = TPI_Walltime(); TPI_Finalize(); dt = TPI_Walltime() - t ; dt_fin_total += dt ; dt_fin_total_2 += dt * dt ; } if ( 1 < ntrial ) { const double init_mean = 1.0e6 * dt_init_total / ntrial ; const double init_sdev = 1.0e6 * sqrt( ( ntrial * dt_init_total_2 - dt_init_total * dt_init_total ) / ( ntrial * ( ntrial - 1 ) ) ); const double fin_mean = 1.0e6 * dt_fin_total / ntrial ; const double fin_sdev = 1.0e6 * sqrt( ( ntrial * dt_fin_total_2 - dt_fin_total * dt_fin_total ) / ( ntrial * ( ntrial - 1 ) ) ); fprintf(stdout,"%d , %d , %10g , %10g , %10g , %10g\n", nth , ntrial , init_mean , init_sdev , fin_mean , fin_sdev ); } } }
int TPI_Init( int n ) { ThreadPool * const pool = local_thread_pool(); int result = ! pool || pool->m_number_threads ? TPI_ERROR_ACTIVE : 0 ; if ( ! result && n <= 0 ) { result = TPI_ERROR_SIZE ; } if ( ! result ) { pthread_attr_t thread_attr ; if ( pthread_attr_init( & thread_attr ) ) { result = TPI_ERROR_INTERNAL ; } else { pthread_attr_setscope( & thread_attr, PTHREAD_SCOPE_SYSTEM ); pthread_attr_setdetachstate( & thread_attr, PTHREAD_CREATE_DETACHED ); pthread_mutex_lock( & pool->m_pool_lock ); { int n_thread = 1 ; /* Count myself among the threads */ pool->m_number_threads = n_thread ; for ( ; n_thread < n && ! result ; ++n_thread ) { pthread_t pt ; if ( pthread_create( & pt, & thread_attr, & local_thread_pool_driver, pool ) ) { result = TPI_ERROR_INTERNAL ; } else { /* Wait for start */ pthread_cond_wait( & pool->m_pool_cond , & pool->m_pool_lock ); } } } pthread_attr_destroy( & thread_attr ); pthread_mutex_unlock( & pool->m_pool_lock ); } if ( result ) { TPI_Finalize(); } } return result ; }
void test_tpi_work_async( const int ntest , const int nthread[] , const int nwork , const int ntrial ) { int * const flags = (int *) malloc( sizeof(int) * nwork ); int j ; fprintf( stdout , "\n\"TEST TPI_Start / TPI_Start_reduce\"\n" ); fprintf( stdout , "\"#Thread\" , \"#Work\" , \"#Trial\" , \"TPI_Start(avg-msec)\" , \"TPI_Start(stddev-msec)\" , \"TPI_Start_reduce(avg-msec)\" , \"TPI_Start_reduce(stddev-msec)\"\n"); for ( j = 0 ; j < ntest ; ++j ) { const int nth = nthread[j]; double dt_work_total = 0.0 ; double dt_work_total_2 = 0.0 ; double dt_reduce_total = 0.0 ; double dt_reduce_total_2 = 0.0 ; int i , k ; int result = TPI_Init( nth ); if ( result != nth ) { fprintf(stderr,"%d != TPI_Init(%d) : FAILED\n", result , nth ); } for ( i = 0 ; i < ntrial ; ++i ) { double t , dt ; int value = 0 ; for ( k = 0 ; k < nwork ; ++k ) { flags[k] = 0 ; } t = TPI_Walltime(); TPI_Start( test_work , & flags , nwork , 0 ); TPI_Wait(); dt = TPI_Walltime() - t ; dt_work_total += dt ; dt_work_total_2 += dt * dt ; for ( k = 0 ; k < nwork && flags[k] ; ++k ); if ( k < nwork ) { fprintf(stderr, "TPI_Run(...) : FAILED at trial %d\n", i ); abort(); } for ( k = 0 ; k < nwork ; ++k ) { flags[k] = 0 ; } t = TPI_Walltime(); TPI_Start_reduce( test_reduce_work , & flags , nwork , test_reduce_join , test_reduce_init , sizeof(value) , & value ); TPI_Wait(); dt = TPI_Walltime() - t ; dt_reduce_total += dt ; dt_reduce_total_2 += dt * dt ; for ( k = 0 ; k < nwork && flags[k] ; ++k ); if ( value != nwork || k < nwork ) { fprintf(stderr, "TPI_Run_reduce(...) : FAILED at trial %d\n", i ); abort(); } } TPI_Finalize(); if ( 1 < ntrial ) { const double work_mean = 1.0e6 * dt_work_total / ntrial ; const double work_sdev = 1.0e6 * sqrt( ( ntrial * dt_work_total_2 - dt_work_total * dt_work_total ) / ( ntrial * ( ntrial - 1 ) ) ); const double reduce_mean = 1.0e6 * dt_reduce_total / ntrial ; const double reduce_sdev = 1.0e6 * sqrt( ( ntrial * dt_reduce_total_2 - dt_reduce_total * dt_reduce_total) / ( ntrial * ( ntrial - 1 ) ) ); fprintf(stdout,"%d , %d , %d , %10g , %10g , %10g , %10g\n", nth, ntrial, nwork, work_mean, work_sdev, reduce_mean, reduce_sdev); } } free( flags ); }
void test_tpi_reduce( const int ntest , const int nthread[] , const int ntrial ) { int j ; fprintf( stdout , "\n\"TEST TPI_Run_threads(reduce) / TPI_Run_threads_reduce\"\n" ); fprintf( stdout , "\"#Thread\" , \"#Trial\" , \"TPI_Run_threads(avg-msec)\" , \"TPI_Run_threads(stddev-msec)\" , \"TPI_Run_threads_reduce(avg-msec)\" , \"TPI_Run_threads_reduce(stddev-msec)\"\n"); for ( j = 0 ; j < ntest ; ++j ) { const int nth = nthread[j]; double dt_lock_total = 0.0 ; double dt_lock_total_2 = 0.0 ; double dt_reduce_total = 0.0 ; double dt_reduce_total_2 = 0.0 ; int i ; int result = TPI_Init( nth ); if ( result != nth ) { fprintf(stderr,"%d != TPI_Init(%d) : FAILED\n", result , nth ); } for ( i = 0 ; i < ntrial ; ++i ) { double t , dt ; int value = 0 ; int * const ptr = & value ; t = TPI_Walltime(); TPI_Run_threads( test_reduce_via_lock , & ptr , 1 ); dt = TPI_Walltime() - t ; dt_lock_total += dt ; dt_lock_total_2 += dt * dt ; if ( value != nth ) { fprintf(stderr, "TPI_Run_threads(reduce,...) : FAILED at trial %d\n", i ); abort(); } value = 0 ; t = TPI_Walltime(); TPI_Run_threads_reduce( test_reduce_via_nolock , NULL , test_reduce_join , test_reduce_init , sizeof(value) , & value ); dt = TPI_Walltime() - t ; dt_reduce_total += dt ; dt_reduce_total_2 += dt * dt ; if ( value != nth ) { fprintf(stderr, "TPI_Run_threads_reduce(...) : FAILED at trial %d\n", i ); abort(); } } TPI_Finalize(); if ( 1 < ntrial ) { const double lock_mean = 1.0e6 * dt_lock_total / ntrial ; const double lock_sdev = 1.0e6 * sqrt( ( ntrial * dt_lock_total_2 - dt_lock_total * dt_lock_total ) / ( ntrial * ( ntrial - 1 ) ) ); const double reduce_mean = 1.0e6 * dt_reduce_total / ntrial ; const double reduce_sdev = 1.0e6 * sqrt( ( ntrial * dt_reduce_total_2 - dt_reduce_total * dt_reduce_total) / ( ntrial * ( ntrial - 1 ) ) ); fprintf(stdout,"%d , %d , %10g , %10g , %10g , %10g\n", nth, ntrial, lock_mean, lock_sdev, reduce_mean, reduce_sdev); } } }
inline int Finalize() { return TPI_Finalize(); }
int main( int argc , char ** argv ) { const int ghost = 1 ; const int max_cube = 20 ; int ncube[20] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; FILE * print_file = stdout ; int print_iter = 500 ; int max_iter = 50 ; VECTOR_SCALAR tolerance = 0.0 ; /* Force max iterations */ int gbox[3][2] = { { 0 , 16 } , { 0 , 16 } , { 0 , 16 } }; int nt = 0 ; int trials = 5 ; int ntest ; int np = 1; int my_p = 0 ; #ifdef HAVE_MPI MPI_Init( & argc , & argv ); MPI_Comm_size( MPI_COMM_WORLD , & np ); MPI_Comm_rank( MPI_COMM_WORLD , & my_p ); #endif if ( ! my_p ) { const char arg_threads[] = "threads=" ; const char arg_cube[] = "cube=" ; const char arg_box[] = "box=" ; const char arg_max[] = "max_iter=" ; const char arg_trials[] = "trials=" ; const char arg_print[] = "print_iter=" ; const char arg_file[] = "print_file=" ; int i ; for ( i = 1 ; i < argc ; ++i ) { if ( ! strncmp(argv[i],arg_threads,strlen(arg_threads)) ) { sscanf(argv[i]+strlen(arg_threads),"%d",&nt); } else if ( ! strncmp(argv[i],arg_box,strlen(arg_box)) ) { sscanf(argv[i]+strlen(arg_box),"%d%*[x]%d%*[x]%d", & gbox[0][1] , & gbox[1][1] , & gbox[2][1] ); } else if ( ! strncmp(argv[i],arg_cube,strlen(arg_cube)) ) { sscanf(argv[i]+strlen(arg_cube), "%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d", ncube+0, ncube+1, ncube+2, ncube+3, ncube+4, ncube+5, ncube+6, ncube+7, ncube+8, ncube+9, ncube+10, ncube+11, ncube+12, ncube+13, ncube+14, ncube+15, ncube+16, ncube+17, ncube+18, ncube+19); } else if ( ! strncmp(argv[i],arg_max,strlen(arg_max)) ) { sscanf(argv[i]+strlen(arg_max),"%d",&max_iter); } else if ( ! strncmp(argv[i],arg_trials,strlen(arg_trials)) ) { sscanf(argv[i]+strlen(arg_trials),"%d",&trials); } else if ( ! strncmp(argv[i],arg_print,strlen(arg_print)) ) { sscanf(argv[i]+strlen(arg_print),"%d",&print_iter); } else if ( ! strncmp(argv[i],arg_file,strlen(arg_file)) ) { char buffer[256] ; sscanf(argv[i]+strlen(arg_file),"%s",buffer); print_file = fopen(buffer,"a"); } } } #ifdef HAVE_MPI { MPI_Bcast( & nt , 1 , MPI_INT , 0 , MPI_COMM_WORLD ); MPI_Bcast( & gbox[0][0] , 6 , MPI_INT , 0 , MPI_COMM_WORLD ); MPI_Bcast( ncube , max_cube , MPI_INT , 0 , MPI_COMM_WORLD ); MPI_Bcast( & max_iter , 1 , MPI_INT , 0 , MPI_COMM_WORLD ); MPI_Bcast( & print_iter , 1 , MPI_INT , 0 , MPI_COMM_WORLD ); MPI_Bcast( & trials , 1 , MPI_INT , 0 , MPI_COMM_WORLD ); } #endif if ( nt ) { TPI_Init( nt ); TPI_Block(); TPI_Unblock(); } if ( ! my_p ) { fprintf(print_file,"\"PROC\" , \"THREAD\" , \"EQUATION\" , \"NON-ZERO\" , \"MXV\" , \"AXPBY\" , \"DOT\" , \"Xerror\" , \"Iter\"\n"); fprintf(print_file,"\"COUNT\" , \"COUNT\" , \"COUNT\" , \"COUNT\" , \"Mflops\" , \"Mflops\" , \"Mflops\" , \"L2norm\" , \"COUNT\"\n"); } for ( ntest = 0 ; ! ntest || ( ntest < max_cube && ncube[ntest] ) ; ++ntest ) { struct cgsolve_data cgdata ; if ( ncube[ntest] ) { gbox[0][1] = gbox[1][1] = gbox[2][1] = ncube[ntest] ; } hpccg_alloc_and_fill( np, my_p, (const int (*)[2]) gbox, ghost, &cgdata); cgdata.max_iter = max_iter ; cgdata.print_iter = print_iter ; cgdata.tolerance = tolerance ; { double dt_mxv[2] = { 0 , 0 }; double dt_axpby[2] = { 0 , 0 }; double dt_dot[2] = { 0 , 0 }; VECTOR_SCALAR norm_resid = 0.0 ; int iter_count = 0 ; int iter_total = 0 ; int k ; VECTOR_SCALAR * const b = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * cgdata.nRow ); VECTOR_SCALAR * const x = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * cgdata.nRow ); VECTOR_SCALAR * const xexact = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * cgdata.nRow ); { const VECTOR_SCALAR value = 1.0 /* 1.0 / 3.0 */ ; int i ; for ( i = 0 ; i < cgdata.nRow ; ++i ) xexact[i] = value ; } for ( k = 0 ; k < trials ; ++k ) { int i ; for ( i = 0 ; i < cgdata.nRow ; ++i ) { x[i] = 0.0 ; } cgsolve_set_lhs( & cgdata , xexact , b ); cgsolve( & cgdata, b, x, & iter_count, & norm_resid, dt_mxv , dt_axpby , dt_dot ); iter_total += iter_count ; } { int nnzGlobal = cgdata.A_pc[ cgdata.nRow ]; double error[2] = { 0 , 0 }; for ( k = 0 ; k < cgdata.nRow ; ++k ) { error[0] += ( x[k] - xexact[k] ) * ( x[k] - xexact[k] ); error[1] += xexact[k] * xexact[k] ; } #ifdef HAVE_MPI { double error_global[2] = { 0.0 , 0.0 }; int nnz = nnzGlobal ; MPI_Allreduce( & nnz , & nnzGlobal , 1 , MPI_INT , MPI_SUM , MPI_COMM_WORLD ); MPI_Allreduce( error , error_global , 2 , MPI_DOUBLE , MPI_SUM , MPI_COMM_WORLD ); error[0] = error_global[0]; error[1] = error_global[1]; } #endif error[0] = sqrt( error[0] ); error[1] = sqrt( error[1] ); if ( ! my_p ) { const int nRowGlobal = ( gbox[0][1] - gbox[0][0] ) * ( gbox[1][1] - gbox[1][0] ) * ( gbox[2][1] - gbox[2][0] ); const double mflop_mxv = 1.0e-6 * ( iter_total ) * 2 * nnzGlobal / dt_mxv[0] ; const double mflop_axpby = 1.0e-6 * ( iter_total * 3 ) * 3 * nRowGlobal / dt_axpby[0] ; const double mflop_dot = 1.0e-6 * ( iter_total * 2 ) * 2 * nRowGlobal / dt_dot[0] ; fprintf(print_file,"%8d , %8d , %8d , %8d , %10g , %10g , %10g , %g , %d\n", np , nt , nRowGlobal , nnzGlobal , mflop_mxv , mflop_axpby , mflop_dot , error[0] / error[1] , iter_total ); fflush(print_file); } } free( xexact ); free( x ); free( b ); } free( cgdata.A_a ); free( cgdata.A_ia ); free( cgdata.A_pc ); free( cgdata.recv_pc ); free( cgdata.send_pc ); free( cgdata.send_id ); } if ( nt ) { TPI_Finalize(); } #ifdef HAVE_MPI MPI_Finalize(); #endif return 0 ; }
TPINode::~TPINode() { if (curNumThreads_ >= 1) { TPI_Finalize(); } }