static void task_xddot_xy_work_blocking( void * arg , TPI_ThreadPool pool ) { int p_size , p_rank ; if ( ! TPI_Rank( pool , & p_rank , & p_size ) ) { struct TaskXY * const t = (struct TaskXY *) arg ; const unsigned block_size = t->block ; const unsigned block_start = block_size * p_rank ; const unsigned block_stride = block_size * p_size ; const double * const x_end = t->x_beg + t->number ; const double * x = t->x_beg + block_start ; const double * y = t->x_beg + block_start ; double s_local[4] = { 0 , 0 , 0 , 0 }; for ( ; x < x_end ; x += block_stride , y += block_stride ) { const unsigned n = x_end - x ; xddot( s_local , ( block_size < n ? block_size : n ) , x , y ); } { double * const xy_sum = t->xy_sum + 4 * p_rank ; xy_sum[0] = s_local[0] ; xy_sum[1] = s_local[1] ; xy_sum[2] = s_local[2] ; xy_sum[3] = s_local[3] ; } } }
static void task_xddot_xy_work( void * arg , TPI_ThreadPool pool ) { int p_size , p_rank ; if ( ! TPI_Rank( pool , & p_rank , & p_size ) ) { struct TaskXY * const t = (struct TaskXY *) arg ; const unsigned n_total = t->number ; const unsigned n_begin = ( n_total * ( p_rank ) ) / p_size ; const unsigned n_end = ( n_total * ( p_rank + 1 ) ) / p_size ; const unsigned n_local = ( n_end - n_begin ); const double * const x = t->x_beg + n_begin ; const double * const y = t->y_beg + n_begin ; double s_local[4] = { 0 , 0 , 0 , 0 }; xddot( s_local , n_local , x , y ); { double * const xy_sum = t->xy_sum + 4 * p_rank ; xy_sum[0] = s_local[0] ; xy_sum[1] = s_local[1] ; xy_sum[2] = s_local[2] ; xy_sum[3] = s_local[3] ; } } }
static void task_ddot_xy_work_blocking( void * arg , TPI_ThreadPool pool ) { int p_size , p_rank ; if ( ! TPI_Rank( pool , & p_rank , & p_size ) ) { struct TaskXY * const t = (struct TaskXY *) arg ; const unsigned block_size = t->block ; const unsigned block_start = block_size * p_rank ; const unsigned block_stride = block_size * p_size ; const double * const x_end = t->x_beg + t->number ; const double * x = t->x_beg + block_start ; const double * y = t->x_beg + block_start ; double local = 0.0 ; for ( ; x < x_end ; x += block_stride , y += block_stride ) { const unsigned n = x_end - x ; local += ddot( ( block_size < n ? block_size : n ) , x , y ); } t->xy_sum[ p_rank ] = local ; } }
static void task_xddot_x_work( void * arg , TPI_ThreadPool pool ) { int p_size , p_rank ; if ( ! TPI_Rank( pool , & p_rank , & p_size ) ) { double partial[2] = { 0 , 0 }; struct TaskX * const t = (struct TaskX *) arg ; { const unsigned p_next = p_rank + 1 ; const unsigned n_global = t->number ; const unsigned n_begin = ( ( n_global * p_rank ) / p_size ); const unsigned n_local = ( ( n_global * p_next ) / p_size ) - n_begin ; dot1_unroll( partial , t->x_beg + n_begin , n_local ); } { TPI_Lock(pool,0); { double * const v = t->x_sum ; SUM_ADD( v , partial[0] ); SUM_ADD( v , partial[1] ); TPI_Unlock(pool,0); } } } }
static void task_norm1_work( void * arg , TPI_ThreadPool pool ) { int p_size , p_rank ; if ( ! TPI_Rank( pool , & p_rank , & p_size ) ) { struct TaskX * const t = (struct TaskX *) arg ; const unsigned p_next = p_rank + 1 ; const unsigned n = t->number ; const double * const xb = t->x_beg + ( n * p_rank ) / p_size ; const double * const xe = t->x_beg + ( n * p_next ) / p_size ; double * const v = t->x_sum ; double partial[2] = { 0 , 0 }; norm1( partial , xb , xe ); TPI_Lock( pool , 0 ); SUM_ADD( v , partial[0] ); SUM_ADD( v , partial[1] ); TPI_Unlock( pool , 0 ); } }
static void task_sum_work( void * arg , TPI_ThreadPool pool ) { int p_size , p_rank ; if ( ! TPI_Rank( pool , & p_rank , & p_size ) ) { struct TaskX * const t = (struct TaskX *) arg ; const unsigned p_next = p_rank + 1 ; const unsigned n = t->number ; const double * const xb = t->x_beg + ( n * p_rank ) / p_size ; const double * const xe = t->x_beg + ( n * p_next ) / p_size ; double * const v = t->x_sum ; double partial[4] = { 0 , 0 , 0 , 0 }; add_array( partial , xb , xe ); TPI_Lock( pool , 0 ); xdsum_add_dsum( v , partial ); TPI_Unlock( pool , 0 ); } }
static void task_axpby_work_steal( void * arg , TPI_ThreadPool pool ) { enum { BLOCK = UNROLL * 128 }; int p_size ; int p_rank ; if ( ! TPI_Rank( pool , & p_rank , & p_size ) ) { struct TaskXY * const t = (struct TaskXY *) arg ; const double a = t->alpha ; const double b = t->beta ; const unsigned n = t->number ; const double * const x = t->x_beg ; double * const y = t->y_beg ; unsigned * const all_iter = t->iter ; unsigned * const my_iter = all_iter + p_size ; { unsigned i ; for ( i = 0 ; i < n ; ) { TPI_Lock( pool , p_rank ); i = *my_iter * BLOCK ; *my_iter += p_size ; TPI_Unlock( pool , p_rank ); if ( i < n ) { const unsigned len = BLOCK < n - i ? BLOCK : n - i ; daxpby_work( len, a, x + i, b, y + i ); } } } /* Finished my work, steal work from someone else */ { int working ; int p = 0 ; for ( working = 1 ; working ; ) { working = 0 ; for ( p = 0 ; p < p_size ; ++p ) { if ( all_iter[p] * BLOCK < n ) { if ( ! TPI_Trylock( pool , p ) ) { const unsigned i = all_iter[p] * BLOCK ; all_iter[p] += p_size ; TPI_Unlock( pool , p ); if ( i < n ) { const unsigned len = BLOCK < n - i ? BLOCK : n - i ; daxpby_work( len, a, x + i, b, y + i ); } } working = 1 ; } } } } } }
static void txblas_task_cr_mxv( void * data , TPI_ThreadPool pool ) { int p_size , p_rank ; if ( ! TPI_Rank( pool , & p_rank , & p_size ) ) { txblasTask_cr_Matrix * const t = (txblasTask_cr_Matrix*) data ; const unsigned beg_row = ( t->number_row * ( p_rank ) ) / p_size ; const unsigned end_row = ( t->number_row * ( p_rank + 1 ) ) / p_size ; const unsigned * const pc_end = t->pc_begin + end_row ; const unsigned * const ia_beg = t->ia_begin ; const double * const a_beg = t->a_begin ; const double * const x_beg = t->x_begin ; double * y = t->y_begin + beg_row ; const unsigned * pc = t->pc_begin + beg_row ; const unsigned * ia = ia_beg + *pc ; const double * a = a_beg + *pc ; while ( pc < pc_end ) { double ytmp = 0 ; const unsigned * const ia_end = ia_beg + *++pc ; { enum { STRIDE = 4 }; const unsigned * const ia_blk = ia_end - ( ia_end - ia ) % STRIDE ; for ( ; ia < ia_blk ; ia += STRIDE , a += STRIDE ) { ytmp += a[0] * x_beg[ ia[0] ] + a[1] * x_beg[ ia[1] ] + a[2] * x_beg[ ia[2] ] + a[3] * x_beg[ ia[3] ] ; } } for ( ; ia < ia_end ; ++ia , ++a ) { ytmp += *a * x_beg[ *ia ]; } *y++ = ytmp ; } } }
static void task_axpby_work( void * arg , TPI_ThreadPool pool ) { int p_size ; int p_rank ; if ( ! TPI_Rank( pool , & p_rank , & p_size ) ) { struct TaskXY * const t = (struct TaskXY *) arg ; const int n_rem = t->number % p_size ; const unsigned n_num = t->number / p_size ; const unsigned n_beg = p_rank * n_num + ( p_rank < n_rem ? p_rank : n_rem ); const unsigned n_len = n_num + ( p_rank < n_rem ? 1 : 0 ); daxpby_work( n_len , t->alpha , t->x_beg + n_beg , t->beta , t->y_beg + n_beg ); } }
static void task_ddot_xy_work( void * arg , TPI_ThreadPool pool ) { int p_size , p_rank ; if ( ! TPI_Rank( pool , & p_rank , & p_size ) ) { struct TaskXY * const t = (struct TaskXY *) arg ; const unsigned n_total = t->number ; const unsigned n_begin = ( n_total * ( p_rank ) ) / p_size ; const unsigned n_end = ( n_total * ( p_rank + 1 ) ) / p_size ; const unsigned n_local = ( n_end - n_begin ); const double * x = t->x_beg + n_begin ; const double * y = t->y_beg + n_begin ; t->xy_sum[ p_rank ] = ddot( n_local , x , y ); } }
static void task_axpby_work_block( void * arg , TPI_ThreadPool pool ) { enum { BLOCK = UNROLL * 1024 }; int p_size ; int p_rank ; if ( ! TPI_Rank( pool , & p_rank , & p_size ) ) { struct TaskXY * const t = (struct TaskXY *) arg ; const unsigned inc = BLOCK * p_size ; const unsigned num = t->number ; if ( 1 < p_size && 2 * inc < num ) { /* More than two blocks of work */ const double a = t->alpha ; const double b = t->beta ; const double * const x = t->x_beg ; double * const y = t->y_beg ; int len ; unsigned i ; for ( i = BLOCK * p_rank ; 0 < ( len = num - i ) ; i += inc ) { daxpby_work( ( BLOCK < len ? BLOCK : len ) , a , x + i , b , y + i ); } } else { /* Even partitioning */ const int n_rem = num % p_size ; const unsigned n_num = num / p_size ; const unsigned n_beg = p_rank*n_num + (p_rank < n_rem ? p_rank : n_rem); const unsigned n_len = n_num + ( p_rank < n_rem ? 1 : 0 ); daxpby_work( n_len , t->alpha , t->x_beg + n_beg , t->beta , t->y_beg + n_beg ); } } }
inline int Rank( ThreadPool pool , int & rank , int & size ) { return TPI_Rank( pool , & rank , & size ); }