void bli_thread_get_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool_t handle_edge_low, dim_t* start, dim_t* end ) { dim_t n_way = bli_thread_n_way( thread ); dim_t work_id = bli_thread_work_id( thread ); dim_t all_start = 0; dim_t all_end = n; dim_t size = all_end - all_start; dim_t n_bf_whole = size / bf; dim_t n_bf_left = size % bf; dim_t n_bf_lo = n_bf_whole / n_way; dim_t n_bf_hi = n_bf_whole / n_way; // In this function, we partition the space between all_start and // all_end into n_way partitions, each a multiple of block_factor // with the exception of the one partition that recieves the // "edge" case (if applicable). // // Here are examples of various thread partitionings, in units of // the block_factor, when n_way = 4. (A '+' indicates the thread // that receives the leftover edge case (ie: n_bf_left extra // rows/columns in its sub-range). // (all_start ... all_end) // n_bf_whole _left hel n_th_lo _hi thr0 thr1 thr2 thr3 // 12 =0 f 0 4 3 3 3 3 // 12 >0 f 0 4 3 3 3 3+ // 13 >0 f 1 3 4 3 3 3+ // 14 >0 f 2 2 4 4 3 3+ // 15 >0 f 3 1 4 4 4 3+ // 15 =0 f 3 1 4 4 4 3 // // 12 =0 t 4 0 3 3 3 3 // 12 >0 t 4 0 3+ 3 3 3 // 13 >0 t 3 1 3+ 3 3 4 // 14 >0 t 2 2 3+ 3 4 4 // 15 >0 t 1 3 3+ 4 4 4 // 15 =0 t 1 3 3 4 4 4 // As indicated by the table above, load is balanced as equally // as possible, even in the presence of an edge case. // First, we must differentiate between cases where the leftover // "edge" case (n_bf_left) should be allocated to a thread partition // at the low end of the index range or the high end. if ( handle_edge_low == FALSE ) { // Notice that if all threads receive the same number of // block_factors, those threads are considered "high" and // the "low" thread group is empty. dim_t n_th_lo = n_bf_whole % n_way; //dim_t n_th_hi = n_way - n_th_lo; // If some partitions must have more block_factors than others // assign the slightly larger partitions to lower index threads. if ( n_th_lo != 0 ) n_bf_lo += 1; // Compute the actual widths (in units of rows/columns) of // individual threads in the low and high groups. dim_t size_lo = n_bf_lo * bf; dim_t size_hi = n_bf_hi * bf; // Precompute the starting indices of the low and high groups. dim_t lo_start = all_start; dim_t hi_start = all_start + n_th_lo * size_lo; // Compute the start and end of individual threads' ranges // as a function of their work_ids and also the group to which // they belong (low or high). if ( work_id < n_th_lo ) { *start = lo_start + (work_id ) * size_lo; *end = lo_start + (work_id+1) * size_lo; } else // if ( n_th_lo <= work_id ) { *start = hi_start + (work_id-n_th_lo ) * size_hi; *end = hi_start + (work_id-n_th_lo+1) * size_hi; // Since the edge case is being allocated to the high // end of the index range, we have to advance the last // thread's end. if ( work_id == n_way - 1 ) *end += n_bf_left; } } else // if ( handle_edge_low == TRUE ) { // Notice that if all threads receive the same number of // block_factors, those threads are considered "low" and // the "high" thread group is empty. dim_t n_th_hi = n_bf_whole % n_way; dim_t n_th_lo = n_way - n_th_hi; // If some partitions must have more block_factors than others // assign the slightly larger partitions to higher index threads. if ( n_th_hi != 0 ) n_bf_hi += 1; // Compute the actual widths (in units of rows/columns) of // individual threads in the low and high groups. dim_t size_lo = n_bf_lo * bf; dim_t size_hi = n_bf_hi * bf; // Precompute the starting indices of the low and high groups. dim_t lo_start = all_start; dim_t hi_start = all_start + n_th_lo * size_lo + n_bf_left; // Compute the start and end of individual threads' ranges // as a function of their work_ids and also the group to which // they belong (low or high). if ( work_id < n_th_lo ) { *start = lo_start + (work_id ) * size_lo; *end = lo_start + (work_id+1) * size_lo; // Since the edge case is being allocated to the low // end of the index range, we have to advance the // starts/ends accordingly. if ( work_id == 0 ) *end += n_bf_left; else { *start += n_bf_left; *end += n_bf_left; } } else // if ( n_th_lo <= work_id ) { *start = hi_start + (work_id-n_th_lo ) * size_hi; *end = hi_start + (work_id-n_th_lo+1) * size_hi; } } }
void bli_l3_thrinfo_print_paths ( thrinfo_t** threads ) { dim_t n_threads = bli_thread_num_threads( threads[0] ); dim_t gl_comm_id; thrinfo_t* jc_info = threads[0]; thrinfo_t* pc_info = bli_thrinfo_sub_node( jc_info ); thrinfo_t* pb_info = bli_thrinfo_sub_node( pc_info ); thrinfo_t* ic_info = bli_thrinfo_sub_node( pb_info ); thrinfo_t* pa_info = bli_thrinfo_sub_node( ic_info ); thrinfo_t* jr_info = bli_thrinfo_sub_node( pa_info ); thrinfo_t* ir_info = bli_thrinfo_sub_node( jr_info ); dim_t jc_way = bli_thread_n_way( jc_info ); dim_t pc_way = bli_thread_n_way( pc_info ); dim_t pb_way = bli_thread_n_way( pb_info ); dim_t ic_way = bli_thread_n_way( ic_info ); dim_t pa_way = bli_thread_n_way( pa_info ); dim_t jr_way = bli_thread_n_way( jr_info ); dim_t ir_way = bli_thread_n_way( ir_info ); dim_t gl_nt = bli_thread_num_threads( jc_info ); dim_t jc_nt = bli_thread_num_threads( pc_info ); dim_t pc_nt = bli_thread_num_threads( pb_info ); dim_t pb_nt = bli_thread_num_threads( ic_info ); dim_t ic_nt = bli_thread_num_threads( pa_info ); dim_t pa_nt = bli_thread_num_threads( jr_info ); dim_t jr_nt = bli_thread_num_threads( ir_info ); printf( " gl jc kc pb ic pa jr ir\n" ); printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", ( unsigned long )gl_nt, ( unsigned long )jc_nt, ( unsigned long )pc_nt, ( unsigned long )pb_nt, ( unsigned long )ic_nt, ( unsigned long )pa_nt, ( unsigned long )jr_nt, ( unsigned long )1 ); printf( "\n" ); printf( " jc kc pb ic pa jr ir\n" ); printf( "xx_way: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", ( unsigned long )jc_way, ( unsigned long )pc_way, ( unsigned long )pb_way, ( unsigned long )ic_way, ( unsigned long )pa_way, ( unsigned long )jr_way, ( unsigned long )ir_way ); printf( "=================================================\n" ); for ( gl_comm_id = 0; gl_comm_id < n_threads; ++gl_comm_id ) { jc_info = threads[gl_comm_id]; pc_info = bli_thrinfo_sub_node( jc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); ic_info = bli_thrinfo_sub_node( pb_info ); pa_info = bli_thrinfo_sub_node( ic_info ); jr_info = bli_thrinfo_sub_node( pa_info ); ir_info = bli_thrinfo_sub_node( jr_info ); dim_t gl_comm_id = bli_thread_ocomm_id( jc_info ); dim_t jc_comm_id = bli_thread_ocomm_id( pc_info ); dim_t pc_comm_id = bli_thread_ocomm_id( pb_info ); dim_t pb_comm_id = bli_thread_ocomm_id( ic_info ); dim_t ic_comm_id = bli_thread_ocomm_id( pa_info ); dim_t pa_comm_id = bli_thread_ocomm_id( jr_info ); dim_t jr_comm_id = bli_thread_ocomm_id( ir_info ); dim_t jc_work_id = bli_thread_work_id( jc_info ); dim_t pc_work_id = bli_thread_work_id( pc_info ); dim_t pb_work_id = bli_thread_work_id( pb_info ); dim_t ic_work_id = bli_thread_work_id( ic_info ); dim_t pa_work_id = bli_thread_work_id( pa_info ); dim_t jr_work_id = bli_thread_work_id( jr_info ); dim_t ir_work_id = bli_thread_work_id( ir_info ); printf( " gl jc pb kc pa ic jr \n" ); printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", ( unsigned long )gl_comm_id, ( unsigned long )jc_comm_id, ( unsigned long )pc_comm_id, ( unsigned long )pb_comm_id, ( unsigned long )ic_comm_id, ( unsigned long )pa_comm_id, ( unsigned long )jr_comm_id ); printf( "work ids: %4ld %4ld %4lu %4lu %4ld %4ld %4ld\n", ( unsigned long )jc_work_id, ( unsigned long )pc_work_id, ( unsigned long )pb_work_id, ( unsigned long )ic_work_id, ( unsigned long )pa_work_id, ( unsigned long )jr_work_id, ( unsigned long )ir_work_id ); printf( "---------------------------------------\n" ); } }
siz_t bli_thread_get_range_weighted_sub ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool_t handle_edge_low, dim_t* j_start_thr, dim_t* j_end_thr ) { dim_t n_way = bli_thread_n_way( thread ); dim_t my_id = bli_thread_work_id( thread ); dim_t bf_left = n % bf; dim_t j; dim_t off_j; doff_t diagoff_j; dim_t n_left; dim_t width_j; dim_t offm_inc, offn_inc; double tri_dim, tri_area; double area_total, area_per_thr; siz_t area = 0; // In this function, we assume that the caller has already determined // that (a) the diagonal intersects the submatrix, and (b) the submatrix // is either lower- or upper-stored. if ( bli_is_lower( uplo ) ) { // Prune away the unstored region above the diagonal, if it exists, // and then to the right of where the diagonal intersects the bottom, // if it exists. (Also, we discard the offset deltas since we don't // need to actually index into the subpartition.) bli_prune_unstored_region_top_l( diagoff, m, n, offm_inc ); bli_prune_unstored_region_right_l( diagoff, m, n, offn_inc ); // We don't need offm_inc, offn_inc here. These statements should // prevent compiler warnings. ( void )offm_inc; ( void )offn_inc; // Now that pruning has taken place, we know that diagoff >= 0. // Compute the total area of the submatrix, accounting for the // location of the diagonal, and divide it by the number of ways // of parallelism. tri_dim = ( double )( n - diagoff - 1 ); tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0; area_total = ( double )m * ( double )n - tri_area; area_per_thr = area_total / ( double )n_way; // Initialize some variables prior to the loop: the offset to the // current subpartition, the remainder of the n dimension, and // the diagonal offset of the current subpartition. off_j = 0; diagoff_j = diagoff; n_left = n; // Iterate over the subpartition indices corresponding to each // thread/caucus participating in the n_way parallelism. for ( j = 0; j < n_way; ++j ) { // Compute the width of the jth subpartition, taking the // current diagonal offset into account, if needed. width_j = bli_thread_get_range_width_l ( diagoff_j, m, n_left, j, n_way, bf, bf_left, area_per_thr, handle_edge_low ); // If the current thread belongs to caucus j, this is his // subpartition. So we compute the implied index range and // end our search. if ( j == my_id ) { *j_start_thr = off_j; *j_end_thr = off_j + width_j; area = bli_find_area_trap_l( m, width_j, diagoff_j ); break; } // Shift the current subpartition's starting and diagonal offsets, // as well as the remainder of the n dimension, according to the // computed width, and then iterate to the next subpartition. off_j += width_j; diagoff_j -= width_j; n_left -= width_j; } } else // if ( bli_is_upper( uplo ) ) { // Express the upper-stored case in terms of the lower-stored case. // First, we convert the upper-stored trapezoid to an equivalent // lower-stored trapezoid by rotating it 180 degrees. bli_rotate180_trapezoid( diagoff, uplo ); // Now that the trapezoid is "flipped" in the n dimension, negate // the bool that encodes whether to handle the edge case at the // low (or high) end of the index range. bli_toggle_bool( handle_edge_low ); // Compute the appropriate range for the rotated trapezoid. area = bli_thread_get_range_weighted_sub ( thread, diagoff, uplo, m, n, bf, handle_edge_low, j_start_thr, j_end_thr ); // Reverse the indexing basis for the subpartition ranges so that // the indices, relative to left-to-right iteration through the // unrotated upper-stored trapezoid, map to the correct columns // (relative to the diagonal). This amounts to subtracting the // range from n. bli_reverse_index_direction( *j_start_thr, *j_end_thr, n ); } return area; }