Beispiel #1
0
void bli_thread_get_range_sub
     (
       thrinfo_t* thread,
       dim_t      n,
       dim_t      bf,
       bool_t     handle_edge_low,
       dim_t*     start,
       dim_t*     end
     )
{
	dim_t      n_way      = bli_thread_n_way( thread );
	dim_t      work_id    = bli_thread_work_id( thread );

	dim_t      all_start  = 0;
	dim_t      all_end    = n;

	dim_t      size       = all_end - all_start;

	dim_t      n_bf_whole = size / bf;
	dim_t      n_bf_left  = size % bf;

	dim_t      n_bf_lo    = n_bf_whole / n_way;
	dim_t      n_bf_hi    = n_bf_whole / n_way;

	// In this function, we partition the space between all_start and
	// all_end into n_way partitions, each a multiple of block_factor
	// with the exception of the one partition that recieves the
	// "edge" case (if applicable).
	//
	// Here are examples of various thread partitionings, in units of
	// the block_factor, when n_way = 4. (A '+' indicates the thread
	// that receives the leftover edge case (ie: n_bf_left extra
	// rows/columns in its sub-range).
	//                                        (all_start ... all_end)
	// n_bf_whole  _left  hel  n_th_lo  _hi   thr0  thr1  thr2  thr3
	//         12     =0    f        0    4      3     3     3     3
	//         12     >0    f        0    4      3     3     3     3+
	//         13     >0    f        1    3      4     3     3     3+
	//         14     >0    f        2    2      4     4     3     3+
	//         15     >0    f        3    1      4     4     4     3+
	//         15     =0    f        3    1      4     4     4     3 
	//
	//         12     =0    t        4    0      3     3     3     3
	//         12     >0    t        4    0      3+    3     3     3
	//         13     >0    t        3    1      3+    3     3     4
	//         14     >0    t        2    2      3+    3     4     4
	//         15     >0    t        1    3      3+    4     4     4
	//         15     =0    t        1    3      3     4     4     4

	// As indicated by the table above, load is balanced as equally
	// as possible, even in the presence of an edge case.

	// First, we must differentiate between cases where the leftover
	// "edge" case (n_bf_left) should be allocated to a thread partition
	// at the low end of the index range or the high end.

	if ( handle_edge_low == FALSE )
	{
		// Notice that if all threads receive the same number of
		// block_factors, those threads are considered "high" and
		// the "low" thread group is empty.
		dim_t n_th_lo = n_bf_whole % n_way;
		//dim_t n_th_hi = n_way - n_th_lo;

		// If some partitions must have more block_factors than others
		// assign the slightly larger partitions to lower index threads.
		if ( n_th_lo != 0 ) n_bf_lo += 1;

		// Compute the actual widths (in units of rows/columns) of
		// individual threads in the low and high groups.
		dim_t size_lo = n_bf_lo * bf;
		dim_t size_hi = n_bf_hi * bf;

		// Precompute the starting indices of the low and high groups.
		dim_t lo_start = all_start;
		dim_t hi_start = all_start + n_th_lo * size_lo;

		// Compute the start and end of individual threads' ranges
		// as a function of their work_ids and also the group to which
		// they belong (low or high).
		if ( work_id < n_th_lo )
		{
			*start = lo_start + (work_id  ) * size_lo;
			*end   = lo_start + (work_id+1) * size_lo;
		}
		else // if ( n_th_lo <= work_id )
		{
			*start = hi_start + (work_id-n_th_lo  ) * size_hi;
			*end   = hi_start + (work_id-n_th_lo+1) * size_hi;

			// Since the edge case is being allocated to the high
			// end of the index range, we have to advance the last
			// thread's end.
			if ( work_id == n_way - 1 ) *end += n_bf_left;
		}
	}
	else // if ( handle_edge_low == TRUE )
	{
		// Notice that if all threads receive the same number of
		// block_factors, those threads are considered "low" and
		// the "high" thread group is empty.
		dim_t n_th_hi = n_bf_whole % n_way;
		dim_t n_th_lo = n_way - n_th_hi;

		// If some partitions must have more block_factors than others
		// assign the slightly larger partitions to higher index threads.
		if ( n_th_hi != 0 ) n_bf_hi += 1;

		// Compute the actual widths (in units of rows/columns) of
		// individual threads in the low and high groups.
		dim_t size_lo = n_bf_lo * bf;
		dim_t size_hi = n_bf_hi * bf;

		// Precompute the starting indices of the low and high groups.
		dim_t lo_start = all_start;
		dim_t hi_start = all_start + n_th_lo * size_lo
		                           + n_bf_left;

		// Compute the start and end of individual threads' ranges
		// as a function of their work_ids and also the group to which
		// they belong (low or high).
		if ( work_id < n_th_lo )
		{
			*start = lo_start + (work_id  ) * size_lo;
			*end   = lo_start + (work_id+1) * size_lo;

			// Since the edge case is being allocated to the low
			// end of the index range, we have to advance the
			// starts/ends accordingly.
			if ( work_id == 0 )   *end   += n_bf_left;
			else                { *start += n_bf_left;
			                      *end   += n_bf_left; }
		}
		else // if ( n_th_lo <= work_id )
		{
			*start = hi_start + (work_id-n_th_lo  ) * size_hi;
			*end   = hi_start + (work_id-n_th_lo+1) * size_hi;
		}
	}
}
Beispiel #2
0
void bli_l3_thrinfo_print_paths
     (
       thrinfo_t** threads
     )
{
	dim_t n_threads = bli_thread_num_threads( threads[0] );
	dim_t gl_comm_id;

	thrinfo_t* jc_info  = threads[0];
	thrinfo_t* pc_info  = bli_thrinfo_sub_node( jc_info );
	thrinfo_t* pb_info  = bli_thrinfo_sub_node( pc_info );
	thrinfo_t* ic_info  = bli_thrinfo_sub_node( pb_info );
	thrinfo_t* pa_info  = bli_thrinfo_sub_node( ic_info );
	thrinfo_t* jr_info  = bli_thrinfo_sub_node( pa_info );
	thrinfo_t* ir_info  = bli_thrinfo_sub_node( jr_info );

	dim_t jc_way = bli_thread_n_way( jc_info );
	dim_t pc_way = bli_thread_n_way( pc_info );
	dim_t pb_way = bli_thread_n_way( pb_info );
	dim_t ic_way = bli_thread_n_way( ic_info );
	dim_t pa_way = bli_thread_n_way( pa_info );
	dim_t jr_way = bli_thread_n_way( jr_info );
	dim_t ir_way = bli_thread_n_way( ir_info );

	dim_t gl_nt = bli_thread_num_threads( jc_info );
	dim_t jc_nt = bli_thread_num_threads( pc_info );
	dim_t pc_nt = bli_thread_num_threads( pb_info );
	dim_t pb_nt = bli_thread_num_threads( ic_info );
	dim_t ic_nt = bli_thread_num_threads( pa_info );
	dim_t pa_nt = bli_thread_num_threads( jr_info );
	dim_t jr_nt = bli_thread_num_threads( ir_info );

	printf( "            gl   jc   kc   pb   ic   pa   jr   ir\n" );
	printf( "xx_nt:    %4lu %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
	( unsigned long )gl_nt,
	( unsigned long )jc_nt,
	( unsigned long )pc_nt,
	( unsigned long )pb_nt,
	( unsigned long )ic_nt,
	( unsigned long )pa_nt,
	( unsigned long )jr_nt,
	( unsigned long )1 );
	printf( "\n" );
	printf( "            jc   kc   pb   ic   pa   jr   ir\n" );
	printf( "xx_way:   %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
    ( unsigned long )jc_way,
	( unsigned long )pc_way,
	( unsigned long )pb_way,
	( unsigned long )ic_way,
	( unsigned long )pa_way,
	( unsigned long )jr_way,
	( unsigned long )ir_way );
	printf( "=================================================\n" );

	for ( gl_comm_id = 0; gl_comm_id < n_threads; ++gl_comm_id )
	{
		jc_info = threads[gl_comm_id];
		pc_info = bli_thrinfo_sub_node( jc_info );
		pb_info = bli_thrinfo_sub_node( pc_info );
		ic_info = bli_thrinfo_sub_node( pb_info );
		pa_info = bli_thrinfo_sub_node( ic_info );
		jr_info = bli_thrinfo_sub_node( pa_info );
		ir_info = bli_thrinfo_sub_node( jr_info );

		dim_t gl_comm_id = bli_thread_ocomm_id( jc_info );
		dim_t jc_comm_id = bli_thread_ocomm_id( pc_info );
		dim_t pc_comm_id = bli_thread_ocomm_id( pb_info );
		dim_t pb_comm_id = bli_thread_ocomm_id( ic_info );
		dim_t ic_comm_id = bli_thread_ocomm_id( pa_info );
		dim_t pa_comm_id = bli_thread_ocomm_id( jr_info );
		dim_t jr_comm_id = bli_thread_ocomm_id( ir_info );

		dim_t jc_work_id = bli_thread_work_id( jc_info );
		dim_t pc_work_id = bli_thread_work_id( pc_info );
		dim_t pb_work_id = bli_thread_work_id( pb_info );
		dim_t ic_work_id = bli_thread_work_id( ic_info );
		dim_t pa_work_id = bli_thread_work_id( pa_info );
		dim_t jr_work_id = bli_thread_work_id( jr_info );
		dim_t ir_work_id = bli_thread_work_id( ir_info );

		printf( "            gl   jc   pb   kc   pa   ic   jr  \n" );
		printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
		( unsigned long )gl_comm_id,
		( unsigned long )jc_comm_id,
		( unsigned long )pc_comm_id,
		( unsigned long )pb_comm_id,
		( unsigned long )ic_comm_id,
		( unsigned long )pa_comm_id,
		( unsigned long )jr_comm_id );
		printf( "work ids: %4ld %4ld %4lu %4lu %4ld %4ld %4ld\n",
		( unsigned long )jc_work_id,
		( unsigned long )pc_work_id,
		( unsigned long )pb_work_id,
		( unsigned long )ic_work_id,
		( unsigned long )pa_work_id,
		( unsigned long )jr_work_id,
		( unsigned long )ir_work_id );
		printf( "---------------------------------------\n" );
	}

}
Beispiel #3
0
siz_t bli_thread_get_range_weighted_sub
     (
       thrinfo_t* thread,
       doff_t     diagoff,
       uplo_t     uplo,
       dim_t      m,
       dim_t      n,
       dim_t      bf,
       bool_t     handle_edge_low,
       dim_t*     j_start_thr,
       dim_t*     j_end_thr
     )
{
	dim_t      n_way   = bli_thread_n_way( thread );
	dim_t      my_id   = bli_thread_work_id( thread );

	dim_t      bf_left = n % bf;

	dim_t      j;

	dim_t      off_j;
	doff_t     diagoff_j;
	dim_t      n_left;

	dim_t      width_j;

	dim_t      offm_inc, offn_inc;

	double     tri_dim, tri_area;
	double     area_total, area_per_thr;

	siz_t      area = 0;

	// In this function, we assume that the caller has already determined
	// that (a) the diagonal intersects the submatrix, and (b) the submatrix
	// is either lower- or upper-stored.

	if ( bli_is_lower( uplo ) )
	{
		// Prune away the unstored region above the diagonal, if it exists,
		// and then to the right of where the diagonal intersects the bottom,
		// if it exists. (Also, we discard the offset deltas since we don't
		// need to actually index into the subpartition.)
		bli_prune_unstored_region_top_l( diagoff, m, n, offm_inc );
		bli_prune_unstored_region_right_l( diagoff, m, n, offn_inc );

		// We don't need offm_inc, offn_inc here. These statements should
		// prevent compiler warnings.
		( void )offm_inc;
		( void )offn_inc;

		// Now that pruning has taken place, we know that diagoff >= 0.

		// Compute the total area of the submatrix, accounting for the
		// location of the diagonal, and divide it by the number of ways
		// of parallelism.
		tri_dim      = ( double )( n - diagoff - 1 );
		tri_area     = tri_dim * ( tri_dim + 1.0 ) / 2.0;
		area_total   = ( double )m * ( double )n - tri_area;
		area_per_thr = area_total / ( double )n_way;

		// Initialize some variables prior to the loop: the offset to the
		// current subpartition, the remainder of the n dimension, and
		// the diagonal offset of the current subpartition.
		off_j     = 0;
		diagoff_j = diagoff;
		n_left    = n;

		// Iterate over the subpartition indices corresponding to each
		// thread/caucus participating in the n_way parallelism.
		for ( j = 0; j < n_way; ++j )
		{
			// Compute the width of the jth subpartition, taking the
			// current diagonal offset into account, if needed.
			width_j =
			bli_thread_get_range_width_l
			(
			  diagoff_j, m, n_left,
			  j, n_way,
			  bf, bf_left,
			  area_per_thr,
			  handle_edge_low
			);

			// If the current thread belongs to caucus j, this is his
			// subpartition. So we compute the implied index range and
			// end our search.
			if ( j == my_id )
			{
				*j_start_thr = off_j;
				*j_end_thr   = off_j + width_j;

				area = bli_find_area_trap_l( m, width_j, diagoff_j );

				break;
			}

			// Shift the current subpartition's starting and diagonal offsets,
			// as well as the remainder of the n dimension, according to the
			// computed width, and then iterate to the next subpartition.
			off_j     += width_j;
			diagoff_j -= width_j;
			n_left    -= width_j;
		}
	}
	else // if ( bli_is_upper( uplo ) )
	{
		// Express the upper-stored case in terms of the lower-stored case.

		// First, we convert the upper-stored trapezoid to an equivalent
		// lower-stored trapezoid by rotating it 180 degrees.
		bli_rotate180_trapezoid( diagoff, uplo );

		// Now that the trapezoid is "flipped" in the n dimension, negate
		// the bool that encodes whether to handle the edge case at the
		// low (or high) end of the index range.
		bli_toggle_bool( handle_edge_low );

		// Compute the appropriate range for the rotated trapezoid.
		area = bli_thread_get_range_weighted_sub
		(
		  thread, diagoff, uplo, m, n, bf,
		  handle_edge_low,
		  j_start_thr, j_end_thr
		);

		// Reverse the indexing basis for the subpartition ranges so that
		// the indices, relative to left-to-right iteration through the
		// unrotated upper-stored trapezoid, map to the correct columns
		// (relative to the diagonal). This amounts to subtracting the
		// range from n.
		bli_reverse_index_direction( *j_start_thr, *j_end_thr, n );
	}

	return area;
}