siz_t bli_thread_get_range_weighted_b2t ( thrinfo_t* thr, obj_t* a, blksz_t* bmult, dim_t* start, dim_t* end ) { siz_t area; // This function assigns area-weighted ranges in the m dimension // where the total range spans 0 to m-1 with 0 at the bottom end and // m-1 at the top end. if ( bli_obj_intersects_diag( *a ) && bli_obj_is_upper_or_lower( *a ) ) { doff_t diagoff = bli_obj_diag_offset( *a ); uplo_t uplo = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( *a ) ) { bli_reflect_about_diag( diagoff, uplo, m, n ); } bli_reflect_about_diag( diagoff, uplo, m, n ); bli_rotate180_trapezoid( diagoff, uplo ); area = bli_thread_get_range_weighted_sub ( thr, diagoff, uplo, m, n, bf, TRUE, start, end ); } else // if dense or zeros { area = bli_thread_get_range_b2t ( thr, a, bmult, start, end ); } return area; }
siz_t bli_thread_get_range_weighted ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool_t handle_edge_low, dim_t* j_start_thr, dim_t* j_end_thr ) { dim_t n_way = thread->n_way; dim_t my_id = thread->work_id; dim_t bf_left = n % bf; dim_t j; dim_t off_j; doff_t diagoff_j; dim_t n_left; dim_t width_j; dim_t offm_inc, offn_inc; double tri_dim, tri_area; double area_total, area_per_thr; siz_t area = 0; // In this function, we assume that the caller has already determined // that (a) the diagonal intersects the submatrix, and (b) the submatrix // is either lower- or upper-stored. if ( bli_is_lower( uplo ) ) { // Prune away the unstored region above the diagonal, if it exists, // and then to the right of where the diagonal intersects the bottom, // if it exists. (Also, we discard the offset deltas since we don't // need to actually index into the subpartition.) bli_prune_unstored_region_top_l( diagoff, m, n, offm_inc ); bli_prune_unstored_region_right_l( diagoff, m, n, offn_inc ); // We don't need offm_inc, offn_inc here. These statements should // prevent compiler warnings. ( void )offm_inc; ( void )offn_inc; // Now that pruning has taken place, we know that diagoff >= 0. // Compute the total area of the submatrix, accounting for the // location of the diagonal, and divide it by the number of ways // of parallelism. tri_dim = ( double )( n - diagoff - 1 ); tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0; area_total = ( double )m * ( double )n - tri_area; area_per_thr = area_total / ( double )n_way; // Initialize some variables prior to the loop: the offset to the // current subpartition, the remainder of the n dimension, and // the diagonal offset of the current subpartition. off_j = 0; diagoff_j = diagoff; n_left = n; // Iterate over the subpartition indices corresponding to each // thread/caucus participating in the n_way parallelism. for ( j = 0; j < n_way; ++j ) { // Compute the width of the jth subpartition, taking the // current diagonal offset into account, if needed. width_j = bli_thread_get_range_width_l( diagoff_j, m, n_left, j, n_way, bf, bf_left, area_per_thr, handle_edge_low ); // If the current thread belongs to caucus j, this is his // subpartition. So we compute the implied index range and // end our search. if ( j == my_id ) { *j_start_thr = off_j; *j_end_thr = off_j + width_j; area = bli_find_area_trap_l( m, width_j, diagoff_j ); break; } // Shift the current subpartition's starting and diagonal offsets, // as well as the remainder of the n dimension, according to the // computed width, and then iterate to the next subpartition. off_j += width_j; diagoff_j -= width_j; n_left -= width_j; } } else // if ( bli_is_upper( uplo ) ) { // Express the upper-stored case in terms of the lower-stored case. // First, we convert the upper-stored trapezoid to an equivalent // lower-stored trapezoid by rotating it 180 degrees. bli_rotate180_trapezoid( diagoff, uplo ); // Now that the trapezoid is "flipped" in the n dimension, negate // the bool that encodes whether to handle the edge case at the // low (or high) end of the index range. bli_toggle_bool( handle_edge_low ); // Compute the appropriate range for the rotated trapezoid. area = bli_thread_get_range_weighted( thread, diagoff, uplo, m, n, bf, handle_edge_low, j_start_thr, j_end_thr ); // Reverse the indexing basis for the subpartition ranges so that // the indices, relative to left-to-right iteration through the // unrotated upper-stored trapezoid, map to the correct columns // (relative to the diagonal). This amounts to subtracting the // range from n. bli_reverse_index_direction( *j_start_thr, *j_end_thr, n ); } return area; }