err_t bli_check_valid_uplo( uplo_t uplo ) { err_t e_val = BLIS_SUCCESS; if ( !bli_is_lower( uplo ) && !bli_is_upper( uplo ) ) e_val = BLIS_INVALID_UPLO; return e_val; }
int main( int argc, char** argv ) { bli_init(); #if 0 obj_t a, b, c; obj_t aa, bb, cc; dim_t m, n, k; num_t dt; uplo_t uploa, uplob, uploc; { dt = BLIS_DOUBLE; m = 6; k = 6; n = 6; bli_obj_create( dt, m, k, 0, 0, &a ); bli_obj_create( dt, k, n, 0, 0, &b ); bli_obj_create( dt, m, n, 0, 0, &c ); uploa = BLIS_UPPER; uploa = BLIS_LOWER; bli_obj_set_struc( BLIS_TRIANGULAR, a ); bli_obj_set_uplo( uploa, a ); bli_obj_set_diag_offset( -2, a ); uplob = BLIS_UPPER; uplob = BLIS_LOWER; bli_obj_set_struc( BLIS_TRIANGULAR, b ); bli_obj_set_uplo( uplob, b ); bli_obj_set_diag_offset( -2, b ); uploc = BLIS_UPPER; //uploc = BLIS_LOWER; //uploc = BLIS_ZEROS; //uploc = BLIS_DENSE; bli_obj_set_struc( BLIS_HERMITIAN, c ); //bli_obj_set_struc( BLIS_TRIANGULAR, c ); bli_obj_set_uplo( uploc, c ); bli_obj_set_diag_offset( 1, c ); bli_obj_alias_to( a, aa ); (void)aa; bli_obj_alias_to( b, bb ); (void)bb; bli_obj_alias_to( c, cc ); (void)cc; bli_randm( &a ); bli_randm( &b ); bli_randm( &c ); //bli_mkherm( &a ); //bli_mktrim( &a ); bli_prune_unref_mparts( &cc, BLIS_M, &aa, BLIS_N ); bli_printm( "c orig", &c, "%4.1f", "" ); bli_printm( "c alias", &cc, "%4.1f", "" ); bli_printm( "a orig", &a, "%4.1f", "" ); bli_printm( "a alias", &aa, "%4.1f", "" ); //bli_obj_print( "a struct", &a ); } #endif dim_t p_begin, p_max, p_inc; gint_t m_input, n_input; char uploa_ch; doff_t diagoffa; dim_t bf; dim_t n_way; char part_dim_ch; bool_t go_fwd; char out_ch; obj_t a; thrinfo_t thrinfo; dim_t m, n; uplo_t uploa; bool_t part_m_dim, part_n_dim; bool_t go_bwd; dim_t p; num_t dt; dim_t start, end; dim_t width; siz_t area; gint_t t_begin, t_stop, t_inc; dim_t t; if ( argc == 13 ) { sscanf( argv[1], "%lu", &p_begin ); sscanf( argv[2], "%lu", &p_max ); sscanf( argv[3], "%lu", &p_inc ); sscanf( argv[4], "%ld", &m_input ); sscanf( argv[5], "%ld", &n_input ); sscanf( argv[6], "%c", &uploa_ch ); sscanf( argv[7], "%ld", &diagoffa ); sscanf( argv[8], "%lu", &bf ); sscanf( argv[9], "%lu", &n_way ); sscanf( argv[10], "%c", &part_dim_ch ); sscanf( argv[11], "%lu", &go_fwd ); sscanf( argv[12], "%c", &out_ch ); } else { printf( "\n" ); printf( " %s\n", argv[0] ); printf( "\n" ); printf( " Simulate the dimension ranges assigned to threads when\n" ); printf( " partitioning a matrix for parallelism in BLIS.\n" ); printf( "\n" ); printf( " Usage:\n" ); printf( "\n" ); printf( " %s p_beg p_max p_inc m n uplo doff bf n_way part_dim go_fwd out\n", argv[0] ); printf( "\n" ); printf( " p_beg: the first problem size p to test.\n" ); printf( " p_max: the maximum problem size p to test.\n" ); printf( " p_inc: the increase in problem size p between tests.\n" ); printf( " m: the m dimension:\n" ); printf( " n: the n dimension:\n" ); printf( " if m,n = -1: bind m,n to problem size p.\n" ); printf( " if m,n = 0: bind m,n to p_max.\n" ); printf( " if m,n > 0: hold m,n = c constant for all p.\n" ); printf( " uplo: the uplo field of the matrix being partitioned:\n" ); printf( " 'l': lower-stored (BLIS_LOWER)\n" ); printf( " 'u': upper-stored (BLIS_UPPER)\n" ); printf( " 'd': densely-stored (BLIS_DENSE)\n" ); printf( " doff: the diagonal offset of the matrix being partitioned.\n" ); printf( " bf: the simulated blocking factor. all thread ranges must\n" ); printf( " be a multiple of bf, except for the range that contains\n" ); printf( " the edge case (if one exists). the blocking factor\n" ); printf( " would typically correspond to a register blocksize.\n" ); printf( " n_way: the number of ways of parallelism for which we are\n" ); printf( " partitioning (i.e.: the number of threads, or thread\n" ); printf( " groups).\n" ); printf( " part_dim: the dimension to partition:\n" ); printf( " 'm': partition the m dimension.\n" ); printf( " 'n': partition the n dimension.\n" ); printf( " go_fwd: the direction to partition:\n" ); printf( " '1': forward, e.g. left-to-right (part_dim = 'm') or\n" ); printf( " top-to-bottom (part_dim = 'n')\n" ); printf( " '0': backward, e.g. right-to-left (part_dim = 'm') or\n" ); printf( " bottom-to-top (part_dim = 'n')\n" ); printf( " NOTE: reversing the direction does not change the\n" ); printf( " subpartitions' widths, but it does change which end of\n" ); printf( " the index range receives the edge case, if it exists.\n" ); printf( " out: the type of output per thread-column:\n" ); printf( " 'w': the width (and area) of the thread's subpartition\n" ); printf( " 'r': the actual ranges of the thread's subpartition\n" ); printf( " where the start and end points of each range are\n" ); printf( " inclusive and exclusive, respectively.\n" ); printf( "\n" ); exit(1); } if ( m_input == 0 ) m_input = p_max; if ( n_input == 0 ) n_input = p_max; if ( part_dim_ch == 'm' ) { part_m_dim = TRUE; part_n_dim = FALSE; } else { part_m_dim = FALSE; part_n_dim = TRUE; } go_bwd = !go_fwd; if ( uploa_ch == 'l' ) uploa = BLIS_LOWER; else if ( uploa_ch == 'u' ) uploa = BLIS_UPPER; else uploa = BLIS_DENSE; if ( part_n_dim ) { if ( bli_is_upper( uploa ) ) { t_begin = n_way-1; t_stop = -1; t_inc = -1; } else /* if lower or dense */ { t_begin = 0; t_stop = n_way; t_inc = 1; } } else // if ( part_m_dim ) { if ( bli_is_lower( uploa ) ) { t_begin = n_way-1; t_stop = -1; t_inc = -1; } else /* if upper or dense */ { t_begin = 0; t_stop = n_way; t_inc = 1; } } printf( "\n" ); printf( " part: %3s doff: %3ld bf: %3ld output: %s\n", ( part_n_dim ? ( go_fwd ? "l2r" : "r2l" ) : ( go_fwd ? "t2b" : "b2t" ) ), diagoffa, bf, ( out_ch == 'w' ? "width(area)" : "ranges" ) ); printf( " uplo: %3c nt: %3ld\n", uploa_ch, n_way ); printf( "\n" ); printf( " " ); for ( t = t_begin; t != t_stop; t += t_inc ) { if ( part_n_dim ) { if ( t == t_begin ) printf( "left... " ); else if ( t == t_stop-t_inc ) printf( " ...right" ); else printf( " " ); } else // if ( part_m_dim ) { if ( t == t_begin ) printf( "top... " ); else if ( t == t_stop-t_inc ) printf( " ...bottom" ); else printf( " " ); } } printf( "\n" ); printf( "%4c x %4c ", 'm', 'n' ); for ( t = t_begin; t != t_stop; t += t_inc ) { printf( "%9s %lu ", "thread", t ); } printf( "\n" ); printf( "-------------" ); for ( t = t_begin; t != t_stop; t += t_inc ) { printf( "-------------" ); } printf( "\n" ); for ( p = p_begin; p <= p_max; p += p_inc ) { if ( m_input < 0 ) m = ( dim_t )p; else m = ( dim_t )m_input; if ( n_input < 0 ) n = ( dim_t )p; else n = ( dim_t )n_input; dt = BLIS_DOUBLE; bli_obj_create( dt, m, n, 0, 0, &a ); bli_obj_set_struc( BLIS_TRIANGULAR, a ); bli_obj_set_uplo( uploa, a ); bli_obj_set_diag_offset( diagoffa, a ); bli_randm( &a ); printf( "%4lu x %4lu ", m, n ); for ( t = t_begin; t != t_stop; t += t_inc ) { thrinfo.n_way = n_way; thrinfo.work_id = t; if ( part_n_dim && go_fwd ) area = bli_get_range_weighted_l2r( &thrinfo, &a, bf, &start, &end ); else if ( part_n_dim && go_bwd ) area = bli_get_range_weighted_r2l( &thrinfo, &a, bf, &start, &end ); else if ( part_m_dim && go_fwd ) area = bli_get_range_weighted_t2b( &thrinfo, &a, bf, &start, &end ); else // ( part_m_dim && go_bwd ) area = bli_get_range_weighted_b2t( &thrinfo, &a, bf, &start, &end ); width = end - start; if ( out_ch == 'w' ) printf( "%4lu(%6lu) ", width, area ); else printf( "[%4lu,%4lu) ", start, end ); } printf( "\n" ); bli_obj_free( &a ); } bli_finalize(); return 0; }
siz_t bli_thread_get_range_weighted ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool_t handle_edge_low, dim_t* j_start_thr, dim_t* j_end_thr ) { dim_t n_way = thread->n_way; dim_t my_id = thread->work_id; dim_t bf_left = n % bf; dim_t j; dim_t off_j; doff_t diagoff_j; dim_t n_left; dim_t width_j; dim_t offm_inc, offn_inc; double tri_dim, tri_area; double area_total, area_per_thr; siz_t area = 0; // In this function, we assume that the caller has already determined // that (a) the diagonal intersects the submatrix, and (b) the submatrix // is either lower- or upper-stored. if ( bli_is_lower( uplo ) ) { // Prune away the unstored region above the diagonal, if it exists, // and then to the right of where the diagonal intersects the bottom, // if it exists. (Also, we discard the offset deltas since we don't // need to actually index into the subpartition.) bli_prune_unstored_region_top_l( diagoff, m, n, offm_inc ); bli_prune_unstored_region_right_l( diagoff, m, n, offn_inc ); // We don't need offm_inc, offn_inc here. These statements should // prevent compiler warnings. ( void )offm_inc; ( void )offn_inc; // Now that pruning has taken place, we know that diagoff >= 0. // Compute the total area of the submatrix, accounting for the // location of the diagonal, and divide it by the number of ways // of parallelism. tri_dim = ( double )( n - diagoff - 1 ); tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0; area_total = ( double )m * ( double )n - tri_area; area_per_thr = area_total / ( double )n_way; // Initialize some variables prior to the loop: the offset to the // current subpartition, the remainder of the n dimension, and // the diagonal offset of the current subpartition. off_j = 0; diagoff_j = diagoff; n_left = n; // Iterate over the subpartition indices corresponding to each // thread/caucus participating in the n_way parallelism. for ( j = 0; j < n_way; ++j ) { // Compute the width of the jth subpartition, taking the // current diagonal offset into account, if needed. width_j = bli_thread_get_range_width_l( diagoff_j, m, n_left, j, n_way, bf, bf_left, area_per_thr, handle_edge_low ); // If the current thread belongs to caucus j, this is his // subpartition. So we compute the implied index range and // end our search. if ( j == my_id ) { *j_start_thr = off_j; *j_end_thr = off_j + width_j; area = bli_find_area_trap_l( m, width_j, diagoff_j ); break; } // Shift the current subpartition's starting and diagonal offsets, // as well as the remainder of the n dimension, according to the // computed width, and then iterate to the next subpartition. off_j += width_j; diagoff_j -= width_j; n_left -= width_j; } } else // if ( bli_is_upper( uplo ) ) { // Express the upper-stored case in terms of the lower-stored case. // First, we convert the upper-stored trapezoid to an equivalent // lower-stored trapezoid by rotating it 180 degrees. bli_rotate180_trapezoid( diagoff, uplo ); // Now that the trapezoid is "flipped" in the n dimension, negate // the bool that encodes whether to handle the edge case at the // low (or high) end of the index range. bli_toggle_bool( handle_edge_low ); // Compute the appropriate range for the rotated trapezoid. area = bli_thread_get_range_weighted( thread, diagoff, uplo, m, n, bf, handle_edge_low, j_start_thr, j_end_thr ); // Reverse the indexing basis for the subpartition ranges so that // the indices, relative to left-to-right iteration through the // unrotated upper-stored trapezoid, map to the correct columns // (relative to the diagonal). This amounts to subtracting the // range from n. bli_reverse_index_direction( *j_start_thr, *j_end_thr, n ); } return area; }