Пример #1
0
siz_t bli_thread_get_range_ndim
     (
       dir_t      direct,
       thrinfo_t* thr,
       obj_t*     a,
       obj_t*     b,
       obj_t*     c,
       cntl_t*    cntl,
       cntx_t*    cntx,
       dim_t*     start,
       dim_t*     end
     )
{
	bszid_t  bszid  = bli_cntl_bszid( cntl );
	opid_t   family = bli_cntx_get_family( cntx );

	// This is part of trsm's current implementation, whereby right side
	// cases are implemented in left-side micro-kernels, which requires
	// we swap the usage of the register blocksizes for the purposes of
	// packing A and B.
	if ( family == BLIS_TRSM )
	{
		if ( bli_obj_root_is_triangular( *b ) ) bszid = BLIS_MR;
		else                                    bszid = BLIS_NR;
	}

	blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
	obj_t*   x;
	bool_t   use_weighted;

	// Use the operation family to choose the one of the two matrices
	// being partitioned that potentially has structure, and also to
	// decide whether or not we need to use weighted range partitioning.
	// NOTE: It's important that we use non-weighted range partitioning
	// for hemm and symm (ie: the gemm family) because the weighted
	// function will mistakenly skip over unstored regions of the
	// structured matrix, even though they represent part of that matrix
	// that will be dense and full (after packing).
	if      ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; }
	else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE;  }
	else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE;  }
	else    /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; }

	if ( use_weighted )
	{
		if ( direct == BLIS_FWD )
			return bli_thread_get_range_weighted_l2r( thr, x, bmult, start, end );
		else
			return bli_thread_get_range_weighted_r2l( thr, x, bmult, start, end );
	}
	else
	{
		if ( direct == BLIS_FWD )
			return bli_thread_get_range_l2r( thr, x, bmult, start, end );
		else
			return bli_thread_get_range_r2l( thr, x, bmult, start, end );
	}
}
Пример #2
0
void bli_trmm_blk_var2b( obj_t*  a,
                         obj_t*  b,
                         obj_t*  c,
                         cntx_t* cntx,
                         gemm_t* cntl,
                         thrinfo_t* thread )
{
    obj_t a_pack_s;
    obj_t b1_pack_s, c1_pack_s;
    
    obj_t b1, c1; 
    obj_t*  a_pack = NULL;
    obj_t*  b1_pack = NULL;
    obj_t*  c1_pack = NULL;

	dim_t i;
	dim_t b_alg;

	// Prune any zero region that exists along the partitioning dimension.
	bli_trmm_prune_unref_mparts_n( a, b, c );

    if( bli_thread_am_ochief( thread ) ) { 
        // Initialize object for packing A
        bli_obj_init_pack( &a_pack_s );
        bli_packm_init( a, &a_pack_s,
                        cntx, bli_cntl_sub_packm_a( cntl ) );

        // Scale C by beta (if instructed).
        bli_scalm_int( &BLIS_ONE,
                       c,  
                       cntx, bli_cntl_sub_scalm( cntl ) );
    }   
    a_pack = bli_thread_obroadcast( thread, &a_pack_s );

    // Initialize pack objects for B and C that are passed into packm_init().
    if( bli_thread_am_ichief( thread ) ) { 
        bli_obj_init_pack( &b1_pack_s );
        bli_obj_init_pack( &c1_pack_s );
    }   
    b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s );
    c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );

	// Pack A (if instructed).
	bli_packm_int( a, a_pack,
	               cntx, bli_cntl_sub_packm_a( cntl ),
                   bli_thrinfo_sub_opackm( thread ) );

    dim_t my_start, my_end;
    bli_thread_get_range_weighted_r2l( thread, b,
                                bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
                                &my_start, &my_end );

	// Partition along the n dimension.
	for ( i = my_start; i < my_end; i += b_alg )
	{
		// Determine the current algorithmic blocksize.
		b_alg = bli_determine_blocksize_b( i, my_end, b,
		                                   bli_cntl_bszid( cntl ), cntx );

		// Acquire partitions for B1 and C1.
		bli_acquire_mpart_r2l( BLIS_SUBPART1,
		                       i, b_alg, b, &b1 );
		bli_acquire_mpart_r2l( BLIS_SUBPART1,
		                       i, b_alg, c, &c1 );

		// Initialize objects for packing A1 and B1.
        if( bli_thread_am_ichief( thread ) ) {
            bli_packm_init( &b1, b1_pack,
                            cntx, bli_cntl_sub_packm_b( cntl ) );
            bli_packm_init( &c1, c1_pack,
                            cntx, bli_cntl_sub_packm_c( cntl ) );
        }
        bli_thread_ibarrier( thread );

		// Pack B1 (if instructed).
		bli_packm_int( &b1, b1_pack,
		               cntx, bli_cntl_sub_packm_b( cntl ),
                       bli_thrinfo_sub_ipackm( thread ) );

		// Pack C1 (if instructed).
		bli_packm_int( &c1, c1_pack,
		               cntx, bli_cntl_sub_packm_c( cntl ),
                       bli_thrinfo_sub_ipackm( thread ) );

		// Perform trmm subproblem.
		bli_trmm_int( &BLIS_ONE,
		              a_pack,
		              b1_pack,
		              &BLIS_ONE,
		              c1_pack,
		              cntx,
		              bli_cntl_sub_gemm( cntl ),
                      bli_thrinfo_sub_self( thread ) );
        bli_thread_ibarrier( thread );

        // Unpack C1 (if C1 was packed).
        bli_unpackm_int( c1_pack, &c1,
                         cntx, bli_cntl_sub_unpackm_c( cntl ),
                         bli_thrinfo_sub_ipackm( thread ) );
	}

	// If any packing buffers were acquired within packm, release them back
	// to the memory manager.
    bli_thread_obarrier( thread );
    if( bli_thread_am_ochief( thread ) )
        bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) );
    if( bli_thread_am_ichief( thread ) ) {
        bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) );
        bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
    }
}
Пример #3
0
int main( int argc, char** argv )
{
	//bli_init();

#if 0
	obj_t a, b, c;
	obj_t aa, bb, cc;
	dim_t m, n, k;
	num_t dt;
	uplo_t uploa, uplob, uploc;

	{
		dt = BLIS_DOUBLE;

		m = 6;
		k = 6;
		n = 6;

		bli_obj_create( dt, m, k, 0, 0, &a );
		bli_obj_create( dt, k, n, 0, 0, &b );
		bli_obj_create( dt, m, n, 0, 0, &c );

		uploa = BLIS_UPPER;
		uploa = BLIS_LOWER;
		bli_obj_set_struc( BLIS_TRIANGULAR, &a );
		bli_obj_set_uplo( uploa, &a );
		bli_obj_set_diag_offset( -2, &a );

		uplob = BLIS_UPPER;
		uplob = BLIS_LOWER;
		bli_obj_set_struc( BLIS_TRIANGULAR, &b );
		bli_obj_set_uplo( uplob, &b );
		bli_obj_set_diag_offset( -2, &b );

		uploc = BLIS_UPPER;
		//uploc = BLIS_LOWER;
		//uploc = BLIS_ZEROS;
		//uploc = BLIS_DENSE;
		bli_obj_set_struc( BLIS_HERMITIAN, &c );
		//bli_obj_set_struc( BLIS_TRIANGULAR, &c );
		bli_obj_set_uplo( uploc, &c );
		bli_obj_set_diag_offset(  1, &c );

		bli_obj_alias_to( &a, &aa ); (void)aa;
		bli_obj_alias_to( &b, &bb ); (void)bb;
		bli_obj_alias_to( &c, &cc ); (void)cc;

		bli_randm( &a );
		bli_randm( &b );
		bli_randm( &c );
		//bli_mkherm( &a );
		//bli_mktrim( &a );

		bli_prune_unref_mparts( &cc, BLIS_M,
		                        &aa, BLIS_N );

		bli_printm( "c orig", &c, "%4.1f", "" );
		bli_printm( "c alias", &cc, "%4.1f", "" );
		bli_printm( "a orig", &a, "%4.1f", "" );
		bli_printm( "a alias", &aa, "%4.1f", "" );
		//bli_obj_print( "a struct", &a );
	}
#endif

	dim_t  p_begin, p_max, p_inc;
	gint_t m_input, n_input;
	char   uploa_ch;
	doff_t diagoffa;
	dim_t  bf;
	dim_t  n_way;
	char   part_dim_ch;
	bool_t go_fwd;
	char   out_ch;

	obj_t   a;
	blksz_t bfs;

	thrinfo_t thrinfo;
	dim_t  m, n;
	uplo_t uploa;
	bool_t part_m_dim, part_n_dim;
	bool_t go_bwd;
	dim_t  p;
	num_t  dt;
	dim_t  start, end;

	dim_t  width;
	siz_t  area;

	gint_t t_begin, t_stop, t_inc;
	dim_t  t;

	if ( argc == 13 )
	{
		sscanf( argv[1], "%u", &p_begin );
		sscanf( argv[2], "%u", &p_max );
		sscanf( argv[3], "%u", &p_inc );
		sscanf( argv[4], "%d", &m_input );
		sscanf( argv[5], "%d", &n_input );
		sscanf( argv[6], "%c",  &uploa_ch );
		sscanf( argv[7], "%d", &diagoffa );
		sscanf( argv[8], "%u", &bf );
		sscanf( argv[9], "%u", &n_way );
		sscanf( argv[10], "%c", &part_dim_ch );
		sscanf( argv[11], "%u", &go_fwd );
		sscanf( argv[12], "%c", &out_ch );
	}
	else
	{
		printf( "\n" );
		printf( " %s\n", argv[0] );
		printf( "\n" );
		printf( "  Simulate the dimension ranges assigned to threads when\n" );
		printf( "  partitioning a matrix for parallelism in BLIS.\n" );
		printf( "\n" );
		printf( " Usage:\n" );
		printf( "\n" );
		printf( "  %s p_beg p_max p_inc m n uplo doff bf n_way part_dim go_fwd out\n", argv[0] );
		printf( "\n" );
		printf( "  p_beg:    the first problem size p to test.\n" );
		printf( "  p_max:    the maximum problem size p to test.\n" );
		printf( "  p_inc:    the increase in problem size p between tests.\n" );
		printf( "  m:        the m dimension:\n" );
		printf( "  n:        the n dimension:\n" );
		printf( "            if m,n = -1: bind m,n to problem size p.\n" );
		printf( "            if m,n =  0: bind m,n to p_max.\n" );
		printf( "            if m,n >  0: hold m,n = c constant for all p.\n" );
		printf( "  uplo:     the uplo field of the matrix being partitioned:\n" );
		printf( "            'l': lower-stored (BLIS_LOWER)\n" );
		printf( "            'u': upper-stored (BLIS_UPPER)\n" );
		printf( "            'd': densely-stored (BLIS_DENSE)\n" );
		printf( "  doff:     the diagonal offset of the matrix being partitioned.\n" );
		printf( "  bf:       the simulated blocking factor. all thread ranges must\n" );
		printf( "            be a multiple of bf, except for the range that contains\n" );
		printf( "            the edge case (if one exists). the blocking factor\n" );
		printf( "            would typically correspond to a register blocksize.\n" );
		printf( "  n_way:    the number of ways of parallelism for which we are\n" );
		printf( "            partitioning (i.e.: the number of threads, or thread\n" );
		printf( "            groups).\n" );
		printf( "  part_dim: the dimension to partition:\n" );
		printf( "            'm': partition the m dimension.\n" );
		printf( "            'n': partition the n dimension.\n" );
		printf( "  go_fwd:   the direction to partition:\n" );
		printf( "            '1': forward, e.g. left-to-right (part_dim = 'm') or\n" );
		printf( "                 top-to-bottom (part_dim = 'n')\n" );
		printf( "            '0': backward, e.g. right-to-left (part_dim = 'm') or\n" );
		printf( "                 bottom-to-top (part_dim = 'n')\n" );
		printf( "            NOTE: reversing the direction does not change the\n" );
		printf( "            subpartitions' widths, but it does change which end of\n" );
		printf( "            the index range receives the edge case, if it exists.\n" );
		printf( "  out:      the type of output per thread-column:\n" );
		printf( "            'w': the width (and area) of the thread's subpartition\n" );
		printf( "            'r': the actual ranges of the thread's subpartition\n" );
		printf( "                 where the start and end points of each range are\n" );
		printf( "                 inclusive and exclusive, respectively.\n" );
		printf( "\n" );

		exit(1);
	}

	if ( m_input == 0 ) m_input = p_max;
	if ( n_input == 0 ) n_input = p_max;

	if ( part_dim_ch == 'm' ) { part_m_dim = TRUE;  part_n_dim = FALSE; }
	else                      { part_m_dim = FALSE; part_n_dim = TRUE;  }

	go_bwd = !go_fwd;

	if      ( uploa_ch == 'l' ) uploa = BLIS_LOWER;
	else if ( uploa_ch == 'u' ) uploa = BLIS_UPPER;
	else                        uploa = BLIS_DENSE;

	if ( part_n_dim )
	{
		if ( bli_is_upper( uploa ) ) { t_begin = n_way-1; t_stop = -1;    t_inc = -1; }
		else /* if lower or dense */ { t_begin = 0;       t_stop = n_way; t_inc =  1; }
	}
	else // if ( part_m_dim )
	{
		if ( bli_is_lower( uploa ) ) { t_begin = n_way-1; t_stop = -1;    t_inc = -1; }
		else /* if upper or dense */ { t_begin = 0;       t_stop = n_way; t_inc =  1; }
	}

	printf( "\n" );
	printf( "  part: %3s   doff: %3d   bf: %3d   output: %s\n",
	        ( part_n_dim ? ( go_fwd ? "l2r" : "r2l" )
	                     : ( go_fwd ? "t2b" : "b2t" ) ),
	        ( int )diagoffa, ( int )bf,
            ( out_ch == 'w' ? "width(area)" : "ranges" ) );
	printf( "              uplo: %3c   nt: %3u\n", uploa_ch, ( unsigned )n_way );
	printf( "\n" );

	printf( "             " );
	for ( t = t_begin; t != t_stop; t += t_inc )
	{
		if ( part_n_dim )
		{
			if      ( t == t_begin      ) printf( "left...      " );
			else if ( t == t_stop-t_inc ) printf( "     ...right" );
			else                          printf( "             " );
		}
		else // if ( part_m_dim )
		{
			if      ( t == t_begin      ) printf( "top...       " );
			else if ( t == t_stop-t_inc ) printf( "    ...bottom" );
			else                          printf( "             " );
		}
	}
	printf( "\n" );


	printf( "%4c x %4c  ", 'm', 'n' );
	for ( t = t_begin; t != t_stop; t += t_inc )
	{
		printf( "%9s %u  ", "thread", ( unsigned )t );
	}
	printf( "\n" );
	printf( "-------------" );
	for ( t = t_begin; t != t_stop; t += t_inc )
	{
		printf( "-------------" );
	}
	printf( "\n" );


	for ( p = p_begin; p <= p_max; p += p_inc )
	{
		if ( m_input < 0 ) m = ( dim_t )p;
		else               m = ( dim_t )m_input;
		if ( n_input < 0 ) n = ( dim_t )p;
		else               n = ( dim_t )n_input;

		dt = BLIS_DOUBLE;
		
		bli_obj_create( dt, m, n, 0, 0, &a );

		bli_obj_set_struc( BLIS_TRIANGULAR, &a );
		bli_obj_set_uplo( uploa, &a );
		bli_obj_set_diag_offset( diagoffa, &a );

		bli_randm( &a );

		bli_blksz_init_easy( &bfs, bf, bf, bf, bf );

		printf( "%4u x %4u  ", ( unsigned )m, ( unsigned )n );

		for ( t = t_begin; t != t_stop; t += t_inc )
		{
			thrinfo.n_way   = n_way;
			thrinfo.work_id = t;

			if      ( part_n_dim && go_fwd )
				area = bli_thread_get_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end );
			else if ( part_n_dim && go_bwd )
				area = bli_thread_get_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end );
			else if ( part_m_dim && go_fwd )
				area = bli_thread_get_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end );
			else // ( part_m_dim && go_bwd )
				area = bli_thread_get_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end );

			width = end - start;

			if ( out_ch == 'w' ) printf( "%4u(%6u) ", ( unsigned )width,
			                                            ( unsigned )area );
			else                 printf( "[%4u,%4u)  ", ( unsigned )start,
			                                              ( unsigned )end );
		}

		printf( "\n" );

		bli_obj_free( &a );
	}

	//bli_finalize();

	return 0;
}