示例#1
0
siz_t bli_thread_get_range_weighted_b2t
     (
       thrinfo_t* thr,
       obj_t*     a,
       blksz_t*   bmult,
       dim_t*     start,
       dim_t*     end
     )
{
	siz_t area;

	// This function assigns area-weighted ranges in the m dimension
	// where the total range spans 0 to m-1 with 0 at the bottom end and
	// m-1 at the top end.

	if ( bli_obj_intersects_diag( *a ) &&
	     bli_obj_is_upper_or_lower( *a ) )
	{
		doff_t diagoff = bli_obj_diag_offset( *a );
		uplo_t uplo    = bli_obj_uplo( *a );
		dim_t  m       = bli_obj_length( *a );
		dim_t  n       = bli_obj_width( *a );
		dim_t  bf      = bli_blksz_get_def_for_obj( a, bmult );

		// Support implicit transposition.
		if ( bli_obj_has_trans( *a ) )
		{
			bli_reflect_about_diag( diagoff, uplo, m, n );
		}

		bli_reflect_about_diag( diagoff, uplo, m, n );

		bli_rotate180_trapezoid( diagoff, uplo );

		area = bli_thread_get_range_weighted_sub
		(
		  thr, diagoff, uplo, m, n, bf,
		  TRUE, start, end
		);
	}
	else // if dense or zeros
	{
		area = bli_thread_get_range_b2t
		(
		  thr, a, bmult,
		  start, end
		);
	}

	return area;
}
示例#2
0
void bli_obj_print( char* label, obj_t* obj )
{
	FILE*  file     = stdout;

	if ( bli_error_checking_is_enabled() )
		bli_obj_print_check( label, obj );

	fprintf( file, "\n" );
	fprintf( file, "%s\n", label );
	fprintf( file, "\n" );

	fprintf( file, " m x n           %lu x %lu\n", ( unsigned long int )bli_obj_length( *obj ),
	                                               ( unsigned long int )bli_obj_width( *obj ) );
	fprintf( file, "\n" );

	fprintf( file, " offm, offn      %lu, %lu\n", ( unsigned long int )bli_obj_row_off( *obj ),
	                                              ( unsigned long int )bli_obj_col_off( *obj ) );
	fprintf( file, " diagoff         %ld\n", ( signed long int )bli_obj_diag_offset( *obj ) );
	fprintf( file, "\n" );

	fprintf( file, " buf             %p\n",  ( void* )bli_obj_buffer( *obj ) );
	fprintf( file, " elem size       %lu\n", ( unsigned long int )bli_obj_elem_size( *obj ) );
	fprintf( file, " rs, cs          %ld, %ld\n", ( signed long int )bli_obj_row_stride( *obj ),
	                                              ( signed long int )bli_obj_col_stride( *obj ) );
	fprintf( file, " is              %ld\n", ( signed long int )bli_obj_imag_stride( *obj ) );
	fprintf( file, " m_padded        %lu\n", ( unsigned long int )bli_obj_padded_length( *obj ) );
	fprintf( file, " n_padded        %lu\n", ( unsigned long int )bli_obj_padded_width( *obj ) );
	fprintf( file, " ps              %lu\n", ( unsigned long int )bli_obj_panel_stride( *obj ) );
	fprintf( file, "\n" );

	fprintf( file, " info            %lX\n", ( unsigned long int )(*obj).info );
	fprintf( file, " - is complex    %lu\n", ( unsigned long int )bli_obj_is_complex( *obj ) );
	fprintf( file, " - is d. prec    %lu\n", ( unsigned long int )bli_obj_is_double_precision( *obj ) );
	fprintf( file, " - datatype      %lu\n", ( unsigned long int )bli_obj_datatype( *obj ) );
	fprintf( file, " - target dt     %lu\n", ( unsigned long int )bli_obj_target_datatype( *obj ) );
	fprintf( file, " - exec dt       %lu\n", ( unsigned long int )bli_obj_execution_datatype( *obj ) );
	fprintf( file, " - has trans     %lu\n", ( unsigned long int )bli_obj_has_trans( *obj ) );
	fprintf( file, " - has conj      %lu\n", ( unsigned long int )bli_obj_has_conj( *obj ) );
	fprintf( file, " - unit diag?    %lu\n", ( unsigned long int )bli_obj_has_unit_diag( *obj ) );
	fprintf( file, " - struc type    %lu\n", ( unsigned long int )bli_obj_struc( *obj ) >> BLIS_STRUC_SHIFT );
	fprintf( file, " - uplo type     %lu\n", ( unsigned long int )bli_obj_uplo( *obj ) >> BLIS_UPLO_SHIFT );
	fprintf( file, "   - is upper    %lu\n", ( unsigned long int )bli_obj_is_upper( *obj ) );
	fprintf( file, "   - is lower    %lu\n", ( unsigned long int )bli_obj_is_lower( *obj ) );
	fprintf( file, "   - is dense    %lu\n", ( unsigned long int )bli_obj_is_dense( *obj ) );
	fprintf( file, " - pack schema   %lu\n", ( unsigned long int )bli_obj_pack_schema( *obj ) >> BLIS_PACK_SCHEMA_SHIFT );
	fprintf( file, " - packinv diag? %lu\n", ( unsigned long int )bli_obj_has_inverted_diag( *obj ) );
	fprintf( file, " - pack ordifup  %lu\n", ( unsigned long int )bli_obj_is_pack_rev_if_upper( *obj ) );
	fprintf( file, " - pack ordiflo  %lu\n", ( unsigned long int )bli_obj_is_pack_rev_if_lower( *obj ) );
	fprintf( file, " - packbuf type  %lu\n", ( unsigned long int )bli_obj_pack_buffer_type( *obj ) >> BLIS_PACK_BUFFER_SHIFT );
	fprintf( file, "\n" );
}
示例#3
0
err_t bli_gemmsup_ref
     (
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       rntm_t* rntm
     )
{
	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_gemm_check( alpha, a, b, beta, c, cntx );

#if 0
	// FGVZ: The datatype-specific variant is now responsible for checking for
	// alpha == 0.0.

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
	{
		bli_scalm( beta, c );
		return BLIS_SUCCESS;
	}
#endif

#if 0
	// FGVZ: Will this be needed for constructing thrinfo_t's (recall: the
	// sba needs to be attached to the rntm; see below)? Or will those nodes
	// just be created "locally," in an exposed manner?

	// Parse and interpret the contents of the rntm_t object to properly
	// set the ways of parallelism for each loop, and then make any
	// additional modifications necessary for the current operation.
	bli_rntm_set_ways_for_op
	(
	  BLIS_GEMM,
	  BLIS_LEFT, // ignored for gemm/hemm/symm
	  bli_obj_length( &c_local ),
	  bli_obj_width( &c_local ),
	  bli_obj_width( &a_local ),
	  rntm
	);

	// FGVZ: the sba needs to be attached to the rntm. But it needs
	// to be done in the thread region, since it needs a thread id.
	//bli_sba_rntm_set_pool( tid, array, rntm_p );
#endif

#if 0
	// FGVZ: The datatype-specific variant is now responsible for inducing a
	// transposition, if needed.

	// Induce transpositions on A and/or B if either object is marked for
	// transposition. We can induce "fast" transpositions since they objects
	// are guaranteed to not have structure or be packed.
	if ( bli_obj_has_trans( a ) )
	{
		bli_obj_induce_fast_trans( a );
		bli_obj_toggle_trans( a );
	}
	if ( bli_obj_has_trans( b ) )
	{
		bli_obj_induce_fast_trans( b );
		bli_obj_toggle_trans( b );
	}
#endif

#if 0
	//bli_gemmsup_ref_var2
	//bli_gemmsup_ref_var1
	#if 0
	bli_gemmsup_ref_var1n
	#else
	#endif
	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
	const bool_t  is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
	                                     stor_id == BLIS_RRC ||
	                                     stor_id == BLIS_RCR ||
	                                     stor_id == BLIS_CRR );
	if ( is_rrr_rrc_rcr_crr )
	{
		bli_gemmsup_ref_var2m
		(
		  BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
		);
	}
	else
	{
		bli_gemmsup_ref_var2m
		(
		  BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
		);
	}
#else
	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );

	// Don't use the small/unpacked implementation if one of the matrices
	// uses general stride.
	if ( stor_id == BLIS_XXX ) return BLIS_FAILURE;

	const bool_t  is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
	                                     stor_id == BLIS_RRC ||
	                                     stor_id == BLIS_RCR ||
	                                     stor_id == BLIS_CRR );
	const bool_t  is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;

	const num_t   dt       = bli_obj_dt( c );
	const bool_t  row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );

	const bool_t  is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
	                                      : is_rcc_crc_ccr_ccc );

	if ( is_primary )
	{
		// This branch handles:
		//  - rrr rrc rcr crr for row-preferential kernels
		//  - rcc crc ccr ccc for column-preferential kernels

		const dim_t m  = bli_obj_length( c );
		const dim_t n  = bli_obj_width( c );
		const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
		const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
		const dim_t mu = m / MR;
		const dim_t nu = n / NR;

		if ( mu >= nu )
		{
			// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
			bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
			                       alpha, a, b, beta, c, stor_id, cntx, rntm );
		}
		else // if ( mu < nu )
		{
			// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
			bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE,
			                       alpha, a, b, beta, c, stor_id, cntx, rntm );
		}
	}
	else
	{
		// This branch handles:
		//  - rrr rrc rcr crr for column-preferential kernels
		//  - rcc crc ccr ccc for row-preferential kernels

		const dim_t mt = bli_obj_width( c );
		const dim_t nt = bli_obj_length( c );
		const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
		const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
		const dim_t mu = mt / MR;
		const dim_t nu = nt / NR;

		if ( mu >= nu )
		{
			// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
			bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
			                       alpha, a, b, beta, c, stor_id, cntx, rntm );
		}
		else // if ( mu < nu )
		{
			// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
			bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,
			                       alpha, a, b, beta, c, stor_id, cntx, rntm );
		}
		// *requires nudging of mc,nc up to be a multiple of nr,mr.
	}
#endif

	// Return success so that the caller knows that we computed the solution.
	return BLIS_SUCCESS;
}
示例#4
0
void bli_trmm3_front( side_t  side,
                      obj_t*  alpha,
                      obj_t*  a,
                      obj_t*  b,
                      obj_t*  beta,
                      obj_t*  c,
                      trmm_t* l_cntl,
                      trmm_t* r_cntl )
{
	trmm_t* cntl;
	obj_t   a_local;
	obj_t   b_local;
	obj_t   c_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_trmm3_check( side, alpha, a, b, beta, c );

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
	{
		bli_scalm( beta, c );
		return;
	}

	// Alias A, B, and C so we can tweak the objects if necessary.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );
	bli_obj_alias_to( *c, c_local );

	// We do not explicitly implement the cases where A is transposed.
	// However, we can still handle them. Specifically, if A is marked as
	// needing a transposition, we simply induce a transposition. This
	// allows us to only explicitly implement the no-transpose cases. Once
	// the transposition is induced, the correct algorithm will be called,
	// since, for example, an algorithm over a transposed lower triangular
	// matrix A moves in the same direction (forwards) as a non-transposed
	// upper triangular matrix. And with the transposition induced, the
	// matrix now appears to be upper triangular, so the upper triangular
	// algorithm will grab the correct partitions, as if it were upper
	// triangular (with no transpose) all along.
	if ( bli_obj_has_trans( a_local ) )
	{
		bli_obj_induce_trans( a_local );
		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local );
	}

#if 0
	if ( bli_is_right( side ) )
	{
		bli_obj_induce_trans( a_local );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( c_local );

		bli_toggle_side( side );
	}
#endif

#if 1
	// If A is being multiplied from the right, swap A and B so that
	// the matrix will actually be on the right.
	if ( bli_is_right( side ) )
	{
		bli_obj_swap( a_local, b_local );
	}

	// An optimization: If C is row-stored, transpose the entire operation
	// so as to allow the macro-kernel more favorable access patterns
	// through C. (The effect of the transposition of A and B is negligible
	// because those operands are always packed to contiguous memory.)
	if ( bli_obj_is_row_stored( c_local ) )
	{
		bli_obj_swap( a_local, b_local );

		bli_obj_induce_trans( a_local );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( c_local );

		bli_toggle_side( side );
	}
#endif

	// Set each alias as the root object.
	// NOTE: We MUST wait until we are done potentially swapping the objects
	// before setting the root fields!
	bli_obj_set_as_root( a_local );
	bli_obj_set_as_root( b_local );
	bli_obj_set_as_root( c_local );

	// Choose the control tree.
	if ( bli_is_left( side ) ) cntl = l_cntl;
	else                       cntl = r_cntl;

    trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE );
    dim_t n_threads = thread_num_threads( infos[0] );

    // Invoke the internal back-end.
    bli_level3_thread_decorator( n_threads,   
                                 (level3_int_t) bli_trmm_int, 
                                 alpha, 
                                 &a_local,  
                                 &b_local,  
                                 beta, 
                                 &c_local,  
                                 (void*) cntl, 
                                 (void**) infos );

    bli_trmm_thrinfo_free_paths( infos, n_threads );
}
示例#5
0
void bli_trmm_int( obj_t*  alpha,
                   obj_t*  a,
                   obj_t*  b,
                   obj_t*  beta,
                   obj_t*  c,
                   trmm_t* cntl )
{
	obj_t     a_local;
	obj_t     b_local;
	obj_t     c_local;
	bool_t    side, uplo;
	varnum_t  n;
	impl_t    i;
	FUNCPTR_T f;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_trmm_int_check( alpha, a, b, beta, c, cntl );

	// If C has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( *c ) ) return;

	// If A or B has a zero dimension, scale C by beta and return early.
	if ( bli_obj_has_zero_dim( *a ) ||
	     bli_obj_has_zero_dim( *b ) )
	{
		bli_scalm( beta, c );
		return;
	}

	// Alias A and B in case we need to update attached scalars.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );

	// Alias C in case we need to induce a transposition.
	bli_obj_alias_to( *c, c_local );

	// If we are about to call a leaf-level implementation, and matrix C
	// still needs a transposition, then we must induce one by swapping the
	// strides and dimensions. Note that this transposition would normally
	// be handled explicitly in the packing of C, but if C is not being
	// packed, this is our last chance to handle the transposition.
	if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
	{
		bli_obj_induce_trans( c_local );
		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
	}

	// If alpha is non-unit, typecast and apply it to the scalar attached
	// to B.
	if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
	{
		bli_obj_scalar_apply_scalar( alpha, &b_local );
	}

	// If beta is non-unit, typecast and apply it to the scalar attached
	// to C.
	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
	{
		bli_obj_scalar_apply_scalar( beta, &c_local );
	}

	// Set two bools: one based on the implied side parameter (the structure
	// of the root object) and one based on the uplo field of the triangular
	// matrix's root object (whether that is matrix A or matrix B).
	if ( bli_obj_root_is_triangular( *a ) )
	{
		side = 0;
		if ( bli_obj_root_is_lower( *a ) ) uplo = 0;
		else                               uplo = 1;
	}
	else // if ( bli_obj_root_is_triangular( *b ) )
	{
		side = 1;
		// Set a bool based on the uplo field of A's root object.
		if ( bli_obj_root_is_lower( *b ) ) uplo = 0;
		else                               uplo = 1;
	}

	// Extract the variant number and implementation type.
	n = cntl_var_num( cntl );
	i = cntl_impl_type( cntl );

	// Index into the variant array to extract the correct function pointer.
	f = vars[side][uplo][n][i];

	// Invoke the variant.
	f( &a_local,
	   &b_local,
	   &c_local,
	   cntl );
}
示例#6
0
void bli_trmm3_front( side_t  side,
                      obj_t*  alpha,
                      obj_t*  a,
                      obj_t*  b,
                      obj_t*  beta,
                      obj_t*  c,
                      gemm_t* cntl )
{
	obj_t   a_local;
	obj_t   b_local;
	obj_t   c_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_trmm3_check( side, alpha, a, b, beta, c );

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
	{
		bli_scalm( beta, c );
		return;
	}

	// Alias A, B, and C so we can tweak the objects if necessary.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );
	bli_obj_alias_to( *c, c_local );

	// We do not explicitly implement the cases where A is transposed.
	// However, we can still handle them. Specifically, if A is marked as
	// needing a transposition, we simply induce a transposition. This
	// allows us to only explicitly implement the no-transpose cases. Once
	// the transposition is induced, the correct algorithm will be called,
	// since, for example, an algorithm over a transposed lower triangular
	// matrix A moves in the same direction (forwards) as a non-transposed
	// upper triangular matrix. And with the transposition induced, the
	// matrix now appears to be upper triangular, so the upper triangular
	// algorithm will grab the correct partitions, as if it were upper
	// triangular (with no transpose) all along.
	if ( bli_obj_has_trans( a_local ) )
	{
		bli_obj_induce_trans( a_local );
		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local );
	}

#if 0

	// If A is being multiplied from the right, transpose all operands
	// so that we can perform the computation as if A were being multiplied
	// from the left.
	if ( bli_is_right( side ) )
	{
		bli_toggle_side( side );
		bli_obj_induce_trans( a_local );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( c_local );
	}

#else

	// An optimization: If C is stored by rows and the micro-kernel prefers
	// contiguous columns, or if C is stored by columns and the micro-kernel
	// prefers contiguous rows, transpose the entire operation to allow the
	// micro-kernel to access elements of C in its preferred manner.
	if (
	     ( bli_obj_is_row_stored( c_local ) &&
	       bli_func_prefers_contig_cols( bli_obj_datatype( c_local ),
	                                     bli_gemm_cntl_ukrs( cntl ) ) ) ||
	     ( bli_obj_is_col_stored( c_local ) &&
	       bli_func_prefers_contig_rows( bli_obj_datatype( c_local ),
	                                     bli_gemm_cntl_ukrs( cntl ) ) )
	   )
	{
		bli_toggle_side( side );
		bli_obj_induce_trans( a_local );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( c_local );
	}

	// If A is being multiplied from the right, swap A and B so that
	// the matrix will actually be on the right.
	if ( bli_is_right( side ) )
	{
		bli_obj_swap( a_local, b_local );
	}

#endif

	// Set each alias as the root object.
	// NOTE: We MUST wait until we are done potentially swapping the objects
	// before setting the root fields!
	bli_obj_set_as_root( a_local );
	bli_obj_set_as_root( b_local );
	bli_obj_set_as_root( c_local );

	// Notice that, unlike trmm_r, there is no dependency in the jc loop
	// for trmm3_r, so we can pass in FALSE for jc_dependency.
	trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE );
    dim_t n_threads = thread_num_threads( infos[0] );

    // Invoke the internal back-end.
    bli_level3_thread_decorator( n_threads,   
                                 (level3_int_t) bli_trmm_int, 
                                 alpha, 
                                 &a_local,  
                                 &b_local,  
                                 beta, 
                                 &c_local,  
                                 (void*) cntl, 
                                 (void**) infos );

    bli_trmm_thrinfo_free_paths( infos, n_threads );

}
示例#7
0
void bli_trsv_int( obj_t*  alpha,
                   obj_t*  a,
                   obj_t*  x,
                   cntx_t* cntx,
                   trsv_t* cntl )
{
	varnum_t  n;
	impl_t    i;
	bool_t    uplo;
	FUNCPTR_T f;
	obj_t     a_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_trsv_check( alpha, a, x );

	// If A or x has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( a ) ) return;
	if ( bli_obj_has_zero_dim( x ) ) return;

	// Alias A in case we need to induce a transformation (ie: transposition).
	bli_obj_alias_to( a, &a_local );

	// NOTE: to support cases where B is complex and A is real, we will
	// need to have the default side case be BLIS_RIGHT and then express
	// the left case in terms of it, rather than the other way around.

	// Determine uplo (for indexing to the correct function pointer).
	if ( bli_obj_is_lower( &a_local ) ) uplo = 0;
	else                                uplo = 1;

	// We do not explicitly implement the cases where A is transposed.
	// However, we can still handle them. Specifically, if A is marked as
	// needing a transposition, we simply toggle the uplo value to cause the
	// correct algorithm to be induced. When that algorithm partitions into
	// A, it will grab the correct subpartitions, which will inherit A's
	// transposition bit and thus downstream subproblems will do the right
	// thing. Alternatively, we could accomplish the same end goal by
	// inducing a transposition, via bli_obj_induce_trans(), in the code
	// block below. That macro function swaps dimensions, strides, and
	// offsets. As an example, given a lower triangular, column-major matrix
	// that needs a transpose, we would induce that transposition by recasting
	// the object as an upper triangular, row-major matrix (with no transpose
	// needed). Note that how we choose to handle transposition here does NOT
	// affect the optimal choice of kernel (ie: a column-major column panel
	// matrix with transpose times a vector would use the same kernel as a
	// row-major row panel matrix with no transpose times a vector).
	if ( bli_obj_has_trans( &a_local ) )
	{
		//bli_obj_induce_trans( &a_local );
		//bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
		if ( uplo == 1 ) uplo = 0;
		else             uplo = 1;
	}

	// Extract the variant number and implementation type.
	n = bli_cntl_var_num( cntl );
	i = bli_cntl_impl_type( cntl );

	// Index into the variant array to extract the correct function pointer.
	f = vars[uplo][n][i];

	// Invoke the variant.
	f( alpha,
	   &a_local,
	   x,
	   cntx,
	   cntl );
}
示例#8
0
void bli_gemmsup_ref_var1n
     (
       trans_t trans,
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       stor3_t eff_id,
       cntx_t* cntx,
       rntm_t* rntm
     )
{
#if 0
	obj_t at, bt;

	bli_obj_alias_to( a, &at );
	bli_obj_alias_to( b, &bt );

	// Induce transpositions on A and/or B if either object is marked for
	// transposition. We can induce "fast" transpositions since they objects
	// are guaranteed to not have structure or be packed.
	if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
	if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }

	const num_t    dt_exec   = bli_obj_dt( c );

	const conj_t   conja     = bli_obj_conj_status( a );
	const conj_t   conjb     = bli_obj_conj_status( b );

	const dim_t    m         = bli_obj_length( c );
	const dim_t    n         = bli_obj_width( c );

	const dim_t    k         = bli_obj_width( &at );

	void* restrict buf_a     = bli_obj_buffer_at_off( &at );
	const inc_t    rs_a      = bli_obj_row_stride( &at );
	const inc_t    cs_a      = bli_obj_col_stride( &at );

	void* restrict buf_b     = bli_obj_buffer_at_off( &bt );
	const inc_t    rs_b      = bli_obj_row_stride( &bt );
	const inc_t    cs_b      = bli_obj_col_stride( &bt );

	void* restrict buf_c     = bli_obj_buffer_at_off( c );
	const inc_t    rs_c      = bli_obj_row_stride( c );
	const inc_t    cs_c      = bli_obj_col_stride( c );

	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );

#else

	const num_t    dt_exec   = bli_obj_dt( c );

	const conj_t   conja     = bli_obj_conj_status( a );
	const conj_t   conjb     = bli_obj_conj_status( b );

	const dim_t    m         = bli_obj_length( c );
	const dim_t    n         = bli_obj_width( c );
	      dim_t    k;

	void* restrict buf_a = bli_obj_buffer_at_off( a );
	      inc_t    rs_a;
	      inc_t    cs_a;

	void* restrict buf_b = bli_obj_buffer_at_off( b );
	      inc_t    rs_b;
	      inc_t    cs_b;

	if ( bli_obj_has_notrans( a ) )
	{
		k     = bli_obj_width( a );

		rs_a  = bli_obj_row_stride( a );
		cs_a  = bli_obj_col_stride( a );
	}
	else // if ( bli_obj_has_trans( a ) )
	{
		// Assign the variables with an implicit transposition.
		k     = bli_obj_length( a );

		rs_a  = bli_obj_col_stride( a );
		cs_a  = bli_obj_row_stride( a );
	}

	if ( bli_obj_has_notrans( b ) )
	{
		rs_b  = bli_obj_row_stride( b );
		cs_b  = bli_obj_col_stride( b );
	}
	else // if ( bli_obj_has_trans( b ) )
	{
		// Assign the variables with an implicit transposition.
		rs_b  = bli_obj_col_stride( b );
		cs_b  = bli_obj_row_stride( b );
	}

	void* restrict buf_c     = bli_obj_buffer_at_off( c );
	const inc_t    rs_c      = bli_obj_row_stride( c );
	const inc_t    cs_c      = bli_obj_col_stride( c );

	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );

#endif

	// Index into the type combination array to extract the correct
	// function pointer.
	FUNCPTR_T f = ftypes_var1n[dt_exec];

	if ( bli_is_notrans( trans ) )
	{
		// Invoke the function.
		f
		(
		  conja,
		  conjb,
		  m,
		  n,
		  k,
		  buf_alpha,
		  buf_a, rs_a, cs_a,
		  buf_b, rs_b, cs_b,
		  buf_beta,
		  buf_c, rs_c, cs_c,
		  eff_id,
		  cntx,
		  rntm
		);
	}
	else
	{
		// Invoke the function (transposing the operation).
		f
		(
		  conjb,             // swap the conj values.
		  conja,
		  n,                 // swap the m and n dimensions.
		  m,
		  k,
		  buf_alpha,
		  buf_b, cs_b, rs_b, // swap the positions of A and B.
		  buf_a, cs_a, rs_a, // swap the strides of A and B.
		  buf_beta,
		  buf_c, cs_c, rs_c, // swap the strides of C.
		  bli_stor3_trans( eff_id ), // transpose the stor3_t id.
		  cntx,
		  rntm
		);
	}
}
示例#9
0
void bli_gemm_int( obj_t*  alpha,
                   obj_t*  a,
                   obj_t*  b,
                   obj_t*  beta,
                   obj_t*  c,
                   gemm_t* cntl,
                   gemm_thrinfo_t* thread )
{
	obj_t     a_local;
	obj_t     b_local;
	obj_t     c_local;
	varnum_t  n;
	impl_t    i;
	FUNCPTR_T f;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_gemm_int_check( alpha, a, b, beta, c, cntl );

	// If C has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( *c ) ) return;

	// If A or B has a zero dimension, scale C by beta and return early.
	if ( bli_obj_has_zero_dim( *a ) ||
	     bli_obj_has_zero_dim( *b ) )
	{
        if( thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
        thread_obarrier( thread );
		return;
	}

	// If A or B is marked as being filled with zeros, scale C by beta and
	// return early.
	if ( bli_obj_is_zeros( *a ) ||
	     bli_obj_is_zeros( *b ) )
	{
        if( thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
        thread_obarrier( thread );
		return;
	}

	// Alias A and B in case we need to update attached scalars.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );

	// Alias C in case we need to induce a transposition.
	bli_obj_alias_to( *c, c_local );

	// If we are about to call a leaf-level implementation, and matrix C
	// still needs a transposition, then we must induce one by swapping the
	// strides and dimensions. Note that this transposition would normally
	// be handled explicitly in the packing of C, but if C is not being
	// packed, this is our last chance to handle the transposition.
	if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
	{
        //if( thread_am_ochief( thread ) ) {
            bli_obj_induce_trans( c_local );
            bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
       // }
	}

	// If alpha is non-unit, typecast and apply it to the scalar attached
	// to B.
	if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
	{
        bli_obj_scalar_apply_scalar( alpha, &b_local );
	}

	// If beta is non-unit, typecast and apply it to the scalar attached
	// to C.
	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
	{
        bli_obj_scalar_apply_scalar( beta, &c_local );
	}

	// Extract the variant number and implementation type.
	n = cntl_var_num( cntl );
	i = cntl_impl_type( cntl );

	// Index into the variant array to extract the correct function pointer.
	f = vars[n][i];

	// Invoke the variant.
	f( &a_local,
	   &b_local,
	   &c_local,
	   cntl,
       thread );
}
示例#10
0
void bli_trsm_front( side_t  side,
                     obj_t*  alpha,
                     obj_t*  a,
                     obj_t*  b,
                     cntx_t* cntx,
                     trsm_t* l_cntl,
                     trsm_t* r_cntl )
{
	trsm_t* cntl;
	obj_t   a_local;
	obj_t   b_local;
	obj_t   c_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_trsm_check( side, alpha, a, b, &BLIS_ZERO, b, cntx );

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
	{
		bli_scalm( alpha, b );
		return;
	}

	// Reinitialize the memory allocator to accommodate the blocksizes
	// in the current context.
	bli_mem_reinit( cntx );

	// Alias A and B so we can tweak the objects if necessary.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );
	bli_obj_alias_to( *b, c_local );

	// We do not explicitly implement the cases where A is transposed.
	// However, we can still handle them. Specifically, if A is marked as
	// needing a transposition, we simply induce a transposition. This
	// allows us to only explicitly implement the no-transpose cases. Once
	// the transposition is induced, the correct algorithm will be called,
	// since, for example, an algorithm over a transposed lower triangular
	// matrix A moves in the same direction (forwards) as a non-transposed
	// upper triangular matrix. And with the transposition induced, the
	// matrix now appears to be upper triangular, so the upper triangular
	// algorithm will grab the correct partitions, as if it were upper
	// triangular (with no transpose) all along.
	if ( bli_obj_has_trans( a_local ) )
	{
		bli_obj_induce_trans( a_local );
		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local );
	}

#if 0

	// If A is being solved against from the right, transpose all operands
	// so that we can perform the computation as if A were being solved
	// from the left.
	if ( bli_is_right( side ) )
	{
		bli_toggle_side( side );
		bli_obj_induce_trans( a_local );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( c_local );
	}

#else

	// If A is being solved against from the right, swap A and B so that
	// the triangular matrix will actually be on the right.
	if ( bli_is_right( side ) )
	{
		bli_obj_swap( a_local, b_local );
	}

#endif

	// Set each alias as the root object.
	// NOTE: We MUST wait until we are done potentially swapping the objects
	// before setting the root fields!
	bli_obj_set_as_root( a_local );
	bli_obj_set_as_root( b_local );
	bli_obj_set_as_root( c_local );

	// Choose the control tree.
	if ( bli_is_left( side ) ) cntl = l_cntl;
	else                       cntl = r_cntl;

    trsm_thrinfo_t** infos = bli_create_trsm_thrinfo_paths( bli_is_right( side ) );
    dim_t n_threads = thread_num_threads( infos[0] );
    
    // Invoke the internal back-end.
    bli_level3_thread_decorator( n_threads,   
                                 (l3_int_t) bli_trsm_int, 
                                 alpha, 
                                 &a_local,  
                                 &b_local,  
                                 alpha, 
                                 &c_local,  
                                 (void*) cntx, 
                                 (void*) cntl, 
                                 (void**) infos );

    bli_trsm_thrinfo_free_paths( infos, n_threads );

}
示例#11
0
void bli_trsm_int
     (
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       rntm_t* rntm,
       cntl_t* cntl,
       thrinfo_t* thread
     )
{
	obj_t        a_local;
	obj_t        b_local;
	obj_t        c_local;
	trsm_var_oft f;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_gemm_basic_check( alpha, a, b, beta, c, cntx );

	// If C has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( c ) ) return;

	// If A or B has a zero dimension, scale C by beta and return early.
	if ( bli_obj_has_zero_dim( a ) ||
	     bli_obj_has_zero_dim( b ) )
	{
		if ( bli_thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
		bli_thread_obarrier( thread );
		return;
	}

	// Alias A and B in case we need to update attached scalars.
	bli_obj_alias_to( a, &a_local );
	bli_obj_alias_to( b, &b_local );

	// Alias C in case we need to induce a transposition.
	bli_obj_alias_to( c, &c_local );

	// If we are about to call a leaf-level implementation, and matrix C
	// still needs a transposition, then we must induce one by swapping the
	// strides and dimensions. Note that this transposition would normally
	// be handled explicitly in the packing of C, but if C is not being
	// packed, this is our last chance to handle the transposition.
	if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) )
	{
		bli_obj_induce_trans( &c_local );
		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local );
	}

	// If beta is non-unit, apply it to the scalar attached to C.
	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
	{
		bli_obj_scalar_apply_scalar( beta, &c_local );
	}

	// Set two bools: one based on the implied side parameter (the structure
	// of the root object) and one based on the uplo field of the triangular
	// matrix's root object (whether that is matrix A or matrix B).
	if ( bli_obj_root_is_triangular( a ) )
	{
		// If alpha is non-unit, typecast and apply it to the scalar
		// attached to B (the non-triangular matrix).
		if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
		{
			bli_obj_scalar_apply_scalar( alpha, &b_local );
		}
	}
	else // if ( bli_obj_root_is_triangular( b ) )
	{
		// If alpha is non-unit, typecast and apply it to the scalar
		// attached to A (the non-triangular matrix).
		if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
		{
            bli_obj_scalar_apply_scalar( alpha, &a_local );
		}
	}

	// FGVZ->TMS: Is this barrier still needed?
	bli_thread_obarrier( thread );

	// Create the next node in the thrinfo_t structure.
	bli_thrinfo_grow( rntm, cntl, thread );

	// Extract the function pointer from the current control tree node.
	f = bli_cntl_var_func( cntl );

	// Invoke the variant.
	f
	(
	  &a_local,
	  &b_local,
	  &c_local,
	  cntx,
	  rntm,
	  cntl,
	  thread
	);
}
示例#12
0
//
// Define object-based interface.
//
void bli_dotxf( obj_t*  alpha,
                obj_t*  a,
                obj_t*  x,
                obj_t*  beta,
                obj_t*  y )
{
	num_t     dt        = bli_obj_datatype( *x );

	conj_t    conja     = bli_obj_conj_status( *a );
	conj_t    conjx     = bli_obj_conj_status( *x );

	dim_t     m         = bli_obj_vector_dim( *y );
	dim_t     b_n       = bli_obj_vector_dim( *x );

	void*     buf_a     = bli_obj_buffer_at_off( *a );
	inc_t     rs_a      = bli_obj_row_stride( *a );
	inc_t     cs_a      = bli_obj_col_stride( *a );

	void*     buf_x     = bli_obj_buffer_at_off( *x );
	inc_t     inc_x     = bli_obj_vector_inc( *x );

	void*     buf_y     = bli_obj_buffer_at_off( *y );
	inc_t     inc_y     = bli_obj_vector_inc( *y );

	obj_t     alpha_local;
	void*     buf_alpha;

	obj_t     beta_local;
	void*     buf_beta;

	FUNCPTR_T f         = ftypes[dt];

	if ( bli_error_checking_is_enabled() )
	    bli_dotxf_check( alpha, a, x, beta, y );

	// Create local copy-casts of the scalars (and apply internal conjugation
	// if needed).
	bli_obj_scalar_init_detached_copy_of( dt,
	                                      BLIS_NO_CONJUGATE,
	                                      alpha,
	                                      &alpha_local );
	bli_obj_scalar_init_detached_copy_of( dt,
	                                      BLIS_NO_CONJUGATE,
	                                      beta,
	                                      &beta_local );

	// Extract the scalar buffers.
	buf_alpha = bli_obj_buffer_for_1x1( dt, alpha_local );
	buf_beta  = bli_obj_buffer_for_1x1( dt, beta_local );

	// Support cases where matrix A requires a transposition.
	if ( bli_obj_has_trans( *a ) ) { bli_swap_incs( rs_a, cs_a ); }

	// Invoke the void pointer-based function.
	f( conja,
	   conjx,
	   m,
	   b_n,
	   buf_alpha,
	   buf_a, rs_a, cs_a,
	   buf_x, inc_x,
	   buf_beta,
	   buf_y, inc_y );
}
示例#13
0
文件: bli_ger_int.c 项目: figual/blis
void bli_ger_int( conj_t  conjx,
                  conj_t  conjy,
                  obj_t*  alpha,
                  obj_t*  x,
                  obj_t*  y,
                  obj_t*  a,
                  cntx_t* cntx,
                  ger_t*  cntl )
{
	varnum_t  n;
	impl_t    i;
	FUNCPTR_T f;
	obj_t     alpha_local;
	obj_t     x_local;
	obj_t     y_local;
	obj_t     a_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_ger_check( alpha, x, y, a );

	// If A has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( a ) ) return;

	// If x or y has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( x ) ||
	     bli_obj_has_zero_dim( y ) ) return;

	// Alias the objects, applying conjx and conjy to x and y, respectively.
	bli_obj_alias_with_conj( conjx, x, &x_local );
	bli_obj_alias_with_conj( conjy, y, &y_local );
	bli_obj_alias_to( a, &a_local );

	// If matrix A is marked for conjugation, we interpret this as a request
	// to apply a conjugation to the other operands.
	if ( bli_obj_has_conj( &a_local ) )
	{
		bli_obj_toggle_conj( &a_local );

		bli_obj_toggle_conj( &x_local );
		bli_obj_toggle_conj( &y_local );

		bli_obj_scalar_init_detached_copy_of( bli_obj_dt( alpha ),
		                                      BLIS_CONJUGATE,
		                                      alpha,
		                                      &alpha_local );
	}
	else
	{
		bli_obj_alias_to( *alpha, alpha_local );
	}

	// If we are about the call a leaf-level implementation, and matrix A
	// still needs a transposition, then we must induce one by swapping the
	// strides and dimensions.
	if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( &a_local ) )
	{
		bli_obj_induce_trans( &a_local );
		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
	}

	// Extract the variant number and implementation type.
	n = bli_cntl_var_num( cntl );
	i = bli_cntl_impl_type( cntl );

	// Index into the variant array to extract the correct function pointer.
	f = vars[n][i];

	// Invoke the variant.
	f( &alpha_local,
	   &x_local,
	   &y_local,
	   &a_local,
	   cntx,
	   cntl );
}