Пример #1
void libblis_test_xpbym_check
       test_params_t* params,
       obj_t*         x,
       obj_t*         beta,
       obj_t*         y,
       obj_t*         y_orig,
       double*        resid
	num_t  dt      = bli_obj_dt( y );
	num_t  dt_real = bli_obj_dt_proj_to_real( y );

	dim_t  m       = bli_obj_length( y );
	dim_t  n       = bli_obj_width( y );

	obj_t  x_temp, y_temp;
	obj_t  norm;

	double junk;

	// Pre-conditions:
	// - x is randomized.
	// - y_orig is randomized.
	// Note:
	// - alpha should have a non-zero imaginary component in the complex
	//   cases in order to more fully exercise the implementation.
	// Under these conditions, we assume that the implementation for
	//   y := beta * y_orig + conjx(x)
	// is functioning correctly if
	//   normf( y - ( beta * y_orig + conjx(x) ) )
	// is negligible.

	bli_obj_scalar_init_detached( dt_real, &norm );

    bli_obj_create( dt, m, n, 0, 0, &x_temp );
    bli_obj_create( dt, m, n, 0, 0, &y_temp );

    bli_copym( x,      &x_temp );
    bli_copym( y_orig, &y_temp );

    bli_scalm( beta, &y_temp );
	bli_addm( &x_temp, &y_temp );

    bli_subm( &y_temp, y );
    bli_normfm( y, &norm );
    bli_getsc( &norm, resid, &junk );

    bli_obj_free( &x_temp );
    bli_obj_free( &y_temp );
Пример #2
void libblis_test_scalm_check( obj_t*  beta,
                               obj_t*  y,
                               obj_t*  y_orig,
                               double* resid )
	num_t  dt      = bli_obj_datatype( *y );
	num_t  dt_real = bli_obj_datatype_proj_to_real( *y );

	dim_t  m       = bli_obj_length( *y );
	dim_t  n       = bli_obj_width( *y );

	obj_t  norm_y_r;
	obj_t  nbeta;

	obj_t  y2;

	double junk;

	// Pre-conditions:
	// - y_orig is randomized.
	// Note:
	// - beta should have a non-zero imaginary component in the complex
	//   cases in order to more fully exercise the implementation.
	// Under these conditions, we assume that the implementation for
	//   y := conjbeta(beta) * y_orig
	// is functioning correctly if
	//   normf( y + -conjbeta(beta) * y_orig )
	// is negligible.

	bli_obj_create( dt, m, n, 0, 0, &y2 );
	bli_copym( y_orig, &y2 );

	bli_obj_scalar_init_detached( dt,      &nbeta );
	bli_obj_scalar_init_detached( dt_real, &norm_y_r );

	bli_copysc( beta, &nbeta );
	bli_mulsc( &BLIS_MINUS_ONE, &nbeta );

	bli_scalm( &nbeta, &y2 );
	bli_addm( &y2, y );
	bli_normfm( y, &norm_y_r );

	bli_getsc( &norm_y_r, resid, &junk );

	bli_obj_free( &y2 );
Пример #3
void libblis_test_scalm_impl( iface_t   iface,
                              obj_t*    beta,
                              obj_t*    y )
	switch ( iface )
		bli_scalm( beta, y );

		libblis_test_printf_error( "Invalid interface type.\n" );
Пример #4
void bli_syrk_front( obj_t*  alpha,
                     obj_t*  a,
                     obj_t*  beta,
                     obj_t*  c,
                     gemm_t* cntl )
	obj_t   a_local;
	obj_t   at_local;
	obj_t   c_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_syrk_check( alpha, a, beta, c );

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
		bli_scalm( beta, c );

	// Alias A and C in case we need to apply transformations.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *c, c_local );
	bli_obj_set_as_root( c_local );

	// For syrk, the right-hand "B" operand is simply A^T.
	bli_obj_alias_to( *a, at_local );
	bli_obj_induce_trans( at_local );

	// An optimization: If C is stored by rows and the micro-kernel prefers
	// contiguous columns, or if C is stored by columns and the micro-kernel
	// prefers contiguous rows, transpose the entire operation to allow the
	// micro-kernel to access elements of C in its preferred manner.
	if (
	     ( bli_obj_is_row_stored( c_local ) &&
	       bli_func_prefers_contig_cols( bli_obj_datatype( c_local ),
	                                     bli_gemm_cntl_ukrs( cntl ) ) ) ||
	     ( bli_obj_is_col_stored( c_local ) &&
	       bli_func_prefers_contig_rows( bli_obj_datatype( c_local ),
	                                     bli_gemm_cntl_ukrs( cntl ) ) )
		bli_obj_induce_trans( c_local );
    herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
    dim_t n_threads = thread_num_threads( infos[0] );

    // Invoke the internal back-end.
    bli_level3_thread_decorator( n_threads,   
                                 (level3_int_t) bli_herk_int, 
                                 (void*) cntl, 
                                 (void**) infos );

    bli_herk_thrinfo_free_paths( infos, n_threads );

Пример #5
void libblis_test_syr2k_experiment( test_params_t* params,
                                    test_op_t*     op,
                                    iface_t        iface,
                                    num_t          datatype,
                                    char*          pc_str,
                                    char*          sc_str,
                                    unsigned int   p_cur,
                                    double*        perf,
                                    double*        resid )
	unsigned int n_repeats = params->n_repeats;
	unsigned int i;

	double       time_min  = 1e9;
	double       time;

	dim_t        m, k;

	uplo_t       uploc;
	trans_t      transa, transb;

	obj_t        kappa;
	obj_t        alpha, a, b, beta, c;
	obj_t        c_save;

	// Map the dimension specifier to actual dimensions.
	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
	k = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );

	// Map parameter characters to BLIS constants.
	bli_param_map_char_to_blis_uplo( pc_str[0], &uploc );
	bli_param_map_char_to_blis_trans( pc_str[1], &transa );
	bli_param_map_char_to_blis_trans( pc_str[2], &transb );

	// Create test scalars.
	bli_obj_scalar_init_detached( datatype, &kappa );
	bli_obj_scalar_init_detached( datatype, &alpha );
	bli_obj_scalar_init_detached( datatype, &beta );

	// Create test operands (vectors and/or matrices).
	libblis_test_mobj_create( params, datatype, transa,
	                          sc_str[0], m, k, &a );
	libblis_test_mobj_create( params, datatype, transb,
	                          sc_str[1], m, k, &b );
	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
	                          sc_str[2], m, m, &c );
	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
	                          sc_str[2], m, m, &c_save );

	// Set alpha and beta.
	if ( bli_obj_is_real( c ) )
		bli_setsc(  0.8, 0.0, &alpha );
		bli_setsc( -1.0, 0.0, &beta );
		// For syr2k, both alpha and beta may be complex since, unlike her2k,
		// C is symmetric in both the real and complex cases.
		bli_setsc(  0.8, 0.5, &alpha );
		bli_setsc( -1.0, 0.5, &beta );

	// Randomize A and B.
	bli_randm( &a );
	bli_randm( &b );

	// Set the structure and uplo properties of C.
	bli_obj_set_struc( BLIS_SYMMETRIC, c );
	bli_obj_set_uplo( uploc, c );

	// Randomize A, make it densely symmetric, and zero the unstored triangle
	// to ensure the implementation is reads only from the stored region.
	bli_randm( &c );
	bli_mksymm( &c );
	bli_mktrim( &c );

	// Save C and set its structure and uplo properties.
	bli_obj_set_struc( BLIS_SYMMETRIC, c_save );
	bli_obj_set_uplo( uploc, c_save );
	bli_copym( &c, &c_save );

	// Normalize by k.
	bli_setsc( 1.0/( double )k, 0.0, &kappa );
	bli_scalm( &kappa, &a );
	bli_scalm( &kappa, &b );

	// Apply the remaining parameters.
	bli_obj_set_conjtrans( transa, a );
	bli_obj_set_conjtrans( transb, b );

	// Repeat the experiment n_repeats times and record results. 
	for ( i = 0; i < n_repeats; ++i )
		bli_copym( &c_save, &c );

		time = bli_clock();

		libblis_test_syr2k_impl( iface, &alpha, &a, &b, &beta, &c );

		time_min = bli_clock_min_diff( time_min, time );

	// Estimate the performance of the best experiment repeat.
	*perf = ( 2.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( bli_obj_is_complex( c ) ) *perf *= 4.0;

	// Perform checks.
	libblis_test_syr2k_check( &alpha, &a, &b, &beta, &c, &c_save, resid );

	// Zero out performance and residual if output matrix is empty.
	libblis_test_check_empty_problem( &c, perf, resid );

	// Free the test objects.
	bli_obj_free( &a );
	bli_obj_free( &b );
	bli_obj_free( &c );
	bli_obj_free( &c_save );
Пример #6
void bli_hemv_int( conj_t  conjh,
                   obj_t*  alpha,
                   obj_t*  a,
                   obj_t*  x,
                   obj_t*  beta,
                   obj_t*  y,
                   cntx_t* cntx,
                   hemv_t* cntl )
	varnum_t  n;
	impl_t    i;
	obj_t     a_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		if ( bli_is_conj( conjh ) ) bli_hemv_check( alpha, a, x, beta, y );
		else                        bli_symv_check( alpha, a, x, beta, y );

	// If y has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( *y ) ) return;

	// If x has a zero dimension, scale y by beta and return early.
	if ( bli_obj_has_zero_dim( *x ) )
		bli_scalm( beta, y );

	// Alias A in case we need to induce the upper triangular case.
	bli_obj_alias_to( *a, a_local );

	// Our blocked algorithms only [explicitly] implement the lower triangular
	// case, so if matrix A is stored as upper triangular, we must toggle the
	// transposition (and conjugation) bits so that the diagonal partitioning
	// routines grab the correct partitions corresponding to the upper
	// triangular case. But we only need to do this for blocked algorithms,
	// since unblocked algorithms are responsible for handling the upper case
	// explicitly (and they should not be inspecting the transposition bit anyway).
	if ( bli_cntl_is_blocked( cntl ) && bli_obj_is_upper( *a ) )
		bli_obj_toggle_conj( a_local );
		bli_obj_toggle_trans( a_local );

	// Extract the variant number and implementation type.
	n = bli_cntl_var_num( cntl );
	i = bli_cntl_impl_type( cntl );

	// Index into the variant array to extract the correct function pointer.
	f = vars[n][i];

	// Invoke the variant.
	f( conjh,
	   cntl );
Пример #7
void bli_her2k_front( obj_t*  alpha,
                      obj_t*  a,
                      obj_t*  b,
                      obj_t*  beta,
                      obj_t*  c,
                      herk_t* cntl )
	obj_t    alpha_conj;
	obj_t    c_local;
	obj_t    a_local;
	obj_t    bh_local;
	obj_t    b_local;
	obj_t    ah_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_her2k_check( alpha, a, b, beta, c );

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
		bli_scalm( beta, c );

	// Alias A, B, and C in case we need to apply transformations.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );
	bli_obj_alias_to( *c, c_local );
	bli_obj_set_as_root( c_local );

	// For her2k, the first and second right-hand "B" operands are simply B'
	// and A'.
	bli_obj_alias_to( *b, bh_local );
	bli_obj_induce_trans( bh_local );
	bli_obj_toggle_conj( bh_local );
	bli_obj_alias_to( *a, ah_local );
	bli_obj_induce_trans( ah_local );
	bli_obj_toggle_conj( ah_local );

	// Initialize a conjugated copy of alpha.
	bli_obj_scalar_init_detached_copy_of( bli_obj_datatype( *a ),
	                                      &alpha_conj );

	// An optimization: If C is row-stored, transpose the entire operation
	// so as to allow the macro-kernel more favorable access patterns
	// through C. (The effect of the transposition of A and A' is negligible
	// because those operands are always packed to contiguous memory.)
	if ( bli_obj_is_row_stored( c_local ) )
		bli_obj_swap( a_local, bh_local );
		bli_obj_swap( b_local, ah_local );

		bli_obj_induce_trans( a_local );
		bli_obj_induce_trans( bh_local );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( ah_local );

		bli_obj_induce_trans( c_local );

#if 0
	// Invoke the internal back-end.
	bli_her2k_int( alpha,
	               cntl );
	// Invoke herk twice, using beta only the first time.
	bli_herk_int( alpha,
	              cntl );

	bli_herk_int( &alpha_conj,
	              cntl );
Пример #8
err_t bli_gemmsup_ref
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       rntm_t* rntm
	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_gemm_check( alpha, a, b, beta, c, cntx );

#if 0
	// FGVZ: The datatype-specific variant is now responsible for checking for
	// alpha == 0.0.

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
		bli_scalm( beta, c );
		return BLIS_SUCCESS;

#if 0
	// FGVZ: Will this be needed for constructing thrinfo_t's (recall: the
	// sba needs to be attached to the rntm; see below)? Or will those nodes
	// just be created "locally," in an exposed manner?

	// Parse and interpret the contents of the rntm_t object to properly
	// set the ways of parallelism for each loop, and then make any
	// additional modifications necessary for the current operation.
	  BLIS_LEFT, // ignored for gemm/hemm/symm
	  bli_obj_length( &c_local ),
	  bli_obj_width( &c_local ),
	  bli_obj_width( &a_local ),

	// FGVZ: the sba needs to be attached to the rntm. But it needs
	// to be done in the thread region, since it needs a thread id.
	//bli_sba_rntm_set_pool( tid, array, rntm_p );

#if 0
	// FGVZ: The datatype-specific variant is now responsible for inducing a
	// transposition, if needed.

	// Induce transpositions on A and/or B if either object is marked for
	// transposition. We can induce "fast" transpositions since they objects
	// are guaranteed to not have structure or be packed.
	if ( bli_obj_has_trans( a ) )
		bli_obj_induce_fast_trans( a );
		bli_obj_toggle_trans( a );
	if ( bli_obj_has_trans( b ) )
		bli_obj_induce_fast_trans( b );
		bli_obj_toggle_trans( b );

#if 0
	#if 0
	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
	const bool_t  is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
	                                     stor_id == BLIS_RRC ||
	                                     stor_id == BLIS_RCR ||
	                                     stor_id == BLIS_CRR );
	if ( is_rrr_rrc_rcr_crr )
		  BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
		  BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );

	// Don't use the small/unpacked implementation if one of the matrices
	// uses general stride.
	if ( stor_id == BLIS_XXX ) return BLIS_FAILURE;

	const bool_t  is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
	                                     stor_id == BLIS_RRC ||
	                                     stor_id == BLIS_RCR ||
	                                     stor_id == BLIS_CRR );
	const bool_t  is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;

	const num_t   dt       = bli_obj_dt( c );
	const bool_t  row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );

	const bool_t  is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
	                                      : is_rcc_crc_ccr_ccc );

	if ( is_primary )
		// This branch handles:
		//  - rrr rrc rcr crr for row-preferential kernels
		//  - rcc crc ccr ccc for column-preferential kernels

		const dim_t m  = bli_obj_length( c );
		const dim_t n  = bli_obj_width( c );
		const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
		const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
		const dim_t mu = m / MR;
		const dim_t nu = n / NR;

		if ( mu >= nu )
			// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
			bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
			                       alpha, a, b, beta, c, stor_id, cntx, rntm );
		else // if ( mu < nu )
			// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
			bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE,
			                       alpha, a, b, beta, c, stor_id, cntx, rntm );
		// This branch handles:
		//  - rrr rrc rcr crr for column-preferential kernels
		//  - rcc crc ccr ccc for row-preferential kernels

		const dim_t mt = bli_obj_width( c );
		const dim_t nt = bli_obj_length( c );
		const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
		const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
		const dim_t mu = mt / MR;
		const dim_t nu = nt / NR;

		if ( mu >= nu )
			// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
			bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
			                       alpha, a, b, beta, c, stor_id, cntx, rntm );
		else // if ( mu < nu )
			// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
			bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,
			                       alpha, a, b, beta, c, stor_id, cntx, rntm );
		// *requires nudging of mc,nc up to be a multiple of nr,mr.

	// Return success so that the caller knows that we computed the solution.
Пример #9
void bli_trmm3_front( side_t  side,
                      obj_t*  alpha,
                      obj_t*  a,
                      obj_t*  b,
                      obj_t*  beta,
                      obj_t*  c,
                      trmm_t* l_cntl,
                      trmm_t* r_cntl )
	trmm_t* cntl;
	obj_t   a_local;
	obj_t   b_local;
	obj_t   c_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_trmm3_check( side, alpha, a, b, beta, c );

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
		bli_scalm( beta, c );

	// Alias A, B, and C so we can tweak the objects if necessary.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );
	bli_obj_alias_to( *c, c_local );

	// We do not explicitly implement the cases where A is transposed.
	// However, we can still handle them. Specifically, if A is marked as
	// needing a transposition, we simply induce a transposition. This
	// allows us to only explicitly implement the no-transpose cases. Once
	// the transposition is induced, the correct algorithm will be called,
	// since, for example, an algorithm over a transposed lower triangular
	// matrix A moves in the same direction (forwards) as a non-transposed
	// upper triangular matrix. And with the transposition induced, the
	// matrix now appears to be upper triangular, so the upper triangular
	// algorithm will grab the correct partitions, as if it were upper
	// triangular (with no transpose) all along.
	if ( bli_obj_has_trans( a_local ) )
		bli_obj_induce_trans( a_local );
		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local );

#if 0
	if ( bli_is_right( side ) )
		bli_obj_induce_trans( a_local );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( c_local );

		bli_toggle_side( side );

#if 1
	// If A is being multiplied from the right, swap A and B so that
	// the matrix will actually be on the right.
	if ( bli_is_right( side ) )
		bli_obj_swap( a_local, b_local );

	// An optimization: If C is row-stored, transpose the entire operation
	// so as to allow the macro-kernel more favorable access patterns
	// through C. (The effect of the transposition of A and B is negligible
	// because those operands are always packed to contiguous memory.)
	if ( bli_obj_is_row_stored( c_local ) )
		bli_obj_swap( a_local, b_local );

		bli_obj_induce_trans( a_local );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( c_local );

		bli_toggle_side( side );

	// Set each alias as the root object.
	// NOTE: We MUST wait until we are done potentially swapping the objects
	// before setting the root fields!
	bli_obj_set_as_root( a_local );
	bli_obj_set_as_root( b_local );
	bli_obj_set_as_root( c_local );

	// Choose the control tree.
	if ( bli_is_left( side ) ) cntl = l_cntl;
	else                       cntl = r_cntl;

    trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE );
    dim_t n_threads = thread_num_threads( infos[0] );

    // Invoke the internal back-end.
    bli_level3_thread_decorator( n_threads,   
                                 (level3_int_t) bli_trmm_int, 
                                 (void*) cntl, 
                                 (void**) infos );

    bli_trmm_thrinfo_free_paths( infos, n_threads );
Пример #10
void bli_gemm_front( obj_t*  alpha,
                     obj_t*  a,
                     obj_t*  b,
                     obj_t*  beta,
                     obj_t*  c,
                     gemm_t* cntl )
	obj_t   a_local;
	obj_t   b_local;
	obj_t   c_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_gemm_check( alpha, a, b, beta, c );

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
		bli_scalm( beta, c );

	// Alias A, B, and C in case we need to apply transformations.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );
	bli_obj_alias_to( *c, c_local );

	// An optimization: If C is stored by rows and the micro-kernel prefers
	// contiguous columns, or if C is stored by columns and the micro-kernel
	// prefers contiguous rows, transpose the entire operation to allow the
	// micro-kernel to access elements of C in its preferred manner.
	if (
	     ( bli_obj_is_row_stored( c_local ) &&
	       bli_func_prefers_contig_cols( bli_obj_datatype( c_local ),
	                                     bli_gemm_cntl_ukrs( cntl ) ) ) ||
	     ( bli_obj_is_col_stored( c_local ) &&
	       bli_func_prefers_contig_rows( bli_obj_datatype( c_local ),
	                                     bli_gemm_cntl_ukrs( cntl ) ) )
		bli_obj_swap( a_local, b_local );

		bli_obj_induce_trans( a_local );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( c_local );

    gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
    dim_t n_threads = thread_num_threads( infos[0] );

	// Invoke the internal back-end.
    bli_level3_thread_decorator( n_threads,   
                                 (level3_int_t) bli_gemm_int, 
                                 (void*) cntl, 
                                 (void**) infos );

    bli_gemm_thrinfo_free_paths( infos, n_threads );

	// Increment the global flop counter.
	bli_flop_count_inc( 2.0 * bli_obj_length( *c )
	                        * bli_obj_width( *c )
	                        * bli_obj_width_after_trans( a_local )
	                        * ( bli_obj_is_complex( *c ) ? 4.0 : 1.0 ) );
Пример #11
void bli_gemm_int
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl,
       thrinfo_t* thread
	obj_t     a_local;
	obj_t     b_local;
	obj_t     c_local;
	gemm_voft f;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_gemm_basic_check( alpha, a, b, beta, c, cntx );

	// If C has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( *c ) ) return;

	// If A or B has a zero dimension, scale C by beta and return early.
	if ( bli_obj_has_zero_dim( *a ) ||
	     bli_obj_has_zero_dim( *b ) )
        if ( bli_thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
        bli_thread_obarrier( thread );

	// If A or B is marked as being filled with zeros, scale C by beta and
	// return early.
	if ( bli_obj_is_zeros( *a ) ||
	     bli_obj_is_zeros( *b ) )
		// This should never execute.

        if ( bli_thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
        bli_thread_obarrier( thread );

	// Alias A, B, and C in case we need to update attached scalars.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );
	bli_obj_alias_to( *c, c_local );

	// If alpha is non-unit, typecast and apply it to the scalar attached
	// to B.
	if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
        bli_obj_scalar_apply_scalar( alpha, &b_local );

	// If beta is non-unit, typecast and apply it to the scalar attached
	// to C.
	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
        bli_obj_scalar_apply_scalar( beta, &c_local );

	// Create the next node in the thrinfo_t structure.
	bli_thrinfo_grow( cntx, cntl, thread );

	// Extract the function pointer from the current control tree node.
	f = bli_cntl_var_func( cntl );

	// Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations.
		ind_t im = bli_cntx_get_ind_method( cntx );

		if ( im != BLIS_NAT )
			if      ( im == BLIS_3M3  && f == bli_gemm_packa    ) f = bli_gemm3m3_packa;
			else if ( im == BLIS_3M2  && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2;
			else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2;

	// Invoke the variant.
Пример #12
void bli_gemm_int( obj_t*  alpha,
                   obj_t*  a,
                   obj_t*  b,
                   obj_t*  beta,
                   obj_t*  c,
                   gemm_t* cntl,
                   gemm_thrinfo_t* thread )
	obj_t     a_local;
	obj_t     b_local;
	obj_t     c_local;
	varnum_t  n;
	impl_t    i;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_gemm_int_check( alpha, a, b, beta, c, cntl );

	// If C has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( *c ) ) return;

	// If A or B has a zero dimension, scale C by beta and return early.
	if ( bli_obj_has_zero_dim( *a ) ||
	     bli_obj_has_zero_dim( *b ) )
        if( thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
        thread_obarrier( thread );

	// If A or B is marked as being filled with zeros, scale C by beta and
	// return early.
	if ( bli_obj_is_zeros( *a ) ||
	     bli_obj_is_zeros( *b ) )
        if( thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
        thread_obarrier( thread );

	// Alias A and B in case we need to update attached scalars.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );

	// Alias C in case we need to induce a transposition.
	bli_obj_alias_to( *c, c_local );

	// If we are about to call a leaf-level implementation, and matrix C
	// still needs a transposition, then we must induce one by swapping the
	// strides and dimensions. Note that this transposition would normally
	// be handled explicitly in the packing of C, but if C is not being
	// packed, this is our last chance to handle the transposition.
	if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
        //if( thread_am_ochief( thread ) ) {
            bli_obj_induce_trans( c_local );
            bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
       // }

	// If alpha is non-unit, typecast and apply it to the scalar attached
	// to B.
	if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
        bli_obj_scalar_apply_scalar( alpha, &b_local );

	// If beta is non-unit, typecast and apply it to the scalar attached
	// to C.
	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
        bli_obj_scalar_apply_scalar( beta, &c_local );

	// Extract the variant number and implementation type.
	n = cntl_var_num( cntl );
	i = cntl_impl_type( cntl );

	// Index into the variant array to extract the correct function pointer.
	f = vars[n][i];

	// Invoke the variant.
	f( &a_local,
       thread );
Пример #13
void bli_trsm_front( side_t  side,
                     obj_t*  alpha,
                     obj_t*  a,
                     obj_t*  b,
                     cntx_t* cntx,
                     trsm_t* l_cntl,
                     trsm_t* r_cntl )
	trsm_t* cntl;
	obj_t   a_local;
	obj_t   b_local;
	obj_t   c_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_trsm_check( side, alpha, a, b, &BLIS_ZERO, b, cntx );

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
		bli_scalm( alpha, b );

	// Reinitialize the memory allocator to accommodate the blocksizes
	// in the current context.
	bli_mem_reinit( cntx );

	// Alias A and B so we can tweak the objects if necessary.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );
	bli_obj_alias_to( *b, c_local );

	// We do not explicitly implement the cases where A is transposed.
	// However, we can still handle them. Specifically, if A is marked as
	// needing a transposition, we simply induce a transposition. This
	// allows us to only explicitly implement the no-transpose cases. Once
	// the transposition is induced, the correct algorithm will be called,
	// since, for example, an algorithm over a transposed lower triangular
	// matrix A moves in the same direction (forwards) as a non-transposed
	// upper triangular matrix. And with the transposition induced, the
	// matrix now appears to be upper triangular, so the upper triangular
	// algorithm will grab the correct partitions, as if it were upper
	// triangular (with no transpose) all along.
	if ( bli_obj_has_trans( a_local ) )
		bli_obj_induce_trans( a_local );
		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local );

#if 0

	// If A is being solved against from the right, transpose all operands
	// so that we can perform the computation as if A were being solved
	// from the left.
	if ( bli_is_right( side ) )
		bli_toggle_side( side );
		bli_obj_induce_trans( a_local );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( c_local );


	// If A is being solved against from the right, swap A and B so that
	// the triangular matrix will actually be on the right.
	if ( bli_is_right( side ) )
		bli_obj_swap( a_local, b_local );


	// Set each alias as the root object.
	// NOTE: We MUST wait until we are done potentially swapping the objects
	// before setting the root fields!
	bli_obj_set_as_root( a_local );
	bli_obj_set_as_root( b_local );
	bli_obj_set_as_root( c_local );

	// Choose the control tree.
	if ( bli_is_left( side ) ) cntl = l_cntl;
	else                       cntl = r_cntl;

    trsm_thrinfo_t** infos = bli_create_trsm_thrinfo_paths( bli_is_right( side ) );
    dim_t n_threads = thread_num_threads( infos[0] );
    // Invoke the internal back-end.
    bli_level3_thread_decorator( n_threads,   
                                 (l3_int_t) bli_trsm_int, 
                                 (void*) cntx, 
                                 (void*) cntl, 
                                 (void**) infos );

    bli_trsm_thrinfo_free_paths( infos, n_threads );

Пример #14
void bli_trsm_int
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       rntm_t* rntm,
       cntl_t* cntl,
       thrinfo_t* thread
	obj_t        a_local;
	obj_t        b_local;
	obj_t        c_local;
	trsm_var_oft f;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_gemm_basic_check( alpha, a, b, beta, c, cntx );

	// If C has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( c ) ) return;

	// If A or B has a zero dimension, scale C by beta and return early.
	if ( bli_obj_has_zero_dim( a ) ||
	     bli_obj_has_zero_dim( b ) )
		if ( bli_thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
		bli_thread_obarrier( thread );

	// Alias A and B in case we need to update attached scalars.
	bli_obj_alias_to( a, &a_local );
	bli_obj_alias_to( b, &b_local );

	// Alias C in case we need to induce a transposition.
	bli_obj_alias_to( c, &c_local );

	// If we are about to call a leaf-level implementation, and matrix C
	// still needs a transposition, then we must induce one by swapping the
	// strides and dimensions. Note that this transposition would normally
	// be handled explicitly in the packing of C, but if C is not being
	// packed, this is our last chance to handle the transposition.
	if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) )
		bli_obj_induce_trans( &c_local );
		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local );

	// If beta is non-unit, apply it to the scalar attached to C.
	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
		bli_obj_scalar_apply_scalar( beta, &c_local );

	// Set two bools: one based on the implied side parameter (the structure
	// of the root object) and one based on the uplo field of the triangular
	// matrix's root object (whether that is matrix A or matrix B).
	if ( bli_obj_root_is_triangular( a ) )
		// If alpha is non-unit, typecast and apply it to the scalar
		// attached to B (the non-triangular matrix).
		if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
			bli_obj_scalar_apply_scalar( alpha, &b_local );
	else // if ( bli_obj_root_is_triangular( b ) )
		// If alpha is non-unit, typecast and apply it to the scalar
		// attached to A (the non-triangular matrix).
		if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
            bli_obj_scalar_apply_scalar( alpha, &a_local );

	// FGVZ->TMS: Is this barrier still needed?
	bli_thread_obarrier( thread );

	// Create the next node in the thrinfo_t structure.
	bli_thrinfo_grow( rntm, cntl, thread );

	// Extract the function pointer from the current control tree node.
	f = bli_cntl_var_func( cntl );

	// Invoke the variant.
Пример #15
void bli_her2k_front
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl

	obj_t    alpha_conj;
	obj_t    c_local;
	obj_t    a_local;
	obj_t    bh_local;
	obj_t    b_local;
	obj_t    ah_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_her2k_check( alpha, a, b, beta, c, cntx );

	// If alpha is zero, scale by beta, zero the imaginary components of
	// the diagonal elements, and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
		bli_scalm( beta, c );
		bli_setid( &BLIS_ZERO, c );

	// Alias A, B, and C in case we need to apply transformations.
	bli_obj_alias_to( a, &a_local );
	bli_obj_alias_to( b, &b_local );
	bli_obj_alias_to( c, &c_local );
	bli_obj_set_as_root( &c_local );

	// For her2k, the first and second right-hand "B" operands are simply B'
	// and A'.
	bli_obj_alias_to( b, &bh_local );
	bli_obj_induce_trans( &bh_local );
	bli_obj_toggle_conj( &bh_local );
	bli_obj_alias_to( a, &ah_local );
	bli_obj_induce_trans( &ah_local );
	bli_obj_toggle_conj( &ah_local );

	// Initialize a conjugated copy of alpha.
	bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ),
	                                      &alpha_conj );

	// An optimization: If C is stored by rows and the micro-kernel prefers
	// contiguous columns, or if C is stored by columns and the micro-kernel
	// prefers contiguous rows, transpose the entire operation to allow the
	// micro-kernel to access elements of C in its preferred manner.
	if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
		bli_obj_swap( &a_local, &bh_local );
		bli_obj_swap( &b_local, &ah_local );

		bli_obj_induce_trans( &a_local );
		bli_obj_induce_trans( &bh_local );
		bli_obj_induce_trans( &b_local );
		bli_obj_induce_trans( &ah_local );

		bli_obj_induce_trans( &c_local );

	// Record the threading for each level within the context.
	bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx,
                                   bli_obj_length( &c_local ),
                                   bli_obj_width( &c_local ),
                                   bli_obj_width( &a_local ) );

	// Invoke herk twice, using beta only the first time.

	// Invoke the internal back-end.
	  BLIS_HERK, // operation family id

	  BLIS_HERK, // operation family id

	// The Hermitian rank-2k product was computed as A*B'+B*A', even for
	// the diagonal elements. Mathematically, the imaginary components of
	// diagonal elements of a Hermitian rank-2k product should always be
	// zero. However, in practice, they sometimes accumulate meaningless
	// non-zero values. To prevent this, we explicitly set those values
	// to zero before returning.
	bli_setid( &BLIS_ZERO, &c_local );
Пример #16
void bli_trmm_int( obj_t*  alpha,
                   obj_t*  a,
                   obj_t*  b,
                   obj_t*  beta,
                   obj_t*  c,
                   trmm_t* cntl )
	obj_t     a_local;
	obj_t     b_local;
	obj_t     c_local;
	bool_t    side, uplo;
	varnum_t  n;
	impl_t    i;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_trmm_int_check( alpha, a, b, beta, c, cntl );

	// If C has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( *c ) ) return;

	// If A or B has a zero dimension, scale C by beta and return early.
	if ( bli_obj_has_zero_dim( *a ) ||
	     bli_obj_has_zero_dim( *b ) )
		bli_scalm( beta, c );

	// Alias A and B in case we need to update attached scalars.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );

	// Alias C in case we need to induce a transposition.
	bli_obj_alias_to( *c, c_local );

	// If we are about to call a leaf-level implementation, and matrix C
	// still needs a transposition, then we must induce one by swapping the
	// strides and dimensions. Note that this transposition would normally
	// be handled explicitly in the packing of C, but if C is not being
	// packed, this is our last chance to handle the transposition.
	if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
		bli_obj_induce_trans( c_local );
		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );

	// If alpha is non-unit, typecast and apply it to the scalar attached
	// to B.
	if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
		bli_obj_scalar_apply_scalar( alpha, &b_local );

	// If beta is non-unit, typecast and apply it to the scalar attached
	// to C.
	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
		bli_obj_scalar_apply_scalar( beta, &c_local );

	// Set two bools: one based on the implied side parameter (the structure
	// of the root object) and one based on the uplo field of the triangular
	// matrix's root object (whether that is matrix A or matrix B).
	if ( bli_obj_root_is_triangular( *a ) )
		side = 0;
		if ( bli_obj_root_is_lower( *a ) ) uplo = 0;
		else                               uplo = 1;
	else // if ( bli_obj_root_is_triangular( *b ) )
		side = 1;
		// Set a bool based on the uplo field of A's root object.
		if ( bli_obj_root_is_lower( *b ) ) uplo = 0;
		else                               uplo = 1;

	// Extract the variant number and implementation type.
	n = cntl_var_num( cntl );
	i = cntl_impl_type( cntl );

	// Index into the variant array to extract the correct function pointer.
	f = vars[side][uplo][n][i];

	// Invoke the variant.
	f( &a_local,
	   cntl );
Пример #17
void libblis_test_gemm_experiment( test_params_t* params,
                                   test_op_t*     op,
                                   iface_t        iface,
                                   num_t          datatype,
                                   char*          pc_str,
                                   char*          sc_str,
                                   unsigned int   p_cur,
                                   double*        perf,
                                   double*        resid )
	unsigned int n_repeats = params->n_repeats;
	unsigned int i;

	double       time_min  = 1e9;
	double       time;

	dim_t        m, n, k;

	trans_t      transa;
	trans_t      transb;

	obj_t        kappa;
	obj_t        alpha, a, b, beta, c;
	obj_t        c_save;

	// Map the dimension specifier to actual dimensions.
	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );
	k = libblis_test_get_dim_from_prob_size( op->dim_spec[2], p_cur );

	// Map parameter characters to BLIS constants.
	bli_param_map_char_to_blis_trans( pc_str[0], &transa );
	bli_param_map_char_to_blis_trans( pc_str[1], &transb );

	// Create test scalars.
	bli_obj_scalar_init_detached( datatype, &kappa );
	bli_obj_scalar_init_detached( datatype, &alpha );
	bli_obj_scalar_init_detached( datatype, &beta );

	// Create test operands (vectors and/or matrices).
	libblis_test_mobj_create( params, datatype, transa,
	                          sc_str[0], m, k, &a );
	libblis_test_mobj_create( params, datatype, transb,
	                          sc_str[1], k, n, &b );
	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
	                          sc_str[2], m, n, &c );
	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
	                          sc_str[2], m, n, &c_save );

	// Set alpha and beta.
	if ( bli_obj_is_real( c ) )
		bli_setsc(  1.2,  0.0, &alpha );
		bli_setsc( -1.0,  0.0, &beta );
		bli_setsc(  1.2,  0.8, &alpha );
		bli_setsc( -1.0,  1.0, &beta );

	// Randomize A, B, and C, and save C.
	bli_randm( &a );
	bli_randm( &b );
	bli_randm( &c );
	bli_copym( &c, &c_save );

	// Normalize by k.
	bli_setsc( 1.0/( double )k, 0.0, &kappa );
	bli_scalm( &kappa, &a );
	bli_scalm( &kappa, &b );

	// Apply the parameters.
	bli_obj_set_conjtrans( transa, a );
	bli_obj_set_conjtrans( transb, b );

	// Repeat the experiment n_repeats times and record results. 
	for ( i = 0; i < n_repeats; ++i )
		bli_copym( &c_save, &c );

		time = bli_clock();

		libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );

		time_min = bli_clock_min_diff( time_min, time );

	// Estimate the performance of the best experiment repeat.
	*perf = ( 2.0 * m * n * k ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( bli_obj_is_complex( c ) ) *perf *= 4.0;

	// Perform checks.
	libblis_test_gemm_check( &alpha, &a, &b, &beta, &c, &c_save, resid );

	// Zero out performance and residual if output matrix is empty.
	libblis_test_check_empty_problem( &c, perf, resid );

	// Free the test objects.
	bli_obj_free( &a );
	bli_obj_free( &b );
	bli_obj_free( &c );
	bli_obj_free( &c_save );
Пример #18
void bli_herk_front( obj_t*  alpha,
                     obj_t*  a,
                     obj_t*  beta,
                     obj_t*  c,
                     cntx_t* cntx,
                     gemm_t* cntl )
	obj_t   a_local;
	obj_t   ah_local;
	obj_t   c_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_herk_check( alpha, a, beta, c, cntx );

	// If alpha is zero, scale by beta, zero the imaginary components of
	// the diagonal elements, and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
		bli_scalm( beta, c );
		bli_setid( &BLIS_ZERO, c );

	// Reinitialize the memory allocator to accommodate the blocksizes
	// in the current context.
	bli_mem_reinit( cntx );

	// Alias A and C in case we need to apply transformations.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *c, c_local );
	bli_obj_set_as_root( c_local );

	// For herk, the right-hand "B" operand is simply A'.
	bli_obj_alias_to( *a, ah_local );
	bli_obj_induce_trans( ah_local );
	bli_obj_toggle_conj( ah_local );

	// An optimization: If C is stored by rows and the micro-kernel prefers
	// contiguous columns, or if C is stored by columns and the micro-kernel
	// prefers contiguous rows, transpose the entire operation to allow the
	// micro-kernel to access elements of C in its preferred manner.
	if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
		bli_obj_toggle_conj( a_local );
		bli_obj_toggle_conj( ah_local );

		bli_obj_induce_trans( c_local );

    thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HERK, BLIS_LEFT );
    dim_t n_threads = bli_thread_num_threads( infos[0] );

    // Invoke the internal back-end.
    bli_l3_thread_decorator( n_threads,
                                 (l3_int_t) bli_herk_int, 
                                 (void*) cntx, 
                                 (void*) cntl, 
                                 (void**) infos );

    bli_l3_thrinfo_free_paths( infos, n_threads );

	// The Hermitian rank-k product was computed as A*A', even for the
	// diagonal elements. Mathematically, the imaginary components of
	// diagonal elements of a Hermitian rank-k product should always be
	// zero. However, in practice, they sometimes accumulate meaningless
	// non-zero values. To prevent this, we explicitly set those values
	// to zero before returning.
	bli_setid( &BLIS_ZERO, &c_local );

Пример #19
void libblis_test_trmm3_experiment( test_params_t* params,
                                    test_op_t*     op,
                                    mt_impl_t      impl,
                                    num_t          datatype,
                                    char*          pc_str,
                                    char*          sc_str,
                                    unsigned int   p_cur,
                                    double*        perf,
                                    double*        resid )
	unsigned int n_repeats = params->n_repeats;
	unsigned int i;

	double       time_min  = 1e9;
	double       time;

	dim_t        m, n;
	dim_t        mn_side;

	side_t       side;
	uplo_t       uploa;
	trans_t      transa;
	diag_t       diaga;
	trans_t      transb;

	obj_t        kappa;
	obj_t        alpha, a, b, beta, c;
	obj_t        c_save;

	// Map the dimension specifier to actual dimensions.
	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
	n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur );

	// Map parameter characters to BLIS constants.
	bli_param_map_char_to_blis_side( pc_str[0], &side );
	bli_param_map_char_to_blis_uplo( pc_str[1], &uploa );
	bli_param_map_char_to_blis_trans( pc_str[2], &transa );
	bli_param_map_char_to_blis_diag( pc_str[3], &diaga );
	bli_param_map_char_to_blis_trans( pc_str[4], &transb );

	// Create test scalars.
	bli_obj_scalar_init_detached( datatype, &kappa );
	bli_obj_scalar_init_detached( datatype, &alpha );
	bli_obj_scalar_init_detached( datatype, &beta );

	// Create test operands (vectors and/or matrices).
	bli_set_dim_with_side( side, m, n, mn_side );
	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
	                          sc_str[0], mn_side, mn_side, &a );
	libblis_test_mobj_create( params, datatype, transb,
	                          sc_str[1], m,       n,       &b );
	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
	                          sc_str[2], m,       n,       &c );
	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
	                          sc_str[2], m,       n,       &c_save );

	// Set alpha and beta.
	if ( bli_obj_is_real( c ) )
		bli_setsc(  0.8,  0.0, &alpha );
		bli_setsc( -1.0,  0.0, &beta );
		bli_setsc(  0.8,  0.6, &alpha );
		bli_setsc( -1.0,  0.5, &beta );

	// Set the structure and uplo properties of A.
	bli_obj_set_struc( BLIS_TRIANGULAR, a );
	bli_obj_set_uplo( uploa, a );

	// Randomize A, make it densely triangular.
	bli_randm( &a );
	bli_mktrim( &a );

	// Randomize B and C, and save C.
	bli_randm( &b );
	bli_randm( &c );
	bli_copym( &c, &c_save );

	// Normalize by m.
	bli_setsc( 1.0/( double )m, 0.0, &kappa );
	bli_scalm( &kappa, &b );

	// Apply the remaining parameters.
	bli_obj_set_conjtrans( transa, a );
	bli_obj_set_diag( diaga, a );
	bli_obj_set_conjtrans( transb, b );

	// Repeat the experiment n_repeats times and record results. 
	for ( i = 0; i < n_repeats; ++i )
		bli_copym( &c_save, &c );

		time = bli_clock();

		libblis_test_trmm3_impl( impl, side, &alpha, &a, &b, &beta, &c );

		time_min = bli_clock_min_diff( time_min, time );

	// Estimate the performance of the best experiment repeat.
	*perf = ( 1.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( bli_obj_is_complex( c ) ) *perf *= 4.0;

	// Perform checks.
	libblis_test_trmm3_check( side, &alpha, &a, &b, &beta, &c, &c_save, resid );

	// Zero out performance and residual if output matrix is empty.
	libblis_test_check_empty_problem( &c, perf, resid );

	// Free the test objects.
	bli_obj_free( &a );
	bli_obj_free( &b );
	bli_obj_free( &c );
	bli_obj_free( &c_save );
Пример #20
void bli_symm_front( side_t  side,
                     obj_t*  alpha,
                     obj_t*  a,
                     obj_t*  b,
                     obj_t*  beta,
                     obj_t*  c,
                     gemm_t* cntl )
	obj_t   a_local;
	obj_t   b_local;
	obj_t   c_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_symm_check( side, alpha, a, b, beta, c );

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
		bli_scalm( beta, c );

	// Alias A, B, and C in case we need to apply transformations.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );
	bli_obj_alias_to( *c, c_local );

	// An optimization: If C is stored by rows and the micro-kernel prefers
	// contiguous columns, or if C is stored by columns and the micro-kernel
	// prefers contiguous rows, transpose the entire operation to allow the
	// micro-kernel to access elements of C in its preferred manner.
	if (
	     ( bli_obj_is_row_stored( c_local ) &&
	       bli_func_prefers_contig_cols( bli_obj_datatype( c_local ),
	                                     cntl_gemm_ukrs( cntl ) ) ) ||
	     ( bli_obj_is_col_stored( c_local ) &&
	       bli_func_prefers_contig_rows( bli_obj_datatype( c_local ),
	                                     cntl_gemm_ukrs( cntl ) ) )
		bli_toggle_side( side );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( c_local );

	// Swap A and B if multiplying A from the right so that "B" contains
	// the symmetric matrix.
	if ( bli_is_right( side ) )
		bli_obj_swap( a_local, b_local );

    gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
    dim_t n_threads = thread_num_threads( infos[0] );
    // Invoke the internal back-end.
    bli_level3_thread_decorator( n_threads,   
                                 (level3_int_t) bli_gemm_int, 
                                 (void*) cntl, 
                                 (void**) infos );

     bli_gemm_thrinfo_free_paths( infos, n_threads );
Пример #21
void bli_symm_front
       side_t  side,
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl
	obj_t   a_local;
	obj_t   b_local;
	obj_t   c_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_symm_check( side, alpha, a, b, beta, c, cntx );

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
		bli_scalm( beta, c );

	// Reinitialize the memory allocator to accommodate the blocksizes
	// in the current context.
	bli_memsys_reinit( cntx );

	// Alias A, B, and C in case we need to apply transformations.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );
	bli_obj_alias_to( *c, c_local );

	// An optimization: If C is stored by rows and the micro-kernel prefers
	// contiguous columns, or if C is stored by columns and the micro-kernel
	// prefers contiguous rows, transpose the entire operation to allow the
	// micro-kernel to access elements of C in its preferred manner.
	if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
		bli_toggle_side( side );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( c_local );

	// Swap A and B if multiplying A from the right so that "B" contains
	// the symmetric matrix.
	if ( bli_is_right( side ) )
		bli_obj_swap( a_local, b_local );

	// Set the operation family id in the context.
	bli_cntx_set_family( BLIS_GEMM, cntx );

	// Record the threading for each level within the context.
	bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx );

	// Invoke the internal back-end.
Пример #22
void bli_gemm_front
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl
    gint_t status = bli_gemm_small_matrix(alpha, a, b, beta, c, cntx, cntl);
    if(BLIS_SUCCESS != status)
	    obj_t   a_local;
	    obj_t   b_local;
	    obj_t   c_local;

	    // Check parameters.
	    if ( bli_error_checking_is_enabled() )
		    bli_gemm_check( alpha, a, b, beta, c, cntx );

	    // If alpha is zero, scale by beta and return.
	    if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
		    bli_scalm( beta, c );

	    // Reinitialize the memory allocator to accommodate the blocksizes
	    // in the current context.
	    bli_memsys_reinit( cntx );

	    // Alias A, B, and C in case we need to apply transformations.
	    bli_obj_alias_to( *a, a_local );
	    bli_obj_alias_to( *b, b_local );
	    bli_obj_alias_to( *c, c_local );

	    // An optimization: If C is stored by rows and the micro-kernel prefers
	    // contiguous columns, or if C is stored by columns and the micro-kernel
	    // prefers contiguous rows, transpose the entire operation to allow the
	    // micro-kernel to access elements of C in its preferred manner.
	    if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
		    bli_obj_swap( a_local, b_local );

		    bli_obj_induce_trans( a_local );
		    bli_obj_induce_trans( b_local );
		    bli_obj_induce_trans( c_local );

	    // Set the operation family id in the context.
	    bli_cntx_set_family( BLIS_GEMM, cntx );

	    // Record the threading for each level within the context.
	    bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx,
                                       bli_obj_length( c_local ),
                                       bli_obj_width( c_local ),
                                       bli_obj_width( a_local ) );

	    // Invoke the internal back-end via the thread handler.
Пример #23
void bli_her2k_front( obj_t*  alpha,
                      obj_t*  a,
                      obj_t*  b,
                      obj_t*  beta,
                      obj_t*  c,
                      gemm_t* cntl )
	obj_t    alpha_conj;
	obj_t    c_local;
	obj_t    a_local;
	obj_t    bh_local;
	obj_t    b_local;
	obj_t    ah_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_her2k_check( alpha, a, b, beta, c );

	// If alpha is zero, scale by beta, zero the imaginary components of
	// the diagonal elements, and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
		bli_scalm( beta, c );
		bli_setid( &BLIS_ZERO, c );

	// Alias A, B, and C in case we need to apply transformations.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );
	bli_obj_alias_to( *c, c_local );
	bli_obj_set_as_root( c_local );

	// For her2k, the first and second right-hand "B" operands are simply B'
	// and A'.
	bli_obj_alias_to( *b, bh_local );
	bli_obj_induce_trans( bh_local );
	bli_obj_toggle_conj( bh_local );
	bli_obj_alias_to( *a, ah_local );
	bli_obj_induce_trans( ah_local );
	bli_obj_toggle_conj( ah_local );

	// Initialize a conjugated copy of alpha.
	bli_obj_scalar_init_detached_copy_of( bli_obj_datatype( *a ),
	                                      &alpha_conj );

	// An optimization: If C is stored by rows and the micro-kernel prefers
	// contiguous columns, or if C is stored by columns and the micro-kernel
	// prefers contiguous rows, transpose the entire operation to allow the
	// micro-kernel to access elements of C in its preferred manner.
	if (
	     ( bli_obj_is_row_stored( c_local ) &&
	       bli_func_prefers_contig_cols( bli_obj_datatype( c_local ),
	                                     bli_gemm_cntl_ukrs( cntl ) ) ) ||
	     ( bli_obj_is_col_stored( c_local ) &&
	       bli_func_prefers_contig_rows( bli_obj_datatype( c_local ),
	                                     bli_gemm_cntl_ukrs( cntl ) ) )
		bli_obj_swap( a_local, bh_local );
		bli_obj_swap( b_local, ah_local );

		bli_obj_induce_trans( a_local );
		bli_obj_induce_trans( bh_local );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( ah_local );

		bli_obj_induce_trans( c_local );

#if 0
	// Invoke the internal back-end.
	bli_her2k_int( alpha,
	               cntl );

	// Invoke herk twice, using beta only the first time.
    herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
    dim_t n_threads = thread_num_threads( infos[0] );

    // Invoke the internal back-end.
    bli_level3_thread_decorator( n_threads,   
                                 (level3_int_t) bli_herk_int, 
                                 (void*) cntl, 
                                 (void**) infos );

    bli_level3_thread_decorator( n_threads,   
                                 (level3_int_t) bli_herk_int, 
                                 (void*) cntl, 
                                 (void**) infos );

    bli_herk_thrinfo_free_paths( infos, n_threads );


	// The Hermitian rank-2k product was computed as A*B'+B*A', even for
	// the diagonal elements. Mathematically, the imaginary components of
	// diagonal elements of a Hermitian rank-2k product should always be
	// zero. However, in practice, they sometimes accumulate meaningless
	// non-zero values. To prevent this, we explicitly set those values
	// to zero before returning.
	bli_setid( &BLIS_ZERO, &c_local );

Пример #24
void bli_trmm3_front( side_t  side,
                      obj_t*  alpha,
                      obj_t*  a,
                      obj_t*  b,
                      obj_t*  beta,
                      obj_t*  c,
                      gemm_t* cntl )
	obj_t   a_local;
	obj_t   b_local;
	obj_t   c_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_trmm3_check( side, alpha, a, b, beta, c );

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
		bli_scalm( beta, c );

	// Alias A, B, and C so we can tweak the objects if necessary.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );
	bli_obj_alias_to( *c, c_local );

	// We do not explicitly implement the cases where A is transposed.
	// However, we can still handle them. Specifically, if A is marked as
	// needing a transposition, we simply induce a transposition. This
	// allows us to only explicitly implement the no-transpose cases. Once
	// the transposition is induced, the correct algorithm will be called,
	// since, for example, an algorithm over a transposed lower triangular
	// matrix A moves in the same direction (forwards) as a non-transposed
	// upper triangular matrix. And with the transposition induced, the
	// matrix now appears to be upper triangular, so the upper triangular
	// algorithm will grab the correct partitions, as if it were upper
	// triangular (with no transpose) all along.
	if ( bli_obj_has_trans( a_local ) )
		bli_obj_induce_trans( a_local );
		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local );

#if 0

	// If A is being multiplied from the right, transpose all operands
	// so that we can perform the computation as if A were being multiplied
	// from the left.
	if ( bli_is_right( side ) )
		bli_toggle_side( side );
		bli_obj_induce_trans( a_local );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( c_local );


	// An optimization: If C is stored by rows and the micro-kernel prefers
	// contiguous columns, or if C is stored by columns and the micro-kernel
	// prefers contiguous rows, transpose the entire operation to allow the
	// micro-kernel to access elements of C in its preferred manner.
	if (
	     ( bli_obj_is_row_stored( c_local ) &&
	       bli_func_prefers_contig_cols( bli_obj_datatype( c_local ),
	                                     bli_gemm_cntl_ukrs( cntl ) ) ) ||
	     ( bli_obj_is_col_stored( c_local ) &&
	       bli_func_prefers_contig_rows( bli_obj_datatype( c_local ),
	                                     bli_gemm_cntl_ukrs( cntl ) ) )
		bli_toggle_side( side );
		bli_obj_induce_trans( a_local );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( c_local );

	// If A is being multiplied from the right, swap A and B so that
	// the matrix will actually be on the right.
	if ( bli_is_right( side ) )
		bli_obj_swap( a_local, b_local );


	// Set each alias as the root object.
	// NOTE: We MUST wait until we are done potentially swapping the objects
	// before setting the root fields!
	bli_obj_set_as_root( a_local );
	bli_obj_set_as_root( b_local );
	bli_obj_set_as_root( c_local );

	// Notice that, unlike trmm_r, there is no dependency in the jc loop
	// for trmm3_r, so we can pass in FALSE for jc_dependency.
	trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE );
    dim_t n_threads = thread_num_threads( infos[0] );

    // Invoke the internal back-end.
    bli_level3_thread_decorator( n_threads,   
                                 (level3_int_t) bli_trmm_int, 
                                 (void*) cntl, 
                                 (void**) infos );

    bli_trmm_thrinfo_free_paths( infos, n_threads );

Пример #25
void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,
                                           test_op_t*     op,
                                           mt_impl_t      impl,
                                           num_t          datatype,
                                           char*          pc_str,
                                           char*          sc_str,
                                           unsigned int   p_cur,
                                           double*        perf,
                                           double*        resid )
	unsigned int n_repeats = params->n_repeats;
	unsigned int i;

	double       time_min  = 1e9;
	double       time;

	dim_t        m, n, k;

	char         sc_a = 'c';
	char         sc_b = 'r';

	side_t       side = BLIS_LEFT;
	uplo_t       uploa;

	obj_t        kappa;
	obj_t        alpha;
	obj_t        a_big, a, b;
	obj_t        b11, c11;
	obj_t        ap, bp;
	obj_t        a1xp, a11p, bx1p, b11p;
	obj_t        c11_save;

	// Map the dimension specifier to actual dimensions.
	k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );

	// Fix m and n to MR and NR, respectively.
	m = bli_blksz_for_type( datatype, gemm_mr );
	n = bli_blksz_for_type( datatype, gemm_nr );

	// Store the register blocksizes so that the driver can retrieve the
	// values later when printing results.
	op->dim_aux[0] = m;
	op->dim_aux[1] = n;

	// Map parameter characters to BLIS constants.
	bli_param_map_char_to_blis_uplo( pc_str[0], &uploa );

	// Create test scalars.
	bli_obj_scalar_init_detached( datatype, &kappa );
	bli_obj_scalar_init_detached( datatype, &alpha );

	// Create test operands (vectors and/or matrices).
	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
	                          sc_a,      k+m, k+m, &a_big );
	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
	                          sc_b,      k+m, n,   &b );
	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
	                          sc_str[0], m,   n,   &c11 );
	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
	                          sc_str[0], m,   n,   &c11_save );

	// Set alpha.
	if ( bli_obj_is_real( b ) )
		bli_setsc(  2.0,  0.0, &alpha );
		bli_setsc(  2.0,  0.0, &alpha );

	// Set the structure, uplo, and diagonal offset properties of A.
	bli_obj_set_struc( BLIS_TRIANGULAR, a_big );
	bli_obj_set_uplo( uploa, a_big );

	// Randomize A and make it densely triangular.
	bli_randm( &a_big );

	// Normalize B and save.
	bli_randm( &b );
	bli_setsc( 1.0/( double )m, 0.0, &kappa );
	bli_scalm( &kappa, &b );

	// Use the last m rows of A_big as A.
	bli_acquire_mpart_t2b( BLIS_SUBPART1, k, m, &a_big, &a );

	// Locate the B11 block of B, copy to C11, and save.
	if ( bli_obj_is_lower( a ) ) 
		bli_acquire_mpart_t2b( BLIS_SUBPART1, k, m, &b, &b11 );
		bli_acquire_mpart_t2b( BLIS_SUBPART1, 0, m, &b, &b11 );
	bli_copym( &b11, &c11 );
	bli_copym( &c11, &c11_save );

	// Initialize pack objects.
	bli_obj_init_pack( &ap );
	bli_obj_init_pack( &bp );

	// Create pack objects for a and b.
	libblis_test_pobj_create( gemm_mr,
	                          &a, &ap );
	libblis_test_pobj_create( gemm_mr,
	                          &b, &bp );

	// Pack the contents of a to ap.
	bli_packm_blk_var3( &a, &ap );

	// Pack the contents of b to bp.
	bli_packm_blk_var2( &b, &bp );

	// Create subpartitions from the a and b panels.
	bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp,
	                                &a1xp, &a11p, &bx1p, &b11p );

	// Repeat the experiment n_repeats times and record results. 
	for ( i = 0; i < n_repeats; ++i )
		bli_copym( &c11_save, &c11 );

		// Re-pack the contents of b to bp.
		bli_packm_blk_var2( &b, &bp );

		time = bli_clock();

		libblis_test_gemmtrsm_ukr_impl( impl, side, &alpha,
		                                &a1xp, &a11p, &bx1p, &b11p, &c11 );

		time_min = bli_clock_min_diff( time_min, time );

	// Estimate the performance of the best experiment repeat.
	*perf = ( 2.0 * m * n * k + 1.0 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( bli_obj_is_complex( b ) ) *perf *= 4.0;

	// Perform checks.
	libblis_test_gemmtrsm_ukr_check( side, &alpha,
	                                 &a1xp, &a11p, &bx1p, &b11p, &c11, &c11_save, resid );

	// Zero out performance and residual if output matrix is empty.
	//libblis_test_check_empty_problem( &c11, perf, resid );

	// Release packing buffers within pack objects.
	bli_obj_release_pack( &ap );
	bli_obj_release_pack( &bp );

	// Free the test objects.
	bli_obj_free( &a_big );
	bli_obj_free( &b );
	bli_obj_free( &c11 );
	bli_obj_free( &c11_save );
Пример #26
void bli_syr2k_front
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl

	obj_t    c_local;
	obj_t    a_local;
	obj_t    bt_local;
	obj_t    b_local;
	obj_t    at_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_syr2k_check( alpha, a, b, beta, c, cntx );

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
		bli_scalm( beta, c );

	// Alias A, B, and C in case we need to apply transformations.
	bli_obj_alias_to( a, &a_local );
	bli_obj_alias_to( b, &b_local );
	bli_obj_alias_to( c, &c_local );
	bli_obj_set_as_root( &c_local );

	// For syr2k, the first and second right-hand "B" operands are simply B'
	// and A'.
	bli_obj_alias_to( b, &bt_local );
	bli_obj_induce_trans( &bt_local );
	bli_obj_alias_to( a, &at_local );
	bli_obj_induce_trans( &at_local );

	// An optimization: If C is stored by rows and the micro-kernel prefers
	// contiguous columns, or if C is stored by columns and the micro-kernel
	// prefers contiguous rows, transpose the entire operation to allow the
	// micro-kernel to access elements of C in its preferred manner.
	if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
		bli_obj_induce_trans( &c_local );

	// Record the threading for each level within the context.
	bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx,
                                   bli_obj_length( &c_local ),
                                   bli_obj_width( &c_local ),
                                   bli_obj_width( &a_local ) );

	// Invoke herk twice, using beta only the first time.

	// Invoke the internal back-end.
	  BLIS_HERK, // operation family id

	  BLIS_HERK, // operation family id