Exemplo n.º 1
0
void bli_gemm_front
     (
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl
     )
{
#ifdef BLIS_SMALL_MATRIX_ENABLE
#ifndef BLIS_ENABLE_MULTITHREADING
    gint_t status = bli_gemm_small_matrix(alpha, a, b, beta, c, cntx, cntl);
    if(BLIS_SUCCESS != status)
#endif
#endif
    {
	    obj_t   a_local;
	    obj_t   b_local;
	    obj_t   c_local;

	    // Check parameters.
	    if ( bli_error_checking_is_enabled() )
		    bli_gemm_check( alpha, a, b, beta, c, cntx );

	    // If alpha is zero, scale by beta and return.
	    if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
	    {
		    bli_scalm( beta, c );
		    return;
	    }

	    // Reinitialize the memory allocator to accommodate the blocksizes
	    // in the current context.
	    bli_memsys_reinit( cntx );

	    // Alias A, B, and C in case we need to apply transformations.
	    bli_obj_alias_to( *a, a_local );
	    bli_obj_alias_to( *b, b_local );
	    bli_obj_alias_to( *c, c_local );

	    // An optimization: If C is stored by rows and the micro-kernel prefers
	    // contiguous columns, or if C is stored by columns and the micro-kernel
	    // prefers contiguous rows, transpose the entire operation to allow the
	    // micro-kernel to access elements of C in its preferred manner.
	    if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
	    {
		    bli_obj_swap( a_local, b_local );

		    bli_obj_induce_trans( a_local );
		    bli_obj_induce_trans( b_local );
		    bli_obj_induce_trans( c_local );
	    }

	    // Set the operation family id in the context.
	    bli_cntx_set_family( BLIS_GEMM, cntx );

	    // Record the threading for each level within the context.
	    bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx,
                                       bli_obj_length( c_local ),
                                       bli_obj_width( c_local ),
                                       bli_obj_width( a_local ) );

	    // Invoke the internal back-end via the thread handler.
	    bli_l3_thread_decorator
	    (
	      bli_gemm_int,
	      alpha,
	      &a_local,
	      &b_local,
	      beta,
	      &c_local,
	      cntx,
	      cntl
	    );
    }
}
Exemplo n.º 2
0
void bli_her2k_front
     (
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl
     )
{
	bli_init_once();

	obj_t    alpha_conj;
	obj_t    c_local;
	obj_t    a_local;
	obj_t    bh_local;
	obj_t    b_local;
	obj_t    ah_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_her2k_check( alpha, a, b, beta, c, cntx );

	// If alpha is zero, scale by beta, zero the imaginary components of
	// the diagonal elements, and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
	{
		bli_scalm( beta, c );
		bli_setid( &BLIS_ZERO, c );
		return;
	}

	// Alias A, B, and C in case we need to apply transformations.
	bli_obj_alias_to( a, &a_local );
	bli_obj_alias_to( b, &b_local );
	bli_obj_alias_to( c, &c_local );
	bli_obj_set_as_root( &c_local );

	// For her2k, the first and second right-hand "B" operands are simply B'
	// and A'.
	bli_obj_alias_to( b, &bh_local );
	bli_obj_induce_trans( &bh_local );
	bli_obj_toggle_conj( &bh_local );
	bli_obj_alias_to( a, &ah_local );
	bli_obj_induce_trans( &ah_local );
	bli_obj_toggle_conj( &ah_local );

	// Initialize a conjugated copy of alpha.
	bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ),
	                                      BLIS_CONJUGATE,
	                                      alpha,
	                                      &alpha_conj );

	// An optimization: If C is stored by rows and the micro-kernel prefers
	// contiguous columns, or if C is stored by columns and the micro-kernel
	// prefers contiguous rows, transpose the entire operation to allow the
	// micro-kernel to access elements of C in its preferred manner.
	if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
	{
		bli_obj_swap( &a_local, &bh_local );
		bli_obj_swap( &b_local, &ah_local );

		bli_obj_induce_trans( &a_local );
		bli_obj_induce_trans( &bh_local );
		bli_obj_induce_trans( &b_local );
		bli_obj_induce_trans( &ah_local );

		bli_obj_induce_trans( &c_local );
	}

	// Record the threading for each level within the context.
	bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx,
                                   bli_obj_length( &c_local ),
                                   bli_obj_width( &c_local ),
                                   bli_obj_width( &a_local ) );

	// Invoke herk twice, using beta only the first time.

	// Invoke the internal back-end.
	bli_l3_thread_decorator
	(
	  bli_gemm_int,
	  BLIS_HERK, // operation family id
	  alpha,
	  &a_local,
	  &bh_local,
	  beta,
	  &c_local,
	  cntx,
	  cntl
	);

	bli_l3_thread_decorator
	(
	  bli_gemm_int,
	  BLIS_HERK, // operation family id
	  &alpha_conj,
	  &b_local,
	  &ah_local,
	  &BLIS_ONE,
	  &c_local,
	  cntx,
	  cntl
	);

	// The Hermitian rank-2k product was computed as A*B'+B*A', even for
	// the diagonal elements. Mathematically, the imaginary components of
	// diagonal elements of a Hermitian rank-2k product should always be
	// zero. However, in practice, they sometimes accumulate meaningless
	// non-zero values. To prevent this, we explicitly set those values
	// to zero before returning.
	bli_setid( &BLIS_ZERO, &c_local );
}
Exemplo n.º 3
0
void bli_symm_front
     (
       side_t  side,
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl
     )
{
	obj_t   a_local;
	obj_t   b_local;
	obj_t   c_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_symm_check( side, alpha, a, b, beta, c, cntx );

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
	{
		bli_scalm( beta, c );
		return;
	}

	// Reinitialize the memory allocator to accommodate the blocksizes
	// in the current context.
	bli_memsys_reinit( cntx );

	// Alias A, B, and C in case we need to apply transformations.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );
	bli_obj_alias_to( *c, c_local );

	// An optimization: If C is stored by rows and the micro-kernel prefers
	// contiguous columns, or if C is stored by columns and the micro-kernel
	// prefers contiguous rows, transpose the entire operation to allow the
	// micro-kernel to access elements of C in its preferred manner.
	if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
	{
		bli_toggle_side( side );
		bli_obj_induce_trans( b_local );
		bli_obj_induce_trans( c_local );
	}

	// Swap A and B if multiplying A from the right so that "B" contains
	// the symmetric matrix.
	if ( bli_is_right( side ) )
	{
		bli_obj_swap( a_local, b_local );
	}

	// Set the operation family id in the context.
	bli_cntx_set_family( BLIS_GEMM, cntx );

	// Record the threading for each level within the context.
	bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx );

	// Invoke the internal back-end.
	bli_l3_thread_decorator
	(
	  bli_gemm_int,
	  alpha,
	  &a_local,
	  &b_local,
	  beta,
	  &c_local,
	  cntx,
	  cntl
	);
}
Exemplo n.º 4
0
void bli_syr2k_front
     (
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl
     )
{
	bli_init_once();

	obj_t    c_local;
	obj_t    a_local;
	obj_t    bt_local;
	obj_t    b_local;
	obj_t    at_local;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_syr2k_check( alpha, a, b, beta, c, cntx );

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
	{
		bli_scalm( beta, c );
		return;
	}

	// Alias A, B, and C in case we need to apply transformations.
	bli_obj_alias_to( a, &a_local );
	bli_obj_alias_to( b, &b_local );
	bli_obj_alias_to( c, &c_local );
	bli_obj_set_as_root( &c_local );

	// For syr2k, the first and second right-hand "B" operands are simply B'
	// and A'.
	bli_obj_alias_to( b, &bt_local );
	bli_obj_induce_trans( &bt_local );
	bli_obj_alias_to( a, &at_local );
	bli_obj_induce_trans( &at_local );

	// An optimization: If C is stored by rows and the micro-kernel prefers
	// contiguous columns, or if C is stored by columns and the micro-kernel
	// prefers contiguous rows, transpose the entire operation to allow the
	// micro-kernel to access elements of C in its preferred manner.
	if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
	{
		bli_obj_induce_trans( &c_local );
	}

	// Record the threading for each level within the context.
	bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx,
                                   bli_obj_length( &c_local ),
                                   bli_obj_width( &c_local ),
                                   bli_obj_width( &a_local ) );

	// Invoke herk twice, using beta only the first time.

	// Invoke the internal back-end.
	bli_l3_thread_decorator
	(
	  bli_gemm_int,
	  BLIS_HERK, // operation family id
	  alpha,
	  &a_local,
	  &bt_local,
	  beta,
	  &c_local,
	  cntx,
	  cntl
	);

	bli_l3_thread_decorator
	(
	  bli_gemm_int,
	  BLIS_HERK, // operation family id
	  alpha,
	  &b_local,
	  &at_local,
	  &BLIS_ONE,
	  &c_local,
	  cntx,
	  cntl
	);
}