Пример #1
0
void libblis_test_dotxaxpyf_experiment
     (
       test_params_t* params,
       test_op_t*     op,
       iface_t        iface,
       num_t          datatype,
       char*          pc_str,
       char*          sc_str,
       unsigned int   p_cur,
       double*        perf,
       double*        resid
     )
{
	unsigned int n_repeats = params->n_repeats;
	unsigned int i;

	double       time_min  = 1e9;
	double       time;

	dim_t        m, b_n;

	conj_t       conjat, conja, conjw, conjx;

	obj_t        alpha, at, a, w, x, beta, y, z;
	obj_t        y_save, z_save;

	cntx_t       cntx;

	// Initialize a context.
	bli_dotxaxpyf_cntx_init( &cntx );

	// Map the dimension specifier to an actual dimension.
	m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );

	// Query the operation's fusing factor for the current datatype.
	b_n = bli_cntx_get_blksz_def_dt( datatype, BLIS_XF, &cntx );

	// Store the fusing factor so that the driver can retrieve the value
	// later when printing results.
	op->dim_aux[0] = b_n;

	// Map parameter characters to BLIS constants.
	bli_param_map_char_to_blis_conj( pc_str[0], &conjat );
	bli_param_map_char_to_blis_conj( pc_str[1], &conja );
	bli_param_map_char_to_blis_conj( pc_str[2], &conjw );
	bli_param_map_char_to_blis_conj( pc_str[3], &conjx );

	// Create test scalars.
	bli_obj_scalar_init_detached( datatype, &alpha );
	bli_obj_scalar_init_detached( datatype, &beta );

	// Create test operands (vectors and/or matrices).
	libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
	                                            sc_str[0], m, b_n, &a );
	libblis_test_vobj_create( params, datatype, sc_str[1], m, &w );
	libblis_test_vobj_create( params, datatype, sc_str[2], b_n, &x );
	libblis_test_vobj_create( params, datatype, sc_str[3], b_n, &y );
	libblis_test_vobj_create( params, datatype, sc_str[3], b_n, &y_save );
	libblis_test_vobj_create( params, datatype, sc_str[4], m, &z );
	libblis_test_vobj_create( params, datatype, sc_str[4], m, &z_save );

	// Set alpha.
	if ( bli_obj_is_real( y ) )
	{
		bli_setsc(  1.2,  0.0, &alpha );
		bli_setsc( -1.0,  0.0, &beta );
	}
	else
	{
		bli_setsc(  1.2,  0.1, &alpha );
		bli_setsc( -1.0, -0.1, &beta );
	}

	// Randomize A, w, x, y, and z, and save y and z.
	libblis_test_mobj_randomize( params, FALSE, &a );
	libblis_test_vobj_randomize( params, FALSE, &w );
	libblis_test_vobj_randomize( params, FALSE, &x );
	libblis_test_vobj_randomize( params, FALSE, &y );
	libblis_test_vobj_randomize( params, FALSE, &z );
	bli_copyv( &y, &y_save );
	bli_copyv( &z, &z_save );

	// Create an alias to a for at. (Note that it should NOT actually be
	// marked for transposition since the transposition is part of the dotxf
	// subproblem.)
	bli_obj_alias_to( a, at );

	// Apply the parameters.
	bli_obj_set_conj( conjat, at );
	bli_obj_set_conj( conja, a );
	bli_obj_set_conj( conjw, w );
	bli_obj_set_conj( conjx, x );

	// Repeat the experiment n_repeats times and record results. 
	for ( i = 0; i < n_repeats; ++i )
	{
		bli_copyv( &y_save, &y );
		bli_copyv( &z_save, &z );

		time = bli_clock();

		libblis_test_dotxaxpyf_impl( iface,
		                             &alpha, &at, &a, &w, &x, &beta, &y, &z,
		                             &cntx );

		time_min = bli_clock_min_diff( time_min, time );
	}

	// Estimate the performance of the best experiment repeat.
	*perf = ( 2.0 * m * b_n + 2.0 * m * b_n ) / time_min / FLOPS_PER_UNIT_PERF;
	if ( bli_obj_is_complex( y ) ) *perf *= 4.0;

	// Perform checks.
	libblis_test_dotxaxpyf_check( params, &alpha, &at, &a, &w, &x, &beta, &y, &z, &y_save, &z_save, resid );

	// Zero out performance and residual if either output vector is empty.
	libblis_test_check_empty_problem( &y, perf, resid );
	libblis_test_check_empty_problem( &z, perf, resid );

	// Free the test objects.
	bli_obj_free( &a );
	bli_obj_free( &w );
	bli_obj_free( &x );
	bli_obj_free( &y );
	bli_obj_free( &z );
	bli_obj_free( &y_save );
	bli_obj_free( &z_save );

	// Finalize the context.
	bli_dotxaxpyf_cntx_finalize( &cntx );
}
Пример #2
0
int main( int argc, char** argv )
{
	obj_t    a, c;
	obj_t    c_save;
	obj_t    alpha;
	dim_t    m, n;
	dim_t    p;
	dim_t    p_begin, p_max, p_inc;
	int      m_input, n_input;
	ind_t    ind;
	num_t    dt;
	char     dt_ch;
	int      r, n_repeats;
	side_t   side;
	uplo_t   uploa;
	trans_t  transa;
	diag_t   diaga;
	f77_char f77_side;
	f77_char f77_uploa;
	f77_char f77_transa;
	f77_char f77_diaga;

	double   dtime;
	double   dtime_save;
	double   gflops;

	//bli_init();

	//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );

	n_repeats = 3;

	dt      = DT;

	ind     = IND;

	p_begin = P_BEGIN;
	p_max   = P_MAX;
	p_inc   = P_INC;

	m_input = -1;
	n_input = -1;


	// Supress compiler warnings about unused variable 'ind'.
	( void )ind;

#if 0

	cntx_t* cntx;

	ind_t ind_mod = ind;

	// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
	if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;

	// Initialize a context for the current induced method and datatype.
	cntx = bli_gks_query_ind_cntx( ind_mod, dt );

	// Set k to the kc blocksize for the current datatype.
	k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );

#elif 1

	//k_input = 256;

#endif

	// Choose the char corresponding to the requested datatype.
	if      ( bli_is_float( dt ) )    dt_ch = 's';
	else if ( bli_is_double( dt ) )   dt_ch = 'd';
	else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
	else                              dt_ch = 'z';

#if 0
	side   = BLIS_LEFT;
#else
	side   = BLIS_RIGHT;
#endif
#if 0
	uploa  = BLIS_LOWER;
#else
	uploa  = BLIS_UPPER;
#endif
	transa = BLIS_NO_TRANSPOSE;
	diaga  = BLIS_NONUNIT_DIAG;

	bli_param_map_blis_to_netlib_side( side, &f77_side );
	bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
	bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
	bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga );

	// Begin with initializing the last entry to zero so that
	// matlab allocates space for the entire array once up-front.
	for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;

	printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR );
	printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
	        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
	        ( unsigned long )0,
	        ( unsigned long )0, 0.0 );


	for ( p = p_begin; p <= p_max; p += p_inc )
	{

		if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
		else               m =     ( dim_t )    m_input;
		if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
		else               n =     ( dim_t )    n_input;

		bli_obj_create( dt, 1, 1, 0, 0, &alpha );

		if ( bli_is_left( side ) )
			bli_obj_create( dt, m, m, 0, 0, &a );
        else
			bli_obj_create( dt, n, n, 0, 0, &a );
		bli_obj_create( dt, m, n, 0, 0, &c );
		//bli_obj_create( dt, m, n, n, 1, &c );
		bli_obj_create( dt, m, n, 0, 0, &c_save );

		bli_randm( &a );
		bli_randm( &c );

		bli_obj_set_struc( BLIS_TRIANGULAR, &a );
		bli_obj_set_uplo( uploa, &a );
		bli_obj_set_conjtrans( transa, &a );
		bli_obj_set_diag( diaga, &a );

		bli_randm( &a );
		bli_mktrim( &a );

		// Load the diagonal of A to make it more likely to be invertible.
		bli_shiftd( &BLIS_TWO, &a );

		bli_setsc(  (2.0/1.0), 0.0, &alpha );

		bli_copym( &c, &c_save );
	
#if 0 //def BLIS
		bli_ind_disable_all_dt( dt );
		bli_ind_enable_dt( ind, dt );
#endif

		dtime_save = DBL_MAX;

		for ( r = 0; r < n_repeats; ++r )
		{
			bli_copym( &c_save, &c );

			dtime = bli_clock();

#ifdef PRINT
			bli_printm( "a", &a, "%4.1f", "" );
			bli_printm( "c", &c, "%4.1f", "" );
#endif

#ifdef BLIS

			bli_trsm( side,
			          &alpha,
			          &a,
			          &c );

#else

			if ( bli_is_float( dt ) )
			{
				f77_int   mm     = bli_obj_length( &c );
				f77_int   kk     = bli_obj_width( &c );
				f77_int   lda    = bli_obj_col_stride( &a );
				f77_int   ldc    = bli_obj_col_stride( &c );
				float*    alphap = ( float* )bli_obj_buffer( &alpha );
				float*    ap     = ( float* )bli_obj_buffer( &a );
				float*    cp     = ( float* )bli_obj_buffer( &c );

				strsm_( &f77_side,
						&f77_uploa,
						&f77_transa,
						&f77_diaga,
						&mm,
						&kk,
						alphap,
						ap, &lda,
						cp, &ldc );
			}
			else if ( bli_is_double( dt ) )
			{
				f77_int   mm     = bli_obj_length( &c );
				f77_int   kk     = bli_obj_width( &c );
				f77_int   lda    = bli_obj_col_stride( &a );
				f77_int   ldc    = bli_obj_col_stride( &c );
				double*   alphap = ( double* )bli_obj_buffer( &alpha );
				double*   ap     = ( double* )bli_obj_buffer( &a );
				double*   cp     = ( double* )bli_obj_buffer( &c );

				dtrsm_( &f77_side,
						&f77_uploa,
						&f77_transa,
						&f77_diaga,
						&mm,
						&kk,
						alphap,
						ap, &lda,
						cp, &ldc );
			}
			else if ( bli_is_scomplex( dt ) )
			{
				f77_int   mm     = bli_obj_length( &c );
				f77_int   kk     = bli_obj_width( &c );
				f77_int   lda    = bli_obj_col_stride( &a );
				f77_int   ldc    = bli_obj_col_stride( &c );
#ifdef EIGEN
				float*    alphap = ( float*    )bli_obj_buffer( &alpha );
				float*    ap     = ( float*    )bli_obj_buffer( &a );
				float*    cp     = ( float*    )bli_obj_buffer( &c );
#else
				scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha );
				scomplex* ap     = ( scomplex* )bli_obj_buffer( &a );
				scomplex* cp     = ( scomplex* )bli_obj_buffer( &c );
#endif

				ctrsm_( &f77_side,
						&f77_uploa,
						&f77_transa,
						&f77_diaga,
						&mm,
						&kk,
						alphap,
						ap, &lda,
						cp, &ldc );
			}
			else if ( bli_is_dcomplex( dt ) )
			{
				f77_int   mm     = bli_obj_length( &c );
				f77_int   kk     = bli_obj_width( &c );
				f77_int   lda    = bli_obj_col_stride( &a );
				f77_int   ldc    = bli_obj_col_stride( &c );
#ifdef EIGEN
				double*   alphap = ( double*   )bli_obj_buffer( &alpha );
				double*   ap     = ( double*   )bli_obj_buffer( &a );
				double*   cp     = ( double*   )bli_obj_buffer( &c );
#else
				dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha );
				dcomplex* ap     = ( dcomplex* )bli_obj_buffer( &a );
				dcomplex* cp     = ( dcomplex* )bli_obj_buffer( &c );
#endif

				ztrsm_( &f77_side,
						&f77_uploa,
						&f77_transa,
						&f77_diaga,
						&mm,
						&kk,
						alphap,
						ap, &lda,
						cp, &ldc );
			}
#endif

#ifdef PRINT
			bli_printm( "c after", &c, "%4.1f", "" );
			exit(1);
#endif

			dtime_save = bli_clock_min_diff( dtime_save, dtime );
		}

		if ( bli_is_left( side ) )
			gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 );
		else
			gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 );

		if ( bli_is_complex( dt ) ) gflops *= 4.0;

		printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR );
		printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
		        ( unsigned long )(p - p_begin + 1)/p_inc + 1,
		        ( unsigned long )m,
		        ( unsigned long )n, gflops );

		bli_obj_free( &alpha );

		bli_obj_free( &a );
		bli_obj_free( &c );
		bli_obj_free( &c_save );
	}

	//bli_finalize();

	return 0;
}
Пример #3
0
err_t bli_gemmsup_ref
     (
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       rntm_t* rntm
     )
{
	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_gemm_check( alpha, a, b, beta, c, cntx );

#if 0
	// FGVZ: The datatype-specific variant is now responsible for checking for
	// alpha == 0.0.

	// If alpha is zero, scale by beta and return.
	if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
	{
		bli_scalm( beta, c );
		return BLIS_SUCCESS;
	}
#endif

#if 0
	// FGVZ: Will this be needed for constructing thrinfo_t's (recall: the
	// sba needs to be attached to the rntm; see below)? Or will those nodes
	// just be created "locally," in an exposed manner?

	// Parse and interpret the contents of the rntm_t object to properly
	// set the ways of parallelism for each loop, and then make any
	// additional modifications necessary for the current operation.
	bli_rntm_set_ways_for_op
	(
	  BLIS_GEMM,
	  BLIS_LEFT, // ignored for gemm/hemm/symm
	  bli_obj_length( &c_local ),
	  bli_obj_width( &c_local ),
	  bli_obj_width( &a_local ),
	  rntm
	);

	// FGVZ: the sba needs to be attached to the rntm. But it needs
	// to be done in the thread region, since it needs a thread id.
	//bli_sba_rntm_set_pool( tid, array, rntm_p );
#endif

#if 0
	// FGVZ: The datatype-specific variant is now responsible for inducing a
	// transposition, if needed.

	// Induce transpositions on A and/or B if either object is marked for
	// transposition. We can induce "fast" transpositions since they objects
	// are guaranteed to not have structure or be packed.
	if ( bli_obj_has_trans( a ) )
	{
		bli_obj_induce_fast_trans( a );
		bli_obj_toggle_trans( a );
	}
	if ( bli_obj_has_trans( b ) )
	{
		bli_obj_induce_fast_trans( b );
		bli_obj_toggle_trans( b );
	}
#endif

#if 0
	//bli_gemmsup_ref_var2
	//bli_gemmsup_ref_var1
	#if 0
	bli_gemmsup_ref_var1n
	#else
	#endif
	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
	const bool_t  is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
	                                     stor_id == BLIS_RRC ||
	                                     stor_id == BLIS_RCR ||
	                                     stor_id == BLIS_CRR );
	if ( is_rrr_rrc_rcr_crr )
	{
		bli_gemmsup_ref_var2m
		(
		  BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
		);
	}
	else
	{
		bli_gemmsup_ref_var2m
		(
		  BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
		);
	}
#else
	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );

	// Don't use the small/unpacked implementation if one of the matrices
	// uses general stride.
	if ( stor_id == BLIS_XXX ) return BLIS_FAILURE;

	const bool_t  is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
	                                     stor_id == BLIS_RRC ||
	                                     stor_id == BLIS_RCR ||
	                                     stor_id == BLIS_CRR );
	const bool_t  is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;

	const num_t   dt       = bli_obj_dt( c );
	const bool_t  row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );

	const bool_t  is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
	                                      : is_rcc_crc_ccr_ccc );

	if ( is_primary )
	{
		// This branch handles:
		//  - rrr rrc rcr crr for row-preferential kernels
		//  - rcc crc ccr ccc for column-preferential kernels

		const dim_t m  = bli_obj_length( c );
		const dim_t n  = bli_obj_width( c );
		const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
		const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
		const dim_t mu = m / MR;
		const dim_t nu = n / NR;

		if ( mu >= nu )
		{
			// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
			bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
			                       alpha, a, b, beta, c, stor_id, cntx, rntm );
		}
		else // if ( mu < nu )
		{
			// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
			bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE,
			                       alpha, a, b, beta, c, stor_id, cntx, rntm );
		}
	}
	else
	{
		// This branch handles:
		//  - rrr rrc rcr crr for column-preferential kernels
		//  - rcc crc ccr ccc for row-preferential kernels

		const dim_t mt = bli_obj_width( c );
		const dim_t nt = bli_obj_length( c );
		const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
		const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
		const dim_t mu = mt / MR;
		const dim_t nu = nt / NR;

		if ( mu >= nu )
		{
			// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
			bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
			                       alpha, a, b, beta, c, stor_id, cntx, rntm );
		}
		else // if ( mu < nu )
		{
			// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
			bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,
			                       alpha, a, b, beta, c, stor_id, cntx, rntm );
		}
		// *requires nudging of mc,nc up to be a multiple of nr,mr.
	}
#endif

	// Return success so that the caller knows that we computed the solution.
	return BLIS_SUCCESS;
}
Пример #4
0
siz_t bli_packv_init_pack
     (
       pack_t  schema,
       bszid_t bmult_id,
       obj_t*  a,
       obj_t*  p,
       cntx_t* cntx
     )
{
	num_t     dt     = bli_obj_dt( a );
	dim_t     dim_a  = bli_obj_vector_dim( a );
	dim_t     bmult  = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx );

	membrk_t* membrk = bli_cntx_membrk( cntx );

#if 0
	mem_t*    mem_p;
#endif
	dim_t     m_p_pad;
	siz_t     size_p;
	inc_t     rs_p, cs_p;
	void*     buf;


	// We begin by copying the basic fields of c.
	bli_obj_alias_to( a, p );

	// Update the dimensions.
	bli_obj_set_dims( dim_a, 1, p );

	// Reset the view offsets to (0,0).
	bli_obj_set_offs( 0, 0, p );

	// Set the pack schema in the p object to the value in the control tree
	// node.
	bli_obj_set_pack_schema( schema, p );

	// Compute the dimensions padded by the dimension multiples.
	m_p_pad = bli_align_dim_to_mult( bli_obj_vector_dim( p ), bmult );

	// Compute the size of the packed buffer.
	size_p = m_p_pad * 1 * bli_obj_elem_size( p );

#if 0
	// Extract the address of the mem_t object within p that will track
	// properties of the packed buffer.
	mem_p = bli_obj_pack_mem( *p );

	if ( bli_mem_is_unalloc( mem_p ) )
	{
		// If the mem_t object of p has not yet been allocated, then acquire
		// a memory block suitable for a vector.
		bli_membrk_acquire_v( membrk,
		                      size_p,
		                      mem_p );
	}
	else
	{
 		// If the mem_t object has already been allocated, then release and
		// re-acquire the memory so there is sufficient space.
		if ( bli_mem_size( mem_p ) < size_p )
		{
			bli_membrk_release( mem_p );

			bli_membrk_acquire_v( membrk,
			                      size_p,
			                      mem_p );
		}
	}

	// Grab the buffer address from the mem_t object and copy it to the
	// main object buffer field. (Sometimes this buffer address will be
	// copied when the value is already up-to-date, because it persists
	// in the main object buffer field across loop iterations.)
	buf = bli_mem_buffer( mem_p );
	bli_obj_set_buffer( buf, p );
#endif

	// Save the padded (packed) dimensions into the packed object.
	bli_obj_set_padded_dims( m_p_pad, 1, p );

	// Set the row and column strides of p based on the pack schema.
	if ( schema == BLIS_PACKED_VECTOR )
	{
		// Set the strides to reflect a column-stored vector. Note that the
		// column stride may never be used, and is only useful to determine
		// how much space beyond the vector would need to be zero-padded, if
		// zero-padding was needed.
		rs_p = 1;
		cs_p = bli_obj_padded_length( p );

		bli_obj_set_strides( rs_p, cs_p, p );
	}

	return size_p;
}
Пример #5
0
void bli_daxpyf_int_var1
     (
       conj_t  conja,
       conj_t  conjx,
       dim_t   m,
       dim_t   b_n,
       double* alpha,
       double* a, inc_t inca, inc_t lda,
       double* x, inc_t incx,
       double* y, inc_t incy,
       cntx_t* cntx
     )
{
	double*  restrict alpha_cast = alpha;
	double*  restrict a_cast = a;
	double*  restrict x_cast = x;
	double*  restrict y_cast = y;
	dim_t             i;

	const dim_t       n_elem_per_reg = 2;
	const dim_t       n_iter_unroll  = 2;

	dim_t             m_pre;
	dim_t             m_run;
	dim_t             m_left;

    double*  restrict a0;
    double*  restrict a1;
    double*  restrict a2;
    double*  restrict a3;
    double*  restrict y0;
    double            a0c, a1c, a2c, a3c;
    double            chi0, chi1, chi2, chi3;

	v2df_t            a00v, a01v, a02v, a03v, y0v;
	v2df_t            a10v, a11v, a12v, a13v, y1v;
	v2df_t            chi0v, chi1v, chi2v, chi3v;

	bool_t            use_ref = FALSE;


	if ( bli_zero_dim2( m, b_n ) ) return;

	m_pre = 0;

	// If there is anything that would interfere with our use of aligned
	// vector loads/stores, call the reference implementation.
	if ( b_n < bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_AF, cntx ) )
	{
		use_ref = TRUE;
	}
	else if ( inca != 1 || incx != 1 || incy != 1 ||
	          bli_is_unaligned_to( lda*sizeof(double), 16 ) )
	{
		use_ref = TRUE;
	}
	else if ( bli_is_unaligned_to( a, 16 ) ||
	          bli_is_unaligned_to( y, 16 ) )
	{
		use_ref = TRUE;

		if ( bli_is_unaligned_to( a, 16 ) &&
		     bli_is_unaligned_to( y, 16 ) )
		{
			use_ref = FALSE;
			m_pre   = 1;
		}
	}

	// Call the reference implementation if needed.
	if ( use_ref == TRUE )
	{
		BLIS_DAXPYF_KERNEL_REF( conja,
		                        conjx,
		                        m,
		                        b_n,
		                        alpha_cast,
		                        a_cast, inca, lda,
		                        x_cast, incx,
		                        y_cast, incy,
		                        cntx );
		return;
	}


	m_run       = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
	m_left      = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );

	a0   = a_cast + 0*lda;
	a1   = a_cast + 1*lda;
	a2   = a_cast + 2*lda;
	a3   = a_cast + 3*lda;
	y0   = y_cast;

	chi0 = *(x_cast + 0*incx);
	chi1 = *(x_cast + 1*incx);
	chi2 = *(x_cast + 2*incx);
	chi3 = *(x_cast + 3*incx);

	PASTEMAC2(d,d,scals)( *alpha_cast, chi0 );
	PASTEMAC2(d,d,scals)( *alpha_cast, chi1 );
	PASTEMAC2(d,d,scals)( *alpha_cast, chi2 );
	PASTEMAC2(d,d,scals)( *alpha_cast, chi3 );

	if ( m_pre == 1 )
	{
		a0c = *a0;
		a1c = *a1;
		a2c = *a2;
		a3c = *a3;

		*y0 += chi0 * a0c + 
		       chi1 * a1c + 
		       chi2 * a2c + 
		       chi3 * a3c;

		a0 += inca;
		a1 += inca;
		a2 += inca;
		a3 += inca;
		y0 += incy;
	}

	chi0v.v = _mm_loaddup_pd( ( double* )&chi0 );
	chi1v.v = _mm_loaddup_pd( ( double* )&chi1 );
	chi2v.v = _mm_loaddup_pd( ( double* )&chi2 );
	chi3v.v = _mm_loaddup_pd( ( double* )&chi3 );

	for ( i = 0; i < m_run; ++i )
	{
		y0v.v = _mm_load_pd( ( double* )(y0 + 0*n_elem_per_reg) );

		a00v.v = _mm_load_pd( ( double* )(a0 + 0*n_elem_per_reg) );
		a01v.v = _mm_load_pd( ( double* )(a1 + 0*n_elem_per_reg) );

		y0v.v += chi0v.v * a00v.v;
		y0v.v += chi1v.v * a01v.v;

		a02v.v = _mm_load_pd( ( double* )(a2 + 0*n_elem_per_reg) );
		a03v.v = _mm_load_pd( ( double* )(a3 + 0*n_elem_per_reg) );

		y0v.v += chi2v.v * a02v.v;
		y0v.v += chi3v.v * a03v.v;

		_mm_store_pd( ( double* )(y0 + 0*n_elem_per_reg), y0v.v );


		y1v.v = _mm_load_pd( ( double* )(y0 + 1*n_elem_per_reg) );

		a10v.v = _mm_load_pd( ( double* )(a0 + 1*n_elem_per_reg) );
		a11v.v = _mm_load_pd( ( double* )(a1 + 1*n_elem_per_reg) );

		y1v.v += chi0v.v * a10v.v;
		y1v.v += chi1v.v * a11v.v;

		a12v.v = _mm_load_pd( ( double* )(a2 + 1*n_elem_per_reg) );
		a13v.v = _mm_load_pd( ( double* )(a3 + 1*n_elem_per_reg) );

		y1v.v += chi2v.v * a12v.v;
		y1v.v += chi3v.v * a13v.v;

		_mm_store_pd( ( double* )(y0 + 1*n_elem_per_reg), y1v.v );


		a0 += n_elem_per_reg * n_iter_unroll;
		a1 += n_elem_per_reg * n_iter_unroll;
		a2 += n_elem_per_reg * n_iter_unroll;
		a3 += n_elem_per_reg * n_iter_unroll;
		y0 += n_elem_per_reg * n_iter_unroll;
	}

	if ( m_left > 0 )
	{
		for ( i = 0; i < m_left; ++i )
		{
			a0c = *a0;
			a1c = *a1;
			a2c = *a2;
			a3c = *a3;

			*y0 += chi0 * a0c + 
			       chi1 * a1c + 
			       chi2 * a2c + 
			       chi3 * a3c;

			a0 += inca;
			a1 += inca;
			a2 += inca;
			a3 += inca;
			y0 += incy;
		}
	}
}
    C11 := beta * C11 + alpha * A1 * B1

  where A1 is MR x k, B1 is k x NR, C11 is MR x NR, and alpha and beta are
  scalars.

  For more info, please refer to the BLIS website's wiki on kernels:

    https://github.com/flame/blis/wiki/KernelsHowTo

  and/or contact the blis-devel mailing list.

  -FGVZ
*/
	const num_t        dt     = BLIS_DCOMPLEX;

	const dim_t        mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
	const dim_t        nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );

	const inc_t        packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx );
	const inc_t        packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx );

	const inc_t        cs_a   = packmr;
	const inc_t        rs_b   = packnr;

	const inc_t        rs_ab  = 1;
	const inc_t        cs_ab  = mr;

	dim_t              l, j, i;

	dcomplex           ab[ bli_zmr *
	                       bli_znr ];