void bli_cntx_init_sandybridge( cntx_t* cntx )
{
	blksz_t blkszs[ BLIS_NUM_BLKSZS ];

	// Set default kernel blocksizes and functions.
	bli_cntx_init_sandybridge_ref( cntx );

	// -------------------------------------------------------------------------

	// Update the context with optimized native gemm micro-kernels and
	// their storage preferences.
	bli_cntx_set_l3_nat_ukrs
	(
	  4,
	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_sandybridge_asm_8x8, FALSE,
	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_sandybridge_asm_8x4, FALSE,
	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4, FALSE,
	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4, FALSE,
	  cntx
	);

	// Initialize level-3 blocksize objects with architecture-specific values.
	//                                           s      d      c      z
	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     8,     8,     8,     4 );
	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     8,     4,     4,     4 );
	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   128,    96,    96,    64 );
	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   384,   256,   256,   192 );
	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4096,  4096,  4096,  4096 );

	// Update the context with the current architecture's register and cache
	// blocksizes (and multiples) for native execution.
	bli_cntx_set_blkszs
	(
	  BLIS_NAT, 5,
	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
	  cntx
	);
}
void bli_cntx_init_cortexa15( cntx_t* cntx )
{
	blksz_t blkszs[ BLIS_NUM_BLKSZS ];

	// Set default kernel blocksizes and functions.
	bli_cntx_init_cortexa15_ref( cntx );

	// -------------------------------------------------------------------------

	// Update the context with optimized native gemm micro-kernels and
	// their storage preferences.
	bli_cntx_set_l3_nat_ukrs
	(
	  2,
	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv7a_int_4x4, FALSE,
	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv7a_int_4x4, FALSE,
	  cntx
	);

	// Initialize level-3 blocksize objects with architecture-specific values.
	//                                           s      d      c      z
	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     4,     4,     0,     0 );
	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     4,     4,     0,     0 );
	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   336,   176,     0,     0 );
	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   528,   368,     0,     0 );
	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4096,  4096,     0,     0 );

	// Update the context with the current architecture's register and cache
	// blocksizes (and multiples) for native execution.
	bli_cntx_set_blkszs
	(
	  BLIS_NAT, 5,
	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
	  cntx
	);
}
void bli_cntx_init_haswell( cntx_t* cntx )
{
	blksz_t blkszs[ BLIS_NUM_BLKSZS ];

	// Set default kernel blocksizes and functions.
	bli_cntx_init_haswell_ref( cntx );

	// -------------------------------------------------------------------------

	// Update the context with optimized native gemm micro-kernels and
	// their storage preferences.
	bli_cntx_set_l3_nat_ukrs
	(
	  8,
	  // gemm
	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_zen_asm_6x16,       TRUE,
	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_zen_asm_6x8,        TRUE,
	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_zen_asm_3x8,        TRUE,
	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_zen_asm_3x4,        TRUE,
	  // gemmtrsm_l
	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_zen_asm_6x16, TRUE,
	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_zen_asm_6x8,  TRUE,
	  // gemmtrsm_u
	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_zen_asm_6x16, TRUE,
	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_zen_asm_6x8,  TRUE,
	  cntx
	);

	bli_cntx_set_l1f_kers
	(
	  4,
	  // axpyf
	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
	  // dotxf
	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
	  cntx
	);

	// Update the context with optimized level-1v kernels.
	bli_cntx_set_l1v_kers
	(
	  10,
	  // amaxv
	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
	  // axpyv
#if 0
	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
	  // dotv
	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int,
	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int,
	  // dotxv
	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
	  // scalv
#if 0
	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int,
	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int,
#else
	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
	  cntx
	);

	// Initialize level-3 blocksize objects with architecture-specific values.
	//                                           s      d      c      z
	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     6,     6,     3,     3 );
	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,     8,     4 );
	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,   144,    72 );
	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,   256,   256 );
	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,  4080,  4080 );
	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,     8,     8 );
	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,     8,     8 );

	// Update the context with the current architecture's register and cache
	// blocksizes (and multiples) for native execution.
	bli_cntx_set_blkszs
	(
	  BLIS_NAT, 7,
	  // level-3
	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
	  // level-1f
	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
	  cntx
	);
}
Exemple #4
0
void GENBARNAME(cntx_init)
     (
       cntx_t* cntx
     )
{
	blksz_t  blkszs[ BLIS_NUM_BLKSZS ];
	func_t*  funcs;
	mbool_t* mbools;
	dim_t    i;


	// -- Clear the context ----------------------------------------------------

	bli_cntx_clear( cntx );


	// -- Set blocksizes -------------------------------------------------------

	//                                          s     d     c     z
	bli_blksz_init_easy( &blkszs[ BLIS_KR ],    1,    1,    1,    1 );
	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    8,    4,    4,    2 );
	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    4,    4,    2,    2 );
	bli_blksz_init_easy( &blkszs[ BLIS_MC ],  512,  256,  256,  128 );
	bli_blksz_init_easy( &blkszs[ BLIS_KC ],  256,  256,  256,  256 );
	bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 );
	bli_blksz_init_easy( &blkszs[ BLIS_M2 ], 1000, 1000, 1000, 1000 );
	bli_blksz_init_easy( &blkszs[ BLIS_N2 ], 1000, 1000, 1000, 1000 );
	bli_blksz_init_easy( &blkszs[ BLIS_AF ],    8,    4,    4,    2 );
	bli_blksz_init_easy( &blkszs[ BLIS_DF ],    8,    4,    4,    2 );
	bli_blksz_init_easy( &blkszs[ BLIS_XF ],    8,    4,    4,    2 );

	// Initialize the context with the default blocksize objects and their
	// multiples.
	bli_cntx_set_blkszs
	(
	  BLIS_NAT, 11,
	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
	  BLIS_KR, &blkszs[ BLIS_KR ], BLIS_KR,
	  BLIS_M2, &blkszs[ BLIS_M2 ], BLIS_M2,
	  BLIS_N2, &blkszs[ BLIS_N2 ], BLIS_N2,
	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
	  BLIS_XF, &blkszs[ BLIS_XF ], BLIS_XF,
	  cntx
	);


	// -- Set level-3 virtual micro-kernels ------------------------------------

	funcs = bli_cntx_l3_vir_ukrs_buf( cntx );

	gen_func_init_co( &funcs[ BLIS_GEMM_UKR ],       gemm1m_ukr_name       );
	gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm1m_l_ukr_name );
	gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm1m_u_ukr_name );
	gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ],     trsm1m_l_ukr_name     );
	gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ],     trsm1m_u_ukr_name     );


	// -- Set level-3 native micro-kernels and preferences ---------------------

	funcs  = bli_cntx_l3_nat_ukrs_buf( cntx );
	mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );

	gen_func_init( &funcs[ BLIS_GEMM_UKR ],       gemm_ukr_name       );
	gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name );
	gen_func_init( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name );
	gen_func_init( &funcs[ BLIS_TRSM_L_UKR ],     trsm_l_ukr_name     );
	gen_func_init( &funcs[ BLIS_TRSM_U_UKR ],     trsm_u_ukr_name     );

	bli_mbool_init( &mbools[ BLIS_GEMM_UKR ],       FALSE, FALSE, FALSE, FALSE );
	bli_mbool_init( &mbools[ BLIS_GEMMTRSM_L_UKR ], FALSE, FALSE, FALSE, FALSE );
	bli_mbool_init( &mbools[ BLIS_GEMMTRSM_U_UKR ], FALSE, FALSE, FALSE, FALSE );
	bli_mbool_init( &mbools[ BLIS_TRSM_L_UKR ],     FALSE, FALSE, FALSE, FALSE );
	bli_mbool_init( &mbools[ BLIS_TRSM_U_UKR ],     FALSE, FALSE, FALSE, FALSE );


	// -- Set level-1f kernels -------------------------------------------------

	funcs = bli_cntx_l1f_kers_buf( cntx );

	gen_func_init( &funcs[ BLIS_AXPY2V_KER ],    axpy2v_ker_name    );
	gen_func_init( &funcs[ BLIS_DOTAXPYV_KER ],  dotaxpyv_ker_name  );
	gen_func_init( &funcs[ BLIS_AXPYF_KER ],     axpyf_ker_name     );
	gen_func_init( &funcs[ BLIS_DOTXF_KER ],     dotxf_ker_name     );
	gen_func_init( &funcs[ BLIS_DOTXAXPYF_KER ], dotxaxpyf_ker_name );


	// -- Set level-1v kernels -------------------------------------------------

	funcs = bli_cntx_l1v_kers_buf( cntx );

	gen_func_init( &funcs[ BLIS_ADDV_KER ],    addv_ker_name    );
	gen_func_init( &funcs[ BLIS_AMAXV_KER ],   amaxv_ker_name   );
	gen_func_init( &funcs[ BLIS_AXPBYV_KER ],  axpbyv_ker_name  );
	gen_func_init( &funcs[ BLIS_AXPYV_KER ],   axpyv_ker_name   );
	gen_func_init( &funcs[ BLIS_COPYV_KER ],   copyv_ker_name   );
	gen_func_init( &funcs[ BLIS_DOTV_KER ],    dotv_ker_name    );
	gen_func_init( &funcs[ BLIS_DOTXV_KER ],   dotxv_ker_name   );
	gen_func_init( &funcs[ BLIS_INVERTV_KER ], invertv_ker_name );
	gen_func_init( &funcs[ BLIS_SCALV_KER ],   scalv_ker_name   );
	gen_func_init( &funcs[ BLIS_SCAL2V_KER ],  scal2v_ker_name  );
	gen_func_init( &funcs[ BLIS_SETV_KER ],    setv_ker_name    );
	gen_func_init( &funcs[ BLIS_SUBV_KER ],    subv_ker_name    );
	gen_func_init( &funcs[ BLIS_SWAPV_KER ],   swapv_ker_name   );
	gen_func_init( &funcs[ BLIS_XPBYV_KER ],   xpbyv_ker_name   );


	// -- Set level-1m (packm/unpackm) kernels ---------------------------------

	funcs = bli_cntx_packm_kers_buf( cntx );

	// Initialize all packm kernel func_t entries to NULL.
	for ( i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i )
	{
		bli_func_init_null( &funcs[ i ] );
	}

	gen_func_init( &funcs[ BLIS_PACKM_2XK_KER ],  packm_2xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_3XK_KER ],  packm_3xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_4XK_KER ],  packm_4xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_6XK_KER ],  packm_6xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_8XK_KER ],  packm_8xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_24XK_KER ], packm_24xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_30XK_KER ], packm_30xk_ker_name );

	funcs = bli_cntx_unpackm_kers_buf( cntx );

	// Initialize all packm kernel func_t entries to NULL.
	for ( i = BLIS_UNPACKM_0XK_KER; i <= BLIS_UNPACKM_31XK_KER; ++i )
	{
		bli_func_init_null( &funcs[ i ] );
	}

	gen_func_init( &funcs[ BLIS_UNPACKM_2XK_KER ],  unpackm_2xk_ker_name );
	gen_func_init( &funcs[ BLIS_UNPACKM_4XK_KER ],  unpackm_4xk_ker_name );
	gen_func_init( &funcs[ BLIS_UNPACKM_6XK_KER ],  unpackm_6xk_ker_name );
	gen_func_init( &funcs[ BLIS_UNPACKM_8XK_KER ],  unpackm_8xk_ker_name );
	gen_func_init( &funcs[ BLIS_UNPACKM_10XK_KER ], unpackm_10xk_ker_name );
	gen_func_init( &funcs[ BLIS_UNPACKM_12XK_KER ], unpackm_12xk_ker_name );
	gen_func_init( &funcs[ BLIS_UNPACKM_14XK_KER ], unpackm_14xk_ker_name );
	gen_func_init( &funcs[ BLIS_UNPACKM_16XK_KER ], unpackm_16xk_ker_name );


	// -- Set miscellaneous fields ---------------------------------------------

	bli_cntx_set_method( BLIS_NAT, cntx );

	bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx );
	bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
	bli_cntx_set_schema_c_panel( BLIS_NOT_PACKED,        cntx );

	bli_cntx_set_anti_pref( FALSE, cntx );

	bli_cntx_set_thrloop( 1, 1, 1, 1, 1, cntx );

	bli_cntx_set_membrk( bli_memsys_global_membrk(), cntx );
}
Exemple #5
0
void bli_cntx_init_zen( cntx_t* cntx )
{
	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
	blksz_t thresh[ BLIS_NUM_THRESH ];

	// Set default kernel blocksizes and functions.
	bli_cntx_init_zen_ref( cntx );

	// -------------------------------------------------------------------------

	// Update the context with optimized native gemm micro-kernels and
	// their storage preferences.
	bli_cntx_set_l3_nat_ukrs
	(
	  8,
	  // gemm
	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,        TRUE,
	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,        TRUE,
	  // gemmtrsm_l
	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,  TRUE,
	  // gemmtrsm_u
	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
	  cntx
	);

	// Update the context with optimized level-1f kernels.
	bli_cntx_set_l1f_kers
	(
	  4,
	  // axpyf
	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
	  // dotxf
	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
	  cntx
	);

	// Update the context with optimized level-1v kernels.
	bli_cntx_set_l1v_kers
	(
	  10,
	  // amaxv
	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
	  // axpyv
#if 0
	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
	  // dotv
	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int,
	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int,
	  // dotxv
	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
	  // scalv
#if 0
	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int,
	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int,
#else
	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
	  cntx
	);

	// Initialize level-3 blocksize objects with architecture-specific values.
	//                                           s      d      c      z
	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     6,     6,     3,     3 );
	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,     8,     4 );
#ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES
	// Zen optmized level 3 cache block sizes
	bli_blksz_init_easy( &blkszs[ BLIS_MC ],  1020,   510,   510,   255 );
	bli_blksz_init_easy( &blkszs[ BLIS_KC ],  1024,  1024,  1024,  1024 );
#else
	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,   144,    72 );
	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,   256,   256 );
#endif
	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8160,  4080,  4080,  3056 );
	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,    -1,    -1 );
	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );

	// Update the context with the current architecture's register and cache
	// blocksizes (and multiples) for native execution.
	bli_cntx_set_blkszs
	(
	  BLIS_NAT, 7,
	  // level-3
	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
	  // level-1f
	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
	  cntx
	);

	// -------------------------------------------------------------------------

	// Initialize sup thresholds with architecture-appropriate values.
	//                                          s     d     c     z
	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,  256,   -1,   -1 );
	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,  100,   -1,   -1 );
	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,  120,   -1,   -1 );

	// Initialize the context with the sup thresholds.
	bli_cntx_set_l3_sup_thresh
	(
	  3,
	  BLIS_MT, &thresh[ BLIS_MT ],
	  BLIS_NT, &thresh[ BLIS_NT ],
	  BLIS_KT, &thresh[ BLIS_KT ],
	  cntx
	);

	// Update the context with optimized small/unpacked gemm kernels.
	bli_cntx_set_l3_sup_kers
	(
	  8,
	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
	  cntx
	);

	// Initialize level-3 sup blocksize objects with architecture-specific
	// values.
	//                                           s      d      c      z
	bli_blksz_init     ( &blkszs[ BLIS_MR ],    -1,     6,    -1,    -1,
	                                            -1,     9,    -1,    -1 );
	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,     8,    -1,    -1 );
	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,    72,    -1,    -1 );
	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  4080,    -1,    -1 );

	// Update the context with the current architecture's register and cache
	// blocksizes for small/unpacked level-3 problems.
	bli_cntx_set_l3_sup_blkszs
	(
	  5,
	  BLIS_NC, &blkszs[ BLIS_NC ],
	  BLIS_KC, &blkszs[ BLIS_KC ],
	  BLIS_MC, &blkszs[ BLIS_MC ],
	  BLIS_NR, &blkszs[ BLIS_NR ],
	  BLIS_MR, &blkszs[ BLIS_MR ],
	  cntx
	);
}