void bli_cntx_init_sandybridge( cntx_t* cntx )
{
	blksz_t blkszs[ BLIS_NUM_BLKSZS ];

	// Set default kernel blocksizes and functions.
	bli_cntx_init_sandybridge_ref( cntx );

	// -------------------------------------------------------------------------

	// Update the context with optimized native gemm micro-kernels and
	// their storage preferences.
	bli_cntx_set_l3_nat_ukrs
	(
	  4,
	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_sandybridge_asm_8x8, FALSE,
	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_sandybridge_asm_8x4, FALSE,
	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4, FALSE,
	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4, FALSE,
	  cntx
	);

	// Initialize level-3 blocksize objects with architecture-specific values.
	//                                           s      d      c      z
	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     8,     8,     8,     4 );
	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     8,     4,     4,     4 );
	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   128,    96,    96,    64 );
	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   384,   256,   256,   192 );
	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4096,  4096,  4096,  4096 );

	// Update the context with the current architecture's register and cache
	// blocksizes (and multiples) for native execution.
	bli_cntx_set_blkszs
	(
	  BLIS_NAT, 5,
	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
	  cntx
	);
}
void bli_cntx_init_cortexa15( cntx_t* cntx )
{
	blksz_t blkszs[ BLIS_NUM_BLKSZS ];

	// Set default kernel blocksizes and functions.
	bli_cntx_init_cortexa15_ref( cntx );

	// -------------------------------------------------------------------------

	// Update the context with optimized native gemm micro-kernels and
	// their storage preferences.
	bli_cntx_set_l3_nat_ukrs
	(
	  2,
	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv7a_int_4x4, FALSE,
	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv7a_int_4x4, FALSE,
	  cntx
	);

	// Initialize level-3 blocksize objects with architecture-specific values.
	//                                           s      d      c      z
	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     4,     4,     0,     0 );
	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     4,     4,     0,     0 );
	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   336,   176,     0,     0 );
	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   528,   368,     0,     0 );
	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4096,  4096,     0,     0 );

	// Update the context with the current architecture's register and cache
	// blocksizes (and multiples) for native execution.
	bli_cntx_set_blkszs
	(
	  BLIS_NAT, 5,
	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
	  cntx
	);
}
void bli_cntx_init_haswell( cntx_t* cntx )
{
	blksz_t blkszs[ BLIS_NUM_BLKSZS ];

	// Set default kernel blocksizes and functions.
	bli_cntx_init_haswell_ref( cntx );

	// -------------------------------------------------------------------------

	// Update the context with optimized native gemm micro-kernels and
	// their storage preferences.
	bli_cntx_set_l3_nat_ukrs
	(
	  8,
	  // gemm
	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_zen_asm_6x16,       TRUE,
	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_zen_asm_6x8,        TRUE,
	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_zen_asm_3x8,        TRUE,
	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_zen_asm_3x4,        TRUE,
	  // gemmtrsm_l
	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_zen_asm_6x16, TRUE,
	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_zen_asm_6x8,  TRUE,
	  // gemmtrsm_u
	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_zen_asm_6x16, TRUE,
	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_zen_asm_6x8,  TRUE,
	  cntx
	);

	bli_cntx_set_l1f_kers
	(
	  4,
	  // axpyf
	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
	  // dotxf
	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
	  cntx
	);

	// Update the context with optimized level-1v kernels.
	bli_cntx_set_l1v_kers
	(
	  10,
	  // amaxv
	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
	  // axpyv
#if 0
	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
	  // dotv
	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int,
	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int,
	  // dotxv
	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
	  // scalv
#if 0
	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int,
	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int,
#else
	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
	  cntx
	);

	// Initialize level-3 blocksize objects with architecture-specific values.
	//                                           s      d      c      z
	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     6,     6,     3,     3 );
	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,     8,     4 );
	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,   144,    72 );
	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,   256,   256 );
	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,  4080,  4080 );
	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,     8,     8 );
	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,     8,     8 );

	// Update the context with the current architecture's register and cache
	// blocksizes (and multiples) for native execution.
	bli_cntx_set_blkszs
	(
	  BLIS_NAT, 7,
	  // level-3
	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
	  // level-1f
	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
	  cntx
	);
}
Example #4
0
void GENBARNAME(cntx_init)
     (
       cntx_t* cntx
     )
{
	blksz_t  blkszs[ BLIS_NUM_BLKSZS ];
	func_t*  funcs;
	mbool_t* mbools;
	dim_t    i;


	// -- Clear the context ----------------------------------------------------

	bli_cntx_clear( cntx );


	// -- Set blocksizes -------------------------------------------------------

	//                                          s     d     c     z
	bli_blksz_init_easy( &blkszs[ BLIS_KR ],    1,    1,    1,    1 );
	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    8,    4,    4,    2 );
	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    4,    4,    2,    2 );
	bli_blksz_init_easy( &blkszs[ BLIS_MC ],  512,  256,  256,  128 );
	bli_blksz_init_easy( &blkszs[ BLIS_KC ],  256,  256,  256,  256 );
	bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 );
	bli_blksz_init_easy( &blkszs[ BLIS_M2 ], 1000, 1000, 1000, 1000 );
	bli_blksz_init_easy( &blkszs[ BLIS_N2 ], 1000, 1000, 1000, 1000 );
	bli_blksz_init_easy( &blkszs[ BLIS_AF ],    8,    4,    4,    2 );
	bli_blksz_init_easy( &blkszs[ BLIS_DF ],    8,    4,    4,    2 );
	bli_blksz_init_easy( &blkszs[ BLIS_XF ],    8,    4,    4,    2 );

	// Initialize the context with the default blocksize objects and their
	// multiples.
	bli_cntx_set_blkszs
	(
	  BLIS_NAT, 11,
	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
	  BLIS_KR, &blkszs[ BLIS_KR ], BLIS_KR,
	  BLIS_M2, &blkszs[ BLIS_M2 ], BLIS_M2,
	  BLIS_N2, &blkszs[ BLIS_N2 ], BLIS_N2,
	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
	  BLIS_XF, &blkszs[ BLIS_XF ], BLIS_XF,
	  cntx
	);


	// -- Set level-3 virtual micro-kernels ------------------------------------

	funcs = bli_cntx_l3_vir_ukrs_buf( cntx );

	gen_func_init_co( &funcs[ BLIS_GEMM_UKR ],       gemm1m_ukr_name       );
	gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm1m_l_ukr_name );
	gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm1m_u_ukr_name );
	gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ],     trsm1m_l_ukr_name     );
	gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ],     trsm1m_u_ukr_name     );


	// -- Set level-3 native micro-kernels and preferences ---------------------

	funcs  = bli_cntx_l3_nat_ukrs_buf( cntx );
	mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );

	gen_func_init( &funcs[ BLIS_GEMM_UKR ],       gemm_ukr_name       );
	gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name );
	gen_func_init( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name );
	gen_func_init( &funcs[ BLIS_TRSM_L_UKR ],     trsm_l_ukr_name     );
	gen_func_init( &funcs[ BLIS_TRSM_U_UKR ],     trsm_u_ukr_name     );

	bli_mbool_init( &mbools[ BLIS_GEMM_UKR ],       FALSE, FALSE, FALSE, FALSE );
	bli_mbool_init( &mbools[ BLIS_GEMMTRSM_L_UKR ], FALSE, FALSE, FALSE, FALSE );
	bli_mbool_init( &mbools[ BLIS_GEMMTRSM_U_UKR ], FALSE, FALSE, FALSE, FALSE );
	bli_mbool_init( &mbools[ BLIS_TRSM_L_UKR ],     FALSE, FALSE, FALSE, FALSE );
	bli_mbool_init( &mbools[ BLIS_TRSM_U_UKR ],     FALSE, FALSE, FALSE, FALSE );


	// -- Set level-1f kernels -------------------------------------------------

	funcs = bli_cntx_l1f_kers_buf( cntx );

	gen_func_init( &funcs[ BLIS_AXPY2V_KER ],    axpy2v_ker_name    );
	gen_func_init( &funcs[ BLIS_DOTAXPYV_KER ],  dotaxpyv_ker_name  );
	gen_func_init( &funcs[ BLIS_AXPYF_KER ],     axpyf_ker_name     );
	gen_func_init( &funcs[ BLIS_DOTXF_KER ],     dotxf_ker_name     );
	gen_func_init( &funcs[ BLIS_DOTXAXPYF_KER ], dotxaxpyf_ker_name );


	// -- Set level-1v kernels -------------------------------------------------

	funcs = bli_cntx_l1v_kers_buf( cntx );

	gen_func_init( &funcs[ BLIS_ADDV_KER ],    addv_ker_name    );
	gen_func_init( &funcs[ BLIS_AMAXV_KER ],   amaxv_ker_name   );
	gen_func_init( &funcs[ BLIS_AXPBYV_KER ],  axpbyv_ker_name  );
	gen_func_init( &funcs[ BLIS_AXPYV_KER ],   axpyv_ker_name   );
	gen_func_init( &funcs[ BLIS_COPYV_KER ],   copyv_ker_name   );
	gen_func_init( &funcs[ BLIS_DOTV_KER ],    dotv_ker_name    );
	gen_func_init( &funcs[ BLIS_DOTXV_KER ],   dotxv_ker_name   );
	gen_func_init( &funcs[ BLIS_INVERTV_KER ], invertv_ker_name );
	gen_func_init( &funcs[ BLIS_SCALV_KER ],   scalv_ker_name   );
	gen_func_init( &funcs[ BLIS_SCAL2V_KER ],  scal2v_ker_name  );
	gen_func_init( &funcs[ BLIS_SETV_KER ],    setv_ker_name    );
	gen_func_init( &funcs[ BLIS_SUBV_KER ],    subv_ker_name    );
	gen_func_init( &funcs[ BLIS_SWAPV_KER ],   swapv_ker_name   );
	gen_func_init( &funcs[ BLIS_XPBYV_KER ],   xpbyv_ker_name   );


	// -- Set level-1m (packm/unpackm) kernels ---------------------------------

	funcs = bli_cntx_packm_kers_buf( cntx );

	// Initialize all packm kernel func_t entries to NULL.
	for ( i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i )
	{
		bli_func_init_null( &funcs[ i ] );
	}

	gen_func_init( &funcs[ BLIS_PACKM_2XK_KER ],  packm_2xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_3XK_KER ],  packm_3xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_4XK_KER ],  packm_4xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_6XK_KER ],  packm_6xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_8XK_KER ],  packm_8xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_24XK_KER ], packm_24xk_ker_name );
	gen_func_init( &funcs[ BLIS_PACKM_30XK_KER ], packm_30xk_ker_name );

	funcs = bli_cntx_unpackm_kers_buf( cntx );

	// Initialize all packm kernel func_t entries to NULL.
	for ( i = BLIS_UNPACKM_0XK_KER; i <= BLIS_UNPACKM_31XK_KER; ++i )
	{
		bli_func_init_null( &funcs[ i ] );
	}

	gen_func_init( &funcs[ BLIS_UNPACKM_2XK_KER ],  unpackm_2xk_ker_name );
	gen_func_init( &funcs[ BLIS_UNPACKM_4XK_KER ],  unpackm_4xk_ker_name );
	gen_func_init( &funcs[ BLIS_UNPACKM_6XK_KER ],  unpackm_6xk_ker_name );
	gen_func_init( &funcs[ BLIS_UNPACKM_8XK_KER ],  unpackm_8xk_ker_name );
	gen_func_init( &funcs[ BLIS_UNPACKM_10XK_KER ], unpackm_10xk_ker_name );
	gen_func_init( &funcs[ BLIS_UNPACKM_12XK_KER ], unpackm_12xk_ker_name );
	gen_func_init( &funcs[ BLIS_UNPACKM_14XK_KER ], unpackm_14xk_ker_name );
	gen_func_init( &funcs[ BLIS_UNPACKM_16XK_KER ], unpackm_16xk_ker_name );


	// -- Set miscellaneous fields ---------------------------------------------

	bli_cntx_set_method( BLIS_NAT, cntx );

	bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx );
	bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
	bli_cntx_set_schema_c_panel( BLIS_NOT_PACKED,        cntx );

	bli_cntx_set_anti_pref( FALSE, cntx );

	bli_cntx_set_thrloop( 1, 1, 1, 1, 1, cntx );

	bli_cntx_set_membrk( bli_memsys_global_membrk(), cntx );
}
Example #5
0
void bli_cntx_init_zen( cntx_t* cntx )
{
	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
	blksz_t thresh[ BLIS_NUM_THRESH ];

	// Set default kernel blocksizes and functions.
	bli_cntx_init_zen_ref( cntx );

	// -------------------------------------------------------------------------

	// Update the context with optimized native gemm micro-kernels and
	// their storage preferences.
	bli_cntx_set_l3_nat_ukrs
	(
	  8,
	  // gemm
	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,        TRUE,
	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,        TRUE,
	  // gemmtrsm_l
	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,  TRUE,
	  // gemmtrsm_u
	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
	  cntx
	);

	// Update the context with optimized level-1f kernels.
	bli_cntx_set_l1f_kers
	(
	  4,
	  // axpyf
	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
	  // dotxf
	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
	  cntx
	);

	// Update the context with optimized level-1v kernels.
	bli_cntx_set_l1v_kers
	(
	  10,
	  // amaxv
	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
	  // axpyv
#if 0
	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
	  // dotv
	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int,
	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int,
	  // dotxv
	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
	  // scalv
#if 0
	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int,
	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int,
#else
	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
	  cntx
	);

	// Initialize level-3 blocksize objects with architecture-specific values.
	//                                           s      d      c      z
	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     6,     6,     3,     3 );
	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,     8,     4 );
#ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES
	// Zen optmized level 3 cache block sizes
	bli_blksz_init_easy( &blkszs[ BLIS_MC ],  1020,   510,   510,   255 );
	bli_blksz_init_easy( &blkszs[ BLIS_KC ],  1024,  1024,  1024,  1024 );
#else
	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,   144,    72 );
	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,   256,   256 );
#endif
	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8160,  4080,  4080,  3056 );
	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,    -1,    -1 );
	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );

	// Update the context with the current architecture's register and cache
	// blocksizes (and multiples) for native execution.
	bli_cntx_set_blkszs
	(
	  BLIS_NAT, 7,
	  // level-3
	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
	  // level-1f
	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
	  cntx
	);

	// -------------------------------------------------------------------------

	// Initialize sup thresholds with architecture-appropriate values.
	//                                          s     d     c     z
	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,  256,   -1,   -1 );
	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,  100,   -1,   -1 );
	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,  120,   -1,   -1 );

	// Initialize the context with the sup thresholds.
	bli_cntx_set_l3_sup_thresh
	(
	  3,
	  BLIS_MT, &thresh[ BLIS_MT ],
	  BLIS_NT, &thresh[ BLIS_NT ],
	  BLIS_KT, &thresh[ BLIS_KT ],
	  cntx
	);

	// Update the context with optimized small/unpacked gemm kernels.
	bli_cntx_set_l3_sup_kers
	(
	  8,
	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
	  cntx
	);

	// Initialize level-3 sup blocksize objects with architecture-specific
	// values.
	//                                           s      d      c      z
	bli_blksz_init     ( &blkszs[ BLIS_MR ],    -1,     6,    -1,    -1,
	                                            -1,     9,    -1,    -1 );
	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,     8,    -1,    -1 );
	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,    72,    -1,    -1 );
	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  4080,    -1,    -1 );

	// Update the context with the current architecture's register and cache
	// blocksizes for small/unpacked level-3 problems.
	bli_cntx_set_l3_sup_blkszs
	(
	  5,
	  BLIS_NC, &blkszs[ BLIS_NC ],
	  BLIS_KC, &blkszs[ BLIS_KC ],
	  BLIS_MC, &blkszs[ BLIS_MC ],
	  BLIS_NR, &blkszs[ BLIS_NR ],
	  BLIS_MR, &blkszs[ BLIS_MR ],
	  cntx
	);
}
Example #6
0
int main( int argc, char** argv )
{
	//bli_init();

#if 0
	obj_t a, b, c;
	obj_t aa, bb, cc;
	dim_t m, n, k;
	num_t dt;
	uplo_t uploa, uplob, uploc;

	{
		dt = BLIS_DOUBLE;

		m = 6;
		k = 6;
		n = 6;

		bli_obj_create( dt, m, k, 0, 0, &a );
		bli_obj_create( dt, k, n, 0, 0, &b );
		bli_obj_create( dt, m, n, 0, 0, &c );

		uploa = BLIS_UPPER;
		uploa = BLIS_LOWER;
		bli_obj_set_struc( BLIS_TRIANGULAR, &a );
		bli_obj_set_uplo( uploa, &a );
		bli_obj_set_diag_offset( -2, &a );

		uplob = BLIS_UPPER;
		uplob = BLIS_LOWER;
		bli_obj_set_struc( BLIS_TRIANGULAR, &b );
		bli_obj_set_uplo( uplob, &b );
		bli_obj_set_diag_offset( -2, &b );

		uploc = BLIS_UPPER;
		//uploc = BLIS_LOWER;
		//uploc = BLIS_ZEROS;
		//uploc = BLIS_DENSE;
		bli_obj_set_struc( BLIS_HERMITIAN, &c );
		//bli_obj_set_struc( BLIS_TRIANGULAR, &c );
		bli_obj_set_uplo( uploc, &c );
		bli_obj_set_diag_offset(  1, &c );

		bli_obj_alias_to( &a, &aa ); (void)aa;
		bli_obj_alias_to( &b, &bb ); (void)bb;
		bli_obj_alias_to( &c, &cc ); (void)cc;

		bli_randm( &a );
		bli_randm( &b );
		bli_randm( &c );
		//bli_mkherm( &a );
		//bli_mktrim( &a );

		bli_prune_unref_mparts( &cc, BLIS_M,
		                        &aa, BLIS_N );

		bli_printm( "c orig", &c, "%4.1f", "" );
		bli_printm( "c alias", &cc, "%4.1f", "" );
		bli_printm( "a orig", &a, "%4.1f", "" );
		bli_printm( "a alias", &aa, "%4.1f", "" );
		//bli_obj_print( "a struct", &a );
	}
#endif

	dim_t  p_begin, p_max, p_inc;
	gint_t m_input, n_input;
	char   uploa_ch;
	doff_t diagoffa;
	dim_t  bf;
	dim_t  n_way;
	char   part_dim_ch;
	bool_t go_fwd;
	char   out_ch;

	obj_t   a;
	blksz_t bfs;

	thrinfo_t thrinfo;
	dim_t  m, n;
	uplo_t uploa;
	bool_t part_m_dim, part_n_dim;
	bool_t go_bwd;
	dim_t  p;
	num_t  dt;
	dim_t  start, end;

	dim_t  width;
	siz_t  area;

	gint_t t_begin, t_stop, t_inc;
	dim_t  t;

	if ( argc == 13 )
	{
		sscanf( argv[1], "%u", &p_begin );
		sscanf( argv[2], "%u", &p_max );
		sscanf( argv[3], "%u", &p_inc );
		sscanf( argv[4], "%d", &m_input );
		sscanf( argv[5], "%d", &n_input );
		sscanf( argv[6], "%c",  &uploa_ch );
		sscanf( argv[7], "%d", &diagoffa );
		sscanf( argv[8], "%u", &bf );
		sscanf( argv[9], "%u", &n_way );
		sscanf( argv[10], "%c", &part_dim_ch );
		sscanf( argv[11], "%u", &go_fwd );
		sscanf( argv[12], "%c", &out_ch );
	}
	else
	{
		printf( "\n" );
		printf( " %s\n", argv[0] );
		printf( "\n" );
		printf( "  Simulate the dimension ranges assigned to threads when\n" );
		printf( "  partitioning a matrix for parallelism in BLIS.\n" );
		printf( "\n" );
		printf( " Usage:\n" );
		printf( "\n" );
		printf( "  %s p_beg p_max p_inc m n uplo doff bf n_way part_dim go_fwd out\n", argv[0] );
		printf( "\n" );
		printf( "  p_beg:    the first problem size p to test.\n" );
		printf( "  p_max:    the maximum problem size p to test.\n" );
		printf( "  p_inc:    the increase in problem size p between tests.\n" );
		printf( "  m:        the m dimension:\n" );
		printf( "  n:        the n dimension:\n" );
		printf( "            if m,n = -1: bind m,n to problem size p.\n" );
		printf( "            if m,n =  0: bind m,n to p_max.\n" );
		printf( "            if m,n >  0: hold m,n = c constant for all p.\n" );
		printf( "  uplo:     the uplo field of the matrix being partitioned:\n" );
		printf( "            'l': lower-stored (BLIS_LOWER)\n" );
		printf( "            'u': upper-stored (BLIS_UPPER)\n" );
		printf( "            'd': densely-stored (BLIS_DENSE)\n" );
		printf( "  doff:     the diagonal offset of the matrix being partitioned.\n" );
		printf( "  bf:       the simulated blocking factor. all thread ranges must\n" );
		printf( "            be a multiple of bf, except for the range that contains\n" );
		printf( "            the edge case (if one exists). the blocking factor\n" );
		printf( "            would typically correspond to a register blocksize.\n" );
		printf( "  n_way:    the number of ways of parallelism for which we are\n" );
		printf( "            partitioning (i.e.: the number of threads, or thread\n" );
		printf( "            groups).\n" );
		printf( "  part_dim: the dimension to partition:\n" );
		printf( "            'm': partition the m dimension.\n" );
		printf( "            'n': partition the n dimension.\n" );
		printf( "  go_fwd:   the direction to partition:\n" );
		printf( "            '1': forward, e.g. left-to-right (part_dim = 'm') or\n" );
		printf( "                 top-to-bottom (part_dim = 'n')\n" );
		printf( "            '0': backward, e.g. right-to-left (part_dim = 'm') or\n" );
		printf( "                 bottom-to-top (part_dim = 'n')\n" );
		printf( "            NOTE: reversing the direction does not change the\n" );
		printf( "            subpartitions' widths, but it does change which end of\n" );
		printf( "            the index range receives the edge case, if it exists.\n" );
		printf( "  out:      the type of output per thread-column:\n" );
		printf( "            'w': the width (and area) of the thread's subpartition\n" );
		printf( "            'r': the actual ranges of the thread's subpartition\n" );
		printf( "                 where the start and end points of each range are\n" );
		printf( "                 inclusive and exclusive, respectively.\n" );
		printf( "\n" );

		exit(1);
	}

	if ( m_input == 0 ) m_input = p_max;
	if ( n_input == 0 ) n_input = p_max;

	if ( part_dim_ch == 'm' ) { part_m_dim = TRUE;  part_n_dim = FALSE; }
	else                      { part_m_dim = FALSE; part_n_dim = TRUE;  }

	go_bwd = !go_fwd;

	if      ( uploa_ch == 'l' ) uploa = BLIS_LOWER;
	else if ( uploa_ch == 'u' ) uploa = BLIS_UPPER;
	else                        uploa = BLIS_DENSE;

	if ( part_n_dim )
	{
		if ( bli_is_upper( uploa ) ) { t_begin = n_way-1; t_stop = -1;    t_inc = -1; }
		else /* if lower or dense */ { t_begin = 0;       t_stop = n_way; t_inc =  1; }
	}
	else // if ( part_m_dim )
	{
		if ( bli_is_lower( uploa ) ) { t_begin = n_way-1; t_stop = -1;    t_inc = -1; }
		else /* if upper or dense */ { t_begin = 0;       t_stop = n_way; t_inc =  1; }
	}

	printf( "\n" );
	printf( "  part: %3s   doff: %3d   bf: %3d   output: %s\n",
	        ( part_n_dim ? ( go_fwd ? "l2r" : "r2l" )
	                     : ( go_fwd ? "t2b" : "b2t" ) ),
	        ( int )diagoffa, ( int )bf,
            ( out_ch == 'w' ? "width(area)" : "ranges" ) );
	printf( "              uplo: %3c   nt: %3u\n", uploa_ch, ( unsigned )n_way );
	printf( "\n" );

	printf( "             " );
	for ( t = t_begin; t != t_stop; t += t_inc )
	{
		if ( part_n_dim )
		{
			if      ( t == t_begin      ) printf( "left...      " );
			else if ( t == t_stop-t_inc ) printf( "     ...right" );
			else                          printf( "             " );
		}
		else // if ( part_m_dim )
		{
			if      ( t == t_begin      ) printf( "top...       " );
			else if ( t == t_stop-t_inc ) printf( "    ...bottom" );
			else                          printf( "             " );
		}
	}
	printf( "\n" );


	printf( "%4c x %4c  ", 'm', 'n' );
	for ( t = t_begin; t != t_stop; t += t_inc )
	{
		printf( "%9s %u  ", "thread", ( unsigned )t );
	}
	printf( "\n" );
	printf( "-------------" );
	for ( t = t_begin; t != t_stop; t += t_inc )
	{
		printf( "-------------" );
	}
	printf( "\n" );


	for ( p = p_begin; p <= p_max; p += p_inc )
	{
		if ( m_input < 0 ) m = ( dim_t )p;
		else               m = ( dim_t )m_input;
		if ( n_input < 0 ) n = ( dim_t )p;
		else               n = ( dim_t )n_input;

		dt = BLIS_DOUBLE;
		
		bli_obj_create( dt, m, n, 0, 0, &a );

		bli_obj_set_struc( BLIS_TRIANGULAR, &a );
		bli_obj_set_uplo( uploa, &a );
		bli_obj_set_diag_offset( diagoffa, &a );

		bli_randm( &a );

		bli_blksz_init_easy( &bfs, bf, bf, bf, bf );

		printf( "%4u x %4u  ", ( unsigned )m, ( unsigned )n );

		for ( t = t_begin; t != t_stop; t += t_inc )
		{
			thrinfo.n_way   = n_way;
			thrinfo.work_id = t;

			if      ( part_n_dim && go_fwd )
				area = bli_thread_get_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end );
			else if ( part_n_dim && go_bwd )
				area = bli_thread_get_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end );
			else if ( part_m_dim && go_fwd )
				area = bli_thread_get_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end );
			else // ( part_m_dim && go_bwd )
				area = bli_thread_get_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end );

			width = end - start;

			if ( out_ch == 'w' ) printf( "%4u(%6u) ", ( unsigned )width,
			                                            ( unsigned )area );
			else                 printf( "[%4u,%4u)  ", ( unsigned )start,
			                                              ( unsigned )end );
		}

		printf( "\n" );

		bli_obj_free( &a );
	}

	//bli_finalize();

	return 0;
}