void bli_cntx_init_sandybridge( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_sandybridge_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 4, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sandybridge_asm_8x8, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sandybridge_asm_8x4, FALSE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4, FALSE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 8, 8, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 8, 4, 4, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 128, 96, 96, 64 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 384, 256, 256, 192 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); }
void bli_cntx_init_cortexa15( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_cortexa15_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 2, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 336, 176, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 528, 368, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 0, 0 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); }
void bli_cntx_init_haswell( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_haswell_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 8, // gemm BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_zen_asm_6x16, TRUE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_zen_asm_6x8, TRUE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_zen_asm_3x8, TRUE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_zen_asm_3x4, TRUE, // gemmtrsm_l BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_zen_asm_6x16, TRUE, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_zen_asm_6x8, TRUE, // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_zen_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_zen_asm_6x8, TRUE, cntx ); bli_cntx_set_l1f_kers ( 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, cntx ); // Update the context with optimized level-1v kernels. bli_cntx_set_l1v_kers ( 10, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int, #else BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, #endif // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, // scalv #if 0 BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int, #else BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 7, // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, cntx ); }
void GENBARNAME(cntx_init) ( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; func_t* funcs; mbool_t* mbools; dim_t i; // -- Clear the context ---------------------------------------------------- bli_cntx_clear( cntx ); // -- Set blocksizes ------------------------------------------------------- // s d c z bli_blksz_init_easy( &blkszs[ BLIS_KR ], 1, 1, 1, 1 ); bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 4, 4, 2 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 2, 2 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 512, 256, 256, 128 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 ); bli_blksz_init_easy( &blkszs[ BLIS_M2 ], 1000, 1000, 1000, 1000 ); bli_blksz_init_easy( &blkszs[ BLIS_N2 ], 1000, 1000, 1000, 1000 ); bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 4, 4, 2 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 4, 4, 2 ); bli_blksz_init_easy( &blkszs[ BLIS_XF ], 8, 4, 4, 2 ); // Initialize the context with the default blocksize objects and their // multiples. bli_cntx_set_blkszs ( BLIS_NAT, 11, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, BLIS_KR, &blkszs[ BLIS_KR ], BLIS_KR, BLIS_M2, &blkszs[ BLIS_M2 ], BLIS_M2, BLIS_N2, &blkszs[ BLIS_N2 ], BLIS_N2, BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, BLIS_XF, &blkszs[ BLIS_XF ], BLIS_XF, cntx ); // -- Set level-3 virtual micro-kernels ------------------------------------ funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); gen_func_init_co( &funcs[ BLIS_GEMM_UKR ], gemm1m_ukr_name ); gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm1m_l_ukr_name ); gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm1m_u_ukr_name ); gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ], trsm1m_l_ukr_name ); gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ], trsm1m_u_ukr_name ); // -- Set level-3 native micro-kernels and preferences --------------------- funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); gen_func_init( &funcs[ BLIS_GEMM_UKR ], gemm_ukr_name ); gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name ); gen_func_init( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name ); gen_func_init( &funcs[ BLIS_TRSM_L_UKR ], trsm_l_ukr_name ); gen_func_init( &funcs[ BLIS_TRSM_U_UKR ], trsm_u_ukr_name ); bli_mbool_init( &mbools[ BLIS_GEMM_UKR ], FALSE, FALSE, FALSE, FALSE ); bli_mbool_init( &mbools[ BLIS_GEMMTRSM_L_UKR ], FALSE, FALSE, FALSE, FALSE ); bli_mbool_init( &mbools[ BLIS_GEMMTRSM_U_UKR ], FALSE, FALSE, FALSE, FALSE ); bli_mbool_init( &mbools[ BLIS_TRSM_L_UKR ], FALSE, FALSE, FALSE, FALSE ); bli_mbool_init( &mbools[ BLIS_TRSM_U_UKR ], FALSE, FALSE, FALSE, FALSE ); // -- Set level-1f kernels ------------------------------------------------- funcs = bli_cntx_l1f_kers_buf( cntx ); gen_func_init( &funcs[ BLIS_AXPY2V_KER ], axpy2v_ker_name ); gen_func_init( &funcs[ BLIS_DOTAXPYV_KER ], dotaxpyv_ker_name ); gen_func_init( &funcs[ BLIS_AXPYF_KER ], axpyf_ker_name ); gen_func_init( &funcs[ BLIS_DOTXF_KER ], dotxf_ker_name ); gen_func_init( &funcs[ BLIS_DOTXAXPYF_KER ], dotxaxpyf_ker_name ); // -- Set level-1v kernels ------------------------------------------------- funcs = bli_cntx_l1v_kers_buf( cntx ); gen_func_init( &funcs[ BLIS_ADDV_KER ], addv_ker_name ); gen_func_init( &funcs[ BLIS_AMAXV_KER ], amaxv_ker_name ); gen_func_init( &funcs[ BLIS_AXPBYV_KER ], axpbyv_ker_name ); gen_func_init( &funcs[ BLIS_AXPYV_KER ], axpyv_ker_name ); gen_func_init( &funcs[ BLIS_COPYV_KER ], copyv_ker_name ); gen_func_init( &funcs[ BLIS_DOTV_KER ], dotv_ker_name ); gen_func_init( &funcs[ BLIS_DOTXV_KER ], dotxv_ker_name ); gen_func_init( &funcs[ BLIS_INVERTV_KER ], invertv_ker_name ); gen_func_init( &funcs[ BLIS_SCALV_KER ], scalv_ker_name ); gen_func_init( &funcs[ BLIS_SCAL2V_KER ], scal2v_ker_name ); gen_func_init( &funcs[ BLIS_SETV_KER ], setv_ker_name ); gen_func_init( &funcs[ BLIS_SUBV_KER ], subv_ker_name ); gen_func_init( &funcs[ BLIS_SWAPV_KER ], swapv_ker_name ); gen_func_init( &funcs[ BLIS_XPBYV_KER ], xpbyv_ker_name ); // -- Set level-1m (packm/unpackm) kernels --------------------------------- funcs = bli_cntx_packm_kers_buf( cntx ); // Initialize all packm kernel func_t entries to NULL. for ( i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i ) { bli_func_init_null( &funcs[ i ] ); } gen_func_init( &funcs[ BLIS_PACKM_2XK_KER ], packm_2xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_3XK_KER ], packm_3xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_4XK_KER ], packm_4xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_6XK_KER ], packm_6xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_8XK_KER ], packm_8xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_24XK_KER ], packm_24xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_30XK_KER ], packm_30xk_ker_name ); funcs = bli_cntx_unpackm_kers_buf( cntx ); // Initialize all packm kernel func_t entries to NULL. for ( i = BLIS_UNPACKM_0XK_KER; i <= BLIS_UNPACKM_31XK_KER; ++i ) { bli_func_init_null( &funcs[ i ] ); } gen_func_init( &funcs[ BLIS_UNPACKM_2XK_KER ], unpackm_2xk_ker_name ); gen_func_init( &funcs[ BLIS_UNPACKM_4XK_KER ], unpackm_4xk_ker_name ); gen_func_init( &funcs[ BLIS_UNPACKM_6XK_KER ], unpackm_6xk_ker_name ); gen_func_init( &funcs[ BLIS_UNPACKM_8XK_KER ], unpackm_8xk_ker_name ); gen_func_init( &funcs[ BLIS_UNPACKM_10XK_KER ], unpackm_10xk_ker_name ); gen_func_init( &funcs[ BLIS_UNPACKM_12XK_KER ], unpackm_12xk_ker_name ); gen_func_init( &funcs[ BLIS_UNPACKM_14XK_KER ], unpackm_14xk_ker_name ); gen_func_init( &funcs[ BLIS_UNPACKM_16XK_KER ], unpackm_16xk_ker_name ); // -- Set miscellaneous fields --------------------------------------------- bli_cntx_set_method( BLIS_NAT, cntx ); bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx ); bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx ); bli_cntx_set_schema_c_panel( BLIS_NOT_PACKED, cntx ); bli_cntx_set_anti_pref( FALSE, cntx ); bli_cntx_set_thrloop( 1, 1, 1, 1, 1, cntx ); bli_cntx_set_membrk( bli_memsys_global_membrk(), cntx ); }
void bli_cntx_init_zen( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_zen_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 8, // gemm BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, // gemmtrsm_l BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, cntx ); // Update the context with optimized level-1f kernels. bli_cntx_set_l1f_kers ( 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, cntx ); // Update the context with optimized level-1v kernels. bli_cntx_set_l1v_kers ( 10, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int, #else BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, #endif // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, // scalv #if 0 BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int, #else BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); #ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES // Zen optmized level 3 cache block sizes bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1020, 510, 510, 255 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1024, 1024, 1024, 1024 ); #else bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); #endif bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 ); bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 7, // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, cntx ); // ------------------------------------------------------------------------- // Initialize sup thresholds with architecture-appropriate values. // s d c z bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 256, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 100, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 120, -1, -1 ); // Initialize the context with the sup thresholds. bli_cntx_set_l3_sup_thresh ( 3, BLIS_MT, &thresh[ BLIS_MT ], BLIS_NT, &thresh[ BLIS_NT ], BLIS_KT, &thresh[ BLIS_KT ], cntx ); // Update the context with optimized small/unpacked gemm kernels. bli_cntx_set_l3_sup_kers ( 8, //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, cntx ); // Initialize level-3 sup blocksize objects with architecture-specific // values. // s d c z bli_blksz_init ( &blkszs[ BLIS_MR ], -1, 6, -1, -1, -1, 9, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 72, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes for small/unpacked level-3 problems. bli_cntx_set_l3_sup_blkszs ( 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_KC, &blkszs[ BLIS_KC ], BLIS_MC, &blkszs[ BLIS_MC ], BLIS_NR, &blkszs[ BLIS_NR ], BLIS_MR, &blkszs[ BLIS_MR ], cntx ); }
int main( int argc, char** argv ) { //bli_init(); #if 0 obj_t a, b, c; obj_t aa, bb, cc; dim_t m, n, k; num_t dt; uplo_t uploa, uplob, uploc; { dt = BLIS_DOUBLE; m = 6; k = 6; n = 6; bli_obj_create( dt, m, k, 0, 0, &a ); bli_obj_create( dt, k, n, 0, 0, &b ); bli_obj_create( dt, m, n, 0, 0, &c ); uploa = BLIS_UPPER; uploa = BLIS_LOWER; bli_obj_set_struc( BLIS_TRIANGULAR, &a ); bli_obj_set_uplo( uploa, &a ); bli_obj_set_diag_offset( -2, &a ); uplob = BLIS_UPPER; uplob = BLIS_LOWER; bli_obj_set_struc( BLIS_TRIANGULAR, &b ); bli_obj_set_uplo( uplob, &b ); bli_obj_set_diag_offset( -2, &b ); uploc = BLIS_UPPER; //uploc = BLIS_LOWER; //uploc = BLIS_ZEROS; //uploc = BLIS_DENSE; bli_obj_set_struc( BLIS_HERMITIAN, &c ); //bli_obj_set_struc( BLIS_TRIANGULAR, &c ); bli_obj_set_uplo( uploc, &c ); bli_obj_set_diag_offset( 1, &c ); bli_obj_alias_to( &a, &aa ); (void)aa; bli_obj_alias_to( &b, &bb ); (void)bb; bli_obj_alias_to( &c, &cc ); (void)cc; bli_randm( &a ); bli_randm( &b ); bli_randm( &c ); //bli_mkherm( &a ); //bli_mktrim( &a ); bli_prune_unref_mparts( &cc, BLIS_M, &aa, BLIS_N ); bli_printm( "c orig", &c, "%4.1f", "" ); bli_printm( "c alias", &cc, "%4.1f", "" ); bli_printm( "a orig", &a, "%4.1f", "" ); bli_printm( "a alias", &aa, "%4.1f", "" ); //bli_obj_print( "a struct", &a ); } #endif dim_t p_begin, p_max, p_inc; gint_t m_input, n_input; char uploa_ch; doff_t diagoffa; dim_t bf; dim_t n_way; char part_dim_ch; bool_t go_fwd; char out_ch; obj_t a; blksz_t bfs; thrinfo_t thrinfo; dim_t m, n; uplo_t uploa; bool_t part_m_dim, part_n_dim; bool_t go_bwd; dim_t p; num_t dt; dim_t start, end; dim_t width; siz_t area; gint_t t_begin, t_stop, t_inc; dim_t t; if ( argc == 13 ) { sscanf( argv[1], "%u", &p_begin ); sscanf( argv[2], "%u", &p_max ); sscanf( argv[3], "%u", &p_inc ); sscanf( argv[4], "%d", &m_input ); sscanf( argv[5], "%d", &n_input ); sscanf( argv[6], "%c", &uploa_ch ); sscanf( argv[7], "%d", &diagoffa ); sscanf( argv[8], "%u", &bf ); sscanf( argv[9], "%u", &n_way ); sscanf( argv[10], "%c", &part_dim_ch ); sscanf( argv[11], "%u", &go_fwd ); sscanf( argv[12], "%c", &out_ch ); } else { printf( "\n" ); printf( " %s\n", argv[0] ); printf( "\n" ); printf( " Simulate the dimension ranges assigned to threads when\n" ); printf( " partitioning a matrix for parallelism in BLIS.\n" ); printf( "\n" ); printf( " Usage:\n" ); printf( "\n" ); printf( " %s p_beg p_max p_inc m n uplo doff bf n_way part_dim go_fwd out\n", argv[0] ); printf( "\n" ); printf( " p_beg: the first problem size p to test.\n" ); printf( " p_max: the maximum problem size p to test.\n" ); printf( " p_inc: the increase in problem size p between tests.\n" ); printf( " m: the m dimension:\n" ); printf( " n: the n dimension:\n" ); printf( " if m,n = -1: bind m,n to problem size p.\n" ); printf( " if m,n = 0: bind m,n to p_max.\n" ); printf( " if m,n > 0: hold m,n = c constant for all p.\n" ); printf( " uplo: the uplo field of the matrix being partitioned:\n" ); printf( " 'l': lower-stored (BLIS_LOWER)\n" ); printf( " 'u': upper-stored (BLIS_UPPER)\n" ); printf( " 'd': densely-stored (BLIS_DENSE)\n" ); printf( " doff: the diagonal offset of the matrix being partitioned.\n" ); printf( " bf: the simulated blocking factor. all thread ranges must\n" ); printf( " be a multiple of bf, except for the range that contains\n" ); printf( " the edge case (if one exists). the blocking factor\n" ); printf( " would typically correspond to a register blocksize.\n" ); printf( " n_way: the number of ways of parallelism for which we are\n" ); printf( " partitioning (i.e.: the number of threads, or thread\n" ); printf( " groups).\n" ); printf( " part_dim: the dimension to partition:\n" ); printf( " 'm': partition the m dimension.\n" ); printf( " 'n': partition the n dimension.\n" ); printf( " go_fwd: the direction to partition:\n" ); printf( " '1': forward, e.g. left-to-right (part_dim = 'm') or\n" ); printf( " top-to-bottom (part_dim = 'n')\n" ); printf( " '0': backward, e.g. right-to-left (part_dim = 'm') or\n" ); printf( " bottom-to-top (part_dim = 'n')\n" ); printf( " NOTE: reversing the direction does not change the\n" ); printf( " subpartitions' widths, but it does change which end of\n" ); printf( " the index range receives the edge case, if it exists.\n" ); printf( " out: the type of output per thread-column:\n" ); printf( " 'w': the width (and area) of the thread's subpartition\n" ); printf( " 'r': the actual ranges of the thread's subpartition\n" ); printf( " where the start and end points of each range are\n" ); printf( " inclusive and exclusive, respectively.\n" ); printf( "\n" ); exit(1); } if ( m_input == 0 ) m_input = p_max; if ( n_input == 0 ) n_input = p_max; if ( part_dim_ch == 'm' ) { part_m_dim = TRUE; part_n_dim = FALSE; } else { part_m_dim = FALSE; part_n_dim = TRUE; } go_bwd = !go_fwd; if ( uploa_ch == 'l' ) uploa = BLIS_LOWER; else if ( uploa_ch == 'u' ) uploa = BLIS_UPPER; else uploa = BLIS_DENSE; if ( part_n_dim ) { if ( bli_is_upper( uploa ) ) { t_begin = n_way-1; t_stop = -1; t_inc = -1; } else /* if lower or dense */ { t_begin = 0; t_stop = n_way; t_inc = 1; } } else // if ( part_m_dim ) { if ( bli_is_lower( uploa ) ) { t_begin = n_way-1; t_stop = -1; t_inc = -1; } else /* if upper or dense */ { t_begin = 0; t_stop = n_way; t_inc = 1; } } printf( "\n" ); printf( " part: %3s doff: %3d bf: %3d output: %s\n", ( part_n_dim ? ( go_fwd ? "l2r" : "r2l" ) : ( go_fwd ? "t2b" : "b2t" ) ), ( int )diagoffa, ( int )bf, ( out_ch == 'w' ? "width(area)" : "ranges" ) ); printf( " uplo: %3c nt: %3u\n", uploa_ch, ( unsigned )n_way ); printf( "\n" ); printf( " " ); for ( t = t_begin; t != t_stop; t += t_inc ) { if ( part_n_dim ) { if ( t == t_begin ) printf( "left... " ); else if ( t == t_stop-t_inc ) printf( " ...right" ); else printf( " " ); } else // if ( part_m_dim ) { if ( t == t_begin ) printf( "top... " ); else if ( t == t_stop-t_inc ) printf( " ...bottom" ); else printf( " " ); } } printf( "\n" ); printf( "%4c x %4c ", 'm', 'n' ); for ( t = t_begin; t != t_stop; t += t_inc ) { printf( "%9s %u ", "thread", ( unsigned )t ); } printf( "\n" ); printf( "-------------" ); for ( t = t_begin; t != t_stop; t += t_inc ) { printf( "-------------" ); } printf( "\n" ); for ( p = p_begin; p <= p_max; p += p_inc ) { if ( m_input < 0 ) m = ( dim_t )p; else m = ( dim_t )m_input; if ( n_input < 0 ) n = ( dim_t )p; else n = ( dim_t )n_input; dt = BLIS_DOUBLE; bli_obj_create( dt, m, n, 0, 0, &a ); bli_obj_set_struc( BLIS_TRIANGULAR, &a ); bli_obj_set_uplo( uploa, &a ); bli_obj_set_diag_offset( diagoffa, &a ); bli_randm( &a ); bli_blksz_init_easy( &bfs, bf, bf, bf, bf ); printf( "%4u x %4u ", ( unsigned )m, ( unsigned )n ); for ( t = t_begin; t != t_stop; t += t_inc ) { thrinfo.n_way = n_way; thrinfo.work_id = t; if ( part_n_dim && go_fwd ) area = bli_thread_get_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end ); else if ( part_n_dim && go_bwd ) area = bli_thread_get_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end ); else if ( part_m_dim && go_fwd ) area = bli_thread_get_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end ); else // ( part_m_dim && go_bwd ) area = bli_thread_get_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end ); width = end - start; if ( out_ch == 'w' ) printf( "%4u(%6u) ", ( unsigned )width, ( unsigned )area ); else printf( "[%4u,%4u) ", ( unsigned )start, ( unsigned )end ); } printf( "\n" ); bli_obj_free( &a ); } //bli_finalize(); return 0; }