const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Cast the micro-kernel address to its function pointer type. */ \ /* NOTE: We use the upper-triangular gemmtrsm ukernel because, while the current macro-kernel targets the "rl" case (right-side/lower- triangular), it becomes upper-triangular after the kernel operation is transposed so that all kernel instances are of the "left" variety (since those are the only trsm ukernels that exist). */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \ const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \
cntx_t* cntx, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ /*const dim_t PACKMR = cs_a;*/ \ /*const dim_t PACKNR = rs_b;*/ \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \
( \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Query the context for the function address of the current datatype's micro-kernel. */ \ PASTECH2(ch,tname,_ft) f = bli_cntx_get_l3_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the typed function for the given datatype. */ \ f( \ k, \ alpha, \ a, \ b, \ beta, \ c, rs_c, cs_c, \ data, \ cntx \ ); \ } \ INSERT_GENTFUNC_BASIC2( gemm_ukernel, gemm_ukr, BLIS_GEMM_UKR )