bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \ \ /* Set the row and column strides of A. */ \ rs_a = 1; \ cs_a = *lda; \ \ /* Acquire a pointer to the global scalar constant BLIS_ONE. */ \ one_p = PASTEMAC(ch,1); \ \ /* Call BLIS interface. */ \ PASTEMAC(ch,blisname)( blis_uploa, \ blis_transa, \ blis_diaga, \ m0, \ one_p, \ a, rs_a, cs_a, \ x0, incx0 ); \ \ /* Finalize BLIS (if it was initialized above). */ \ bli_finalize_auto( init_result ); \ } #ifdef BLIS_ENABLE_BLAS2BLIS
PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \ const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha1_cast = alpha1; \ ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \
function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffc_ij; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t i, j, jp; \ inc_t rstep_a; \
const inc_t cs_b = 1; \ \ ctype_r* restrict one_r = PASTEMAC(chr,1); \ ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ \ ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ \ void* a_next = bli_auxinfo_next_a( data ); \ void* b_next = bli_auxinfo_next_b( data ); \ \ dim_t i, j; \ \ /* PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: a1112p_r", m, k+m, \ a11_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: a1112p_i", m, k+m, \ a11_r+is_a, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: b1121p_r", k+m, n, \ b11_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: b1121p_i", k+m, n, \ b11_r+is_b, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ */ \ \ \ /* Copy the contents of c to a temporary buffer ct. */ \ if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ { \ /* We can handle a non-zero imaginary component on alpha, but to do so we have to manually scale b and then use alpha == 1 for the micro-kernel calls. */ \
function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \
\ dim_t m = bli_obj_length( a ); \ \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ \ void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \ \ /* Query a type-specific function pointer, except one that uses void* instead of typed pointers. */ \ PASTECH2(opname,_unb,_vft) f = \ PASTEMAC(varname,_qfp)( dt ); \ \ f \ ( \ uploa, \ transa, \ diaga, \ m, \ buf_alpha, \ buf_a, rs_a, cs_a, \ buf_x, incx, \ cntx \ ); \ } \ GENFRONT( trmv, trmv_unb_var1 )
const inc_t irstep_c = rs_c * MR; \ const inc_t irstep_a = rs_a * MR; \ */ \ \ /* Query the context for the sup microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemmsup_ker_ft) \ gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ \ ctype* restrict a_00 = a; \ ctype* restrict b_00 = b; \ ctype* restrict c_00 = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ \ ctype* restrict one = PASTEMAC(ch,1); \ \ auxinfo_t aux; \ \ /* Compute number of primary and leftover components of the outer dimensions. NOTE: Functionally speaking, we compute jc_iter as: jc_iter = n / NC; if ( jc_left ) ++jc_iter; However, this is implemented as: jc_iter = ( n + NC - 1 ) / NC; This avoids a branch at the cost of two additional integer instructions. The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in similar manner. */ \ const dim_t jc_iter = ( n + NC - 1 ) / NC; \ const dim_t jc_left = n % NC; \ \
\ void* a_next = bli_auxinfo_next_a( data ); \ void* b_next = bli_auxinfo_next_b( data ); \ \ dim_t n_iter; \ dim_t n_elem; \ \ inc_t incc, ldc; \ inc_t incct, ldct; \ \ dim_t i, j; \ \ \ /* PASTEMAC(chr,fprintm)( stdout, "gemm4m1_ukr: ap_r", m, k, \ a_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "gemm4m1_ukr: ap_i", m, k, \ a_i, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "gemm4m1_ukr: bp_r", k, n, \ b_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "gemm4m1_ukr: bp_i", k, n, \ b_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ */ \ \ \ /* SAFETY CHECK: The higher level implementation should never allow an alpha with non-zero imaginary component to be passed in, because it can't be applied properly using the 4m method. If alpha is not real, then something is very wrong. */ \ if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \