void bli_trsm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, trsm_t* cntl, trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); pack_t schema_a = bli_obj_pack_schema( *a ); pack_t schema_b = bli_obj_pack_schema( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); dim_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); dim_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; func_t* gemmtrsm_ukrs; func_t* gemm_ukrs; void* gemmtrsm_ukr; void* gemm_ukr; // Grab the address of the internal scalar buffer for the scalar // attached to A. This will be the alpha scalar used in the gemmtrsm // subproblems (ie: the scalar that would be applied to the packed // copy of A prior to it being updated by the trsm subproblem). This // scalar may be unit, if for example it was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Extract from the control tree node the func_t objects containing // the gemmtrsm and gemm micro-kernel function addresses, and then // query the function addresses corresponding to the current datatype. gemmtrsm_ukrs = cntl_gemmtrsm_u_ukrs( cntl ); gemm_ukrs = cntl_gemm_ukrs( cntl ); gemmtrsm_ukr = bli_func_obj_query( dt_exec, gemmtrsm_ukrs ); gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs ); // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, gemm_ukr, thread ); }
void bli_trsm_lu_ker_var2( obj_t* a, obj_t* b, obj_t* c, trsm_t* cntl ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffa = bli_obj_diag_offset( *a ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; func_t* gemmtrsm_ukrs; func_t* gemm_ukrs; void* gemmtrsm_ukr; void* gemm_ukr; // Grab the address of the internal scalar buffer for the scalar // attached to B. This will be the alpha scalar used in the gemmtrsm // subproblems (ie: the scalar that would be applied to the packed // copy of B prior to it being updated by the trsm subproblem). This // scalar may be unit, if for example it was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *b ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This // is needed because cs_a and rs_b are used to index into the // micro-panels of A and B, respectively, and since the pointer // types in the macro-kernel (scomplex or dcomplex) will result // in pointer arithmetic that moves twice as far as it should, // given the datatypes actually stored (float or double), we must // halve the strides to compensate. if ( bli_obj_is_panel_packed_4m( *a ) || bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; } // Extract from the control tree node the func_t objects containing // the gemmtrsm and gemm micro-kernel function addresses, and then // query the function addresses corresponding to the current datatype. gemmtrsm_ukrs = cntl_gemmtrsm_u_ukrs( cntl ); gemm_ukrs = cntl_gemm_ukrs( cntl ); gemmtrsm_ukr = bli_func_obj_query( dt_exec, gemmtrsm_ukrs ); gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs ); // Invoke the function. f( diagoffa, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, gemm_ukr ); }