dim_t bli_gemm_determine_kc_f( dim_t i, dim_t dim, obj_t* a, obj_t* b, blksz_t* bsize ) { num_t dt; dim_t mnr; dim_t b_alg, b_max; dim_t b_use; // We assume that this function is being called from an algorithm that // is moving "forward" (ie: top to bottom, left to right, top-left // to bottom-right). // Extract the execution datatype and use it to query the corresponding // blocksize and blocksize maximum values from the blksz_t object. dt = bli_obj_execution_datatype( *a ); b_alg = bli_blksz_get_def( dt, bsize ); b_max = bli_blksz_get_max( dt, bsize ); // Nudge the default and maximum kc blocksizes up to the nearest // multiple of MR if A is Hermitian or symmetric, or NR if B is // Hermitian or symmetric. If neither case applies, then we leave // the blocksizes unchanged. if ( bli_obj_root_is_herm_or_symm( *a ) ) { mnr = bli_blksz_get_mr( dt, bsize ); b_alg = bli_align_dim_to_mult( b_alg, mnr ); b_max = bli_align_dim_to_mult( b_max, mnr ); } else if ( bli_obj_root_is_herm_or_symm( *b ) ) { mnr = bli_blksz_get_nr( dt, bsize ); b_alg = bli_align_dim_to_mult( b_alg, mnr ); b_max = bli_align_dim_to_mult( b_max, mnr ); } b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max ); return b_use; }
dim_t bli_determine_blocksize_b( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ) { num_t dt; blksz_t* bsize; dim_t b_alg, b_max; dim_t b_use; // Extract the execution datatype and use it to query the corresponding // blocksize and blocksize maximum values from the blksz_t object. dt = bli_obj_execution_datatype( *obj ); bsize = bli_cntx_get_blksz( bszid, cntx ); b_alg = bli_blksz_get_def( dt, bsize ); b_max = bli_blksz_get_max( dt, bsize ); b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); return b_use; }
void bli_trsm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, trsm_t* cntl, trsm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); pack_t schema_a = bli_obj_pack_schema( *a ); pack_t schema_b = bli_obj_pack_schema( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); dim_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); dim_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; func_t* gemmtrsm_ukrs; func_t* gemm_ukrs; void* gemmtrsm_ukr; void* gemm_ukr; // Grab the address of the internal scalar buffer for the scalar // attached to A. This will be the alpha scalar used in the gemmtrsm // subproblems (ie: the scalar that would be applied to the packed // copy of A prior to it being updated by the trsm subproblem). This // scalar may be unit, if for example it was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Extract from the control tree node the func_t objects containing // the gemmtrsm and gemm micro-kernel function addresses, and then // query the function addresses corresponding to the current datatype. gemmtrsm_ukrs = cntl_gemmtrsm_u_ukrs( cntl ); gemm_ukrs = cntl_gemm_ukrs( cntl ); gemmtrsm_ukr = bli_func_obj_query( dt_exec, gemmtrsm_ukrs ); gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs ); // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, gemm_ukr, thread ); }
void bli_trsm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, trsm_t* cntl ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; func_t* gemmtrsm_ukrs; func_t* gemm_ukrs; void* gemmtrsm_ukr; void* gemm_ukr; // Grab the address of the internal scalar buffer for the scalar // attached to A. This will be the alpha scalar used in the gemmtrsm // subproblems (ie: the scalar that would be applied to the packed // copy of A prior to it being updated by the trsm subproblem). This // scalar may be unit, if for example it was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This // is needed because cs_a and rs_b are used to index into the // micro-panels of A and B, respectively, and since the pointer // types in the macro-kernel (scomplex or dcomplex) will result // in pointer arithmetic that moves twice as far as it should, // given the datatypes actually stored (float or double), we must // halve the strides to compensate. if ( bli_obj_is_panel_packed_4m( *a ) || bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; } // Extract from the control tree node the func_t objects containing // the gemmtrsm and gemm micro-kernel function addresses, and then // query the function addresses corresponding to the current datatype. gemmtrsm_ukrs = cntl_gemmtrsm_l_ukrs( cntl ); gemm_ukrs = cntl_gemm_ukrs( cntl ); gemmtrsm_ukr = bli_func_obj_query( dt_exec, gemmtrsm_ukrs ); gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs ); // Invoke the function. f( diagoffb, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, gemm_ukr ); }
void bli_herk_l_ker_var2( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, herk_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffc = bli_obj_diag_offset( *c ); pack_t schema_a = bli_obj_pack_schema( *a ); pack_t schema_b = bli_obj_pack_schema( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; func_t* gemm_ukrs; void* gemm_ukr; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the // function address corresponding to the current datatype. gemm_ukrs = cntl_gemm_ukrs( cntl ); gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs ); // Invoke the function. f( diagoffc, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, gemm_ukr, thread ); }
void bli_trsm_rl_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); pack_t schema_a = bli_obj_pack_schema( *a ); pack_t schema_b = bli_obj_pack_schema( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); dim_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); dim_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar // attached to A (the non-triangular matrix). This will be the alpha // scalar used in the gemmtrsm subproblems (ie: the scalar that would // be applied to the packed copy of A prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, cntx, thread ); }
void bli_trsm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, trsm_t* cntl, trsm_thrinfo_t* thread ) { obj_t b_pack_s; obj_t a1_pack_s; obj_t a1, c1; obj_t* b_pack = NULL; obj_t* a1_pack = NULL; dim_t i; dim_t b_alg; dim_t m_trans; dim_t offA; // Initialize object for packing B. if( thread_am_ochief( thread ) ) { bli_obj_init_pack( &b_pack_s ); bli_packm_init( b, &b_pack_s, cntl_sub_packm_b( cntl ) ); } b_pack = thread_obroadcast( thread, &b_pack_s ); // Initialize object for packing B. if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &a1_pack_s ); } a1_pack = thread_ibroadcast( thread, &a1_pack_s ); // Pack B1 (if instructed). bli_packm_int( b, b_pack, cntl_sub_packm_b( cntl ), trsm_thread_sub_opackm( thread ) ); // Set the default length of and offset to the non-zero part of A. m_trans = bli_obj_length_after_trans( *a ); offA = 0; // If A is lower triangular, we have to adjust where the non-zero part of // A begins. if ( bli_obj_is_lower( *a ) ) offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) ); dim_t start, end; num_t dt = bli_obj_execution_datatype( *a ); bli_get_range_t2b( thread, offA, m_trans, //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ), bli_info_get_default_mc( BLIS_TRSM, dt ), &start, &end ); // Partition along the remaining portion of the m dimension. for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Initialize object for packing A1. if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, cntl_sub_packm_a( cntl ) ); } thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, a1_pack, b_pack, &BLIS_ONE, &c1, cntl_sub_trsm( cntl ), trsm_thread_sub_trsm( thread ) ); thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. thread_obarrier( thread ); if( thread_am_ochief( thread ) ) bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) ); if( thread_am_ichief( thread ) ) bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) ); }
void bli_trsm_blk_var1b( obj_t* a, obj_t* b, obj_t* c, trsm_t* cntl, trsm_thrinfo_t* thread ) { obj_t b_pack_s; obj_t a1_pack_s; obj_t a1, c1; obj_t* b_pack = NULL; obj_t* a1_pack = NULL; dim_t i; dim_t b_alg; // Prune any zero region that exists along the partitioning dimension. bli_trsm_prune_unref_mparts_m( a, b, c ); // Initialize object for packing B. if( thread_am_ochief( thread ) ) { bli_obj_init_pack( &b_pack_s ); bli_packm_init( b, &b_pack_s, cntl_sub_packm_b( cntl ) ); } b_pack = thread_obroadcast( thread, &b_pack_s ); // Initialize object for packing B. if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &a1_pack_s ); } a1_pack = thread_ibroadcast( thread, &a1_pack_s ); // Pack B1 (if instructed). bli_packm_int( b, b_pack, cntl_sub_packm_b( cntl ), trsm_thread_sub_opackm( thread ) ); dim_t my_start, my_end; num_t dt = bli_obj_execution_datatype( *a ); dim_t bf = ( bli_obj_root_is_triangular( *a ) ? bli_info_get_default_mr( BLIS_TRSM, dt ) : bli_info_get_default_nr( BLIS_TRSM, dt ) ); bli_get_range_b2t( thread, a, bf, &my_start, &my_end ); // Partition along the remaining portion of the m dimension. for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( i, my_end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. bli_acquire_mpart_b2t( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_b2t( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Initialize object for packing A1. if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, cntl_sub_packm_a( cntl ) ); } thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, a1_pack, b_pack, &BLIS_ONE, &c1, cntl_sub_trsm( cntl ), trsm_thread_sub_trsm( thread ) ); thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. thread_obarrier( thread ); if( thread_am_ochief( thread ) ) bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) ); if( thread_am_ichief( thread ) ) bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) ); }
void bli_trsm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, trsm_t* cntl, trsm_thrinfo_t* thread ) { obj_t a_pack_s; obj_t b1_pack_s, c1_pack_s; obj_t b1, c1; obj_t* a_pack = NULL; obj_t* b1_pack = NULL; obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; // Initialize pack objects for A that are passed into packm_init(). if( thread_am_ochief( thread ) ) { bli_obj_init_pack( &a_pack_s ); // Initialize object for packing A. bli_packm_init( a, &a_pack_s, cntl_sub_packm_a( cntl ) ); // Scale C by beta (if instructed). bli_scalm_int( &BLIS_ONE, c, cntl_sub_scalm( cntl ) ); } a_pack = thread_obroadcast( thread, &a_pack_s ); // Initialize pack objects for B and C that are passed into packm_init(). if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &b1_pack_s ); bli_obj_init_pack( &c1_pack_s ); } b1_pack = thread_ibroadcast( thread, &b1_pack_s ); c1_pack = thread_ibroadcast( thread, &c1_pack_s ); // Pack A (if instructed). bli_packm_int( a, a_pack, cntl_sub_packm_a( cntl ), trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; num_t dt = bli_obj_execution_datatype( *a ); bli_get_range_r2l( thread, 0, n_trans, //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), // bli_info_get_default_mr( BLIS_TRSM, dt ) ), bli_lcm( bli_blksz_get_nr( dt, cntl_blocksize( cntl ) ), bli_blksz_get_mr( dt, cntl_blocksize( cntl ) ) ), &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( i, end, b, cntl_blocksize( cntl ) ); // Acquire partitions for B1 and C1. bli_acquire_mpart_r2l( BLIS_SUBPART1, i, b_alg, b, &b1 ); bli_acquire_mpart_r2l( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. if( thread_am_ichief( thread ) ) { bli_packm_init( &b1, b1_pack, cntl_sub_packm_b( cntl ) ); bli_packm_init( &c1, c1_pack, cntl_sub_packm_c( cntl ) ); } thread_ibarrier( thread ); // Pack B1 (if instructed). bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), trsm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, a_pack, b1_pack, &BLIS_ONE, c1_pack, cntl_sub_trsm( cntl ), trsm_thread_sub_trsm( thread ) ); thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). bli_unpackm_int( c1_pack, &c1, cntl_sub_unpackm_c( cntl ), trsm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. thread_obarrier( thread ); if( thread_am_ochief( thread ) ) bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) ); if( thread_am_ichief( thread ) ) { bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) ); bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) ); } }
void bli_trmm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, gemm_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); pack_t schema_a = bli_obj_pack_schema( *a ); pack_t schema_b = bli_obj_pack_schema( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); dim_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); dim_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, thread ); }
void bli_trmm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, trmm_t* cntl, trmm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; func_t* gemm_ukrs; void* gemm_ukr; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This // is needed because cs_a and rs_b are used to index into the // micro-panels of A and B, respectively, and since the pointer // types in the macro-kernel (scomplex or dcomplex) will result // in pointer arithmetic that moves twice as far as it should, // given the datatypes actually stored (float or double), we must // halve the strides to compensate. if ( bli_obj_is_panel_packed_4m( *a ) || bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; } // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the // function address corresponding to the current datatype. gemm_ukrs = cntl_gemm_ukrs( cntl ); gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs ); // Invoke the function. f( diagoffb, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, gemm_ukr, thread ); }
void bli_gemm_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); pack_t schema_a = bli_obj_pack_schema( *a ); pack_t schema_b = bli_obj_pack_schema( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t is_a = bli_obj_imag_stride( *a ); dim_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t is_b = bli_obj_imag_stride( *b ); dim_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); // If 1m is being employed on a column- or row-stored matrix with a // real-valued beta, we can use the real domain macro-kernel, which // eliminates a little overhead associated with the 1m virtual // micro-kernel. #if 1 if ( bli_is_1m_packed( schema_a ) ) { bli_l3_ind_recast_1m_params ( dt_exec, schema_a, c, m, n, k, pd_a, ps_a, pd_b, ps_b, rs_c, cs_c ); } #endif // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, is_a, pd_a, ps_a, buf_b, rs_b, is_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, thread ); }