bool_t bli_obj_scalar_equals( obj_t* a, obj_t* beta ) { obj_t scalar_a; bool_t r_val; bli_obj_scalar_detach( a, &scalar_a ); r_val = bli_obj_equals( &scalar_a, beta ); return r_val; }
void bli_obj_scalar_apply_scalar( obj_t* alpha, obj_t* a ) { obj_t alpha_cast; obj_t scalar_a; // Make a copy-cast of alpha of the same datatype as A. This step // gives us the opportunity to typecast alpha. bli_obj_scalar_init_detached_copy_of( bli_obj_datatype( *a ), BLIS_NO_CONJUGATE, alpha, &alpha_cast ); // Detach the scalar from A. bli_obj_scalar_detach( a, &scalar_a ); // Scale the detached scalar by alpha. bli_mulsc( &alpha_cast, &scalar_a ); // Copy the internal scalar in scalar_a to A. bli_obj_copy_internal_scalar( scalar_a, *a ); }
void bli_herk_l_ker_var2( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, herk_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffc = bli_obj_diag_offset( *c ); pack_t schema_a = bli_obj_pack_schema( *a ); pack_t schema_b = bli_obj_pack_schema( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; func_t* gemm_ukrs; void* gemm_ukr; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the // function address corresponding to the current datatype. gemm_ukrs = cntl_gemm_ukrs( cntl ); gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs ); // Invoke the function. f( diagoffc, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, gemm_ukr, thread ); }
void bli_herk_u_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffc = bli_obj_diag_offset( c ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); inc_t is_a = bli_obj_imag_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); inc_t is_b = bli_obj_imag_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffc, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, is_a, pd_a, ps_a, buf_b, rs_b, is_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, thread ); }
void bli_packm_blk_var1( obj_t* c, obj_t* p, packm_thrinfo_t* t ) { num_t dt_cp = bli_obj_datatype( *c ); struc_t strucc = bli_obj_struc( *c ); doff_t diagoffc = bli_obj_diag_offset( *c ); diag_t diagc = bli_obj_diag( *c ); uplo_t uploc = bli_obj_uplo( *c ); trans_t transc = bli_obj_conjtrans_status( *c ); pack_t schema = bli_obj_pack_schema( *p ); bool_t invdiag = bli_obj_has_inverted_diag( *p ); bool_t revifup = bli_obj_is_pack_rev_if_upper( *p ); bool_t reviflo = bli_obj_is_pack_rev_if_lower( *p ); dim_t m_p = bli_obj_length( *p ); dim_t n_p = bli_obj_width( *p ); dim_t m_max_p = bli_obj_padded_length( *p ); dim_t n_max_p = bli_obj_padded_width( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_p = bli_obj_buffer_at_off( *p ); inc_t rs_p = bli_obj_row_stride( *p ); inc_t cs_p = bli_obj_col_stride( *p ); inc_t is_p = bli_obj_imag_stride( *p ); dim_t pd_p = bli_obj_panel_dim( *p ); inc_t ps_p = bli_obj_panel_stride( *p ); obj_t kappa; /*---initialize pointer to stop gcc complaining 2-9-16 GH --- */ obj_t* kappa_p = {0}; void* buf_kappa; func_t* packm_kers; void* packm_ker; FUNCPTR_T f; // Treatment of kappa (ie: packing during scaling) depends on // whether we are executing an induced method. if ( bli_is_ind_packed( schema ) ) { // The value for kappa we use will depend on whether the scalar // attached to A has a nonzero imaginary component. If it does, // then we will apply the scalar during packing to facilitate // implementing induced complex domain algorithms in terms of // real domain micro-kernels. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) if( thread_am_ochief( t ) ) { if ( bli_obj_scalar_has_nonzero_imag( p ) ) { // Detach the scalar. bli_obj_scalar_detach( p, &kappa ); // Reset the attached scalar (to 1.0). bli_obj_scalar_reset( p ); kappa_p = κ } else { // If the internal scalar of A has only a real component, then // we will apply it later (in the micro-kernel), and so we will // use BLIS_ONE to indicate no scaling during packing. kappa_p = &BLIS_ONE; } } kappa_p = thread_obroadcast( t, kappa_p ); // Acquire the buffer to the kappa chosen above. buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p ); } else // if ( bli_is_nat_packed( schema ) ) { // This branch if for native execution, where we assume that // the micro-kernel will always apply the alpha scalar of the // higher-level operation. Thus, we use BLIS_ONE for kappa so // that the underlying packm implementation does not perform // any scaling during packing. buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE ); } // Choose the correct func_t object based on the pack_t schema. if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers; else if ( bli_is_3mi_packed( schema ) || bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers; else if ( bli_is_ro_packed( schema ) || bli_is_io_packed( schema ) || bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers; else packm_kers = packm_struc_cxk_kers; // Query the datatype-specific function pointer from the func_t object. packm_ker = bli_func_obj_query( dt_cp, packm_kers ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; // Invoke the function. f( strucc, diagoffc, diagc, uploc, transc, schema, invdiag, revifup, reviflo, m_p, n_p, m_max_p, n_max_p, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, is_p, pd_p, ps_p, packm_ker, t ); }
void bli_trmm_rl_ker_var2( obj_t* a, obj_t* b, obj_t* c, trmm_t* cntl ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t rs_a = bli_obj_row_stride( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t cs_b = bli_obj_col_stride( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, m, n, k, buf_alpha, buf_a, rs_a, cs_a, ps_a, buf_b, rs_b, cs_b, ps_b, buf_beta, buf_c, rs_c, cs_c ); }
void bli_packm_blk_var1_md ( obj_t* c, obj_t* p, cntx_t* cntx, cntl_t* cntl, thrinfo_t* t ) { num_t dt_c = bli_obj_dt( c ); num_t dt_p = bli_obj_dt( p ); trans_t transc = bli_obj_conjtrans_status( c ); pack_t schema = bli_obj_pack_schema( p ); dim_t m_p = bli_obj_length( p ); dim_t n_p = bli_obj_width( p ); dim_t m_max_p = bli_obj_padded_length( p ); dim_t n_max_p = bli_obj_padded_width( p ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); void* buf_p = bli_obj_buffer_at_off( p ); inc_t rs_p = bli_obj_row_stride( p ); inc_t cs_p = bli_obj_col_stride( p ); inc_t is_p = bli_obj_imag_stride( p ); dim_t pd_p = bli_obj_panel_dim( p ); inc_t ps_p = bli_obj_panel_stride( p ); obj_t kappa; void* buf_kappa; FUNCPTR_T f; // Treatment of kappa (ie: packing during scaling) depends on // whether we are executing an induced method. if ( bli_is_nat_packed( schema ) ) { // This branch is for native execution, where we assume that // the micro-kernel will always apply the alpha scalar of the // higher-level operation. Thus, we use BLIS_ONE for kappa so // that the underlying packm implementation does not perform // any scaling during packing. buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE ); } else // if ( bli_is_ind_packed( schema ) ) { obj_t* kappa_p; // The value for kappa we use will depend on whether the scalar // attached to A has a nonzero imaginary component. If it does, // then we will apply the scalar during packing to facilitate // implementing induced complex domain algorithms in terms of // real domain micro-kernels. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) if ( bli_obj_scalar_has_nonzero_imag( p ) ) { // Detach the scalar. bli_obj_scalar_detach( p, &kappa ); // Reset the attached scalar (to 1.0). bli_obj_scalar_reset( p ); kappa_p = κ } else { // If the internal scalar of A has only a real component, then // we will apply it later (in the micro-kernel), and so we will // use BLIS_ONE to indicate no scaling during packing. kappa_p = &BLIS_ONE; } // Acquire the buffer to the kappa chosen above. buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p ); } // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_c][dt_p]; // Invoke the function. f( transc, schema, m_p, n_p, m_max_p, n_max_p, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, is_p, pd_p, ps_p, cntx, t ); }
void bli_trmm_ru_ker_var2( obj_t* a, obj_t* b, obj_t* c, trmm_t* cntl, trmm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); doff_t diagoffb = bli_obj_diag_offset( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; func_t* gemm_ukrs; void* gemm_ukr; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This // is needed because cs_a and rs_b are used to index into the // micro-panels of A and B, respectively, and since the pointer // types in the macro-kernel (scomplex or dcomplex) will result // in pointer arithmetic that moves twice as far as it should, // given the datatypes actually stored (float or double), we must // halve the strides to compensate. if ( bli_obj_is_panel_packed_4m( *a ) || bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; } // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the // function address corresponding to the current datatype. gemm_ukrs = cntl_gemm_ukrs( cntl ); gemm_ukr = bli_func_obj_query( dt_exec, gemm_ukrs ); // Invoke the function. f( diagoffb, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, gemm_ukr, thread ); }
void bli_gemm_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_execution_datatype( *c ); pack_t schema_a = bli_obj_pack_schema( *a ); pack_t schema_b = bli_obj_pack_schema( *b ); dim_t m = bli_obj_length( *c ); dim_t n = bli_obj_width( *c ); dim_t k = bli_obj_width( *a ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t cs_a = bli_obj_col_stride( *a ); inc_t is_a = bli_obj_imag_stride( *a ); dim_t pd_a = bli_obj_panel_dim( *a ); inc_t ps_a = bli_obj_panel_stride( *a ); void* buf_b = bli_obj_buffer_at_off( *b ); inc_t rs_b = bli_obj_row_stride( *b ); inc_t is_b = bli_obj_imag_stride( *b ); dim_t pd_b = bli_obj_panel_dim( *b ); inc_t ps_b = bli_obj_panel_stride( *b ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); // If 1m is being employed on a column- or row-stored matrix with a // real-valued beta, we can use the real domain macro-kernel, which // eliminates a little overhead associated with the 1m virtual // micro-kernel. #if 1 if ( bli_is_1m_packed( schema_a ) ) { bli_l3_ind_recast_1m_params ( dt_exec, schema_a, c, m, n, k, pd_a, ps_a, pd_b, ps_b, rs_c, cs_c ); } #endif // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, is_a, pd_a, ps_a, buf_b, rs_b, is_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, thread ); }